自己一直学习计算机视觉方面的东西,现在想学习一下数据挖掘跟搜索引擎,自己基础也有点薄弱,看朱明的那本数据挖掘,只能片面的了解这个数据挖掘。不过最近有一本书 机器学习实战,于是乎通过实战的形式了解一下基本的算法的执行过程。
# -*- coding: utf-8 -*
from numpy import *
import operatordef createDataSet():group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])labels = ['A','A','B','B']return group, labelsdef classify0(inX, dataSet, labels, k):#行的个数也就是训练集的个数dataSetSize = dataSet.shape[0]print ('dataSetSize:',dataSetSize)#tile表示将输入向量inX在行方向上重复dataSetSize次,在列的方向上重复1次,具体就是构建一个跟训练集对应的数组大小,相减就是为了求距离diffMat = tile(inX,(dataSetSize,1)) - dataSetprint('diffMat:',diffMat)sqDiffMat = diffMat**2print('sqDiffMat:',sqDiffMat)#sum默认axis=0为普通相加,axis=1为矩阵的每一个行向量相加sqDistances = sqDiffMat.sum(axis=1)print('sqDistances:',sqDistances)distances = sqDistances**0.5print('distances:',distances)sortedDistIndicies = distances.argsort()print('sortedDistIndicies:',sortedDistIndicies)#定义一个词典classCount={}#range(k) 为[0,1,2....k-1]print('labels:',labels)for i in range(k): voteIlabel = labels[sortedDistIndicies[i]]#获得最小的k个长度#get 返回键值key对应的值;如果key没有在字典里,则返回default参数的值,这边default是0,功能计算每个标签类别的个数classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1#sorted 排序是产生一个新的列表,sort是在原有基础上排序,key按第一个域进行排序,Ture为逆序print('classCount:',classCount)sortedClassCount = sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)print('sortedClassCount:',sortedClassCount)return sortedClassCount[0][0]
1: >>> import kNN
2: >>> group,labels = kNN.createDataSet()
3: >>> kNN.classify0([0,0],group,labels,3)
4: ('dataSetSize:', 4L)
5: ('diffMat:', array([[-1. , -1.1],
6: [-1. , -1. ],
7: [ 0. , 0. ],
8: [ 0. , -0.1]]))
9: ('sqDiffMat:', array([[ 1. , 1.21],
10: [ 1. , 1. ],
11: [ 0. , 0. ],
12: [ 0. , 0.01]]))
13: ('sqDistances:', array([ 2.21, 2. , 0. , 0.01]))
14: ('distances:', array([ 1.48660687, 1.41421356, 0. , 0.1 ]))
15: ('sortedDistIndicies:', array([2, 3, 1, 0], dtype=int64))
16: ('labels:', ['A', 'A', 'B', 'B'])
17: ('classCount:', {'A': 1, 'B': 2})
18: ('sortedClassCount:', [('B', 2), ('A', 1)])
19: 'B'
1: def file2matrix(filename):
2: fr = open(filename)
3: arrayOLines = fr.readlines()
4: numberOfLines = len(arrayOLines)
5: print('numberOfLines:',numberOfLines)
6: returnMat = zeros((numberOfLines,3))
7: print('returnMat:',returnMat)
8: classLabelVector = []
9: index = 0
10: for line in arrayOLines:
11: #没有传入参数时,是默认去除首尾空格
12: line = line.strip()
13: listFromLine = line.split('\t')
14: returnMat[index,:] = listFromLine[0:3]
15: classLabelVector.append(int(listFromLine[-1]))
16: index += 1
17: print('returnMat:',returnMat)
18: print('classLabelVector:',classLabelVector[0:20])
19: return returnMat,classLabelVector
1: >>> reload(kNN)
2: <module 'kNN' from 'E:\Machine Learning\exercise\ch02\kNN.py'>
3: >>> datingDataMat,datingLabels = kNN.file2matrix('datingTestSet.txt')
4: ('numberOfLines:', 1000)
5: ('returnMat:', array([[ 0., 0., 0.],
6: [ 0., 0., 0.],
7: [ 0., 0., 0.],
8: ...,
9: [ 0., 0., 0.],
10: [ 0., 0., 0.],
11: [ 0., 0., 0.]]))
12: ('returnMat:', array([[ 4.09200000e+04, 8.32697600e+00, 9.53952000e-01],
13: [ 1.44880000e+04, 7.15346900e+00, 1.67390400e+00],
14: [ 2.60520000e+04, 1.44187100e+00, 8.05124000e-01],
15: ...,
16: [ 2.65750000e+04, 1.06501020e+01, 8.66627000e-01],
17: [ 4.81110000e+04, 9.13452800e+00, 7.28045000e-01],
18: [ 4.37570000e+04, 7.88260100e+00, 1.33244600e+00]]))
19: ('classLabelVector:', [3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3])
根据作者使用,我们画出玩视频游戏所消耗的时间百分比 与每周所消费的冰淇淋公升数
1: >>> import matplotlib
2: >>> import matplotlib.pyplot as plt
3: >>> fig = plt.figure()
4: >>> ax = fig.add_subplot(111)//这边111表示把绘图区域分成1行*1列共1个区域,然后在区域1上创建一个轴对象
5: >>> ax.scatter(datingDataMat[:,1],datingDataMat[:,2])//scatter表示散点图
6: <matplotlib.collections.PathCollection object at 0x00000000064DD128>
7: >>> plt.show()
1: ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*array(datingLabels),15.0*array(datingLabels))
1: def autoNorm(dataSet):
2: minVals = dataSet.min(0)
3: maxVals = dataSet.max(0)
4: print('minVals:',minVals)
5: print('maxVals:',maxVals)
6: ranges = maxVals - minVals
7: normDataSet = zeros(shape(dataSet))
8: m = dataSet.shape[0]
9: print('m:',m)
10: normDataSet = dataSet - tile(minVals,(m,1))
11: normDataSet = normDataSet/tile(ranges,(m,1))
12: return normDataSet,ranges,minVals
1: >>> reload(kNN)
2: <module 'kNN' from 'E:\Machine Learning\exercise\ch02\kNN.py'>
3: >>> normMat,ranges,minVals = kNN.autoNorm(datingDataMat)
4: ('minVals:', array([ 0. , 0. , 0.001156]))
5: ('maxVals:', array([ 9.12730000e+04, 2.09193490e+01, 1.69551700e+00]))
6: ('m:', 1000L)
7: >>> normMat
8: array([[ 0.44832535, 0.39805139, 0.56233353],
9: [ 0.15873259, 0.34195467, 0.98724416],
10: [ 0.28542943, 0.06892523, 0.47449629],
11: ...,
12: [ 0.29115949, 0.50910294, 0.51079493],
13: [ 0.52711097, 0.43665451, 0.4290048 ],
14: [ 0.47940793, 0.3768091 , 0.78571804]])
15: >>> ranges
16: array([ 9.12730000e+04, 2.09193490e+01, 1.69436100e+00])
17: >>> minVals
18: array([ 0. , 0. , 0.001156])
1: def datingClassTest():
2: hoRatio = 0.10
3: datingDataMat,datingLabels = file2matrix('datingTestSet.txt')
4: normMat,ranges,minVals = autoNorm(datingDataMat)
5: m = normMat.shape[0]
6: numTestVecs = int(m*hoRatio)
7: errorCount = 0.0
8: for i in range(numTestVecs):
9: #这边的意思是拿前10%的数据作为测试,后90%的数据是训练样本
10: classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],5)
11: print "the classifier came back with:%d,the real answer is: %d" %(classifierResult,datingLabels[i])
12: if(classifierResult !=datingLabels[i]):
13: errorCount +=1.0
14: print "the total error rate is :%f" %(errorCount/float(numTestVecs))
1: >>> import kNN
2: >>> kNN.datingClassTest()
3: the classifier came back with:3,the real answer is: 3
4: the classifier came back with:2,the real answer is: 2
5: the classifier came back with:1,the real answer is: 1
6: 。。。。。。
7: the classifier came back with:3,the real answer is: 3
8: the classifier came back with:3,the real answer is: 3
9: the classifier came back with:2,the real answer is: 2
10: the classifier came back with:2,the real answer is: 1
11: the classifier came back with:1,the real answer is: 1
12: the total error rate is :0.040000
1: def classifyPerson():
2: resultList = ['not at all','in small doses','in large doses']
3: percentTats = float(raw_input("percentage of time spent playing video games?"))
4: ffMiles = float(raw_input("frequent flier miles earned per year?"))
5: iceCream = float(raw_input("liters of ice cream consumed per year?"))
6: datingDataMat,datingLabels = file2matrix('datingTestSet.txt')
7: normMat,ranges,minVals = autoNorm(datingDataMat)
8: inArr = array([ffMiles,percentTats,iceCream])
9: classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
10: #这边减1是由于最后分类的数据是1,2,3对应到数组中是0,1,2
11: print "You will probably like this person: ",resultList[classifierResult -1]
1: >>> import kNN
2: >>> kNN.classifyPerson()
3: percentage of time spent playing video games?11
4: frequent flier miles earned per year?11111
5: liters of ice cream consumed per year?0.6
6: You will probably like this person: in large doses
7: >>> kNN.classifyPerson()
8: percentage of time spent playing video games?10
9: frequent flier miles earned per year?10000
10: liters of ice cream consumed per year?0.5
11: You will probably like this person: in small doses
1: #把图像文本数据存入returnVect
2: def img2vector(filename):
3: #图像像素是32*32
4: returnVect = zeros((1,1024))
5: fr = open(filename)
6: for i in range(32):
7: lineStr = fr.readline()
8: for j in range(32):
9: returnVect[0,32*i+j] = int(lineStr[j])
10: return returnVect
1: >>> import kNN
2: >>> kNN.img2vector('testDigits/0_13.txt')
3: array([[ 0., 0., 0., ..., 0., 0., 0.]])
4: >>> testVector = kNN.img2vector('testDigits/0_13.txt')
5: >>> testVector[0,0:31]
6: array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
7: 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
8: 0., 0., 0., 0., 0.])
1: def handwritingClassTest():
2: hwLabels = []
3: #获取文件目录
4: trainingFileList = listdir('trainingDigits')
5: m = len(trainingFileList)
6: trainingMat = zeros((m,1024))
7: for i in range(m):
8: fileNameStr = trainingFileList[i]
9: #得到数组如[0_12,txt],[0]是第一个数据0_12
10: fileStr = fileNameStr.split('.')[0]
11: #得到数组如[0,12]获得第一个数[0],从这边看出文件名还是有很大作用的。
12: classNumStr = int(fileStr.split('_')[0])
13: hwLabels.append(classNumStr)
14: trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
15: testFileList = listdir('testDigits')
16: errorCount = 0.0
17: mTest = len(testFileList)
18: for i in range(mTest):
19: fileNameStr = testFileList[i]
20: fileStr = fileNameStr.split('.')[0]
21: classNumStr = int(fileStr.split('_')[0])
22: vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
23: classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
24: print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)
25: if (classifierResult != classNumStr): errorCount += 1.0
26: print "\nthe total number of errors is: %d" % errorCount
27: print "\nthe total error rate is: %f" % (errorCount/float(mTest))
1: >>> import kNN
2: >>> kNN.handwritingClassTest()
3: the classifier came back with: 0, the real answer is: 0
4: the classifier came back with: 0, the real answer is: 0
5: 。。。。。。
6: the classifier came back with: 9, the real answer is: 9
7: the classifier came back with: 9, the real answer is: 9
8: the classifier came back with: 9, the real answer is: 9
9: the classifier came back with: 9, the real answer is: 9
11: the total number of errors is: 11
13: the total error rate is: 0.011628