
k-近鄰算法 From Machine Learning

Python 3.6 WIN10環境下,PyCharm IDE寫代碼,前半部分(電影分類問題)使用cmd執行資料輸入,後半部分(約會網站問題開始)直接在IDE的Console區域執行資料輸入;path環境路徑使用aconda,暫時發現其他的環境路徑遇到matplotlib無法安裝成功的問題:








k-近鄰算法 From Machine Learning










from numpy import *
import operator
def createDataSet():
    group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]]) #建立資料集
    labels = ['A','A','B','B'] #建立标簽
    return group,labels


>>> group,labels = kNN.createDataSet()
>>> group
array([[ 1. ,  1.1],
       [ 1. ,  1. ],
       [ 0. ,  0. ],
       [ 0. ,  0.1]])
>>> labels
['A', 'A', 'B', 'B']

分類方法,在cmd中調用執行:inX-要分類的輸入量; dataSet-輸入的訓練樣本集;labels-标簽向量;k-旋轉最近鄰居的數目


def classify0(inX, dataSet, labels, k):
    dataSetSize = dataSet.shape[0]
    diffMat = tile(inX, (dataSetSize,1)) - dataSet
    sqDiffMat = diffMat**2
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances**0.5
    sortedDistIndicies = distances.argsort()
    for i in range(k): #選擇距離最小的k個點
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1), reverse=True) #排序
    return sortedClassCount[0][0]


>>> kNN.classify0([0,0], group, labels, 3)
>>> kNN.classify0([0,1], group, labels, 3)
>>> kNN.classify0([1,1.1], group, labels, 3)


解析已知樣本資料,樣本資料包含特征值和目标值。将未知對象(由特征值定義)歸類(目标值),三類:1.不喜歡的人 2.魅力一般的人 3.極具魅力的人










def file2matrix(filename):
    fr = open(filename)
    arrayOLines = fr.readlines()
    numberOfLines = len(arrayOLines)
    returnMat = zeros((numberOfLines, 3))
    classLabelVector = []
    index = 0
    for line in arrayOLines:
        line = line.strip()#截取掉所有的回車字元
        listFromLine = line.split('\t') #然後使用tab字元\t将上一步得到的整行資料分割成一個元素清單
        returnMat[index,:] = listFromLine[0:3] #選取前3個元素,将它們存儲到特征矩陣中
        if(listFromLine[-1] == 'largeDoses'): #Python語言可使用索引值-1表示清單中的最後一列元素,利用該負索引,可友善地将清單的最後一列存儲到向量classLabelVector中
            classLabelVector.append(3)#listFromLine[-1] = '3' #為之後的使用該算法來判斷是否喜歡一個人時而改
        elif (listFromLine[-1] == 'smallDoses'):
            classLabelVector.append(2)#listFromLine[-1] = '2' #為之後的使用該算法來判斷是否喜歡一個人時而改
            classLabelVector.append(1)#listFromLine[-1] = '1' #為之後的使用該算法來判斷是否喜歡一個人時而改
        index += 1

    return returnMat,classLabelVector


>>> import kNN
>>> from numpy import *
>>> datingDataMat,datingLabels = kNN.file2matrix('datingTestSet.txt')
>>> import matplotlib
>>> import matplotlib.pyplot as plt
>>> fig = plt.figure()
>>> ax = fig.add_subplot(111)
>>> ax.scatter(datingDataMat[:,1], datingDataMat[:,2], 15.0*array(datingLabels), 15.0*array(datingLabels))
<matplotlib.collections.PathCollection object at 0x000002B853F7F860>
>>> plt.show()


Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
NameError: name 'array' is not defined


>>>from numpy import *


k-近鄰算法 From Machine Learning


ax.scatter(datingDataMat[:,1], datingDataMat[:,2], 15.0*array(datingLabels), 15.0*array(datingLabels))


ax.scatter(datingDataMat[:,0], datingDataMat[:,1], 15.0*array(datingLabels), 15.0*array(datingLabels))


k-近鄰算法 From Machine Learning



newValue = (oldValue - min)/(max - min)



def autoNorm(dataSet):
    minVals = dataSet.min(0) #每列最小值
    maxVals = dataSet.max(0) #每列最大值
    ranges = maxVals - minVals #函數計算可能的取值範圍
    normDataSet = zeros(shape(dataSet))
    m = dataSet.shape[0]
    normDataSet = dataSet - tile(minVals, (m,1)) #tile将變量内容複制成輸入矩陣同樣大小的矩陣
    normDataSet = normDataSet/tile(ranges, (m,1)) #特征值相除,為了歸一化特征值,必須使用目前值減去最小值,然後除以取值範圍;在某些數值處理軟體包,/可能意味着矩陣除法,NumPy庫中,
    return normDataSet,ranges,minVals            #矩陣除法需要使用函數linalg.solve(matA,matB)


>>> import kNN
>>> datingDataMat,datingLabels=kNN.file2matrix('datingTestSet.txt')
>>> normMat,ranges,minVals = kNN.autoNorm(datingDataMat)
>>> normMat
array([[ 0.44832535,  0.39805139,  0.56233353],
       [ 0.15873259,  0.34195467,  0.98724416],
       [ 0.28542943,  0.06892523,  0.47449629],
       [ 0.29115949,  0.50910294,  0.51079493],
       [ 0.52711097,  0.43665451,  0.4290048 ],
       [ 0.47940793,  0.3768091 ,  0.78571804]])
>>> ranges
array([  9.12730000e+04,   2.09193490e+01,   1.69436100e+00])
>>> minVals
array([ 0.      ,  0.      ,  0.001156])




def datingClassTest():
    hoRatio = 0.1 #前10%的資料作為測試資料集,後90%資料作為訓練資料集
    datingDataMat,datingLabels = file2matrix('datingTestSet.txt')
    normMat,ranges,minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        #normMat[i,:] 取出第i行的所有資料
        classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,:],datingLabels[numTestVecs:m], 3)
        print("the classifier came back with: %d, the real answer is: %d" %(classifierResult, datingLabels[i]))
        if(classifierResult != datingLabels[i]):errorCount += 1.0
    print("the total error rate is:%f"%(errorCount/float(numTestVecs)))


>>> import kNN
>>> from numpy import *
>>> kNN.datingClassTest()
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 1
the total error rate is:0.050000


>>> import kNN
>>> from numpy import *
>>> kNN.datingClassTest()
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the total error rate is:0.000000


最後使用該算法來判斷是否喜歡一個人:直接在PyCharm IDE中運作了,不再cmd中去運作了,再添加如下代碼:

def classifyPerson():
	resultList = ['不喜歡','有些喜歡','非常喜歡']
	precentTats = float(input("玩視訊遊戲所耗時間百分比:"))
	ffMiles = float(input("每年獲得的飛行常客裡程數:"))
	iceCream = float(input("每周消費的冰激淋公升數:"))
	filename = "datingTestSet.txt"
	datingDataMat, datingLabels = file2matrix(filename)
	normMat, ranges, minVals = autoNorm(datingDataMat)
	inArr = np.array([ffMiles, precentTats, iceCream])
	norminArr = (inArr - minVals) / ranges
	classifierResult = classify0(norminArr, normMat, datingLabels, 3)
	print("你可能%s這個人" % (resultList[classifierResult-1]))

if __name__ == '__main__':
    # datingClassTest()

直接Run kNN.py,在Console區域輸出結果如下:




if __name__ == '__main__':
    # datingClassTest()


def img2vector(filename):
    returnVect = zeros((1,1024))
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVect[0,32*i+j] = int(lineStr[j])
    return returnVect

def handwritingClassTest():
    cpu_start = time.time()
    print('start:%f' % cpu_start)
    hwLabels = []
    trainingFileList = listdir('trainingDigits')
    m = len(trainingFileList)
    trainingMat = zeros([m,1024])
    for i in range(m):
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
    testFileList = listdir('testDigits')
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileNameStr.split('_')[0])
        vectorUnderTest = img2vector('testDigits/%s' %fileNameStr)
        classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
        print("分類器傳回值為:%d, 實際值為:%d" %(classifierResult, classNumStr))
        if(classifierResult != classNumStr): errorCount += 1.0
    print("\n錯誤總數為: %d" % errorCount)
    print("\n總錯誤率為: %f" %(errorCount/float(mTest)))
    cpu_end = time.time()
    print('end:%f' % cpu_end)
    print("total time: %f S" % (cpu_end - cpu_start))
if __name__ == '__main__':
    # datingClassTest()
   # testVector=img2vector('testDigits/0_13.txt')
   # print(testVector[0,32:63])


錯誤總數為: 10

總錯誤率為: 0.010571
total time: 33.872470 S





第1列為每年獲得的飛行常客裡程數,第2列為玩視訊遊戲所耗時間百分比,第3列為每周消費的冰淇淋公升數,第4列為目标值:非常喜歡-largeDoses 有點喜歡-smallDoses 不喜歡-didntLike:

40920	8.326976	0.953952	largeDoses
14488	7.153469	1.673904	smallDoses
26052	1.441871	0.805124	didntLike
75136	13.147394	0.428964	didntLike
38344	1.669788	0.134296	didntLike


