KNN
基本原理
KNN(k-nearst neighbor,k近鄰)的基本原理很簡單,對于待預測樣本Xi,按照某種度量(一般是歐氏距離)與訓練資料進行相似度計算并排序,選擇最相似的前k個訓練資料,k個樣本中最多的類作為待預測樣本的類型。
代碼實作
本代碼基于《機器學習實戰》中的約會資料集與手寫體數字識别資料集
基本思想是利用numpy中的向量化函數來計算歐氏距離,比書上的源代碼簡潔
代碼檔案的讀取仍然是利用os子產品的listdir函數實作
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author:Staaaying
import os
import numpy as np
def knn(data, label, input, k=5, label_num=3):
res = np.sum(np.power(data - input, 2), axis=1)
idx = np.argsort(res)
label = label[idx]
res_arr = np.zeros(label_num)
for i in range(k):
res_arr[int(label[i])] += 1
res_arr = res_arr / k
return res_arr.argmax(), res_arr.max()
def normalize(data):
data_max = data.max(axis=0)
data_min = data.min(axis=0)
return (data - data_min)/(data_max - data_min)
def get_data():
with open(r'D:\ML_Study\ML_in_action\Ch02\datingTestSet.txt') as f:
data = []
label = []
for line in f.readlines():
a, b, c, d = line.split(sep='\t')
data.append([int(a), float(b), float(c)])
if 'largeDoses' in d:
label.append(0)
elif 'smallDoses' in d:
label.append(1)
else:
label.append(2)
data = np.array(data)
label = np.array(label)
data = normalize(data)
idx = np.arange(data.shape[0])
np.random.shuffle(idx)
test_data_ratio = 0.1
idx1 = idx[0:int(idx.shape[0] * test_data_ratio)]
idx2 = idx[int(idx.shape[0] * test_data_ratio):]
test_data = data[idx1]
test_label = label[idx1]
data = data[idx2]
label = label[idx2]
return data, label, test_data, test_label
def read_data_from_file(path):
label = int(path.split(sep='\\')[-1][0])
data = []
with open(path) as f:
for line in f.readlines():
line = line[:-1]
for item in line:
data.append(int(item))
data = np.array(data)
return label, data
def get_digits():
test_dir = r'D:\ML_Study\ML_in_action\Ch02\digits\testDigits'
data_dir = r'D:\ML_Study\ML_in_action\Ch02\digits\trainingDigits'
test_data = []
test_label = []
data = []
label = []
for item in os.listdir(test_dir):
path = test_dir + '\\' + item
label_temp, data_temp = read_data_from_file(path)
test_data.append(data_temp)
test_label.append(label_temp)
for item in os.listdir(data_dir):
path = data_dir + '\\' + item
label_temp, data_temp = read_data_from_file(path)
data.append(data_temp)
label.append(label_temp)
data = np.array(data)
label = np.array(label)
test_data = np.array(test_data)
test_label = np.array(test_label)
return data, label, test_data, test_label
if __name__ == '__main__':
# data, label, test_data, test_label = get_data()
# data, label, test_data, test_label = get_digits()
epocs = 1
for epoc in range(epocs):
acc = 0
for i in range(test_data.shape[0]):
pred, prob = knn(data, label, test_data[i], epoc + 1, 10)
print('The {} input pred:{} real:{} prob:{}'.format(i, pred, test_label[i], prob))
if pred == test_label[i]:
acc += 1
print('The {} epoc Accuracy:{}'.format(epoc, acc / test_data.shape[0]))
Tips
- os子產品拿來做path相關的事情非常友善,以後需要多學習
- numpy中的sort函數在對多元array排序時,是在指定的axis上分别對每行列(三維以上沒有行列概念,但差不多是這麼個意思QAQ)獨立地排序,會損失相關性!!!,解決辦法是使用額外的一個随即索引序列idx,按照某一行列大小排序後argmax得到idx,再應用到其他行列上
- numpu.shuffle函數是in_place修改