直接上程式:
import pandas as pd
from collections import Counter
import time
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold
class Bayes(object):
def __init__(self):
self.length = -1 # 儲存測試集資料量
self.train_target_list = [] # 目标值類别集合
self.p_train_target = {} # 儲存個目标值機率
self.split_data_lis = [] # 儲存各條件機率對應的資料集
self.feature_p_lis = [] # 儲存特征機率
self.predict = [] # 儲存分類結果
def fit(self, train_data, train_target):
train_length = train_data.shape[0]
self.length = train_length
target_list = list(set(train_target)) # 隊訓練集目标值去重
self.train_target_list = target_list # 寫入對象特征
target_classifier = dict(Counter(train_target)) # 儲存目标值的分類計數(字典格式)
train_data = pd.DataFrame(train_data)
train_data['target'] = train_target # 将資料轉換為DataFrame格式友善後續聚合
for target in self.train_target_list:
self.p_train_target[target] = target_classifier[target] / self.length # 儲存各目标值的機率
split_data = train_data[train_data['target'] == target]
# print(split_data)
self.split_data_lis.append(split_data)
def p_test_data(self, sample):
result_p = []
for j in range(len(self.train_target_list)):
p_label = 1
this_target = self.train_target_list[j]
this_data = self.split_data_lis[j]
for i in range(0, sample.shape[0]):
feature_num_dict = dict(Counter(this_data[i])) # 計算一列資料中各類别的數量
if sample[i] in feature_num_dict:
label_num = feature_num_dict.get(sample[i])
p_label = p_label * (label_num / this_data.shape[0]) # 計算單個特征的條件機率
else:
# 加入拉普拉斯平滑系數解決機率為0的情況'
p_label = p_label * (1 / (this_data.shape[0] + len(feature_num_dict)))
this_target_p = p_label * self.p_train_target.get(this_target) # 計算該樣本屬于該特征的機率
result_p.append(this_target_p)
position = result_p.index(max(result_p)) # 機率最大的分類
return self.train_target_list[position]
def classifier(self, test_data):
if self.length == -1:
raise ValueError('please use fit() to train the train data set ')
else:
test_data = pd.DataFrame(test_data)
test_data['target'] = test_data.apply(self.p_test_data, axis=1) #
self.predict = list(test_data['target'])
def score(self, test_target, num):
if len(self.predict) == 0:
raise ValueError('please use classifier() to get classifier target')
else:
count = 0
for i in range(0, test_target.shape[0]):
if test_target[i] == self.predict[i]:
count += 1
score = count / (test_target.shape[0])
print('第{}折正确率為:'.format(num), score)
return score
if __name__ == '__main__':
iris = load_iris()
data = iris.data
target = iris.target
fold = StratifiedKFold(n_splits=5)
start_time = time.time()
rate = 0.0
k = 0
for train, test in fold.split(data, target):
k += 1
classifier = Bayes()
classifier.fit(data[train], target[train])
classifier.classifier(data[test])
re_soc = classifier.score(target[test], k)
rate += re_soc
print('平均正确率為:', rate / 5)
end_time = time.time()
time_d = end_time - start_time
print("spend time:", time_d)
參考資料如下:
https://blog.csdn.net/qq_38233659/article/details/101553277
資料集切割 實作
https://blog.csdn.net/sinat_30353259/article/details/80932111
bayes算法講解