天天看點

機器學習——python實作樸素貝葉斯算法

直接上程式:

import pandas as pd
from collections import Counter
import time
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold


class Bayes(object):
    def __init__(self):
        self.length = -1  # 儲存測試集資料量
        self.train_target_list = []  # 目标值類别集合
        self.p_train_target = {}  # 儲存個目标值機率
        self.split_data_lis = []  # 儲存各條件機率對應的資料集
        self.feature_p_lis = []  # 儲存特征機率
        self.predict = []  # 儲存分類結果

    def fit(self, train_data, train_target):

        train_length = train_data.shape[0]
        self.length = train_length
        target_list = list(set(train_target))  # 隊訓練集目标值去重
        self.train_target_list = target_list  # 寫入對象特征
        target_classifier = dict(Counter(train_target))  # 儲存目标值的分類計數(字典格式)
        train_data = pd.DataFrame(train_data)
        train_data['target'] = train_target  # 将資料轉換為DataFrame格式友善後續聚合
        for target in self.train_target_list:
            self.p_train_target[target] = target_classifier[target] / self.length  # 儲存各目标值的機率
            split_data = train_data[train_data['target'] == target]
            #  print(split_data)
            self.split_data_lis.append(split_data)

    def p_test_data(self, sample):

        result_p = []
        for j in range(len(self.train_target_list)):
            p_label = 1
            this_target = self.train_target_list[j]
            this_data = self.split_data_lis[j]
            for i in range(0, sample.shape[0]):
                feature_num_dict = dict(Counter(this_data[i]))  # 計算一列資料中各類别的數量
                if sample[i] in feature_num_dict:
                    label_num = feature_num_dict.get(sample[i])
                    p_label = p_label * (label_num / this_data.shape[0])  # 計算單個特征的條件機率
                else:
                    # 加入拉普拉斯平滑系數解決機率為0的情況'
                    p_label = p_label * (1 / (this_data.shape[0] + len(feature_num_dict)))
            this_target_p = p_label * self.p_train_target.get(this_target)  # 計算該樣本屬于該特征的機率
            result_p.append(this_target_p)
        position = result_p.index(max(result_p))  # 機率最大的分類
        return self.train_target_list[position]

    def classifier(self, test_data):

        if self.length == -1:
            raise ValueError('please use fit() to train the train data set ')
        else:
            test_data = pd.DataFrame(test_data)
            test_data['target'] = test_data.apply(self.p_test_data, axis=1)  #
            self.predict = list(test_data['target'])

    def score(self, test_target, num):
        if len(self.predict) == 0:
            raise ValueError('please use classifier() to get classifier target')
        else:
            count = 0
            for i in range(0, test_target.shape[0]):
                if test_target[i] == self.predict[i]:
                    count += 1
            score = count / (test_target.shape[0])
            print('第{}折正确率為:'.format(num), score)
            return score


if __name__ == '__main__':
    iris = load_iris()
    data = iris.data
    target = iris.target
    fold = StratifiedKFold(n_splits=5)
    start_time = time.time()
    rate = 0.0
    k = 0
    for train, test in fold.split(data, target):
        k += 1
        classifier = Bayes()
        classifier.fit(data[train], target[train])
        classifier.classifier(data[test])
        re_soc = classifier.score(target[test], k)
        rate += re_soc
    print('平均正确率為:', rate / 5)
    end_time = time.time()
    time_d = end_time - start_time
    print("spend time:", time_d)

           

參考資料如下:

https://blog.csdn.net/qq_38233659/article/details/101553277

資料集切割 實作

https://blog.csdn.net/sinat_30353259/article/details/80932111

bayes算法講解