python機器學習---用貝葉斯算法實作垃圾郵件分類預測

import numpy
from os import listdir
import jieba
import operator
from gensim import corpora,models,similarities
from numpy import *

#貝葉斯算法的實作
class Bayes:
    def __init__(self):
        self.length=-
        self.labelcount=dict()
        self.vectorcount=dict()#key:value,{label:vect}
    def fit(self,dataSet,labels):
        if(len(dataSet)!=len(labels)):
            raise ValueError("您輸入的類别與我們的資料集個數不比對")
        self.length=len(dataSet[])
        labelsnum=len(labels)#資料記錄數量，類别總數量
        norelabel=set(labels)#不重複類别數組
        for item in norelabel:
            thislabel=item
            #目前類别占總類别的比例，目前類别的機率
            self.labelcount[thislabel]=labels.count(thislabel)/labelsnum#目前類别出現的機率
        for vect,label in zip(dataSet,labels):
            if(label not in self.vectorcount):
                self.vectorcount[label]=[]
            self.vectorcount[label].append(vect)
        print("訓練結束")
        return self
    def btest(self,TestData,labelSet):
        if(self.length==-):
            raise ValueError("沒有訓練，先訓練再測試")
        #計算目前資料分别為各個類别的機率
        lbdict=dict()
        for thislb in labelSet:
            p= 
            labelpct=self.labelcount[thislb]
            allvector=self.vectorcount[thislb]
            vnum=len(allvector)#取出目前向量清單的長度
            allvector=numpy.array(allvector).T
            for index in range(,len(TestData)):
                vector=list(allvector[index])
                p=p*vector.count(TestData[index])/vnum
            lbdict[thislb]=p*labelpct
        #取出機率最大的那個類别
        thislabel=sorted(lbdict,key=lambda x:lbdict[x],reverse=True)[]
        return thislabel


#進行訓練
#從檔案名得到分類資訊
def seplabel(fname):
    filestr=fname.split(".")[]
    thislabel=filestr.split("_")[]
    if(thislabel=="t"):
        classstr=
    else:
        classstr=
    return classstr

#建立詞典
dictdata=""
filelist=listdir("D:/python/train")
for i in range(,len(filelist)):
    data=open("D:/python/train"+filelist[i],"r",encoding="utf-8").read()
    cdata=jieba.cut(data)
    for j in cdata:
        dictdata=dictdata+j+"  "
texts=[dictdata.split()]
dictionary=corpora.Dictionary(texts)


#建構訓練集資料向量以及對應的label
def traindataSet():
    labels=[]
    dirname="D:/python/train"
    trainfilelist=listdir(dirname)
    #print(trainfilelist)
    m=len(trainfilelist)
    trainMat=numpy.zeros((m,))
    for i in range(,m):
        fnamestr= trainfilelist[i]
        labels.append(seplabel(fnamestr))
        data=open(dirname+fnamestr,"r",encoding="utf-8").read()
        cutdata=jieba.cut(data)
        newdata=""
        for item in cutdata:
            newdata+=item+" "
        print(newdata)
        #将對應的資料轉為稀疏向量
        new_vect=dictionary.doc2bow(newdata.split())
        #print(new_vect)
        thisvec=""
        for t in range(,len(new_vect)):
            for k in range(,len(new_vect[t])):
                thisvec=thisvec+str(new_vect[t][k])+"  "
        new_vec=thisvec.split()
        #print(new_vec)
        trainMat[i,:len(new_vec)]=new_vec
    return labels,trainMat

#接下來進行貝葉斯算法訓練
labels,trainMat=traindataSet()
bys=Bayes()
bys.fit(trainMat,labels)
#測試
testdata=open("D:/python/test/abc1.txt","r",encoding="utf-8").read()
cutdata=jieba.cut(testdata)
newdata=""
for i in cutdata:
    newdata+=i+"  "
new_vec=dictionary.doc2bow(newdata.split())
thisvec=""
for t in range(,len(new_vec)):
    for k in range(,len(new_vec[t])):
        thisvec=thisvec+str(new_vec[t][k])+"  "
new_vec2=thisvec.split()
#print(new_vec)
testMat=numpy.zeros((,))
testMat[,:len(new_vec2)]=new_vec2
labels=[,]
rst=bys.btest(testMat[],labels)
  if(rst==):
    print("不是垃圾郵件")
  else:
    print("是垃圾郵件")

python機器學習---用貝葉斯算法實作垃圾郵件分類預測

繼續閱讀

XGBoost Plotting API以及GBDT組合特征實踐 XGBoost Plotting API以及GBDT組合特征實踐

解碼器用于語義分割：資料依賴的解碼可以實作靈活的特征聚合

YAML簡介和PyYAML安全操作YAML支援的類型YAML的優點：yaml的基本文法python操作

2021-2025年中國運動療法（KT）帶行業市場供需與戰略研究報告

Small tricks

libsvm for python 安裝

學習軟體測試基礎測試第七天

Zeppelin 配置通路 REST APIApache Zeppelin Configuration REST API

【Torch】最簡潔logging使用指南

27. Remove Element(清單)題目代碼

Cloud Studio初體驗

使用 ctypes 進行 Python 和 C 的混合程式設計

【python】【資料處理】畫多元資料分布圖

【python】netconf協定對接管理裝置

「Python 網絡自動化」NETCONF —— Python 使用 NETCONF 管理配置 H3C 網絡裝置

在python中建立excel并寫入