樸素貝葉斯實作垃圾郵件識别

import numpy as np
from collections import Counter

'''詞頻進行統計，統計出每個單詞出現的個數
輸入的是一個一個很長的文章或者句子,應該有斷點吧 ，還是在jieba中處理，就在說了
return：1，key-value 2，就是0,1，2這種标記.貌似第一種比較容易實作
'''
'''用counter一次計數檔案，這樣對原始檔案隻操作一次。
統計每個單詞出現的次數
   輸入：檔案位址
   輸出:{word1:num1,word2:num2...}
   hamcnt:正常郵件
   spamcnt：垃圾郵件
   totanNum:郵件總數
   hamNum: 正常郵件數目
   spamNum: 垃圾郵件數目
'''
def seperate(filename):
    hamcnt = Counter() #正常郵件統計字典
    spamcnt = Counter()#垃圾郵件統計字典
    file = open(filename,encoding='gb18030',errors = 'ignore')#處理不能打開的異常
    totalNum=0  #郵件的總數
    hamNum=0   #正常郵件數
    spamNum=0  #垃圾郵件數
    i=0
    for line in file:
        i=i+1
        new = line.split(' ')   #将單個文本，按照單詞分開
        totalNum = totalNum +1
        if 'ham' in new[0]:   #如果是正常郵件，進入hamcnt統計
            hamNum =hamNum+1
            for word in new[1:]:
                hamcnt[word] +=1
        if 'spam' in new[0]:      #如果是垃圾郵件，進入spamcnt統計
            spamNum =spamNum +1
            for word in new[1:]:
                spamcnt[word] +=1
    print('*********樣本的總的行數是%s**********'%i)
    return hamcnt,spamcnt,totalNum,hamNum,spamNum
'''preData是一個句子，或者一封郵件。
return：true,false 是否是垃圾郵件
過程：1,先進行分表，分為兩類---調用seperate()
     2,進行詞頻統計，為下面計算機率做準備
     3，計算兩種情況的機率：p1(spam)p(word1|spam)p(word2|spam)p(word3|spam)...
         *****修正公式p(word|spam) = (1+wordExistNUm)/(wordsExistNum+wordsNum)
     4,比較機率，确定輸出結果
'''
def train(filename,preData):
    hamcnt,spamcnt,totalNum,hamNum,spamNum = seperate(filename)
    #計算詞的總數
    wordNumerOfham = 0
    for key in hamcnt:
        wordNumerOfham += hamcnt[key]
    wordNumerOfspam = 0
    for key in spamcnt:
        wordNumerOfspam +=spamcnt[key]
   #對要預測的文本進行拆解
    newPreData = preData.split(' ')
    #計算機率p(spam|total),p(ham|total)
    p1_spam = hamNum/totalNum
    p1_ham = spamNum/totalNum
    hamProbablity =1
    spamProbability =1
    for word in newPreData:
        try:  
            hamProbablity = hamProbablity*(hamcnt[word]+1)/(wordNumerOfham+len(hamcnt))
        except:  #文本中沒有該單詞
            hamProbablity = hamProbablity*1/(wordNumerOfham+len(hamcnt))
    res1 = hamProbablity*p1_ham
    
    for word in newPreData:
        try:
            spamProbability = spamProbability*(spamcnt[word]+1)/(wordNumerOfspam+len(spamcnt))
        except:
            spamProbability = spamProbability*(1)/(wordNumerOfspam+len(spamcnt))
    res2 = spamProbability*p1_spam
    
    if res1 == res2:
        print('res1',res1,'res2',res2)
    if res1>res2:
        print('不是垃圾郵件!','正常檔案：',res1,'垃圾郵件',res2)
        return 0
    else:
        print('是垃圾郵件','正常檔案：',res1,'垃圾郵件',res2)
        return 1

'''外部調用過程:
1,讀取資料
2，訓練,得到結果
'''
filename = 'D:\\學習\\計算機教育訓練\\機器學習部分\\貝葉斯分類器\\smsspamcollection\\SMSSpamCollection.txt'

preData = 'You have won ?1,000 cash or a ?2,000 prize! To claim, call09050000327'
res = train(filename,preData)

樸素貝葉斯的實作理論：http://blog.csdn.net/gane_cheng/article/details/53219332 （這個連結裡面也有資料集）

英文的網址：https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html#eqn:documentprior

說明：統計的文本函數式counter，對統計英文文本速度還是很快的。

樸素貝葉斯實作垃圾郵件識别

繼續閱讀

來自python的【條件控制/語句循環/break/continue/else/pass】一、條件控制二、語句循環

無法解析的外部符号 wmain，該符号在函數 "void cdecl mainCRTStartupHelper(struct HINSTANCE *,unsigned short con......

TestLink導出用例轉換工具(XML2Excel)

YAML簡介和PyYAML安全操作YAML支援的類型YAML的優點：yaml的基本文法python操作

Small tricks

libsvm for python 安裝

學習軟體測試基礎測試第七天

Zeppelin 配置通路 REST APIApache Zeppelin Configuration REST API

【Torch】最簡潔logging使用指南

27. Remove Element(清單)題目代碼

Cloud Studio初體驗

使用 ctypes 進行 Python 和 C 的混合程式設計

【python】【資料處理】畫多元資料分布圖

【python】netconf協定對接管理裝置

「Python 網絡自動化」NETCONF —— Python 使用 NETCONF 管理配置 H3C 網絡裝置

在python中建立excel并寫入