天天看點

樸素貝葉斯---過濾垃圾郵件

在bayes.py中添加

#樸素貝葉斯詞袋模型
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = []*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 
    return returnVec

#解析文本
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W*', bigString)  #除掉标點符号,保留單詞
    return [tok.lower() for tok in listOfTokens if len(tok) > ]   #傳回長度大于2,小寫後的單詞

#垃圾郵件測試函數
def spamTest():
    #導入并解析文本
    docList = []; classList = []; fullText = []
    for i in range(, ):
        wordList = textParse(open('email/spam/%d.txt' %i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append()
        wordList = textParse(open('email/ham/%d.txt' %i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append()
    vocabList = createVocabList(docList)   #解析為詞清單
    trainingSet = range(); testSet = []  #建立訓練集、測試集,訓練集初始化為一個整數清單
    #随機建構訓練集
    for i in range():  #50封郵件中,随機選取10封作為測試集
        randIndex = int(random.uniform(, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  #從訓練集中删除
    trainMat = []; trainClasses = []
    for docIndex in trainingSet:  #循環周遊訓練集
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))  #對每封郵件基于詞彙表建構詞向量
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))  #計算分類所需的機率
    errorCount = 
    #對測試集分類
    for docIndex in testSet:
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 
            print "classification error", docList[docIndex]
    print 'the error rate is : ', float(errorCount / len(testSet))
           

測試:

>>> import bayes
>>> spamTest()
the error rate is :  
>>> 
           

繼續閱讀