天天看點

資料挖掘之ID3算法

資料挖掘ID3算法,在網上的指導下寫的,僅供參考。

資料集如下:

Outlook Temperature Humidity Windy Play
sunny hot high FALSE no
sunny hot high TRUE no
overcast hot high FALSE yes
rain mild high FALSE yes
rain cool normal FALSE yes
rain cool normal TRUE no
overcast cool normal TRUE yes
sunny mild high FALSE no
sunny cool normal FALSE yes
rain mild normal FALSE yes
sunny mild normal TRUE yes
overcast mild high TRUE yes
overcast hot normal FALSE yes
rain mild high TRUE no
import pandas as pd
import numpy as np
from math import log
import operator
def getDataSet():
    DataSet = pd.read_excel(r"ID3資料集.xlsx", encoding='UTF-8')
    DataArr = np.array(DataSet)
    columns = np.array(DataSet.columns[:len(DataSet.columns)-1])
    return DataArr.tolist(),columns.tolist() #擷取資料

# 計算香農熵
def calcShannonEnt(dataSet):
    numEntries = len(dataSet)
    labelCounts = {}
    for feaVec in dataSet:
        currentLabel = feaVec[-1]
        if currentLabel not in labelCounts:
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key]) / numEntries
        shannonEnt -= prob * log(prob, 2)
    return shannonEnt


def splitDataSet(dataSet, axis, value):
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis + 1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet


def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1  # 因為資料集的最後一項是标簽
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0
    bestFeature = -1
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList)
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet) / float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)
        infoGain = baseEntropy - newEntropy
        if infoGain > bestInfoGain:
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature


# 因為我們遞歸建構決策樹是根據屬性的消耗進行計算的,是以可能會存在最後屬性用完了,但是分類
# 還是沒有算完,這時候就會采用多數表決的方式計算節點分類
def majorityCnt(classList):
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote] = 0
        classCount[vote] += 1
    return max(classCount)


def createTree(dataSet, labels):
    classList = [example[-1] for example in dataSet]
    if classList.count(classList[0]) == len(classList):  # 類别相同則停止劃分
        return classList[0]
    if len(dataSet[0]) == 1:  # 所有特征已經用完
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel: {}}
    del (labels[bestFeat])
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]  # 為了不改變原始清單的内容複制了一下
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat, value), subLabels)
    return myTree


def main():
    data, label = getDataSet()
    myTree = createTree(data, label)
    print(myTree)


if __name__ == '__main__':
    main()