天天看點

資料挖掘習題選做--第六章:Apriori算法、FP-growth

資料挖掘概念與技術習題選做

第六章習題

資料挖掘習題選做--第六章:Apriori算法、FP-growth

(1) 用python簡單實作Apriori算法

# -*- coding: utf-8 -*-
__author__ = "Yunfan Yang"

def gen_L1(TID):
    """從事務集中産生頻繁1項集"""
    initial_C1 = {}        # 定義一個空字典用于統計初始項集資訊,鍵值對形如{"M": 3}

    for tid in TID:
        for item in tid:
            if item not in initial_C1.keys():
                initial_C1[item] = 
            else:
                initial_C1[item] += 

    # 候選1項集
    C1 = []
    for key, value in initial_C1.items():
        tmp_tuple = ([key], value)
        C1.append(tmp_tuple)
    # C1結果為[(['M'], 3), (['O'], 4), (['N'], 2), (['K'], 5), (['E'], 4), (['Y'], 3), (['D'], 1), (['A'], 1), (['U'], 1), (['C'], 2), (['I'], 1)]

    # 頻繁1項集
    L1 = []
    for item in C1:
        if item[] / len(TID) >= min_sup:
            L1.append(item)

    # L1結果為[(['M'], 3), (['O'], 4), (['K'], 5), (['E'], 4), (['Y'], 3)]
    return L1

def generateC_k(Lk_1):
    """産生候選k項集的函數"""
    C_k = []

    # 執行連接配接步驟
    for i in range(len(Lk_1)-):     # 周遊Lk_1中的項集
        # print(C)
        for j in range(i+, len(Lk_1)):
            if len(Lk_1[i][]) <= :    # 頻繁項集隻有1項,則直接連接配接,産生候選項
                C = Lk_1[i][][:]  # 複制Lk_1[i][0]給C
                C.append(Lk_1[j][][-])
            elif len(Lk_1[i][]) > :
                if if_equal(Lk_1[i][][:-], Lk_1[j][][:-]) and Lk_1[i][][-] != Lk_1[j][][-]: # 連接配接條件
                    C = Lk_1[i][][:]
                    C.append(Lk_1[j][][-])   # 連接配接步
                else:
                    break
            if has_infrequent_subset(C, Lk_1):       # 剪枝步:删除非頻繁候選
                pass
            else:
                C_k.append(C)
    return C_k

# Lk_1 = [(['O', 'K'], 3), (['O', 'E'], 3)]
def if_equal(A, B):
    """判斷兩個長度相等的清單是否相等"""
    for i in range(len(A)):
        if A[i] == B[i]:
            continue
        else:
            return False
    return True

# L1 = [(['M'], 3), (['O'], 4), (['K'], 5), (['E'], 4), (['Y'], 3)]
# C = ['M', 'I']
def has_infrequent_subset(C, Lk_1):
    """利用先驗知識,判斷候選項的子集是否有子集不屬于頻繁集"""
    L = []  # 建立一個空清單隻包含頻繁項集,形如[['M'], ['O']]
    for i in range(len(Lk_1)):
        L.append(Lk_1[i][])
    # print(L)
    for element in subset(C):
        # print(element)
        if element not in L:   # 如果候選項的子集不在Lk_1中,傳回True
            return True
    return False

def subset(C):
    """産生一個清單的長度減1項子集,比如[1,2,3]生成[[1,2],[1,3],[2,3]]"""
    result = []
    for i in range(len(C)):
        copyC = C[:]
        copyC.pop(i)
        subsetC = copyC
        result.append(subsetC)
    return result

def AinB(A, B):
    """定義函數檢測清單B是否全部包含清單A的元素"""
    for a in A:
        if a in B:
            continue
        else:
            return False
    return True

def main_apriori(Lk_1):
    """Apriori主程式"""

    while Lk_1[-] != []:
        Ck = generateC_k(Lk_1[-])   # 産生候選項集
        # print(Ck)
        Lk = []                     # 初始化每次要生成的頻繁項集
        for i in range(len(Ck)):
            count =                # 初始化計數
            for tid in TID:    # 掃描事務庫,進行計數
                if AinB(Ck[i], tid):   # 判斷候選項集中的候選項是否出現在事務集的事務中
                    count += 
                    lk = (Ck[i], count)

            if lk[] / len(TID) >= min_sup:  # 判斷是否滿足最小支援度
                Lk.append(lk)
        Lk_1.append(Lk)

    return Lk_1      # 得出每次生成的頻繁項集清單

def gen_rule_set(Lk_1):
    """生成隻包含關聯規則項集的清單"""
    # 因為最後的頻繁集的所有非空子集及支援度計數都出現在清單Lk_1中,故可以跟據Lk_1的清單尋找關聯規則
    # 首先生成一個新清單隻包含最終頻繁集的及其非空子集
    rule_set = []
    for i in range(len(Lk_1) - ):  # 周遊Lk_1的每個頻繁項集
        for L in Lk_1[i]:  # 對于每個頻繁項
            if AinB(L[], Lk_1[-][][]):  # 如果頻繁項的所有元素都在最終頻繁項中,則取出該清單至rule_set中
                rule_set.append(L)
    return rule_set

def gen_rule(rule_set):
    """生成規則"""
    for i in range(len(rule_set)):
        for j in range(len(rule_set)):    # 周遊規則清單rule_set
            if not AinB(rule_set[i][], rule_set[j][]) and not AinB(rule_set[j][], rule_set[i][]) and (
                len(rule_set[i][]) + len(rule_set[j][])) <= len(rule_set[-][]): # 取出沒有項本身的另一個項,生成規則
                rule = [rule_set[i], rule_set[j]]
                join = rule_set[i][] + rule_set[j][]   # 将規則裡的兩個項集合并,即取并集

                for k in range(len(rule_set)):   # 再一次周遊關聯規則的項集合
                    if len(join) == len(rule_set[k][]) and AinB(join, rule_set[k][]):  # 取出并集的支援度計數
                        rule_union = rule_set[k]
                        # print(rule_union)
                        conf = rule_union[] / rule[][]     # 計算置信度
                        if conf >= min_conf:                  # 判斷是否滿足最小置信度
                            print(rule[][], '-->', rule[][], '置信度為:{}%'.format(conf*))

if __name__ == "__main__":

    # 原始事務集
    t100 = ('M', 'O', 'N', 'K', 'E', 'Y')
    t200 = ('D', 'O', 'N', 'K', 'E', 'Y')
    t300 = ('M', 'A', 'K', 'E')
    t400 = ('M', 'U', 'C', 'K', 'Y')
    t500 = ('C', 'O', 'O', 'K', 'I', 'E')
    TID = [t100, t200, t300, t400, t500]

    # 設定最小支援度以及置信度
    min_sup = 
    min_conf = 

    # # 原始事務集
    # t100 = ('I1', 'I2', 'I5')
    # t200 = ('I2', 'I4')
    # t300 = ('I2', 'I3')
    # t400 = ('I1', 'I2', 'I4')
    # t500 = ('I1', 'I3')
    # t600 = ('I2', 'I3')
    # t700 = ('I1', 'I3')
    # t800 = ('I1', 'I2', 'I3', 'I5')
    # t900 = ('I1', 'I2', 'I3')
    # TID = [t100, t200, t300, t400, t500, t600, t700, t800, t900]
    #
    # # 設定最小支援度以及置信度
    # min_sup = 2/9
    # min_conf = 0.7

    Lk_1 = [gen_L1(TID)]   # 将每個頻繁項集作為一個子清單儲存在Lk_1中
    Lk_1 = main_apriori(Lk_1)
    print("\n生成的頻繁項集清單為:", Lk_1[:-])

    Lk = Lk_1[-]
    print("最後生成的頻繁項集為:", Lk)

    rule_set = gen_rule_set(Lk_1)
    print("産生關聯規則的項集為:", rule_set)

    print("\n生成的強關聯規則有:")
    gen_rule(rule_set)
           

運作結果為:

生成的頻繁項集清單為: [[(['M'], ), (['O'], ), (['K'], ), (['E'], ), (['Y'], )], [(['M', 'K'], ), (['O', 'K'], ), (['O', 'E'], ), (['K', 'E'], ), (['K', 'Y'], )], [(['O', 'K', 'E'], )]]
最後生成的頻繁項集為: [(['O', 'K', 'E'], )]
産生關聯規則的項集為: [(['O'], ), (['K'], ), (['E'], ), (['O', 'K'], ), (['O', 'E'], ), (['K', 'E'], ), (['O', 'K', 'E'], )]

生成的強關聯規則有:
['K'] --> ['E'] 置信度為:%
['E'] --> ['K'] 置信度為:%
['O', 'K'] --> ['E'] 置信度為:%
['O', 'E'] --> ['K'] 置信度為:%
           

(2) FP-growth 算法

再說吧,先往下進行,有空再寫。另外Apriori寫的着實太爛,還是太菜了,繼續加油把。

資料挖掘習題選做--第六章:Apriori算法、FP-growth
資料挖掘習題選做--第六章:Apriori算法、FP-growth

繼續閱讀