天天看點

利用關聯規則找出IPv6位址分段之間的關聯

對資料集進行資料清洗,并應用fpgrowth算法尋找頻繁項集,最後找出關聯規則,這裡是吧 jupyter裡運作的代碼進行了封裝

結果

利用關聯規則找出IPv6位址分段之間的關聯

可以找到兩端輸入的位址切片之間的關聯

代碼:

import pandas as pd
import numpy as np
from pandas import DataFrame, Series
from io import BytesIO
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpgrowth
from mlxtend.frequent_patterns import association_rules
from time import time


class FrequentPatternsIPv6(object):
    '''
    傳入對應的兩個ipv6切片,
    尋找這兩個ipv6切片之間的關聯規則
    '''

    def __init__(
            self, data_path, slice_ip_1_start, slice_ip_1_end,
            slice_ip_2_start, slice_ip_2_end
    ):
        assert (0 <= slice_ip_1_start <= 128 or 0 <= slice_ip_1_end <= 128 or
                0 <= slice_ip_2_start <= 128 or 0 <= slice_ip_2_end <= 128), \
            'please input the correct slice'
        # 平時說的ip都是從 1 開始數的,字元串是從 0 開始數的
        slice_ip_1_start -= 1
        slice_ip_1_end -= 1
        slice_ip_2_start -= 1
        slice_ip_2_end -= 1
        self.slice_ip_1 = str(slice_ip_1_start) + ':' + str(slice_ip_1_end)
        self.slice_ip_2 = str(slice_ip_2_start) + ':' + str(slice_ip_2_end)
        self.slice_ip_1_start = slice_ip_1_start
        self.slice_ip_1_end = slice_ip_1_end
        self.slice_ip_2_start = slice_ip_2_start
        self.slice_ip_2_end = slice_ip_2_end
        self.path = data_path

    def load_data(self):
        '''
        資料格式
        ip      label
        ...     ...
        ...     ...
        ...     ...

        預設 DHCP 的标簽為 0
        '''
        return pd.read_csv(self.path)

    def replenish_ip(self, ip):
        '''
        補全ip中的 :: 為0000
        補全ip中前導0的省略
        '''
        org_length = 8
        splited_ = ip.split(':')
        if '' in splited_:
            length = len(splited_) - 1
        else:
            length = len(splited_)
        d = org_length - length
        if ip.endswith('::'):
            ip = ip.replace('::', ':0000' * d)
        else:
            # 對于末尾為 :: 的 ip也要補全 0
            ip = ip.replace('::', ':0000' * d + ':')
        _temp = ip.split(':')
        for index, i in enumerate(_temp):
            l = len(i)
            if l < 4:
                d = 4 - l
                _temp[index] = '0' * d + i

        return ':'.join(_temp)

    def hex_to_bin(self, ip, flag, slice_here):
        '''
        十六進制轉二進制
        flag: 1 表示處理的是輸入的第一個ip切片索引
        '''
        # 排除奇奇怪怪的東西 "2.at.pool.ntp.org", '[2a0a' 這樣的東西
        if ':' not in ip or '[' in ip:
            return 'error_ip'  # 記住在進行apply完之後要檢查一遍 'error_ip'
        ip = self.replenish_ip(ip)
        ip = ip.split(':')
        memory_list = []
        for i in ip:
            temp__ = int(i, 16)
            b = bin(temp__)[2:]
            x = len(b)
            if x < 16:
                d = 16 - x
                b = '0' * d + b
            memory_list.append(b)
        final = ''.join(memory_list)

        start, stop = slice_here.split(':')
        return final[int(start): int(stop)]

    def bin_to_hex(self, string):
        '''
        二進制轉十六進制
        '''
        demical = int(string, 2)
        return hex(demical)[2:]

    def clean_data(self):
        '''
        資料清洗
        :return: 經過資料清洗後兩個ipv6切片的 Series
        '''
        df = self.load_data()
        df = df[df['label'] == 0]  # 選出 DHCP
        temp = df['ip']
        temp_1 = temp.apply(self.hex_to_bin, args=(1, self.slice_ip_1))
        error_ip_num = temp_1[temp_1 == 'error_ip'].count()
        print(f'資料集中的異常ip數量為{error_ip_num}條')
        print('\n\n', '**'*30)

        err_index = temp_1[temp_1 == 'error_ip'].index
        temp_1.drop(index=err_index, inplace=True)
        pickle_name_1 = f'./from_{self.slice_ip_1_start}_to_{self.slice_ip_1_end}_first.pk'
        temp_1.to_pickle(pickle_name_1)

        temp_2 = temp.apply(self.hex_to_bin, args=(2, self.slice_ip_2))
        err_index = temp_2[temp_2 == 'error_ip'].index
        temp_2.drop(index=err_index, inplace=True)
        pickle_name_2 = f'./from_{self.slice_ip_2_start}_to_{self.slice_ip_2_end}_second.pk'
        temp_2.to_pickle(pickle_name_2)

        return temp_1, temp_2

    def slice_ipv6_describe(self):
        '''
        這個ip切片的統計資訊:
        各個切片的占比
        '''
        series_1, series_2 = self.clean_data()

        def format_1(arg):
            x = len(arg)
            if x < 16:
                arg = '0'*(16 - x) + arg
            return arg

        def format_2(arg):
            x = len(arg)
            if x < 8:
                arg = '0'*(8 - x) + arg
            return arg
            
        series_1 = series_1.apply(format_1)
        series_2 = series_2.apply(format_2)     

        info_1 = series_1.value_counts() / series_1.count()
        info_2 = series_2.value_counts() / series_2.count()

        print(info_1)
        print('\n\n', '**'*30)
        print(info_2)

        series_1 = series_1.apply(self.bin_to_hex)
        series_2 = series_2.apply(self.bin_to_hex)

        # 補齊前導零
        def format_series_1(arg):
            l = len(arg)
            if l < 4:
                return '0' * (4 - l) + arg
            return arg

        def format_series_2(arg):
            if len(arg) == 1:
                return '0' + arg
            return arg

        series_1 = series_1.apply(format_series_1)
        series_2 = series_2.apply(format_series_2)

        return series_1, series_2

    def frequent_patterns_prepare(self, min_threshold=1000):
        '''
        Arg:
            min_thresshold: ip切片計數最小的門檻值
        
        attention:
            當直接使用TransactionEncoder的時候,np.zeros()由于無法生成過大次元的數組會報錯:
            Unable to allocate array with shape (1994891, 44205) and data type bool
            44205 = 104-120 的 unique值 + 120-128的 unique值,是以為了使用TransactionEncoder,要先除去support很小的值

            對于numpy的記憶體問題,在 linux上面跑的時候,并沒有windows的這個限制,windows最大的m*n是10億
        '''
        columns = [self.slice_ip_1, self.slice_ip_2]
        series_1, series_2 = self.slice_ipv6_describe()
        df = pd.DataFrame(list(zip(series_1, series_2)), columns=columns)

        # 獲得 ip切片計數大于門檻值的所有 ip切片的索引
        indices = series_1.value_counts()[series_1.value_counts() > min_threshold].index

        def filter_value(value):
            '''
            找出次數 >min_threshold 的對應的 df 切片
            不符合的标記為 備援 'verbose'
            '''
            return value if value in indices else 'verbose'

        slice_df = df[self.slice_ip_1].apply(filter_value)

        # 把去除了備援資料之後的 series 代替原來的 series,會産生很多 NaN,就是那些原本不對的資料
        df[self.slice_ip_1] = slice_df[slice_df != 'verbose']
        df.dropna(inplace=True)

        return df

    def apply_(self):
        df_ = self.frequent_patterns_prepare(min_threshold=1000)
        te = TransactionEncoder()  # 對資料集進行TransactionEncoder編碼 
        df_tf = te.fit_transform(df_.values)

        df = pd.DataFrame(df_tf, columns=te.columns_)

        start = time()
        # 尋找頻繁項集
        frequent_itemsets = fpgrowth(df, min_support=0.05, use_colnames=True)
        print('尋找頻繁項集算法時耗:', time() - start)
        print()

        frequent_itemsets.sort_values(by='support', ascending=False, inplace=True)
        print(f'freqSet:\n{frequent_itemsets}')
        print('\n\n', '**'*30)

        # 生成關聯規則
        association_rule = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.7)  # 名額為置信度
        association_rule.sort_values(by='leverage', ascending=False, inplace=True)  # 關聯規則按leverage排序

        print('關聯規則:\n{}'.format(association_rule))


if __name__ == "__main__":
    start = time()
    fq_ = FrequentPatternsIPv6('D:/ipv6_label_lzx20190904.csv', 104, 120, 120, 128)
    fq_.apply_()
    end = time()
    d = end - start
    min_ = d // 60
    s = d % 60
    print(f'{min_}min{s}s')
           

中秋~

一個人過呗,哈哈哈

繼續閱讀