天天看點

python手寫kmeans以及kmeans++聚類算法

自己用python手寫實作了kmeans與kmeans++算法。記錄一下,說不定以後就用着了呢。

首先是用到的幾個自定義函數:

def nearest(data,cluster_center):
    n = len(cluster_center)
    m = len(data)
    sum1 = 0
    dis = []
    for i in range(n):
        for j in range(m):
            sum1+=(data[j]-cluster_center[i][j])**2
        dis.append(sum1/m)
        sum1 = 0
    index = dis.index(min(dis))
    return index

def is_same(data,cluster_center):   
    for center in cluster_center:
        if (data==center).all():
            return True
    return False

def random_select(array):
    array1 = array/sum(array)
    array2 = [0 for _ in range(len(array1))]
    for i in range(len(array1)+1):
        for j in range(i):
            array2[i-1]+=array1[j]
    j = random.random()
    for i,value in enumerate(array2):
        if j<value:
            break
    return i
           

kmeans算法:

def k_means(data,k):
    m = len(data)
    n = len(data[0])
    cluster = [-1 for _ in range(m)]         #所有樣本尚未聚類
    cluster_center = [[] for _ in range(k)] #聚類中心
    cc = [[] for _ in range(k)]              #下一輪的聚類中心
    c_number = [0 for _ in range(k)]         #每個簇中的樣本數目
    
    #随機選擇聚類中心,使用set()防止随機點重複。
    h = set()
    while len(h) < k:
        j = random.randint(0,m-1)
        h.add(j)
#         if is_similar(data[j],cluster_center):
#             continue
    for i,seed in enumerate(h):
        cluster_center[i] = data[seed][:]
        cc[i] = [0 for _ in range(n)]
    for times in range(40):
        for i in range(m):
            c = nearest(data[i],cluster_center)
            cluster[i] = c
            c_number[c]+=1
            cc[c]=[i+j for i,j in zip(cc[c],data[i].tolist())]
#         print(c_number)
        for i in range(k):
            cc[i] = [x/c_number[i] for x in cc[i]]
            c_number[i] = 0
            cluster_center[i] = cc[i]
            cc[i] = [0 for _ in cc[i]]
        print(times,cluster)
    return cluster
           

K-means++

(主要是優化了初始聚類中心的選取,增加了多次聚類取一個最優結果這兩個功能)

def k_meansa(data,k):
    m = len(data)                    #樣本個數 
    n = len(data[0])                 #次元
    cluster_center = np.zeros((k,n)) #聚類中心
    
    #選擇合适的初始聚類中心
    j = np.random.randint(m)          #[0,m)
    cluster_center[0] = data[j][:]
    dis = np.zeros(m)-1               #樣本到目前所有聚類中心的最近距離
    i = 0
    while i<k-1:
        for j in range(m):          #j:樣本
            d = (cluster_center[i]-data[j])**2  #data[j]與最新聚類中心cluster_center之間的距離
            d = np.sum(d)
            if (dis[j]<0) or (dis[j]>d):
                dis[j] = d
        j = random_select(dis)        #按照dis權重選擇樣本j
#         print(j)
        if is_same(data[j],cluster_center):
            continue
        i+=1
        cluster_center[i] = data[j][:]
        
    #聚類
    cluster = np.zeros(m,dtype = np.int) - 1   #所有樣本尚未聚類
    cc = np.zeros((k,n))                   #下一輪的聚類中心
    c_number = np.zeros(k)              #每個簇中的樣本數目
    result = []                         #存放十次聚類的結果
    error = []                          #存放十次聚類的誤差
    for cnt in range(10):                  #進行十次聚類,取其中結果最好的一個
        print('===========第%d次聚類=========='%cnt)
        for times in range(10):
            for i in range(m):
                c = nearest(data[i],cluster_center)
                cluster[i] = c
                c_number[c]+=1
                cc[c]+=data[i]
            for i in range(k):
                cluster_center[i] = cc[i]/c_number[i]
            cc.flat = 0
            c_number.flat = 0
            print('疊代%d次:'%times)
            print(cluster)
        for i,value in enumerate(cluster):
            temp=(data[i]-cluster_center[value])**2
            temp = np.sum(temp)
        error.append(temp)
        result.append(cluster)
        
    index = error.index(min(error))
        
    return result[index]
           

繼續閱讀