天天看点

python手写kmeans以及kmeans++聚类算法

自己用python手写实现了kmeans与kmeans++算法。记录一下,说不定以后就用着了呢。

首先是用到的几个自定义函数:

def nearest(data,cluster_center):
    n = len(cluster_center)
    m = len(data)
    sum1 = 0
    dis = []
    for i in range(n):
        for j in range(m):
            sum1+=(data[j]-cluster_center[i][j])**2
        dis.append(sum1/m)
        sum1 = 0
    index = dis.index(min(dis))
    return index

def is_same(data,cluster_center):   
    for center in cluster_center:
        if (data==center).all():
            return True
    return False

def random_select(array):
    array1 = array/sum(array)
    array2 = [0 for _ in range(len(array1))]
    for i in range(len(array1)+1):
        for j in range(i):
            array2[i-1]+=array1[j]
    j = random.random()
    for i,value in enumerate(array2):
        if j<value:
            break
    return i
           

kmeans算法:

def k_means(data,k):
    m = len(data)
    n = len(data[0])
    cluster = [-1 for _ in range(m)]         #所有样本尚未聚类
    cluster_center = [[] for _ in range(k)] #聚类中心
    cc = [[] for _ in range(k)]              #下一轮的聚类中心
    c_number = [0 for _ in range(k)]         #每个簇中的样本数目
    
    #随机选择聚类中心,使用set()防止随机点重复。
    h = set()
    while len(h) < k:
        j = random.randint(0,m-1)
        h.add(j)
#         if is_similar(data[j],cluster_center):
#             continue
    for i,seed in enumerate(h):
        cluster_center[i] = data[seed][:]
        cc[i] = [0 for _ in range(n)]
    for times in range(40):
        for i in range(m):
            c = nearest(data[i],cluster_center)
            cluster[i] = c
            c_number[c]+=1
            cc[c]=[i+j for i,j in zip(cc[c],data[i].tolist())]
#         print(c_number)
        for i in range(k):
            cc[i] = [x/c_number[i] for x in cc[i]]
            c_number[i] = 0
            cluster_center[i] = cc[i]
            cc[i] = [0 for _ in cc[i]]
        print(times,cluster)
    return cluster
           

K-means++

(主要是优化了初始聚类中心的选取,增加了多次聚类取一个最优结果这两个功能)

def k_meansa(data,k):
    m = len(data)                    #样本个数 
    n = len(data[0])                 #维度
    cluster_center = np.zeros((k,n)) #聚类中心
    
    #选择合适的初始聚类中心
    j = np.random.randint(m)          #[0,m)
    cluster_center[0] = data[j][:]
    dis = np.zeros(m)-1               #样本到当前所有聚类中心的最近距离
    i = 0
    while i<k-1:
        for j in range(m):          #j:样本
            d = (cluster_center[i]-data[j])**2  #data[j]与最新聚类中心cluster_center之间的距离
            d = np.sum(d)
            if (dis[j]<0) or (dis[j]>d):
                dis[j] = d
        j = random_select(dis)        #按照dis加权选择样本j
#         print(j)
        if is_same(data[j],cluster_center):
            continue
        i+=1
        cluster_center[i] = data[j][:]
        
    #聚类
    cluster = np.zeros(m,dtype = np.int) - 1   #所有样本尚未聚类
    cc = np.zeros((k,n))                   #下一轮的聚类中心
    c_number = np.zeros(k)              #每个簇中的样本数目
    result = []                         #存放十次聚类的结果
    error = []                          #存放十次聚类的误差
    for cnt in range(10):                  #进行十次聚类,取其中结果最好的一个
        print('===========第%d次聚类=========='%cnt)
        for times in range(10):
            for i in range(m):
                c = nearest(data[i],cluster_center)
                cluster[i] = c
                c_number[c]+=1
                cc[c]+=data[i]
            for i in range(k):
                cluster_center[i] = cc[i]/c_number[i]
            cc.flat = 0
            c_number.flat = 0
            print('迭代%d次:'%times)
            print(cluster)
        for i,value in enumerate(cluster):
            temp=(data[i]-cluster_center[value])**2
            temp = np.sum(temp)
        error.append(temp)
        result.append(cluster)
        
    index = error.index(min(error))
        
    return result[index]
           

继续阅读