自己用python手寫實作了kmeans與kmeans++算法。記錄一下,說不定以後就用着了呢。
首先是用到的幾個自定義函數:
def nearest(data,cluster_center):
n = len(cluster_center)
m = len(data)
sum1 = 0
dis = []
for i in range(n):
for j in range(m):
sum1+=(data[j]-cluster_center[i][j])**2
dis.append(sum1/m)
sum1 = 0
index = dis.index(min(dis))
return index
def is_same(data,cluster_center):
for center in cluster_center:
if (data==center).all():
return True
return False
def random_select(array):
array1 = array/sum(array)
array2 = [0 for _ in range(len(array1))]
for i in range(len(array1)+1):
for j in range(i):
array2[i-1]+=array1[j]
j = random.random()
for i,value in enumerate(array2):
if j<value:
break
return i
kmeans算法:
def k_means(data,k):
m = len(data)
n = len(data[0])
cluster = [-1 for _ in range(m)] #所有樣本尚未聚類
cluster_center = [[] for _ in range(k)] #聚類中心
cc = [[] for _ in range(k)] #下一輪的聚類中心
c_number = [0 for _ in range(k)] #每個簇中的樣本數目
#随機選擇聚類中心,使用set()防止随機點重複。
h = set()
while len(h) < k:
j = random.randint(0,m-1)
h.add(j)
# if is_similar(data[j],cluster_center):
# continue
for i,seed in enumerate(h):
cluster_center[i] = data[seed][:]
cc[i] = [0 for _ in range(n)]
for times in range(40):
for i in range(m):
c = nearest(data[i],cluster_center)
cluster[i] = c
c_number[c]+=1
cc[c]=[i+j for i,j in zip(cc[c],data[i].tolist())]
# print(c_number)
for i in range(k):
cc[i] = [x/c_number[i] for x in cc[i]]
c_number[i] = 0
cluster_center[i] = cc[i]
cc[i] = [0 for _ in cc[i]]
print(times,cluster)
return cluster
K-means++
(主要是優化了初始聚類中心的選取,增加了多次聚類取一個最優結果這兩個功能)
def k_meansa(data,k):
m = len(data) #樣本個數
n = len(data[0]) #次元
cluster_center = np.zeros((k,n)) #聚類中心
#選擇合适的初始聚類中心
j = np.random.randint(m) #[0,m)
cluster_center[0] = data[j][:]
dis = np.zeros(m)-1 #樣本到目前所有聚類中心的最近距離
i = 0
while i<k-1:
for j in range(m): #j:樣本
d = (cluster_center[i]-data[j])**2 #data[j]與最新聚類中心cluster_center之間的距離
d = np.sum(d)
if (dis[j]<0) or (dis[j]>d):
dis[j] = d
j = random_select(dis) #按照dis權重選擇樣本j
# print(j)
if is_same(data[j],cluster_center):
continue
i+=1
cluster_center[i] = data[j][:]
#聚類
cluster = np.zeros(m,dtype = np.int) - 1 #所有樣本尚未聚類
cc = np.zeros((k,n)) #下一輪的聚類中心
c_number = np.zeros(k) #每個簇中的樣本數目
result = [] #存放十次聚類的結果
error = [] #存放十次聚類的誤差
for cnt in range(10): #進行十次聚類,取其中結果最好的一個
print('===========第%d次聚類=========='%cnt)
for times in range(10):
for i in range(m):
c = nearest(data[i],cluster_center)
cluster[i] = c
c_number[c]+=1
cc[c]+=data[i]
for i in range(k):
cluster_center[i] = cc[i]/c_number[i]
cc.flat = 0
c_number.flat = 0
print('疊代%d次:'%times)
print(cluster)
for i,value in enumerate(cluster):
temp=(data[i]-cluster_center[value])**2
temp = np.sum(temp)
error.append(temp)
result.append(cluster)
index = error.index(min(error))
return result[index]