K-Means算法是一種基于距離的聚類算法,采用疊代的方法,計算出K個聚類中心,把若幹個點聚成K類。
package com.immooc.spark
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.{SparkConf, SparkContext}
object KMeansTest {
def main(args:Array[String]): Unit = {
val conf = new SparkConf().setAppName("KMeansTest").setMaster("local[2]")
val sc = new SparkContext(conf)
Logger.getRootLogger.setLevel(Level.WARN)
// 讀取樣本資料1,格式為LIBSVM format
val data = sc.textFile("file:///Users/walle/Documents/D3/sparkmlib/kmeans_data.txt")
val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache()
// 建立KMeans聚類模型,并訓練
val initMode = "k-means||"
val numClusters = 4
val numIterations = 100
val model = new KMeans().
setInitializationMode(initMode).
setK(numClusters).
setMaxIterations(numIterations).
run(parsedData)
val centers = model.clusterCenters
println("centers")
for (i <- 0 to centers.length - 1) {
println(centers(i)(0) + "\t" + centers(i)(1))
}
// 誤差計算
val WSSSE = model.computeCost(parsedData)
println("Within Set Sum of Squared Errors = " + WSSSE)
}
}
1. 輸出
centers
9.05 9.05
0.05 0.05
9.2 9.2
0.2 0.2
Within Set Sum of Squared Errors = 0.03000000000004321