天天看點

Spark mllib 決策樹

package com.immooc.spark

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.util.MLUtils

object DecisionTreeTest {
  def main(args:Array[String]): Unit = {


    val conf = new SparkConf().setAppName("DecisionTreeTest").setMaster("local[2]")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    // 讀取樣本資料1,格式為LIBSVM format
    val data = sc.textFile("file:///Users/walle/Documents/D3/sparkmlib/data.txt")
    val parsedData = data.map{ line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
    }


    //樣本資料劃分訓練樣本與測試樣本
    val splits = parsedData.randomSplit(Array(0.7, 0.3), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)

     val numClasses = 2
     val categoricalFeaturesInfo = Map[Int, Int]()
     val impurity = "gini"
     val maxDepth = 5
     val maxBins = 32

     val model = DecisionTree.trainClassifier(training, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins)


    //模型預測
    val labelAndPreds = test.map { point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)
    }

    //測試值與真實值對比
    val print_predict = labelAndPreds.take(15)
    println("label" + "\t" + "prediction")
    for (i <- 0 to print_predict.length - 1) {
      println(print_predict(i)._1 + "\t" + print_predict(i)._2)
    }

    //樹的錯誤率
    val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / test.count()
    println("Test Error = " + testErr)
    //列印樹的判斷值
    println("Learned classification tree model:\n" + model.toDebugString)

  }
}      

1. 資料

0,32 1 1 0

0,25 1 2 0

1,29 1 2 1

1,24 1 1 0

0,31 1 1 0

1,35 1 2 1

0,30 0 1 0

0,31 1 1 0

1,30 1 2 1

1,21 1 1 0

0,21 1 2 0

1,21 1 2 1

0,29 0 2 1

0,29 1 0 1

0,29 0 2 1

1,30 1 1 0

2. 結果

繼續閱讀