天天看點

spark mllib機器學習之七 TFIDF

package com.agm.FeatureExtractors

import org.apache.spark.ml.feature.{ HashingTF, IDF, Tokenizer }

import org.apache.spark.{ SparkConf, SparkContext }

import org.apache.spark.mllib.classification.{ LogisticRegressionWithLBFGS, LogisticRegressionModel }

import org.apache.spark.mllib.evaluation.MulticlassMetrics

import org.apache.spark.mllib.regression.LabeledPoint

import org.apache.spark.mllib.linalg.Vectors

import org.apache.spark.mllib.util.MLUtils

import java.io._

import org.apache.log4j.{ Level, Logger }

import org.apache.spark.sql.SQLContext

object TFIDF {

  def main(args: Array[String]) {

    Logger.getLogger("org").setLevel(Level.ERROR)

    val conf = new SparkConf().setAppName("Simple Application") //給Application命名    

    conf.setMaster("local")

    val sc = new SparkContext(conf)

    val sqlContext = new SQLContext(sc);

    val sentenceData = sqlContext.createDataFrame(Seq(

      (0.0, "Hi I heard about Spark"),

      (0.0, "I wish Java could use case classes"),

      (1.0, "Logistic regression models are neat"))).toDF("label", "sentence")

    val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")

    val wordsData = tokenizer.transform(sentenceData)

    val hashingTF = new HashingTF()

      .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)

    val featurizedData = hashingTF.transform(wordsData)

    // alternatively, CountVectorizer can also be used to get term frequency vectors

    val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")

    val idfModel = idf.fit(featurizedData)

    val rescaledData = idfModel.transform(featurizedData)

    rescaledData.select("label", "features").show()

  }

}

繼續閱讀