1、scala的原生寫法,用到了groupBy
2、spark中的寫法,用到了reduceByKey
3、spark中的寫法,用到了groupByKey
4、spark中的寫法,用到了groupBy
5、flink中DataSteam的寫法,用到了keyBy
6、flink中DataSet的寫法,用到了groupBy
package com.test
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.flink.streaming.api.scala._
import org.apache.flink.api.scala.ExecutionEnvironment
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
object WordCount {
def main(args: Array[String]): Unit = {
val stringList = List("hello world", "hello scala", "hello you", "good world")
//同一時間隻能運作其中一種方法,若要運作其他方法,改下方法名即可
wordCount5(stringList)
}
//scala原生寫法
def wordCount1(stringList: List[String]): Unit = {
val result = stringList.flatMap(_.split(" "))
.map((_, 1))
.groupBy(_._1)
.mapValues(_.size)
.toList
.sortBy(_._2)
.foreach(println)
}
//spark + reduceByKey
def wordCount2(stringList: List[String]): Unit = {
//設定本機Spark配置
val conf = new SparkConf().setAppName("wordCount").setMaster("local")
//建立Spark上下文
val sc = new SparkContext(conf)
//從檔案中擷取資料
val input = sc.parallelize(stringList)
//分析并排序輸出統計結果
val result = input.flatMap(_.split(" "))
.map((_, 1))
.reduceByKey(_ + _)
.sortBy(_._2, false)
.foreach(println)
}
//spark + groupByKey
def wordCount3(stringList: List[String]): Unit = {
val conf = new SparkConf().setAppName("wordCount").setMaster("local")
val sc = new SparkContext(conf)
val input = sc.parallelize(stringList)
val result = input.flatMap(_.split(" "))
.map((_, 1))
.groupByKey()
.map(x => (x._1, x._2.sum))
.sortBy(_._2)
.foreach(println)
}
//spark + groupBy
def wordCount4(stringList: List[String]): Unit = {
val conf = new SparkConf().setAppName("wordCount").setMaster("local")
val sc = new SparkContext(conf)
val input = sc.parallelize(stringList)
val result = input.flatMap(_.split(" "))
.map((_, 1))
.groupBy(_._1)
.mapValues(x => x.size)
.foreach(println)
}
//flink + DataSteam + keyBy
def wordCount5(stringList: List[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
val source = env.fromCollection(stringList)
val result = source.flatMap(_.split(" "))
.map((_, 1))
.keyBy(0)
.sum(1)
.map(println(_))
env.execute("wordCount")
}
//flink + DataSet + groupBy
def wordCount6(stringList: List[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
val source = env.fromCollection[String](stringList)
val result = source.flatMap(_.split(" "))
.map((_, 1))
.groupBy(0)
.sum(1)
result.print()
env.execute("wordCount")
}
}