1.Scala 2.11.8 下載下傳解壓
[root@sht-sgmhadoopnn-01 hadoop]# tar xzvf scala-2.11.8.tgz
[root@sht-sgmhadoopnn-01 hadoop]# mv scala-2.11.8 scala
[root@sht-sgmhadoopnn-01 hadoop]#
2.将scala檔案夾同步到叢集其他機器
3.在叢集的每台機器配置環境變量,生效
###在檔案末尾添加兩行
[root@sht-sgmhadoopnn-01 hadoop]# vi /etc/profile
export SCALA_HOME=/hadoop/scala
export PATH=$SCALA_HOME/bin:$PATH
[root@sht-sgmhadoopnn-01 hadoop]# source /etc/profile
[root@sht-sgmhadoopnn-02 hadoop]# source /etc/profile
[root@sht-sgmhadoopdn-01 hadoop]# source /etc/profile
[root@sht-sgmhadoopdn-02 hadoop]# source /etc/profile
[root@sht-sgmhadoopdn-03 hadoop]# source /etc/profile
---------------------------------------------------------------------------------------------------------------------
1.Spark2.0.0下載下傳解壓
[root@sht-sgmhadoopnn-01 hadoop]# tar xzvf spark-2.0.0-bin-hadoop2.7.tgz
[root@sht-sgmhadoopnn-01 hadoop]# mv spark-2.0.0-bin-hadoop2.7 spark
2.配置spark-env.sh
[root@sht-sgmhadoopnn-01 conf]# pwd
/hadoop/spark/conf
[root@sht-sgmhadoopnn-01 conf]# cp spark-env.sh.template spark-env.sh
[root@sht-sgmhadoopnn-01 conf]#
###添加以下5行
[root@sht-sgmhadoopnn-01 conf]# vi spark-env.sh
export JAVA_HOME=/usr/java/jdk1.7.0_67-cloudera
export SPARK_MASTER_IP=172.16.101.55
export SPARK_WORKER_MEMORY=1g
export SPARK_PID_DIR=/hadoop/pid
export HADOOP_CONF_DIR=/hadoop/hadoop/etc/hadoop
3.配置slaves檔案
[root@sht-sgmhadoopnn-01 conf]# cp slaves.template slaves
[root@sht-sgmhadoopnn-01 conf]# vi slaves
sht-sgmhadoopdn-01
sht-sgmhadoopdn-02
sht-sgmhadoopdn-03
4.将spark檔案夾copy到配置slaves檔案的機器上
5.在叢集的每台機器配置環境變量,生效
export SPARK_HOME=/hadoop/scala
export PATH=$SPARK_HOME/bin:$PATH
6.啟動spark
[root@sht-sgmhadoopnn-01 sbin]# ./start-all.sh
starting org.apache.spark.deploy.master.Master, logging to /hadoop/spark/logs/spark-root-org.apache.spark.deploy.master.Master-1-sht-sgmhadoopnn-01.out
sht-sgmhadoopdn-01: starting org.apache.spark.deploy.worker.Worker, logging to /hadoop/spark/logs/spark-root-org.apache.spark.deploy.worker.Worker-1-sht-sgmhadoopdn-01.telenav.cn.out
sht-sgmhadoopdn-02: starting org.apache.spark.deploy.worker.Worker, logging to /hadoop/spark/logs/spark-root-org.apache.spark.deploy.worker.Worker-1-sht-sgmhadoopdn-02.telenav.cn.out
sht-sgmhadoopdn-03: starting org.apache.spark.deploy.worker.Worker, logging to /hadoop/spark/logs/spark-root-org.apache.spark.deploy.worker.Worker-1-sht-sgmhadoopdn-03.telenav.cn.out
[root@sht-sgmhadoopnn-01 sbin]#
7.web檢視
<a href="http://sht-sgmhadoopnn-01:8080/">http://sht-sgmhadoopnn-01:8080/</a>
[root@sht-sgmhadoopnn-01 sbin]# jps
27169 HMaster
26233 NameNode
26641 ResourceManager
2312 Jps
26542 DFSZKFailoverController
2092 Master
27303 RunJar
26989 JobHistoryServer
[root@sht-sgmhadoopdn-01 ~]# jps
19907 Worker
2086 jar
17265 DataNode
17486 NodeManager
20055 Jps
17377 JournalNode
17697 HRegionServer
3671 QuorumPeerMain
8.運作WordCount案例
[root@sht-sgmhadoopnn-01 hadoop]# vi wordcount.txt
hello abc 123
abc hadoop hello hdfs
spark yarn
123 abc hello hdfs spark
wjp wjp abc hello
[root@sht-sgmhadoopnn-01 bin]# spark-shell
scala>
scala> val count=textfile.flatMap(line => line.split(" ")).map(word => (word,1)).reduceByKey(_+_)
count: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[4] at reduceByKey at :26
scala> count.collect()
res0: Array[(String, Int)] = Array((hello,4), (123,2), (yarn,1), (abc,4), (wjp,2), (spark,2), (hadoop,1), (hdfs,2))
###val file=sc.textFile("hadoop fs -ls hdfs://172.16.101.56:8020/wordcount.txt")
val file = sc.textFile("hdfs://namenode:8020/path/to/input")
val counts = file.flatMap(line => line.split(" "))
.map(word => (word, 1))
.reduceByKey(_ + _)
counts.saveAsTextFile("hdfs://namenode:8020/output")
--------------------------------------------------------------------------------------------------------------------------------------------------------
a.本地模式兩線程運作
#[root@sht-sgmhadoopdn-01 ~]# ./bin/run-example SparkPi 2>&1 | grep "Pi is roughly"
[root@sht-sgmhadoopnn-01 spark]# ./bin/run-example SparkPi 10 --master local[2]
b.Spark Standalone 叢集模式運作
[root@sht-sgmhadoopnn-01 spark]# ./bin/spark-submit \
--class org.apache.spark.examples.SparkPi \
--master spark://sht-sgmhadoopnn-01:7077 \
examples/jars/spark-examples_2.11-2.0.0.jar \
100
c.注意 Spark on YARN 支援兩種運作模式,分别為yarn-cluster和yarn-client,具體的差別可以看這篇博文,
從廣義上講,yarn-cluster适用于生産環境;而yarn-client适用于互動和調試,也就是希望快速地看到application的輸出。
#Spark on YARN 叢集上 yarn-cluster 模式運作
--master yarn-cluster ./examples/jars/spark-examples_2.11-2.0.0.jar \
10
spark-submit \
--master yarn-cluster \
--num-executors 3 \
--driver-memory 4g \
--executor-memory 2g \
--executor-cores 1 \
$SPARK_HOME/examples/jars/spark-examples_2.11-2.0.0.jar \