一,環境建構
1、spark版本
2、Scala語言打成的jar,因為Scala和java都是運作在jvm上的位元組碼類語言,java可以直接調用。,當然該包中包含了一些算法,LR、GBDT、決策樹、随機森林,以便于資料的訓練。
3、maven webapp項目,pom引入包如下:
<!-- Begin:spark包 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId>
<version>${hadoop.client.version}</version> </dependency> -->
<artifactId>spark-sql_2.11</artifactId>
<artifactId>spark-mllib_2.11</artifactId>
<!-- End:spark包 -->
版本是:<spark.version>2.2.0</spark.version>
3.spark連接配接工具類:
package cfca.xfraud.afmls.util.spark;
import org.apache.spark.SparkContext;
import org.apache.spark.sql.SparkSession;
import cfca.xfraud.afmls.common.Constants;
import cfca.xfraud.afmls.exception.CodeException;
import cfca.xfraud.dataprocess.DataProcess;
import cfca.xfraud.model.DesicionTree;
import cfca.xfraud.model.GBDTree;
import cfca.xfraud.model.LR;
import cfca.xfraud.model.RFTree;
/**
* @Author zhangzhongqiu
* @Description spark工具類
* @version 1.2.0
* @since jdk1.8
* @date 2018-03-06
*
*/
public class SparkUtil {
private static SparkSession sparkSession;
static{
SparkContext sparkContext = new SparkContext(Constants.SPARK_LOCAL_PATTERN, Constants.APP_NAME);
sparkSession = new SparkSession(sparkContext);
}
* 資料預處理運作狀态
* @param appName
* @param dataPattern
* @return
* @throws Exception
* @throws CodeException
public static boolean runDataProcessStatus(int dataPattern,String trainTime) throws Exception {
boolean isSuccess=false;
try {
isSuccess = new DataProcess(sparkSession).runDataProcess(dataPattern,trainTime);
} catch (Exception e) {
throw e;
return isSuccess;
* 資料訓練過程
* @param algorithmPattern
* @param depth
* @param trainTime
public static boolean runTrainData(String algorithmPattern,int depth,String trainTime) throws Exception{
boolean isRunSuccess=false;
if(Constants.STR_ZERO.equals(algorithmPattern)){
isRunSuccess = new LR(sparkSession).lrClassificationModel(20,0.2,trainTime);
if(Constants.STR_ONE.equals(algorithmPattern)){
isRunSuccess = new DesicionTree(sparkSession).decisionTreeClassificationModel(20,trainTime);
if(Constants.STR_TWO.equals(algorithmPattern)){
isRunSuccess=new GBDTree(sparkSession).GBDTreeClassificationModel(6,20,trainTime);
if(Constants.STR_THREE.equals(algorithmPattern)){
isRunSuccess = new RFTree(sparkSession).rfTreeClassificationModel(100,trainTime);
return isRunSuccess;
主要是建立local模式下的sparkSession作為憑證,進行操作,藍色字型來自于Scala包中的代碼,隻需要在相應的類中提供函數入口就好了。
原文位址http://www.bieryun.com/2457.html