Java API練習MapReduce的WordCount

WordCount（單詞統計)

原理：如圖

代碼：

1.導入3個jar包：hadoop-common、hadoop-core、hadoop-client

2.Map階段：

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

//繼承Mapper類
/**
 Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
 *
 * KEYIN：表示mapper資料輸入的時候key的資料類型，在預設的讀取資料元件下，叫InputFormat,它的行為是一行一行的讀取待處理的資料
 *        讀取一行，傳回一行給程式，這種情況下  keyin就表示每一行的起始偏移量  是以資料類型是Long，一般不改變它的類型
 *
 * VALUEIN:表述mapper資料輸入的時候value的資料類型，在預設的讀取資料元件下 valuein就表示讀取的這一行内容  是以資料類型是String
 *
 * KEYOUT 表示mapper資料輸出的時候key的資料類型  在本案例當中 輸出的key是單詞  是以資料類型是 String
 *
 * VALUEOUT表示mapper資料輸出的時候value的資料類型  在本案例當中 輸出的key是單詞的次數  是以資料類型是 Integer
 *
 * 這裡所說的資料類型String Long都是jdk自帶的類型   在序列化的時候  效率低下 是以hadoop自己封裝一套資料類型
 *   long---->LongWritable
 *   String-->Text
 *   Integer--->Intwritable
 *   null-->NullWritable
 * 
 * 互相轉化：以String為例
 * String ——> Test     new Test(word)
 * Test ——> String     word.toString()
 */
 
public class MyMapper extends Mapper<LongWritable,Text,Text,LongWriteable> {
	public void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException {
		//value為傳入的每一行文本
		//按照空格分隔每行文本，變成單詞數組
		String[] word = value.toString().split(" ");
		//周遊數組中的單詞，每有一個單詞就标記一個數字1
		//通過context上下文将資料傳遞到shuffle階段
		for(String wd : word){
			context.write(new Text(wd),new LongWritanble(1));
		}
	}
}

3.Reduce階段

//繼承Reducer類
/**
 * Reducer<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
 *
 * KEYIN：就是reducer階段輸入的資料key類型，對應mapper的輸出key類型  在本案例中  就是單詞  Text
 *
 * VALUEIN就是reducer階段輸入的資料value類型，對應mapper的輸出value類型  在本案例中  就是單詞次數  IntWritable
 * .
 * KEYOUT就是reducer階段輸出的資料key類型 在本案例中  就是單詞  Text
 *
 * VALUEOUTreducer階段輸出的資料value類型 在本案例中  就是單詞的總次數  IntWritable
 */
 
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Reducer;
 import java.io.IOException;
 
 public class MyReduce extends Reducer<Text,LongWritable,Text,LongWritable>{
 	public void reduce(Text key,iterable<LongWritable> values,Context context) throws IOException, InterruptedException{
 		long count = 0;
 		//周遊，将同一個詞的個數相加
 		for(LongWritable value:values){
 			//get()方法将LongWritable變為long類型
 			count += value.get();
 		}
 		context.write(key,new Writable(count));
 	}
 }

4.整合

import com.bdqn.wc.MyDriver;
import com.bdqn.wc.MyMapper;
import com.bdqn.wc.Myreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;

public class MyDriver {
   public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        //準備一個空任務
        Job job = Job.getInstance(conf,"wc");
        //設定任務主啟動類
        job.setJarByClass(MyDriver.class);
        //設定任務的輸入資料源
        FileInputFormat.addInputPath(job,new Path("d:/abc.txt"));

        //設定Mapper任務
        job.setMapperClass(MyMapper.class);
        //設定Mapper任務類的輸出類型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);
        //設定Reduce任務類
        job.setReducerClass(Myreduce.class);
        //設定Reduce任務類的輸出類型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        //設定任務的輸出資料目标(生成的是fff檔案夾)
        FileOutputFormat.setOutputPath(job,new Path("d:/fff"));
        
        //送出程式 并且監控列印程式的執行情況、
        //隻有job.waitForCompletion也可以
        boolean b = job.waitForCompletion(true);
        System.exit(b?0:1);
    }
}

window下運作結果：

注：在window下運作要先安裝Hadoop

Java API練習MapReduce的WordCount

Java API練習MapReduce的WordCount

WordCount（單詞統計)

原理：如圖

代碼：

繼續閱讀

nginx location中斜線的位置的重要性

27 Best Free Eclipse Plug-ins for Java Developer to be ProductiveCode Quality PluginsText Editor PluginsDependency ManagementVersion Control Integration PluginsFramework Development Continuous Integration Related PluginsOther Utility Plugins

Java String.format方法的簡單使用

neo4j之cypher使用文檔

Ambari介紹和架構原理

GitHub連夜封殺！這份阿裡 10W 字内部 Java 字面試手冊到底有多強？

spark/scala關于【資源檔案】加載方法概述外部檔案加載方案測試資源檔案打包入jar包中小結

NOSQL安全攻擊

mybatis_入門程式Mybatis入門

AOP程式設計_Android優雅權限架構(1)概念基礎，2021金三銀四前言正文大綱正文

Effective Java 8:通用程式設計

OOM三種類型

工廠模式-三種類型

【遞歸】高效率求2的n次幂

win10本地scala和spark安裝安裝scala安裝spark

scala (3) Function 和 Method