用mapreduce寫wordcount

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCountMR {

    /**
     * 該main方法是該mapreduce程式運作的入口，其中用一個Job類對象來管理程式運作時所需要的很多參數：
     * 比如，指定用哪個元件作為資料讀取器、資料結果輸出器 指定用哪個類作為map階段的業務邏輯類，哪個類作為reduce階段的業務邏輯類
     * 指定wordcount job程式的jar包所在路徑 .... 以及其他各種需要的參數,最後job送出運作
     */
    public static void main(String[] args) throws Exception {
        // 指定mapreduce運作的hdfs相關的參數
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://hadoop:9000");
        conf.set("mapreduce.application.classpath", System.getProperty("user.dir"));
        //分布式叢集設定
        // conf.set("mapred.jar", System.getProperty("user.dir")+"/WordCount.jar");
        System.setProperty("HADOOP_USER_NAME", "root");
        //設定開發環境變量
        System.setProperty("hadoop.home.dir", "/opt/hadoop-2.7.3/");

        // 設定mapreduce運作模式，這也是預設值
        // conf.set("mapreduce.framework.name", "yarn");
        // conf.set("yarn.resourcemanager.hostname", "hadoop");

        // 擷取job對象
        Job job = Job.getInstance(conf);

        // 設定jar包所在路徑
        job.setJarByClass(WordCountMR.class);
        //job.setMapperClass(WordCountMapper.class);
        //job.setReducerClass(WordCountReducer.class);

        // 指定mapper類和reducer類
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);

        // 指定maptask的輸出key-value類型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // 指定reducetask的輸出key-value類型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        // 設定reduceTask的個數
        // job.setNumReduceTasks(3);

        // 本地運作時也可以直接指定本地目錄
        // Path inputPath = new Path("d:/wordcount/input");
        // Path outputPath = new Path("d:/wordcount/output");

        // 指定該mapreduce程式資料的輸入路徑
        Path inputPath = new Path("/tmp/input");
        //FileInputFormat.addInputPath(job, new Path("d:/words/input"));

        // 指定該mapreduce程式資料的輸出路徑
        Path outputPath = new Path("/tmp/output");
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(outputPath)) {
            fs.delete(outputPath, true);
        }
        FileInputFormat.setInputPaths(job, inputPath);
        FileOutputFormat.setOutputPath(job, outputPath);

        // job.submit();
        // 最後送出任務
        boolean waitForCompletion = job.waitForCompletion(true);
        System.exit(waitForCompletion ?  : );
    }

    /**
     * Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
     * 
     * KEYIN：是指架構讀取到的資料的key的類型，在預設的InputFormat下，讀到的key是一行文本的起始偏移量，是以key的類型是Long
     * VALUEIN：是指架構讀取到的資料的value的類型,在預設的InputFormat下，讀到的value是一行文本的内容，是以value的類型是String
     * KEYOUT：是指使用者自定義邏輯方法傳回的資料中key的類型，由使用者業務邏輯決定，在此wordcount程式中，我們輸出的key是單詞，是以是String
     * VALUEOUT：是指使用者自定義邏輯方法傳回的資料中value的類型，由使用者業務邏輯決定,在此wordcount程式中，我們輸出的value是單詞的數量，是以是Integer
     * 
     * 但是，String ，Long等jdk中自帶的資料類型，在序列化時，效率比較低，hadoop為了提高序列化效率，自定義了一套序列化架構
     * 是以，在hadoop的程式中，如果該資料需要進行序列化（寫磁盤，或者網絡傳輸），就一定要用實作了hadoop序列化架構的資料類型
     * 
     * Long ----> LongWritable 
     * String ----> Text 
     * Integer ----> IntWritable 
     * Null ----> NullWritable
     */
    static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            // 按空格切割單詞
            String[] words = value.toString().split(" ");
            for (String word : words) {
                // 發送單詞
                context.write(new Text(word), new IntWritable());
            }
        }
    }

    /**
     * 首先，和前面一樣，Reducer類也有輸入和輸出，輸入就是Map階段的處理結果，輸出就是Reduce最後的輸出
     * reducetask在調我們寫的reduce方法,reducetask應該收到了前一階段（map階段）中所有maptask輸出的資料中的一部分
     * （資料的key.hashcode%reducetask數==本reductask号），是以reducetaks的輸入類型必須和maptask的輸出類型一樣
     * 
     * reducetask将這些收到kv資料拿來處理時，是這樣調用我們的reduce方法的： 先将自己收到的所有的kv對按照k分組（根據k是否相同）
     * 将某一組kv中的第一個kv中的k傳給reduce方法的key變量，把這一組kv中所有的v用一個疊代器傳給reduce方法的變量values
     * 
     * key：key
     * values：該key對應的所有的value值的集合，該架構傳給我們的是該集合的一個疊代器
     */
    static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

            // 做每個單詞的結果彙總
            int sum = ;
            for (IntWritable v : values) {
                sum += v.get();
            }
            // 寫出最後的結果
            context.write(key, new IntWritable(sum));
        }
    }
}

用mapreduce寫wordcount

繼續閱讀

Java小案例——随機數猜測随機數猜測

nginx location中斜線的位置的重要性

27 Best Free Eclipse Plug-ins for Java Developer to be ProductiveCode Quality PluginsText Editor PluginsDependency ManagementVersion Control Integration PluginsFramework Development Continuous Integration Related PluginsOther Utility Plugins

Java String.format方法的簡單使用

neo4j之cypher使用文檔

Ambari介紹和架構原理

GitHub連夜封殺！這份阿裡 10W 字内部 Java 字面試手冊到底有多強？

spark/scala關于【資源檔案】加載方法概述外部檔案加載方案測試資源檔案打包入jar包中小結

mybatis_入門程式Mybatis入門

AOP程式設計_Android優雅權限架構(1)概念基礎，2021金三銀四前言正文大綱正文

Effective Java 8:通用程式設計

OOM三種類型

工廠模式-三種類型

【遞歸】高效率求2的n次幂

win10本地scala和spark安裝安裝scala安裝spark

scala (3) Function 和 Method