用mapreduce写wordcount

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCountMR {

    /**
     * 该main方法是该mapreduce程序运行的入口，其中用一个Job类对象来管理程序运行时所需要的很多参数：
     * 比如，指定用哪个组件作为数据读取器、数据结果输出器 指定用哪个类作为map阶段的业务逻辑类，哪个类作为reduce阶段的业务逻辑类
     * 指定wordcount job程序的jar包所在路径 .... 以及其他各种需要的参数,最后job提交运行
     */
    public static void main(String[] args) throws Exception {
        // 指定mapreduce运行的hdfs相关的参数
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://hadoop:9000");
        conf.set("mapreduce.application.classpath", System.getProperty("user.dir"));
        //分布式集群设置
        // conf.set("mapred.jar", System.getProperty("user.dir")+"/WordCount.jar");
        System.setProperty("HADOOP_USER_NAME", "root");
        //设置开发环境变量
        System.setProperty("hadoop.home.dir", "/opt/hadoop-2.7.3/");

        // 设置mapreduce运行模式，这也是默认值
        // conf.set("mapreduce.framework.name", "yarn");
        // conf.set("yarn.resourcemanager.hostname", "hadoop");

        // 获取job对象
        Job job = Job.getInstance(conf);

        // 设置jar包所在路径
        job.setJarByClass(WordCountMR.class);
        //job.setMapperClass(WordCountMapper.class);
        //job.setReducerClass(WordCountReducer.class);

        // 指定mapper类和reducer类
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);

        // 指定maptask的输出key-value类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // 指定reducetask的输出key-value类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        // 设置reduceTask的个数
        // job.setNumReduceTasks(3);

        // 本地运行时也可以直接指定本地目录
        // Path inputPath = new Path("d:/wordcount/input");
        // Path outputPath = new Path("d:/wordcount/output");

        // 指定该mapreduce程序数据的输入路径
        Path inputPath = new Path("/tmp/input");
        //FileInputFormat.addInputPath(job, new Path("d:/words/input"));

        // 指定该mapreduce程序数据的输出路径
        Path outputPath = new Path("/tmp/output");
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(outputPath)) {
            fs.delete(outputPath, true);
        }
        FileInputFormat.setInputPaths(job, inputPath);
        FileOutputFormat.setOutputPath(job, outputPath);

        // job.submit();
        // 最后提交任务
        boolean waitForCompletion = job.waitForCompletion(true);
        System.exit(waitForCompletion ?  : );
    }

    /**
     * Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
     * 
     * KEYIN：是指框架读取到的数据的key的类型，在默认的InputFormat下，读到的key是一行文本的起始偏移量，所以key的类型是Long
     * VALUEIN：是指框架读取到的数据的value的类型,在默认的InputFormat下，读到的value是一行文本的内容，所以value的类型是String
     * KEYOUT：是指用户自定义逻辑方法返回的数据中key的类型，由用户业务逻辑决定，在此wordcount程序中，我们输出的key是单词，所以是String
     * VALUEOUT：是指用户自定义逻辑方法返回的数据中value的类型，由用户业务逻辑决定,在此wordcount程序中，我们输出的value是单词的数量，所以是Integer
     * 
     * 但是，String ，Long等jdk中自带的数据类型，在序列化时，效率比较低，hadoop为了提高序列化效率，自定义了一套序列化框架
     * 所以，在hadoop的程序中，如果该数据需要进行序列化（写磁盘，或者网络传输），就一定要用实现了hadoop序列化框架的数据类型
     * 
     * Long ----> LongWritable 
     * String ----> Text 
     * Integer ----> IntWritable 
     * Null ----> NullWritable
     */
    static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            // 按空格切割单词
            String[] words = value.toString().split(" ");
            for (String word : words) {
                // 发送单词
                context.write(new Text(word), new IntWritable());
            }
        }
    }

    /**
     * 首先，和前面一样，Reducer类也有输入和输出，输入就是Map阶段的处理结果，输出就是Reduce最后的输出
     * reducetask在调我们写的reduce方法,reducetask应该收到了前一阶段（map阶段）中所有maptask输出的数据中的一部分
     * （数据的key.hashcode%reducetask数==本reductask号），所以reducetaks的输入类型必须和maptask的输出类型一样
     * 
     * reducetask将这些收到kv数据拿来处理时，是这样调用我们的reduce方法的： 先将自己收到的所有的kv对按照k分组（根据k是否相同）
     * 将某一组kv中的第一个kv中的k传给reduce方法的key变量，把这一组kv中所有的v用一个迭代器传给reduce方法的变量values
     * 
     * key：key
     * values：该key对应的所有的value值的集合，该框架传给我们的是该集合的一个迭代器
     */
    static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

            // 做每个单词的结果汇总
            int sum = ;
            for (IntWritable v : values) {
                sum += v.get();
            }
            // 写出最后的结果
            context.write(key, new IntWritable(sum));
        }
    }
}

用mapreduce写wordcount

继续阅读

Java小案例——随机数猜测随机数猜测

nginx location中斜线的位置的重要性

27 Best Free Eclipse Plug-ins for Java Developer to be ProductiveCode Quality PluginsText Editor PluginsDependency ManagementVersion Control Integration PluginsFramework Development Continuous Integration Related PluginsOther Utility Plugins

Java String.format方法的简单使用

neo4j之cypher使用文档

Ambari介绍和架构原理

GitHub连夜封杀！这份阿里 10W 字内部 Java 字面试手册到底有多强？

spark/scala关于【资源文件】加载方法概述外部文件加载方案测试资源文件打包入jar包中小结

mybatis_入门程序Mybatis入门

AOP编程_Android优雅权限框架(1)概念基础，2021金三银四前言正文大纲正文

Effective Java 8:通用程序设计

OOM三种类型

工厂模式-三种类型

【递归】高效率求2的n次幂

win10本地scala和spark安装安装scala安装spark

scala (3) Function 和 Method