Hadoop-Mapreduce實戰（兩表join）

兩表Join

未優化版本
- Bean.java

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/*
 * 人員和位址的通用bean
 */
public class Bean implements WritableComparable<Bean> {
    private String userNo = "";
    private String userName = "";
    private String addreNo = "";
    private String addreName = "";
    private int flag;

    public Bean(Bean bean) {
        this.userName = bean.getUserName();
        this.userNo = bean.getUserNo();
        this.addreName = bean.getAddreName();
        this.addreNo = bean.getAddreNo();
        this.flag = bean.getFlag();
    }

    public Bean() {
        super();
        // TODO Auto-generated constructor stub
    }

    public Bean(String userNo, String userName, String addreNo,
                String addreName, int flag) {
        super();
        this.userNo = userNo;
        this.userName = userName;
        this.addreNo = addreNo;
        this.addreName = addreName;
        this.flag = flag;
    }

    public String getUserNo() {
        return userNo;
    }

    public void setUserNo(String userNo) {
        this.userNo = userNo;
    }

    public String getUserName() {
        return userName;
    }

    public void setUserName(String userName) {
        this.userName = userName;
    }

    public String getAddreNo() {
        return addreNo;
    }

    public void setAddreNo(String addreNo) {
        this.addreNo = addreNo;
    }

    public String getAddreName() {
        return addreName;
    }

    public void setAddreName(String addreName) {
        this.addreName = addreName;
    }

    public int getFlag() {
        return flag;
    }

    public void setFlag(int flag) {
        this.flag = flag;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(userNo);
        out.writeUTF(userName);
        out.writeUTF(addreNo);
        out.writeUTF(addreName);
        out.writeInt(flag);

    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.userNo = in.readUTF();
        this.userName = in.readUTF();
        this.addreNo = in.readUTF();
        this.addreName = in.readUTF();
        this.flag = in.readInt();

    }

    @Override
    public int compareTo(Bean arg0) {
        // TODO Auto-generated method stub
        return 0;
    }

    @Override
    public String toString() {
        return "userNo=" + userNo + ", userName=" + userName + ", addreNo="
                + addreNo + ", addreName=" + addreName;
    }

}

PersonAddrMap.java

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class PersonAddrMap extends Mapper<LongWritable, Text, IntWritable, Bean> {
    @Override
    protected void map(LongWritable key, Text value,
                       Mapper<LongWritable, Text, IntWritable, Bean>.Context context)
            throws IOException, InterruptedException {
        String line = value.toString();
        String str[] = line.split(" ");
        if (str.length == 2) { //地區資訊表
            Bean bean = new Bean();
            bean.setAddreNo(str[0]);
            bean.setAddreName(str[1]);
            bean.setFlag(0); // 0表示地區
            context.write(new IntWritable(Integer.parseInt(str[0])), bean);
        } else { //人員資訊表
            Bean bean = new Bean();
            bean.setUserNo(str[0]);
            bean.setUserName(str[1]);
            bean.setAddreNo(str[2]);
            bean.setFlag(1); // 1表示人員表
            context.write(new IntWritable(Integer.parseInt(str[2])), bean);
        }
    }
}

PersonAddreRedu.java

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class PersonAddreRedu extends Reducer<IntWritable, Bean, NullWritable,Text> {
    @Override
    protected void reduce(IntWritable key, Iterable<Bean> values,
                          Reducer<IntWritable, Bean, NullWritable, Text>.Context context)
            throws IOException, InterruptedException {
        Bean Addre = null;
        List<Bean> peoples = new ArrayList<Bean>();
		/*
		 * 如果values的第一個元素資訊就是位址Addre的資訊的話,
		 * 我們就不再需要一個List來緩存person資訊了,values後面的全是人員資訊
		 * 将減少巨大的記憶體空間
		 */
		/*
		 * partitioner和shuffer的過程：
		 * partitioner的主要功能是根據reduce的數量将map輸出的結果進行分塊,将資料送入到相應的reducer.
		 * 所有的partitioner都必須實作partitioner接口并實作getPartition方法,該方法的傳回值為int類型，并且取值範圍在0~(numOfReducer-1),
		 * 進而能将map的輸出輸入到對應的reducer中,對于某個mapreduce過程,hadoop架構定義了預設的partitioner為HashPartioner,
		 * 該partitioner使用key的hashCode來決定将該key輸送到哪個reducer;
		 * shuffle将每個partitioner輸出的結果根據key進行group以及排序,将具有相同key的value構成一個values的疊代器,并根據key進行排序分别調用
		 * 開發者定義的reduce方法進行排序,是以mapreducer的是以key必須實作comparable接口的compareto()方法進而能實作兩個key對象的比較
		 */
		/*
		 * 我們需要自定義key的資料結構(shuffle按照key進行分組)來滿足共同addreNo的情況下位址表的更小需求
		 *
		 */
        for (Bean bean : values) {
            if (bean.getFlag() == 0) { // 表示地區表
                Addre = new Bean(bean);
            } else {
                peoples.add(new Bean(bean)); //添加到peoplelist中
            }
        }
        for (Bean peo : peoples) { // 給peoplelist添加地區名字
            peo.setAddreName(Addre.getAddreName());
            context.write(NullWritable.get(), new Text(peo.toString()));
        }
    }
}

PersonAddreMain.java

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class PersonAddreMain {
    public static void main(String[] args) throws Exception {

        args = new String[] { "F:\\A\\join\\", "F:\\A\\out" };

        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(PersonAddreMain.class);

        job.setMapperClass(PersonAddrMap.class);
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(Bean.class);

        job.setReducerClass(PersonAddreRedu.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        job.waitForCompletion(true);
    }
}

已優化版本

Bean.java

import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/*
 * 人員和位址的通用bean
 * 用作map輸出的value
 */
public class Bean implements WritableComparable<Bean> {
    private String userNo = " ";
    private String userName = " ";
    private String addreNo = " ";
    private String addreName = " ";

    public Bean(Bean bean) {
        this.userName = bean.getUserName();
        this.userNo = bean.getUserNo();
        this.addreName = bean.getAddreName();
        this.addreNo = bean.getAddreNo();
    }

    public Bean() {
        super();
        // TODO Auto-generated constructor stub
    }

    public Bean(String userNo, String userName, String addreNo,
                String addreName, int flag) {
        super();
        this.userNo = userNo;
        this.userName = userName;
        this.addreNo = addreNo;
        this.addreName = addreName;
    }

public String getUserNo() {
        return userNo;
    }

    public void setUserNo(String userNo) {
        this.userNo = userNo;
    }

    public String getUserName() {
        return userName;
    }

    public void setUserName(String userName) {
        this.userName = userName;
    }

    public String getAddreNo() {
        return addreNo;
    }

    public void setAddreNo(String addreNo) {
        this.addreNo = addreNo;
    }

    public String getAddreName() {
        return addreName;
    }

    public void setAddreName(String addreName) {
        this.addreName = addreName;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(userNo);
        out.writeUTF(userName);
        out.writeUTF(addreNo);
        out.writeUTF(addreName);

    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.userNo = in.readUTF();
        this.userName = in.readUTF();
        this.addreNo = in.readUTF();
        this.addreName = in.readUTF();
    }

    @Override
    public int compareTo(Bean arg0) {
        // TODO Auto-generated method stub
        return 0;
    }

    @Override
    public String toString() {
        return "userNo=" + userNo + ", userName=" + userName + ", addreNo="
                + addreNo + ", addreName=" + addreName;
    }
}

BeanKey.java

import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/*
 * map輸出的key
 */
public class BeanKey implements WritableComparable<BeanKey> {
    private int AddreNo;
    private boolean isPrimary; // true：address false：person

    public BeanKey(int addreNo, boolean isPrimary) {
        super();
        this.AddreNo = addreNo;
        this.isPrimary = isPrimary;
    }

    public BeanKey() {
        super();
        // TODO Auto-generated constructor stub
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(AddreNo);
        out.writeBoolean(isPrimary);

    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.AddreNo = in.readInt();
        this.isPrimary = in.readBoolean();

    }

    // partitioner執行時調用hashcode()方法和compareTo()方法
    // compareTo()方法作為shuffle排序的預設方法
    @Override
    public int hashCode() {
        return this.AddreNo; // 按AddreNo進行分組
    }

    //用于排序,将相同的AddressNo的位址表和人員表,将位址表放到首位
    @Override
    public int compareTo(BeanKey o) {
        if (this.AddreNo == o.getAddreNo()) { // 如果是同一個AddressNo的資料則判斷是Person還是Address表
            if (this.isPrimary == o.isPrimary()) {  //如果屬性相同屬于同種類型的表,傳回0
                return 0;
            } else {
                return this.isPrimary ? -1 : 1; // true表示Address表 傳回更小的值,将排至values隊首
            }
        } else {
            return this.AddreNo - o.getAddreNo() > 0 ? 1 : -1;  //按AddressNo排序
        }
    }

    public int getAddreNo() {
        return AddreNo;
    }

    public void setAddreNo(int addreNo) {
        AddreNo = addreNo;
    }

    public boolean isPrimary() {
        return isPrimary;
    }

    public void setPrimary(boolean isPrimary) {
        this.isPrimary = isPrimary;
    }
}

PersonAddrMap.java

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;

/*
 * map類使key,value分别進行處理
 */
public class PersonAddreMap extends Mapper<LongWritable, Text, BeanKey, Bean> {
    @Override
    protected void map(LongWritable key, Text value,
                       Mapper<LongWritable, Text, BeanKey, Bean>.Context context)
            throws IOException, InterruptedException {
        String line = value.toString();
        String str[] = line.split(" ");
        if (str.length == 2) {
            // Addre表
            Bean Addre = new Bean();
            Addre.setAddreNo(str[0]);
            Addre.setAddreName(str[1]);

            BeanKey AddreKey = new BeanKey();
            AddreKey.setAddreNo(Integer.parseInt(str[0]));
            AddreKey.setPrimary(true); // true表示地區表
            context.write(AddreKey, Addre);
        } else {
            // Person表
            Bean Person = new Bean();
            Person.setUserNo(str[0]);
            Person.setUserName(str[1]);
            Person.setAddreNo(str[2]);

            BeanKey PerKey = new BeanKey();
            PerKey.setAddreNo(Integer.parseInt(str[2]));
            PerKey.setPrimary(false);// false表示人員表
            context.write(PerKey, Person);

        }
    }

}

PersonAddreRedu.java

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class PersonAddreReduce extends Reducer<BeanKey, Bean, NullWritable, Text> {
    @Override
    protected void reduce(BeanKey key, Iterable<Bean> values,
                          Reducer<BeanKey, Bean, NullWritable, Text>.Context context)
            throws IOException, InterruptedException {
        Bean Addre = null;
        int num = 0;
        for (Bean bean : values) {
            if (num == 0) {
                Addre = new Bean(bean); // Address位址表為values的第一個值
                num++;
            } else {
                // 其餘全為person表
                // 沒有list數組,節省大量記憶體空間
                bean.setAddreName(Addre.getAddreName());
                context.write(NullWritable.get(), new Text(bean.toString()));
            }
        }
    }
}

PKFKCompartor.java

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/*
 * 實作Group分組
 * shuffle的group過程預設的是使用的key(BeanKey)的compareTo()方法
 * 剛才我們添加的自定義的Key沒有辦法将具有相同AddressNo的位址和人員放到同一個group中(因為從compareTo()方法中可以看出他們是不相等的)
 * 我們需要的就是自己定義一個groupComparer就可以
 * 實作比較器
 */
public class PKFKCompartor extends WritableComparator {

    protected PKFKCompartor() {
        super(BeanKey.class, true);
    }

    //兩個BeanKey進行比較排序
    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        BeanKey a1 = (BeanKey) a;
        BeanKey b1 = (BeanKey) b;
        if (a1.getAddreNo() == b1.getAddreNo()) {
            return 0;
        } else {
            return a1.getAddreNo() > b1.getAddreNo() ? 1 : -1;
        }
    }
}

PersonAddreMain.java

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class PersonAddreMain {
    public static void main(String[] args) throws Exception {

        args = new String[]{"F:\\A\\join\\", "F:\\A\\out_Andy1"};

        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(PersonAddreMain.class);

        //設定自定義的group
        job.setGroupingComparatorClass(PKFKCompartor.class);

        job.setMapperClass(PersonAddreMap.class);
        job.setMapOutputKeyClass(BeanKey.class);
        job.setMapOutputValueClass(Bean.class);

        job.setReducerClass(PersonAddreRedu.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        job.waitForCompletion(true);
    }
}

Hadoop-Mapreduce實戰（兩表join）

兩表Join

繼續閱讀

nginx location中斜線的位置的重要性

27 Best Free Eclipse Plug-ins for Java Developer to be ProductiveCode Quality PluginsText Editor PluginsDependency ManagementVersion Control Integration PluginsFramework Development Continuous Integration Related PluginsOther Utility Plugins

Java String.format方法的簡單使用

neo4j之cypher使用文檔

Ambari介紹和架構原理

GitHub連夜封殺！這份阿裡 10W 字内部 Java 字面試手冊到底有多強？

spark/scala關于【資源檔案】加載方法概述外部檔案加載方案測試資源檔案打包入jar包中小結

NOSQL安全攻擊

mybatis_入門程式Mybatis入門

AOP程式設計_Android優雅權限架構(1)概念基礎，2021金三銀四前言正文大綱正文

Effective Java 8:通用程式設計

OOM三種類型

工廠模式-三種類型

【遞歸】高效率求2的n次幂

win10本地scala和spark安裝安裝scala安裝spark

scala (3) Function 和 Method