Hadoop-Mapreduce实战(两表join)

两表Join

  • 未优化版本
    • Bean.java
import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/*
 * 人员和地址的通用bean
 */
public class Bean implements WritableComparable<Bean> {
    
    
    private String userNo = "";
    private String userName = "";
    private String addreNo = "";
    private String addreName = "";
    private int flag;

    public Bean(Bean bean) {
    
    
        this.userName = bean.getUserName();
        this.userNo = bean.getUserNo();
        this.addreName = bean.getAddreName();
        this.addreNo = bean.getAddreNo();
        this.flag = bean.getFlag();
    }

    public Bean() {
    
    
        super();
        // TODO Auto-generated constructor stub
    }

    public Bean(String userNo, String userName, String addreNo,
                String addreName, int flag) {
    
    
        super();
        this.userNo = userNo;
        this.userName = userName;
        this.addreNo = addreNo;
        this.addreName = addreName;
        this.flag = flag;
    }

    public String getUserNo() {
    
    
        return userNo;
    }

    public void setUserNo(String userNo) {
    
    
        this.userNo = userNo;
    }

    public String getUserName() {
    
    
        return userName;
    }

    public void setUserName(String userName) {
    
    
        this.userName = userName;
    }

    public String getAddreNo() {
    
    
        return addreNo;
    }

    public void setAddreNo(String addreNo) {
    
    
        this.addreNo = addreNo;
    }

    public String getAddreName() {
    
    
        return addreName;
    }

    public void setAddreName(String addreName) {
    
    
        this.addreName = addreName;
    }

    public int getFlag() {
    
    
        return flag;
    }

    public void setFlag(int flag) {
    
    
        this.flag = flag;
    }

    @Override
    public void write(DataOutput out) throws IOException {
    
    
        out.writeUTF(userNo);
        out.writeUTF(userName);
        out.writeUTF(addreNo);
        out.writeUTF(addreName);
        out.writeInt(flag);

    }

    @Override
    public void readFields(DataInput in) throws IOException {
    
    
        this.userNo = in.readUTF();
        this.userName = in.readUTF();
        this.addreNo = in.readUTF();
        this.addreName = in.readUTF();
        this.flag = in.readInt();

    }

    @Override
    public int compareTo(Bean arg0) {
    
    
        // TODO Auto-generated method stub
        return 0;
    }

    @Override
    public String toString() {
    
    
        return "userNo=" + userNo + ", userName=" + userName + ", addreNo="
                + addreNo + ", addreName=" + addreName;
    }

}

PersonAddrMap.java

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class PersonAddrMap extends Mapper<LongWritable, Text, IntWritable, Bean> {
    
    
    @Override
    protected void map(LongWritable key, Text value,
                       Mapper<LongWritable, Text, IntWritable, Bean>.Context context)
            throws IOException, InterruptedException {
    
    
        String line = value.toString();
        String str[] = line.split(" ");
        if (str.length == 2) {
    
     //地区信息表
            Bean bean = new Bean();
            bean.setAddreNo(str[0]);
            bean.setAddreName(str[1]);
            bean.setFlag(0); // 0表示地区
            context.write(new IntWritable(Integer.parseInt(str[0])), bean);
        } else {
    
     //人员信息表
            Bean bean = new Bean();
            bean.setUserNo(str[0]);
            bean.setUserName(str[1]);
            bean.setAddreNo(str[2]);
            bean.setFlag(1); // 1表示人员表
            context.write(new IntWritable(Integer.parseInt(str[2])), bean);
        }
    }
}

PersonAddreRedu.java

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class PersonAddreRedu extends Reducer<IntWritable, Bean, NullWritable,Text> {
    
    
    @Override
    protected void reduce(IntWritable key, Iterable<Bean> values,
                          Reducer<IntWritable, Bean, NullWritable, Text>.Context context)
            throws IOException, InterruptedException {
    
    
        Bean Addre = null;
        List<Bean> peoples = new ArrayList<Bean>();
		/*
		 * 如果values的第一个元素信息就是地址Addre的信息的话,
		 * 我们就不再需要一个List来缓存person信息了,values后面的全是人员信息
		 * 将减少巨大的内存空间
		 */
		/*
		 * partitioner和shuffer的过程:
		 * partitioner的主要功能是根据reduce的数量将map输出的结果进行分块,将数据送入到相应的reducer.
		 * 所有的partitioner都必须实现partitioner接口并实现getPartition方法,该方法的返回值为int类型,并且取值范围在0~(numOfReducer-1),
		 * 从而能将map的输出输入到对应的reducer中,对于某个mapreduce过程,hadoop框架定义了默认的partitioner为HashPartioner,
		 * 该partitioner使用key的hashCode来决定将该key输送到哪个reducer;
		 * shuffle将每个partitioner输出的结果根据key进行group以及排序,将具有相同key的value构成一个values的迭代器,并根据key进行排序分别调用
		 * 开发者定义的reduce方法进行排序,因此mapreducer的所以key必须实现comparable接口的compareto()方法从而能实现两个key对象的比较
		 */
		/*
		 * 我们需要自定义key的数据结构(shuffle按照key进行分组)来满足共同addreNo的情况下地址表的更小需求
		 *
		 */
        for (Bean bean : values) {
    
    
            if (bean.getFlag() == 0) {
    
     // 表示地区表
                Addre = new Bean(bean);
            } else {
    
    
                peoples.add(new Bean(bean)); //添加到peoplelist中
            }
        }
        for (Bean peo : peoples) {
    
     // 给peoplelist添加地区名字
            peo.setAddreName(Addre.getAddreName());
            context.write(NullWritable.get(), new Text(peo.toString()));
        }
    }
}

PersonAddreMain.java

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class PersonAddreMain {
    
    
    public static void main(String[] args) throws Exception {
    
    

        args = new String[] {
    
     "F:\\A\\join\\", "F:\\A\\out" };

        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(PersonAddreMain.class);

        job.setMapperClass(PersonAddrMap.class);
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(Bean.class);

        job.setReducerClass(PersonAddreRedu.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        job.waitForCompletion(true);
    }
}

已优化版本

  • Bean.java
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/*
 * 人员和地址的通用bean
 * 用作map输出的value
 */
public class Bean implements WritableComparable<Bean> {
    
    
    private String userNo = " ";
    private String userName = " ";
    private String addreNo = " ";
    private String addreName = " ";

    public Bean(Bean bean) {
    
    
        this.userName = bean.getUserName();
        this.userNo = bean.getUserNo();
        this.addreName = bean.getAddreName();
        this.addreNo = bean.getAddreNo();
    }

    public Bean() {
    
    
        super();
        // TODO Auto-generated constructor stub
    }

    public Bean(String userNo, String userName, String addreNo,
                String addreName, int flag) {
    
    
        super();
        this.userNo = userNo;
        this.userName = userName;
        this.addreNo = addreNo;
        this.addreName = addreName;
    }
    
public String getUserNo() {
    
    
        return userNo;
    }

    public void setUserNo(String userNo) {
    
    
        this.userNo = userNo;
    }

    public String getUserName() {
    
    
        return userName;
    }

    public void setUserName(String userName) {
    
    
        this.userName = userName;
    }

    public String getAddreNo() {
    
    
        return addreNo;
    }

    public void setAddreNo(String addreNo) {
    
    
        this.addreNo = addreNo;
    }

    public String getAddreName() {
    
    
        return addreName;
    }

    public void setAddreName(String addreName) {
    
    
        this.addreName = addreName;
    }

    @Override
    public void write(DataOutput out) throws IOException {
    
    
        out.writeUTF(userNo);
        out.writeUTF(userName);
        out.writeUTF(addreNo);
        out.writeUTF(addreName);

    }

    @Override
    public void readFields(DataInput in) throws IOException {
    
    
        this.userNo = in.readUTF();
        this.userName = in.readUTF();
        this.addreNo = in.readUTF();
        this.addreName = in.readUTF();
    }

    @Override
    public int compareTo(Bean arg0) {
    
    
        // TODO Auto-generated method stub
        return 0;
    }

    @Override
    public String toString() {
    
    
        return "userNo=" + userNo + ", userName=" + userName + ", addreNo="
                + addreNo + ", addreName=" + addreName;
    }
}

BeanKey.java

import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/*
 * map输出的key
 */
public class BeanKey implements WritableComparable<BeanKey> {
    
    
    private int AddreNo;
    private boolean isPrimary; // true:address false:person

    public BeanKey(int addreNo, boolean isPrimary) {
    
    
        super();
        this.AddreNo = addreNo;
        this.isPrimary = isPrimary;
    }

    public BeanKey() {
    
    
        super();
        // TODO Auto-generated constructor stub
    }

    @Override
    public void write(DataOutput out) throws IOException {
    
    
        out.writeInt(AddreNo);
        out.writeBoolean(isPrimary);

    }

    @Override
    public void readFields(DataInput in) throws IOException {
    
    
        this.AddreNo = in.readInt();
        this.isPrimary = in.readBoolean();

    }

    // partitioner执行时调用hashcode()方法和compareTo()方法
    // compareTo()方法作为shuffle排序的默认方法
    @Override
    public int hashCode() {
    
    
        return this.AddreNo; // 按AddreNo进行分组
    }

    //用于排序,将相同的AddressNo的地址表和人员表,将地址表放到首位
    @Override
    public int compareTo(BeanKey o) {
    
    
        if (this.AddreNo == o.getAddreNo()) {
    
     // 如果是同一个AddressNo的数据则判断是Person还是Address表
            if (this.isPrimary == o.isPrimary()) {
    
      //如果属性相同属于同种类型的表,返回0
                return 0;
            } else {
    
    
                return this.isPrimary ? -1 : 1; // true表示Address表 返回更小的值,将排至values队首
            }
        } else {
    
    
            return this.AddreNo - o.getAddreNo() > 0 ? 1 : -1;  //按AddressNo排序
        }
    }

    public int getAddreNo() {
    
    
        return AddreNo;
    }

    public void setAddreNo(int addreNo) {
    
    
        AddreNo = addreNo;
    }

    public boolean isPrimary() {
    
    
        return isPrimary;
    }

    public void setPrimary(boolean isPrimary) {
    
    
        this.isPrimary = isPrimary;
    }
}

PersonAddrMap.java

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;

/*
 * map类使key,value分别进行处理
 */
public class PersonAddreMap extends Mapper<LongWritable, Text, BeanKey, Bean> {
    
    
    @Override
    protected void map(LongWritable key, Text value,
                       Mapper<LongWritable, Text, BeanKey, Bean>.Context context)
            throws IOException, InterruptedException {
    
    
        String line = value.toString();
        String str[] = line.split(" ");
        if (str.length == 2) {
    
    
            // Addre表
            Bean Addre = new Bean();
            Addre.setAddreNo(str[0]);
            Addre.setAddreName(str[1]);

            BeanKey AddreKey = new BeanKey();
            AddreKey.setAddreNo(Integer.parseInt(str[0]));
            AddreKey.setPrimary(true); // true表示地区表
            context.write(AddreKey, Addre);
        } else {
    
    
            // Person表
            Bean Person = new Bean();
            Person.setUserNo(str[0]);
            Person.setUserName(str[1]);
            Person.setAddreNo(str[2]);

            BeanKey PerKey = new BeanKey();
            PerKey.setAddreNo(Integer.parseInt(str[2]));
            PerKey.setPrimary(false);// false表示人员表
            context.write(PerKey, Person);

        }
    }

}

PersonAddreRedu.java

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class PersonAddreReduce extends Reducer<BeanKey, Bean, NullWritable, Text> {
    
    
    @Override
    protected void reduce(BeanKey key, Iterable<Bean> values,
                          Reducer<BeanKey, Bean, NullWritable, Text>.Context context)
            throws IOException, InterruptedException {
    
    
        Bean Addre = null;
        int num = 0;
        for (Bean bean : values) {
    
    
            if (num == 0) {
    
    
                Addre = new Bean(bean); // Address地址表为values的第一个值
                num++;
            } else {
    
    
                // 其余全为person表
                // 没有list数组,节省大量内存空间
                bean.setAddreName(Addre.getAddreName());
                context.write(NullWritable.get(), new Text(bean.toString()));
            }
        }
    }
}

PKFKCompartor.java

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/*
 * 实现Group分组
 * shuffle的group过程默认的是使用的key(BeanKey)的compareTo()方法
 * 刚才我们添加的自定义的Key没有办法将具有相同AddressNo的地址和人员放到同一个group中(因为从compareTo()方法中可以看出他们是不相等的)
 * 我们需要的就是自己定义一个groupComparer就可以
 * 实现比较器
 */
public class PKFKCompartor extends WritableComparator {
    
    

    protected PKFKCompartor() {
    
    
        super(BeanKey.class, true);
    }

    //两个BeanKey进行比较排序
    @Override
    public int compare(WritableComparable a, WritableComparable b) {
    
    
        BeanKey a1 = (BeanKey) a;
        BeanKey b1 = (BeanKey) b;
        if (a1.getAddreNo() == b1.getAddreNo()) {
    
    
            return 0;
        } else {
    
    
            return a1.getAddreNo() > b1.getAddreNo() ? 1 : -1;
        }
    }
}

PersonAddreMain.java

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class PersonAddreMain {
    
    
    public static void main(String[] args) throws Exception {
    
    

        args = new String[]{
    
    "F:\\A\\join\\", "F:\\A\\out_Andy1"};

        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(PersonAddreMain.class);

        //设置自定义的group
        job.setGroupingComparatorClass(PKFKCompartor.class);

        job.setMapperClass(PersonAddreMap.class);
        job.setMapOutputKeyClass(BeanKey.class);
        job.setMapOutputValueClass(Bean.class);

        job.setReducerClass(PersonAddreRedu.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        job.waitForCompletion(true);
    }
}

猜你喜欢

转载自blog.csdn.net/qq_45092505/article/details/105654585