Join

类似SQL的Join，将1张以上的表按照某种关系，将数据进行格式化输出.

1 Reduce Join

Reduce Join工作原理

Map端的主要工作：为来自不同表或文件的key/value对，添加标识加以区别不同的数据来源，然后用join字段作为排序和分组条件（本例是pid），其余部分和标识作为value，最后进行输出.
Reduce端的主要工作：在Reduce端以连接字段作为key的分组已经完成，每组数据会调用一个reduce方法，此时我们只需要根据在Map端添加的标识将数据分别拿出后，再合并成我们需要的格式即可.

需求：
（1）订单数据表

id	pid	amount
1001	`01`	1
1002	`02`	2
1003	`03`	3
1004	`01`	4
1005	`02`	5
1006	`03`	6

（2）商品信息表

pid	pname
`01`	mi
`02`	iphone
`03`	blackberry

（3）最终期望输出结果(按pid升序可以保证，id可能会变成倒序乱，如有特殊要求可以自行调整)

id	pname	amount
1001	`mi`	1
1004	`mi`	4
1002	`iphone`	2
1005	`iphone`	5
1003	`blackberry`	3
1006	`blackberry`	6

自定义Bean

package join;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class TableBean implements WritableComparable<TableBean> {
    private String id;
    private String pid;
    private int amount;
    private String name;
    private String tblName;
    @Override
    public String toString() {
        return this.id + "\t" + this.name + "\t" + this.amount;
    }

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public String getPid() {
        return pid;
    }

    public void setPid(String pid) {
        this.pid = pid;
    }

    public int getAmount() {
        return amount;
    }

    public void setAmount(int amount) {
        this.amount = amount;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public String getTblName() {
        return tblName;
    }

    public void setTblName(String tblName) {
        this.tblName = tblName;
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(this.id);
        dataOutput.writeUTF(this.pid);
        dataOutput.writeInt(this.amount);
        dataOutput.writeUTF(this.name);
        dataOutput.writeUTF(this.tblName);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        this.id = dataInput.readUTF();
        this.pid = dataInput.readUTF();
        this.amount = dataInput.readInt();
        this.name = dataInput.readUTF();
        this.tblName = dataInput.readUTF();
    }

    @Override
    public int compareTo(TableBean o) {
        int result = 0;
        if(this.pid.compareTo(o.pid) > 0) {
            result = 1;
        } else if (this.pid.compareTo(o.pid) < 0) {
            result = -1;
        }
        return result;
    }
}

Mapper类

package join;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

public class TableMapper extends Mapper<LongWritable, Text, TableBean, NullWritable> {
    private String fileName;
    private TableBean bean = new TableBean();

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        FileSplit split = (FileSplit) context.getInputSplit();
        fileName = split.getPath().getName();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] fields = value.toString().split("\t");

        bean.setTblName(fileName);
        if("order.txt".equals(fileName)) {
            // order表
            bean.setId(fields[0]);
            bean.setPid(fields[1]);
            bean.setAmount(Integer.parseInt(fields[2]));
            bean.setName("");
        } else {
            // pd表
            bean.setId("");
            bean.setPid(fields[0]);
            bean.setAmount(0);
            bean.setName(fields[1]);
        }
        context.write(bean, NullWritable.get());
    }
}

Reducer类

package join;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.List;

public class TableReducer extends Reducer<TableBean, NullWritable, TableBean, NullWritable> {
    private List<TableBean> list = new ArrayList<TableBean>();
    private TableBean pBean = new TableBean();
    @Override
    protected void reduce(TableBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
        for (NullWritable value : values) {
            if("order.txt".equals(key.getTblName())) {
                TableBean bean = new TableBean();
                try {
                    BeanUtils.copyProperties(bean, key);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
                list.add(bean);
            } else {
                try {
                    BeanUtils.copyProperties(pBean, key);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
            }
        }
        for (TableBean bean : list) {
            bean.setName(pBean.getName());
            context.write(bean, NullWritable.get());
        }
        list.clear();
    }
}

Driver类

package join;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class TableDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        Path oPath = new Path("i:\\tbl_output");
        if (fs.exists(oPath)) {
            fs.delete(oPath, true);
        }
        Job job = Job.getInstance(conf);
        job.setJarByClass(TableDriver.class);
        job.setMapperClass(TableMapper.class);
        job.setReducerClass(TableReducer.class);

        job.setMapOutputKeyClass(TableBean.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(TableBean.class);
        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.addInputPath(job, new Path("i:\\tbl_input"));
        FileOutputFormat.setOutputPath(job, oPath);

        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }
}

缺点：
join操作全部由Reduce端完成，造成Reduce端压力偏大，而Map端压力偏小，且极易造成数据倾斜.
优化方法：
使用Map Join

2 Map Join

使用场景
Map Join适用于一张小表(10-15M)和一张大表的场景.
优点
可以将多张小表缓存起来，在map端进行业务合并，减少到reduce的数据，减缓reduce端的压力，尽可能的减少数据倾斜.
具体办法
（1）在Driver类中，指明要添加到缓存的表.
（2）在Mapper的setup方法中，将文件从缓存中读出，并进行封装.（如使用map）
代码演示（Bean还用上面的）

Mapper类

package join.map;

import join.reduce.TableBean;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.*;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;

public class TblMapper extends Mapper<LongWritable, Text, TableBean, NullWritable> {
    private Map<String, String> map = new HashMap<String, String>();
    private TableBean bean = new TableBean();
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        URI[] files = context.getCacheFiles();
        URI uri = files[0];
        String p = uri.getPath();
        BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(p)));
        String line = "";
        while(StringUtils.isNotEmpty(line = reader.readLine())){
            String[] fields = line.split("\t");
            map.put(fields[0], fields[1]);
        }
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] fields = value.toString().split("\t");
        bean.setId(fields[0]);
        bean.setPid(fields[1]);
        bean.setAmount(Integer.parseInt(fields[2]));
        bean.setName(map.get(fields[1]));
        // 值得注意的是这里，因为在Bean中，readFields和write中涉及到的field，一定都是要非Null的否则会报空指针异常.
        // 所以，这里我们虽然不需要TblName这个字段的值，也需要将它设置成非Null的值
        bean.setTblName("");
        context.write(bean, NullWritable.get());
    }
}

Reducer类

package join.map;

import join.reduce.TableBean;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class TblReducer extends Reducer<TableBean, NullWritable, TableBean, NullWritable> {
    private TableBean bean = new TableBean();
    @Override
    protected void reduce(TableBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
        for (NullWritable value : values) {
            bean.setId(key.getId());
            bean.setPid(key.getPid());
            bean.setName(key.getName());
            bean.setAmount(key.getAmount());
            context.write(bean ,NullWritable.get());
        }
    }
}

Driver类

package join.map;

import join.reduce.TableBean;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

public class TblDriver {
    public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        Path oPath = new Path("i:\\tbl_output");
        if (fs.exists(oPath)) {
            fs.delete(oPath, true);
        }
        Job job = Job.getInstance(conf);
        // 将较小的pd表内容添加到缓存
        job.addCacheFile(new URI("file:///i:/tbl_input/pd.txt"));
        job.setJarByClass(TblDriver.class);
        job.setMapperClass(TblMapper.class);
        job.setReducerClass(TblReducer.class);

        job.setMapOutputKeyClass(TableBean.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(TableBean.class);
        job.setOutputValueClass(NullWritable.class);

		// 只有2张表，pd已经加载进缓存中，所以inputformat只需要处理order表即可.
        FileInputFormat.addInputPath(job, new Path("i:\\tbl_input\\order.txt"));
        FileOutputFormat.setOutputPath(job, oPath);

        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }
}

leonardy

发布了62 篇原创文章 · 获赞 3 · 访问量 2万+

私信关注

MapReduce框架原理之（四）Join

MapReduce框架原理之Join

Join

1 Reduce Join

2 Map Join

猜你喜欢