多表连接
订单表 order.txt
1001 01 1
1002 02 2
1003 03 3
1001 02 2
1002 03 3
1003 03 5
商品表 produce.txt
01 小米
02 华为
03 格力
- 将商品表中数据根据商品id合并到订单数据表中。最终数据形式:
- 订单id、商品id、数量、商品名称
1001 01 1 小米
1002 02 2 华为
1003 03 3 格力
1001 02 2 华为
1002 03 3 格力
1003 03 5 格力
reduce端表合并(数据倾斜)
- 通过将关联条件作为map输出的key,将两表满足join条件的数据并携带数据所来源的文件信息,发往同一个reduce task,在reduce中进行数据的串联。
JoinTableBean
package com.hadoop.mapreduce.jointable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class JoinTableBean implements Writable {
private String order_id; // 订单id
private String produce_id; // 产品id
private int num; // 产品数量
private String produce_name; // 产品名称
private String flag;// 表的标记 O-订单表记录;P-产品信息记录
public JoinTableBean() {
}
public void setData(String order_id, String produce_id, int num, String produce_name, String flag) {
this.order_id = order_id;
this.produce_id = produce_id;
this.num = num;
this.produce_name = produce_name;
this.flag = flag;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(order_id);
out.writeUTF(produce_id);
out.writeInt(num);
out.writeUTF(produce_name);
out.writeUTF(flag);
}
@Override
public void readFields(DataInput in) throws IOException {
this.order_id = in.readUTF();
this.produce_id = in.readUTF();
this.num = in.readInt();
this.produce_name = in.readUTF();
this.flag = in.readUTF();
}
@Override
public String toString() {
return order_id + "\t" + produce_id + "\t" + produce_name + "\t" + num;
}
public String getOrder_id() {
return order_id;
}
public void setOrder_id(String order_id) {
this.order_id = order_id;
}
public String getProduce_id() {
return produce_id;
}
public void setProduce_id(String produce_id) {
this.produce_id = produce_id;
}
public int getNum() {
return num;
}
public void setNum(int num) {
this.num = num;
}
public String getProduce_name() {
return produce_name;
}
public void setProduce_name(String produce_name) {
this.produce_name = produce_name;
}
public String getFlag() {
return flag;
}
public void setFlag(String flag) {
this.flag = flag;
}
}
JoinTableMapper
package com.hadoop.mapreduce.jointable;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
public class JoinTableMapper extends Mapper<LongWritable, Text, Text, JoinTableBean> {
private JoinTableBean bean = new JoinTableBean();
private Text text = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String [] fields = line.split("\t");
FileSplit fileSplit = (FileSplit) context.getInputSplit();
String file_name = fileSplit.getPath().getName();
// 通过文件名判断是哪种数据
String produce_id = "";
if(file_name.startsWith("order")){
// order_id produce_id num
produce_id = fields[1];
bean.setData(fields[0], produce_id, Integer.parseInt(fields[2]), "", "O");
}else{
// produce_id produce_name
produce_id = fields[0];
bean.setData("", produce_id, 0, fields[1], "P");
}
text.set(produce_id);
context.write(text, bean);
}
}
JoinTableReducer
package com.hadoop.mapreduce.jointable;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class JoinTableReducer extends Reducer<Text, JoinTableBean,JoinTableBean, NullWritable> {
private JoinTableBean bean = new JoinTableBean(); //产品
private ArrayList<JoinTableBean> orderBeans = new ArrayList<JoinTableBean>(); //订单
@Override
protected void reduce(Text key, Iterable<JoinTableBean> beans, Context context) throws IOException, InterruptedException {
for(JoinTableBean b : beans){
if("P".equals(b.getFlag())){ //产品信息表
try {
BeanUtils.copyProperties(bean, b);
} catch (Exception e) {
e.printStackTrace();
}
}else{ //订单数据表
JoinTableBean t = new JoinTableBean();
try {
BeanUtils.copyProperties(t, b);
orderBeans.add(t);
} catch (Exception e) {
e.printStackTrace();
}
}
}
// 拼接两类数据形成最终结果
for(JoinTableBean b : orderBeans){
b.setProduce_name(bean.getProduce_name());
context.write(b, NullWritable.get());
}
orderBeans.clear();
}
}
JoinTableDriver
package com.hadoop.mapreduce.jointable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* reduce端表合并,存在数据倾斜的情况
* 通过将关联条件作为map输出的key,将两表满足join条件的数据并携带数据所来源的文件信息,发往同一个reduce task,在reduce中进行数据的串联。
*/
public class JoinTableDriver extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Configuration conf = this.getConf();
Job job = Job.getInstance(conf);
// 设置加载jar的位置
job.setJarByClass(JoinTableDriver.class);
// 指定本业务job要使用的mapper/Reducer业务类
job.setMapperClass(JoinTableMapper.class);
job.setReducerClass(JoinTableReducer.class);
// 仅设置map,单独设置map中输出key和value的类型
// 指定mapper输出数据的key-value类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(JoinTableBean.class);
// 全局设置,同时设置map和reduce
// 指定最终输出的数据的key-value类型
job.setOutputKeyClass(JoinTableBean.class);
job.setOutputValueClass(NullWritable.class);
// 指定job的输入原始文件所在目录
FileInputFormat.setInputPaths(job, new Path(args[0]));
// 指定job的输出结果所在目录
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean res = job.waitForCompletion(true);
return res ? 0 : 1;
}
public static void main(String[] args) throws Exception {
if (args == null || args.length != 2) {
System.err.println("Usage: hadoop jar <jarname> <classname> <input path> <output path>");
System.exit(-1);
}
int ret = ToolRunner.run(new JoinTableDriver(), args);
System.exit(ret);
}
}
- 缺点:这种方式中,合并的操作是在reduce阶段完成,reduce端的处理压力太大,map节点的运算负载则很低,资源利用率不高,且在reduce阶段极易产生数据倾斜
- 解决方案: map端实现数据合并
map端表合并(Distributedcache)
- 适用于关联表中有小表的情形。
- 可以将小表分发到所有的map节点,map节点就可以在本地对自己所读到的大表数据进行合并并输出最终结果,可以大大提高合并操作的并发度,加快处理速度。
- map端表合并就没有reducer阶段了,运行程序时输入文件中也把produce文件去掉
MapSideJoinTableMapper
package com.hadoop.mapreduce.jointable;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MapSideJoinTableMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
// 用一个hashmap来加载保存产品信息表
private Map<String, String> produceMap = new HashMap<String, String>();
private Text k = new Text();
/**
* 通过阅读父类Mapper的源码,发现 setup方法是在maptask处理数据之前调用一次 可以用来做一些初始化工作
*/
@Override
protected void setup(Context context) throws IOException, InterruptedException {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream("produce.txt")));
String line;
while (StringUtils.isNotEmpty(line = br.readLine())) {
String[] fields = line.split("\t");
produceMap.put(fields[0], fields[1]);
}
br.close();
}
/**
* 由于已经持有完整的产品信息表,所以在map方法中就能实现join逻辑了
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] fields = line.split("\t");
String produce_name = produceMap.get(fields[1]);
k.set(line + "\t" + produce_name);
context.write(k, NullWritable.get());
}
}
MapSideJoinTableDriver
package com.hadoop.mapreduce.jointable;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* map端表合并(Distributedcache) 适用于关联表中有小表的情形;
* 可以将小表分发到所有的map节点,map节点就可以在本地对自己所读到的大表数据进行合并并输出最终结果,可以大大提高合并操作的并发度,加快处理速度。
*/
public class MapSideJoinTableDriver extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Configuration conf = this.getConf();
Job job = Job.getInstance(conf);
job.setJarByClass(MapSideJoinTableDriver.class);
job.setMapperClass(MapSideJoinTableMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 将产品表文件缓存到task工作节点的工作目录中去
job.addCacheFile(new URI("file:/D:/hadoop/jointableinput/produce.txt"));
// map端join的逻辑不需要reduce阶段,设置reducetask数量为0
job.setNumReduceTasks(0);
boolean res = job.waitForCompletion(true);
return res ? 0 : 1;
}
public static void main(String[] args) throws Exception {
if (args == null || args.length != 2) {
System.err.println("Usage: hadoop jar <jarname> <classname> <input path> <output path>");
System.exit(-1);
}
int ret = ToolRunner.run(new MapSideJoinTableDriver(), args);
System.exit(ret);
}
}