MapReduce框架原理之Join
Join
类似SQL的Join,将1张以上的表按照某种关系,将数据进行格式化输出.
1 Reduce Join
Reduce Join工作原理
- Map端的主要工作:为来自不同表或文件的key/value对,
添加标识加以区别不同的数据来源
,然后用join字段作为排序和分组条件(本例是pid)
,其余部分和标识作为value,最后进行输出. - Reduce端的主要工作:在Reduce端以连接字段作为key的分组已经完成,每组数据会调用一个reduce方法,此时我们只需要根据在Map端添加的
标识
将数据分别拿出后,再合并成我们需要的格式即可.
需求:
(1)订单数据表
id | pid | amount |
---|---|---|
1001 | 01 |
1 |
1002 | 02 |
2 |
1003 | 03 |
3 |
1004 | 01 |
4 |
1005 | 02 |
5 |
1006 | 03 |
6 |
(2)商品信息表
pid | pname |
---|---|
01 |
mi |
02 |
iphone |
03 |
blackberry |
(3)最终期望输出结果(按pid升序可以保证,id可能会变成倒序乱,如有特殊要求可以自行调整)
id | pname | amount |
---|---|---|
1001 | mi |
1 |
1004 | mi |
4 |
1002 | iphone |
2 |
1005 | iphone |
5 |
1003 | blackberry |
3 |
1006 | blackberry |
6 |
自定义Bean
package join;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class TableBean implements WritableComparable<TableBean> {
private String id;
private String pid;
private int amount;
private String name;
private String tblName;
@Override
public String toString() {
return this.id + "\t" + this.name + "\t" + this.amount;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getPid() {
return pid;
}
public void setPid(String pid) {
this.pid = pid;
}
public int getAmount() {
return amount;
}
public void setAmount(int amount) {
this.amount = amount;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getTblName() {
return tblName;
}
public void setTblName(String tblName) {
this.tblName = tblName;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(this.id);
dataOutput.writeUTF(this.pid);
dataOutput.writeInt(this.amount);
dataOutput.writeUTF(this.name);
dataOutput.writeUTF(this.tblName);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.id = dataInput.readUTF();
this.pid = dataInput.readUTF();
this.amount = dataInput.readInt();
this.name = dataInput.readUTF();
this.tblName = dataInput.readUTF();
}
@Override
public int compareTo(TableBean o) {
int result = 0;
if(this.pid.compareTo(o.pid) > 0) {
result = 1;
} else if (this.pid.compareTo(o.pid) < 0) {
result = -1;
}
return result;
}
}
Mapper类
package join;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class TableMapper extends Mapper<LongWritable, Text, TableBean, NullWritable> {
private String fileName;
private TableBean bean = new TableBean();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
FileSplit split = (FileSplit) context.getInputSplit();
fileName = split.getPath().getName();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] fields = value.toString().split("\t");
bean.setTblName(fileName);
if("order.txt".equals(fileName)) {
// order表
bean.setId(fields[0]);
bean.setPid(fields[1]);
bean.setAmount(Integer.parseInt(fields[2]));
bean.setName("");
} else {
// pd表
bean.setId("");
bean.setPid(fields[0]);
bean.setAmount(0);
bean.setName(fields[1]);
}
context.write(bean, NullWritable.get());
}
}
Reducer类
package join;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.List;
public class TableReducer extends Reducer<TableBean, NullWritable, TableBean, NullWritable> {
private List<TableBean> list = new ArrayList<TableBean>();
private TableBean pBean = new TableBean();
@Override
protected void reduce(TableBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
for (NullWritable value : values) {
if("order.txt".equals(key.getTblName())) {
TableBean bean = new TableBean();
try {
BeanUtils.copyProperties(bean, key);
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
list.add(bean);
} else {
try {
BeanUtils.copyProperties(pBean, key);
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
}
}
for (TableBean bean : list) {
bean.setName(pBean.getName());
context.write(bean, NullWritable.get());
}
list.clear();
}
}
Driver类
package join;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class TableDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
Path oPath = new Path("i:\\tbl_output");
if (fs.exists(oPath)) {
fs.delete(oPath, true);
}
Job job = Job.getInstance(conf);
job.setJarByClass(TableDriver.class);
job.setMapperClass(TableMapper.class);
job.setReducerClass(TableReducer.class);
job.setMapOutputKeyClass(TableBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(TableBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(job, new Path("i:\\tbl_input"));
FileOutputFormat.setOutputPath(job, oPath);
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
缺点:
join操作全部由Reduce端完成,造成Reduce端压力偏大,而Map端压力偏小,且极易造成数据倾斜.
优化方法:
使用Map Join
2 Map Join
- 使用场景
Map Join适用于一张小表(10-15M)和一张大表的场景. - 优点
可以将多张小表缓存起来,在map端进行业务合并,减少到reduce的数据,减缓reduce端的压力,尽可能的减少数据倾斜. - 具体办法
(1)在Driver类中,指明要添加到缓存的表.
(2)在Mapper的setup方法中,将文件从缓存中读出,并进行封装.(如使用map) - 代码演示(Bean还用上面的)
Mapper类
package join.map;
import join.reduce.TableBean;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.*;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
public class TblMapper extends Mapper<LongWritable, Text, TableBean, NullWritable> {
private Map<String, String> map = new HashMap<String, String>();
private TableBean bean = new TableBean();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
URI[] files = context.getCacheFiles();
URI uri = files[0];
String p = uri.getPath();
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(p)));
String line = "";
while(StringUtils.isNotEmpty(line = reader.readLine())){
String[] fields = line.split("\t");
map.put(fields[0], fields[1]);
}
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] fields = value.toString().split("\t");
bean.setId(fields[0]);
bean.setPid(fields[1]);
bean.setAmount(Integer.parseInt(fields[2]));
bean.setName(map.get(fields[1]));
// 值得注意的是这里,因为在Bean中,readFields和write中涉及到的field,一定都是要非Null的否则会报空指针异常.
// 所以,这里我们虽然不需要TblName这个字段的值,也需要将它设置成非Null的值
bean.setTblName("");
context.write(bean, NullWritable.get());
}
}
Reducer类
package join.map;
import join.reduce.TableBean;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class TblReducer extends Reducer<TableBean, NullWritable, TableBean, NullWritable> {
private TableBean bean = new TableBean();
@Override
protected void reduce(TableBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
for (NullWritable value : values) {
bean.setId(key.getId());
bean.setPid(key.getPid());
bean.setName(key.getName());
bean.setAmount(key.getAmount());
context.write(bean ,NullWritable.get());
}
}
}
Driver类
package join.map;
import join.reduce.TableBean;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
public class TblDriver {
public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
Path oPath = new Path("i:\\tbl_output");
if (fs.exists(oPath)) {
fs.delete(oPath, true);
}
Job job = Job.getInstance(conf);
// 将较小的pd表内容添加到缓存
job.addCacheFile(new URI("file:///i:/tbl_input/pd.txt"));
job.setJarByClass(TblDriver.class);
job.setMapperClass(TblMapper.class);
job.setReducerClass(TblReducer.class);
job.setMapOutputKeyClass(TableBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(TableBean.class);
job.setOutputValueClass(NullWritable.class);
// 只有2张表,pd已经加载进缓存中,所以inputformat只需要处理order表即可.
FileInputFormat.addInputPath(job, new Path("i:\\tbl_input\\order.txt"));
FileOutputFormat.setOutputPath(job, oPath);
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}