现有user表以及order表,要求利用MapReduce实现两张表的userId字段join算法。
user表
u001,senge,18,angelababy
u002,laozhao,48,ruhua
u003,xiaoxu,16,chunge
u004,laoyang,28,zengge
u005,nana,14,huangbo
order表
order001,u001
order002,u001
order003,u005
order004,u002
order005,u003
order006,u004
最终输出:
order002,u001,18,senge,angelababy
order001,u001,18,senge,angelababy
order004,u002,48,laozhao,ruhua
order005,u003,16,xiaoxu,chunge
order006,u004,28,laoyang,zengge
order003,u005,14,nana,huangbo
首选构造一个实体类JoinBean来存放最终join之后的字段 orderId userId age userName userFriend,另外增加一个字段用来在map阶段表示该Bean属于哪张表 tableName
map阶段: 读取输入目录的每一个文件,首先判断该文件类型user表还是order表,将文件的每一行构造成一个Bean写入context 其中key=userId, value=Bean 相同用户的Bean会分发到同一个reduce中。
reduce阶段: 相同userId的Bean会分到同一组,找出这些Bean中的user以及order,相当于一对多,然后遍历order集合,将user信息写入Bean中,最后写入context中即可。
JoinBean
public class JoinBean implements Writable {
private String orderId;
private String userId;
private String userName;
private int userAge;
private String userFriend;
private String tableName;
public void set(String orderId, String userId, String userName, int userAge, String userFriend, String tableName) {
this.orderId = orderId;
this.userId = userId;
this.userName = userName;
this.userAge = userAge;
this.userFriend = userFriend;
this.tableName = tableName;
}
public String getTableName() {
return tableName;
}
public void setTableName(String tableName) {
this.tableName = tableName;
}
public String getOrderId() {
return orderId;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public String getUserId() {
return userId;
}
public void setUserId(String userId) {
this.userId = userId;
}
public String getUserName() {
return userName;
}
public void setUserName(String userName) {
this.userName = userName;
}
public int getUserAge() {
return userAge;
}
public void setUserAge(int userAge) {
this.userAge = userAge;
}
public String getUserFriend() {
return userFriend;
}
public void setUserFriend(String userFriend) {
this.userFriend = userFriend;
}
@Override
public String toString() {
return this.orderId + "," + this.userId + "," + this.userAge + "," + this.userName + "," + this.userFriend;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(this.orderId);
out.writeUTF(this.userId);
out.writeUTF(this.userName);
out.writeInt(this.userAge);
out.writeUTF(this.userFriend);
out.writeUTF(this.tableName);
}
@Override
public void readFields(DataInput in) throws IOException {
this.orderId = in.readUTF();
this.userId = in.readUTF();
this.userName = in.readUTF();
this.userAge = in.readInt();
this.userFriend = in.readUTF();
this.tableName = in.readUTF();
}
}
Map方法:
注意在setup方法中判断当前处理文件的文件名
public static class JoinMapper extends Mapper<LongWritable, Text, Text, JoinBean> {
String fileName = null;
JoinBean bean = new JoinBean();
Text k = new Text();
/**
* maptask在做数据处理时,会先调用一次setup() 钓完后才对每一行反复调用map()
*/
@Override
protected void setup(Mapper<LongWritable, Text, Text, JoinBean>.Context context)
throws IOException, InterruptedException {
FileSplit inputSplit = (FileSplit) context.getInputSplit();
fileName = inputSplit.getPath().getName();
}
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, JoinBean>.Context context)
throws IOException, InterruptedException {
String[] fields = value.toString().split(",");
if (fileName.startsWith("order")) {
bean.set(fields[0], fields[1], "NULL", -1, "NULL", "order");
} else {
bean.set("NULL", fields[0], fields[1], Integer.parseInt(fields[2]), fields[3], "user");
}
k.set(bean.getUserId());
context.write(k, bean);
}
}
Reduce方法:
按照userId聚合,由于user表每个userId是唯一的,因此聚合数据中只有一个user表Bean,其余为order表Bean,遍历order表Bean集合,将user表信息写入,最后输出。
public static class JoinReducer extends Reducer<Text, JoinBean, JoinBean, NullWritable> {
@Override
protected void reduce(Text key, Iterable<JoinBean> beans, Context context)
throws IOException, InterruptedException {
ArrayList<JoinBean> orderList = new ArrayList<>();
JoinBean userBean = null;
try {
// 区分两类数据
for (JoinBean bean : beans) {
if ("order".equals(bean.getTableName())) {
JoinBean newBean = new JoinBean();
BeanUtils.copyProperties(newBean, bean);
orderList.add(newBean);
}else{
userBean = new JoinBean();
BeanUtils.copyProperties(userBean, bean);
}
}
// 拼接数据,并输出
for(JoinBean bean:orderList){
bean.setUserName(userBean.getUserName());
bean.setUserAge(userBean.getUserAge());
bean.setUserFriend(userBean.getUserFriend());
context.write(bean, NullWritable.get());
}
} catch (IllegalAccessException | InvocationTargetException e) {
e.printStackTrace();
}
}
}
提交任务:
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(Join.class);
job.setMapperClass(JoinMapper.class);
job.setReducerClass(JoinReducer.class);
job.setNumReduceTasks(1);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(JoinBean.class);
job.setOutputKeyClass(JoinBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("F:\\hadoop-2.8.1\\data\\join\\index"));
FileOutputFormat.setOutputPath(job, new Path("F:\\hadoop-2.8.1\\data\\join\\output"+System.currentTimeMillis()));
job.waitForCompletion(true);
}
优化实现方式:使用Partitioner+Compareto+GroupingComparator
1.map阶段读取user Bean以及order Bean 此时使用Bean来作为key Null作为value
2.重写Partitioner,使相同userId的Bean分发到同一个reduce
3.重写Bean中compareTo方法,使得同一个reduce中Bean首先按照userId来排序,其次按照tableName来排序
4.重写groupingComparator方法,使得同一个reduce中,相同的userId来作为一组聚合
5.在reduce中,读取第一个元素为user Bean 后面元素为order Bean组装写入context
map阶段
public static class Join2Mapper extends Mapper<LongWritable, Text, JoinBean, NullWritable> {
String fileName = null;
JoinBean bean = new JoinBean();
//Text k = new Text();
/**
* maptask在做数据处理时,会先调用一次setup() 钓完后才对每一行反复调用map()
*/
@Override
protected void setup(Context context)
throws IOException, InterruptedException {
FileSplit inputSplit = (FileSplit) context.getInputSplit();
fileName = inputSplit.getPath().getName();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] fields = value.toString().split(",");
if (fileName.startsWith("order")) {
bean.set(fields[0], fields[1], "NULL", -1, "NULL", "order");
} else {
bean.set("NULL", fields[0], fields[1], Integer.parseInt(fields[2]), fields[3], "user");
}
//k.set(bean.getUserId());
context.write(bean, NullWritable.get());
}
}
重写Partitioner
public class UserIdPartitioner extends Partitioner<JoinBean, NullWritable>{
@Override
public int getPartition(JoinBean key, NullWritable value, int numPartitions) {
return (key.getUserId().hashCode() & Integer.MAX_VALUE) % numPartitions;
}
}
重写Bean compareTo方法
@Override
public int compareTo(JoinBean o) {
return this.userId.compareTo(o.userId)==0 ? -this.tableName.compareTo(o.tableName) : this.userId.compareTo(o.userId);
}
重写GroupingComparator
public class TableNameGroupingComparator extends WritableComparator{
public TableNameGroupingComparator() {
super(JoinBean.class, true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
JoinBean o1 = (JoinBean) a;
JoinBean o2 = (JoinBean) b;
return o1.getUserId().compareTo(o2.getUserId());
}
}
reduce方法
public static class Join2Reducer extends Reducer<JoinBean, NullWritable, JoinBean, NullWritable> {
@Override
protected void reduce(JoinBean key, Iterable<NullWritable> beans, Context context)
throws IOException, InterruptedException {
ArrayList<JoinBean> orderList = new ArrayList<>();
JoinBean userBean = new JoinBean();
//JoinBean userBean = null;
Iterator<NullWritable> it = beans.iterator();
if(it.hasNext()){
it.next();
//注意这里只是引用赋值 每次迭代key都会改变,响应userBean也会变化,这里应该使用
//userBean = key;
try {
BeanUtils.copyProperties(userBean, key);
} catch (IllegalAccessException | InvocationTargetException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
//System.out.println("userBean = " + userBean);
while(it.hasNext()){
it.next();
//System.out.println("key before = " + key);
//System.out.println("userBean key = " + userBean);
key.setUserName(userBean.getUserName());
key.setUserAge(userBean.getUserAge());
key.setUserFriend(userBean.getUserFriend());
//System.out.println("key after = " + key);
context.write(key, NullWritable.get());
}
}
}
这里要注意Bean的复制拷贝以及key-value迭代机制
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(Join2.class);
job.setMapperClass(Join2Mapper.class);
job.setReducerClass(Join2Reducer.class);
job.setNumReduceTasks(1);
job.setMapOutputKeyClass(JoinBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(JoinBean.class);
job.setOutputValueClass(NullWritable.class);
job.setPartitionerClass(UserIdPartitioner.class);
job.setGroupingComparatorClass(TableNameGroupingComparator.class);
FileInputFormat.setInputPaths(job, new Path("F:\\hadoop-2.8.1\\data\\join\\index"));
FileOutputFormat.setOutputPath(job, new Path("F:\\hadoop-2.8.1\\data\\join\\output" + System.currentTimeMillis()));
job.waitForCompletion(true);
}
最终结果:
order002,u001,18,senge,angelababy,order
order001,u001,18,senge,angelababy,order
order004,u002,48,laozhao,ruhua,order
order005,u003,16,xiaoxu,chunge,order
order006,u004,28,laoyang,zengge,order
order003,u005,14,nana,huangbo,order