Hadoop--MapReduce8--Join算法

现有user表以及order表，要求利用MapReduce实现两张表的userId字段join算法。

user表

u001,senge,18,angelababy
u002,laozhao,48,ruhua
u003,xiaoxu,16,chunge
u004,laoyang,28,zengge
u005,nana,14,huangbo

order表

order001,u001
order002,u001
order003,u005
order004,u002
order005,u003
order006,u004

最终输出：

order002,u001,18,senge,angelababy
order001,u001,18,senge,angelababy
order004,u002,48,laozhao,ruhua
order005,u003,16,xiaoxu,chunge
order006,u004,28,laoyang,zengge
order003,u005,14,nana,huangbo

首选构造一个实体类JoinBean来存放最终join之后的字段 orderId userId age userName userFriend，另外增加一个字段用来在map阶段表示该Bean属于哪张表 tableName

map阶段：读取输入目录的每一个文件，首先判断该文件类型user表还是order表，将文件的每一行构造成一个Bean写入context 其中key=userId, value=Bean 相同用户的Bean会分发到同一个reduce中。

reduce阶段：相同userId的Bean会分到同一组，找出这些Bean中的user以及order,相当于一对多，然后遍历order集合，将user信息写入Bean中，最后写入context中即可。

JoinBean

public class JoinBean implements Writable {

	private String orderId;
	private String userId;
	private String userName;
	private int userAge;
	private String userFriend;
	private String tableName;

	public void set(String orderId, String userId, String userName, int userAge, String userFriend, String tableName) {
		this.orderId = orderId;
		this.userId = userId;
		this.userName = userName;
		this.userAge = userAge;
		this.userFriend = userFriend;
		this.tableName = tableName;
	}

	public String getTableName() {
		return tableName;
	}

	public void setTableName(String tableName) {
		this.tableName = tableName;
	}

	public String getOrderId() {
		return orderId;
	}

	public void setOrderId(String orderId) {
		this.orderId = orderId;
	}

	public String getUserId() {
		return userId;
	}

	public void setUserId(String userId) {
		this.userId = userId;
	}

	public String getUserName() {
		return userName;
	}

	public void setUserName(String userName) {
		this.userName = userName;
	}

	public int getUserAge() {
		return userAge;
	}

	public void setUserAge(int userAge) {
		this.userAge = userAge;
	}

	public String getUserFriend() {
		return userFriend;
	}

	public void setUserFriend(String userFriend) {
		this.userFriend = userFriend;
	}

	@Override
	public String toString() {
		return this.orderId + "," + this.userId + "," + this.userAge + "," + this.userName + "," + this.userFriend;
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(this.orderId);
		out.writeUTF(this.userId);
		out.writeUTF(this.userName);
		out.writeInt(this.userAge);
		out.writeUTF(this.userFriend);
		out.writeUTF(this.tableName);

	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.orderId = in.readUTF();
		this.userId = in.readUTF();
		this.userName = in.readUTF();
		this.userAge = in.readInt();
		this.userFriend = in.readUTF();
		this.tableName = in.readUTF();

	}

}

Map方法：

注意在setup方法中判断当前处理文件的文件名

public static class JoinMapper extends Mapper<LongWritable, Text, Text, JoinBean> {
		String fileName = null;
		JoinBean bean = new JoinBean();
		Text k = new Text();

		/**
		 * maptask在做数据处理时，会先调用一次setup() 钓完后才对每一行反复调用map()
		 */
		@Override
		protected void setup(Mapper<LongWritable, Text, Text, JoinBean>.Context context)
				throws IOException, InterruptedException {
			FileSplit inputSplit = (FileSplit) context.getInputSplit();
			fileName = inputSplit.getPath().getName();
		}

		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, JoinBean>.Context context)
				throws IOException, InterruptedException {

			String[] fields = value.toString().split(",");

			if (fileName.startsWith("order")) {
				bean.set(fields[0], fields[1], "NULL", -1, "NULL", "order");
			} else {
				bean.set("NULL", fields[0], fields[1], Integer.parseInt(fields[2]), fields[3], "user");
			}
			k.set(bean.getUserId());
			context.write(k, bean);

		}

	}

Reduce方法：

按照userId聚合，由于user表每个userId是唯一的，因此聚合数据中只有一个user表Bean，其余为order表Bean,遍历order表Bean集合，将user表信息写入，最后输出。

public static class JoinReducer extends Reducer<Text, JoinBean, JoinBean, NullWritable> {

		@Override
		protected void reduce(Text key, Iterable<JoinBean> beans, Context context)
				throws IOException, InterruptedException {
			ArrayList<JoinBean> orderList = new ArrayList<>();
			JoinBean userBean = null;

			try {
				// 区分两类数据
				for (JoinBean bean : beans) {
					if ("order".equals(bean.getTableName())) {
						JoinBean newBean = new JoinBean();
						BeanUtils.copyProperties(newBean, bean);
						orderList.add(newBean);
					}else{
						userBean = new JoinBean();
						BeanUtils.copyProperties(userBean, bean);
					}

				}
				
				// 拼接数据，并输出
				for(JoinBean bean:orderList){
					bean.setUserName(userBean.getUserName());
					bean.setUserAge(userBean.getUserAge());
					bean.setUserFriend(userBean.getUserFriend());					
					context.write(bean, NullWritable.get());					
				}
			} catch (IllegalAccessException | InvocationTargetException e) {
				e.printStackTrace();
			}

		}

	}

提交任务：

public static void main(String[] args) throws Exception {

		Configuration conf = new Configuration();  	
		Job job = Job.getInstance(conf);
		job.setJarByClass(Join.class);
		job.setMapperClass(JoinMapper.class);
		job.setReducerClass(JoinReducer.class);
		job.setNumReduceTasks(1);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(JoinBean.class);
		job.setOutputKeyClass(JoinBean.class);
		job.setOutputValueClass(NullWritable.class);
		FileInputFormat.setInputPaths(job, new Path("F:\\hadoop-2.8.1\\data\\join\\index"));
		FileOutputFormat.setOutputPath(job, new Path("F:\\hadoop-2.8.1\\data\\join\\output"+System.currentTimeMillis()));

		job.waitForCompletion(true);
	}

优化实现方式：使用Partitioner+Compareto+GroupingComparator

1.map阶段读取user Bean以及order Bean 此时使用Bean来作为key Null作为value

2.重写Partitioner，使相同userId的Bean分发到同一个reduce

3.重写Bean中compareTo方法，使得同一个reduce中Bean首先按照userId来排序，其次按照tableName来排序

4.重写groupingComparator方法，使得同一个reduce中，相同的userId来作为一组聚合

5.在reduce中，读取第一个元素为user Bean 后面元素为order Bean组装写入context

map阶段

public static class Join2Mapper extends Mapper<LongWritable, Text, JoinBean, NullWritable> {
		String fileName = null;
		JoinBean bean = new JoinBean();
		//Text k = new Text();

		/**
		 * maptask在做数据处理时，会先调用一次setup() 钓完后才对每一行反复调用map()
		 */
		@Override
		protected void setup(Context context)
				throws IOException, InterruptedException {
			FileSplit inputSplit = (FileSplit) context.getInputSplit();
			fileName = inputSplit.getPath().getName();
		}

		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

			String[] fields = value.toString().split(",");

			if (fileName.startsWith("order")) {
				bean.set(fields[0], fields[1], "NULL", -1, "NULL", "order");
			} else {
				bean.set("NULL", fields[0], fields[1], Integer.parseInt(fields[2]), fields[3], "user");
			}
			//k.set(bean.getUserId());
			context.write(bean, NullWritable.get());

		}

	}

重写Partitioner

public class UserIdPartitioner extends Partitioner<JoinBean, NullWritable>{

	@Override
	public int getPartition(JoinBean key, NullWritable value, int numPartitions) {
		return (key.getUserId().hashCode() & Integer.MAX_VALUE) % numPartitions;
	}
}

重写Bean compareTo方法

@Override
	public int compareTo(JoinBean o) {
	    return this.userId.compareTo(o.userId)==0 ? -this.tableName.compareTo(o.tableName) : this.userId.compareTo(o.userId);
	}

重写GroupingComparator

public class TableNameGroupingComparator extends WritableComparator{
	
	public TableNameGroupingComparator() {
		super(JoinBean.class, true);
	}
	
	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		JoinBean o1 = (JoinBean) a;
		JoinBean o2 = (JoinBean) b;
		return o1.getUserId().compareTo(o2.getUserId());	
	}
}

reduce方法

public static class Join2Reducer extends Reducer<JoinBean, NullWritable, JoinBean, NullWritable> {
		
		@Override
		protected void reduce(JoinBean key, Iterable<NullWritable> beans, Context context)
				throws IOException, InterruptedException {
			ArrayList<JoinBean> orderList = new ArrayList<>();
			JoinBean userBean = new JoinBean();
			//JoinBean userBean = null;
			Iterator<NullWritable> it = beans.iterator();
			if(it.hasNext()){
				it.next();
				//注意这里只是引用赋值  每次迭代key都会改变，响应userBean也会变化，这里应该使用
				//userBean = key;				
				try {
					BeanUtils.copyProperties(userBean, key);
				} catch (IllegalAccessException | InvocationTargetException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
			}
			//System.out.println("userBean = " + userBean);
			while(it.hasNext()){
				it.next();
				//System.out.println("key before = " + key);
				//System.out.println("userBean key = " + userBean);
				key.setUserName(userBean.getUserName());
        		key.setUserAge(userBean.getUserAge());
        		key.setUserFriend(userBean.getUserFriend());
        		//System.out.println("key after = " + key);
				context.write(key, NullWritable.get());
			}		                       
		}
	}

这里要注意Bean的复制拷贝以及key-value迭代机制

public static void main(String[] args) throws Exception {

		Configuration conf = new Configuration();  	
		Job job = Job.getInstance(conf);
		job.setJarByClass(Join2.class);
		job.setMapperClass(Join2Mapper.class);
		job.setReducerClass(Join2Reducer.class);
		job.setNumReduceTasks(1);
		job.setMapOutputKeyClass(JoinBean.class);
		job.setMapOutputValueClass(NullWritable.class);
		job.setOutputKeyClass(JoinBean.class);
		job.setOutputValueClass(NullWritable.class);
		job.setPartitionerClass(UserIdPartitioner.class);
		job.setGroupingComparatorClass(TableNameGroupingComparator.class);
		FileInputFormat.setInputPaths(job, new Path("F:\\hadoop-2.8.1\\data\\join\\index"));
		FileOutputFormat.setOutputPath(job, new Path("F:\\hadoop-2.8.1\\data\\join\\output" + System.currentTimeMillis()));

		job.waitForCompletion(true);
	}

最终结果：

order002,u001,18,senge,angelababy,order
order001,u001,18,senge,angelababy,order
order004,u002,48,laozhao,ruhua,order
order005,u003,16,xiaoxu,chunge,order
order006,u004,28,laoyang,zengge,order
order003,u005,14,nana,huangbo,order