Reduce端join算法实现 - （订单跟商品）

代码地址：
https://gitee.com/tanghongping/hadoopMapReduce/tree/master/src/com/thp/bigdata/rjon

现在有两张表 1.订单表 2.商品表
订单数据表t_order：

id	date	pid	amount
1001	20150710	P0001	2
1002	20150710	P0001	3
1002	20150710	P0002	3
1003	20150710	P0003	2

商品信息表t_product:

id	pname	category_id	price
P0001	小米5	1000	2
P0002	锤子T1	1000	3
P0003	华为	1001	5

加入数量巨大，两张表的数据是以文件的形式存储在HDFS中，需要用mapreduce程序来实现SQL查询运算：

select  a.id,a.date,b.name,b.category_id,b.price from t_order a join t_product b on a.pid = b.id

实现机制：

通过将关联条件作为map输出的key,将两表满足join条件的数据并携带数据所来源的文件信息，发往同一个reduce task , 在reduce中进行数据的串联。
首先有两个文件：
订单文件
order.txt

1001	20150710	P0001	2
1002	20150710	P0001	3
1002	20150710	P0002	3
1003    20150710    P0003   2

商品信息文件：
product.txt

P0001	小米5	1000	2
P0002	锤子T1	1000	3

首先我们应该构建一个实体类，这个实体类里面要有订单表和商品表中的所有的字段。
由于map阶段我们是需要读取两个文件的，一个是订单文件，一个是商品文件，但是我们使用的是同一个实体类。所以我们应该有一个属性用来标识哪个是从订单中读取到的数据，哪个是从商品表中读取到的数据。

因为我们在reduce阶段需要将两张表的数据进行合并，两张表的关系是一对多的关系，所以我们在设置属性进行数据封装的时候，是需要知道哪个是一哪个是多。
所以标识的字段必须要有。

InfoBean

package com.thp.bigdata.rjon;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

/**
 * 实体类 使用的是同一个bean
 * @author tommy
 *
 */
public class InfoBean implements Writable{

	// 订单文件的属性
	private int order_id;  // 订单id
	private String dateString;     // 订单日期
	private String pid;	   // 产品id
	private int amount;	   // 订单数量
	
	
	// 商品文件的属性
	private String pname;  // 产品名字
	private int category_id; // 产品分类
	private float price;    // 产品价格
	
	// flag = 0  封装的是订单表的数据
	// flag = 1  封装的是产品表的数据
	private int flag;
	
	
	
	public InfoBean() {}
	
	public void set(int order_id, String dateString, String pid, int amount, String pname, int category_id, float price, int flag) {
		this.order_id = order_id;
		this.dateString = dateString;
		this.pid = pid;
		this.amount = amount;
		this.pname = pname;
		this.category_id = category_id;
		this.price = price;
		this.flag = flag;
	}

	public int getOrder_id() {
		return order_id;
	}

	public void setOrder_id(int order_id) {
		this.order_id = order_id;
	}

	public String getDate() {
		return dateString;
	}

	public void setDate(String dateString) {
		this.dateString = dateString;
	}

	public String getPid() {
		return pid;
	}

	public void setPid(String pid) {
		this.pid = pid;
	}

	public int getAmount() {
		return amount;
	}

	public void setAmount(int amount) {
		this.amount = amount;
	}

	public String getPname() {
		return pname;
	}

	public void setPname(String pname) {
		this.pname = pname;
	}

	public int getCategory_id() {
		return category_id;
	}

	public void setCategory_id(int category_id) {
		this.category_id = category_id;
	}

	public float getPrice() {
		return price;
	}

	public void setPrice(float price) {
		this.price = price;
	}
	
	public int getFlag() {
		return flag;
	}

	public void setFlag(int flag) {
		this.flag = flag;
	}

	@Override
	public String toString() {
		return "InfoBean [order_id=" + order_id + ", date=" + dateString + ", pid=" + pid + ", amount=" + amount + ", pname="
				+ pname + ", category_id=" + category_id + ", price=" + price + "]";
	}
	
	
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeInt(order_id);
		out.writeUTF(dateString);
		out.writeUTF(pid);
		out.writeInt(amount);
		out.writeUTF(pname);
		out.writeInt(category_id);
		out.writeFloat(price);
		out.writeInt(flag);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.order_id = in.readInt();
		this.dateString = in.readUTF();
		this.pid = in.readUTF();
		this.amount = in.readInt();
		this.pname = in.readUTF();
		this.category_id = in.readInt();
		this.price = in.readFloat();
		this.flag = in.readInt();
	}

}

Map阶段：

Map阶段要做的事就是从这两个文件中将数据拿出来，然后使用实体类 InfoBean进行封装数据，封装完数据之后，需要将数据发送给reduce阶段进行处理。由于两张表中的数据是通过商品id 进行关联的。所以context在往外写数据的时候需要将pid (商品id) 作为key输出，这样，reduce阶段接收到的bean，都是相同pid的数据。说明是相同商品的数据。
就相当于我们写SQL中的查询条件： on a.pid = b.id

由于我们是从两个文件中读取数据，两个文件的数据格式是不一样的，所以我们应该根据文件名字进行数据的封装。
封装时，只对这个文件中有的属性进行封装，没有的则是哟见你刚默认的代替。

   static class RJoinMapper extends Mapper<LongWritable, Text, Text, InfoBean> {
		
		// 不要把创建对象行代码放在map方法里面，因为map方法会不断地被调用，而我们只需要创建对象来进行赋值而已
		InfoBean bean = new InfoBean();
		Text outKey = new Text();  // 向外输出的key 必须是联系两张表的那个字段
		@Override
		protected void map(LongWritable key, Text value,Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
			String[] fields = line.split(",");
			
			
			// 需要获取文件名字进行区分是哪个表中的数据
			// 先获取文件的切片
			FileSplit fileSplit = (FileSplit) context.getInputSplit();
			// 获取文件名字
			String fileName = fileSplit.getPath().getName();
			
			String pid = ""; // 这个pid是非常关键的，因为map进行分发数据的时候，我们需要根据这个pid来作为key
			if(fileName.startsWith("order")) { // 订单表中的数据
				pid = fields[2];
			    bean.set(Integer.parseInt(fields[0]), fields[1], pid, Integer.parseInt(fields[3]), "", 0, 0, 0);
			} else { // 产品表中的数据
				pid = fields[0];
				bean.set(0, "", pid, 0, fields[1], Integer.parseInt(fields[2]), Float.parseFloat(fields[3]), 1);
			}
			outKey.set(pid);
			context.write(outKey, bean);
		}
	}

等封装了的bean之后，所有的相同的商品id (pid) 都会进入同一个reduce,这样在reduce阶段进行数据的合并。

Reduce 阶段：

订单表时多的一方，商品表时一的一方

需要注意一点的是，我们进行bean的复制的时候，需要使用工具类，或者自己一个一个手动set对应的值，不能直接使用 = 进行赋值，因为都是引用类型。

   // reduce 阶段现在我们只需要将Bean输出打印就可以了
	static class RJoinReducer extends Reducer<Text, InfoBean, InfoBean, NullWritable> {
		@Override
		protected void reduce(Text pid, Iterable<InfoBean> beans,Context context) throws IOException, InterruptedException {
			// 产品的bean
			InfoBean pdBean = new InfoBean();
			// 每个产品可能有多个订单  所以使用ArrayList
			ArrayList<InfoBean> orderBeans = new ArrayList<InfoBean>();
			for(InfoBean bean : beans) {
				if("1".equals(bean.getFlag())) {  // 产品文件中的数据
					// 产品文件中每个产品只有一个
					// 注意不能直接使用  pdBean = bean ; 因为都是引用类型
					try {
						BeanUtils.copyProperties(pdBean, bean);
					} catch (IllegalAccessException | InvocationTargetException e) {
						e.printStackTrace();
					}
				} else {  // 订单文件中的数据
					InfoBean odBean = new InfoBean();  // 订单Bean
					try {
						BeanUtils.copyProperties(odBean, bean);
						orderBeans.add(odBean);
					} catch (IllegalAccessException | InvocationTargetException e) {
						e.printStackTrace();
					}
				}
			}
			
			
			// 将两张表中的数据进行拼接
			for(InfoBean bean: orderBeans) {  // 需要将每一个订单中的数据全部都写出去
				bean.setPname(bean.getPname());
				bean.setCategory_id(bean.getCategory_id());
				bean.setPrice(bean.getPrice());
				context.write(bean, NullWritable.get());
			}
			
		}
	}

线程启动：

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();
		
		System.err.println(args[0]);
		System.err.println(args[1]);
		
		String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		if(otherArgs.length != 2) {
			System.err.println("Usage : wordcount <in> <out>");
			System.exit(2);
		}
		
		Job job = Job.getInstance(conf);
		
		
		job.setJar("f:/rjoin.jar");
		
		// 指定本业务job要使用的mapper/Reduce业务类
		job.setMapperClass(RJoinMapper.class);
		job.setReducerClass(RJoinReducer.class);
		
		// 指定mapper输出数据的kv类型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(InfoBean.class);
		
		// 指定最终输出的数据的kv类型
		job.setOutputKeyClass(InfoBean.class);
		job.setOutputValueClass(NullWritable.class);
		
		
		
		Path path = new Path(otherArgs[1]);
		FileSystem fileSystem = path.getFileSystem(conf);   // 根据path找到这个文件
		if(fileSystem.exists(path)) {
			fileSystem.delete(path, true);  // true的意思是，就算output有东西，也一带删除
		}
		
		
		// 指定job的输入原始文件所在的目录
		// 待处理文件可以在多个目录里面
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		
		// 指定job的输出结果
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		
		// 将job中配置的相关参数,以及job所用的的java类所在的jar包,提交给yarn去运行
		/*job.submit();*/
		boolean res = job.waitForCompletion(true); // 会等待程序处理完成之后,程序才退出
		System.exit(res ? 0 : 1);
	}

这个是可以直接运行在hadoop集群上的。因为我之前的项目复制了hadoop的一些配置文件。