Hadoop--MapReduce7--自定义GroupingComparator

在reduce中，相同key会聚合到一起作为一组，每一组都会按照一定的规则来排序GroupingComparator

给定以下数据，每一行分别表示一条线段的左右端点

1,4
2,5
3,4
2,6
4,7
5,8
5,9
6,10
10,15
11,16
12,18
13,17

求点交错的层数

例如1交错1层（1,4） 2交错3层（1,4）（2,5）（2,6）

map : 输入每一行，读取左右端点，把中间所有数据点作为key value=1输出

reduce : 相同的key数据点聚合，叠加得最终层数

代码如下：

public class JobSubmitter {
	
	public static class LineMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable> {
		
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String[] line = value.toString().split(",");
			for(int i=Integer.parseInt(line[0]); i<=Integer.parseInt(line[1]); i++){
				context.write(new IntWritable(i), new IntWritable(1));
			}
		}
	} 
	
	public static class LineReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {
		@Override
		protected void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
			int count = 0;
			Iterator<IntWritable> it = values.iterator();
			while(it.hasNext()){
				count += it.next().get();
			}
			context.write(key, new IntWritable(count));
		}
	}
	
    public static void main(String[] args) throws Exception {
		
		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);
		job.setJarByClass(JobSubmitter.class);
		job.setMapperClass(LineMapper.class);
		job.setReducerClass(LineReducer.class);
		job.setMapOutputKeyClass(IntWritable.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setOutputKeyClass(IntWritable.class);
		job.setOutputValueClass(IntWritable.class);
		
		FileInputFormat.setInputPaths(job, new Path("F:\\hadoop-2.8.1\\data\\line\\input"));
		FileOutputFormat.setOutputPath(job, new Path("F:\\hadoop-2.8.1\\data\\line\\output"));
		
		job.setNumReduceTasks(1);
		
		boolean res = job.waitForCompletion(true);
		System.exit(res ? 0 : 1);
		
	}
	
}

求出重叠次数最高的坐标点及其重叠次数

map阶段：读取每一行，封装Bean《坐标，次数》作为key value=null Bean实现WritableComparable接口，规定key比较规则

重写分组比较器GroupComparator，实现同一个reduce中不同分组之间比较规则

reduce阶段：将所有key都聚合到同一个reducetask中，实现按照分组来排序，然后输出最高点即可。

封装bean，实现要点如下：

1.实现WritableComparator接口 write readFeilds以及compareTo方法

2.无参构造器

3.toString()方法决定最终写入HDFS文件中结果格式

public class LineBean implements WritableComparable<LineBean>{
	
	private int point;
	private int count;
	
	public LineBean(){}
	
	public LineBean(int point, int count) {
		this.point = point;
		this.count = count;
	}

	public int getPoint() {
		return point;
	}

	public void setPoint(int point) {
		this.point = point;
	}

	public int getCount() {
		return count;
	}

	public void setCount(int count) {
		this.count = count;
	}
	

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeInt(this.point);
		out.writeInt(this.count);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.point = in.readInt();
		this.count = in.readInt();
	}

	@Override
	public int compareTo(LineBean o2) {
		if(o2.count > this.count) 
			return 1;
		else if(o2.count < this.count)
			return -1;
		return 0;
	}

	@Override
	public String toString() {
		StringBuilder builder = new StringBuilder();
		builder.append("LineBean [point=");
		builder.append(point);
		builder.append(", count=");
		builder.append(count);
		builder.append("]");
		return builder.toString();
	}
	
}

实现GroupComparator

public class LineGroupComparator extends WritableComparator{
	
	public LineGroupComparator(){
		super(LineBean.class, true);
	}
	
	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		LineBean o1 = (LineBean) a;
		LineBean o2 = (LineBean) b;
		return o1.compareTo(o2);
	}

}

代码如下：

public class JobSubmitter2 {
	
	public static class LineMapper extends Mapper<LongWritable, Text, LineBean, NullWritable> {
		
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String[] line = value.toString().split("\t");
			//System.out.println(Integer.parseInt(line[0]));
			context.write(new LineBean(Integer.parseInt(line[0]), Integer.parseInt(line[1])), NullWritable.get());
		}
	} 
	
	public static class LineReducer extends Reducer<LineBean, NullWritable, LineBean, NullWritable> {
		@Override
		protected void reduce(LineBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
			for(NullWritable v : values){
				context.write(key, v);
			}
		}
	}
	
    public static void main(String[] args) throws Exception {
		
		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);
		job.setJarByClass(JobSubmitter2.class);
		job.setMapperClass(LineMapper.class);
		job.setReducerClass(LineReducer.class);
		job.setMapOutputKeyClass(LineBean.class);
		job.setMapOutputValueClass(NullWritable.class);
		job.setOutputKeyClass(LineBean.class);
		job.setOutputValueClass(NullWritable.class);
		
		job.setGroupingComparatorClass(LineGroupComparator.class);
		
		FileInputFormat.setInputPaths(job, new Path("F:\\hadoop-2.8.1\\data\\line\\output"));
		FileOutputFormat.setOutputPath(job, new Path("F:\\hadoop-2.8.1\\data\\line\\output"+System.currentTimeMillis()));
		
		job.setNumReduceTasks(1);
		
		boolean res = job.waitForCompletion(true);
		System.exit(res ? 0 : 1);
		
	}
	
}

最终结果：可以通过Bean toString()方法来控制最终格式

LineBean [point=6, count=5]
LineBean [point=4, count=5]
LineBean [point=5, count=5]
LineBean [point=3, count=4]
LineBean [point=15, count=4]
LineBean [point=7, count=4]
LineBean [point=14, count=4]
LineBean [point=13, count=4]
LineBean [point=2, count=3]
LineBean [point=16, count=3]
LineBean [point=12, count=3]
LineBean [point=8, count=3]
LineBean [point=9, count=2]
LineBean [point=11, count=2]
LineBean [point=17, count=2]
LineBean [point=10, count=2]
LineBean [point=18, count=1]
LineBean [point=1, count=1]

求分组topN

给定如下数据，输出每笔订单中成交额最大的N笔

order001,u001,小米6,1999.9,2
order001,u001,雀巢咖啡,99.0,2
order001,u001,安慕希,250.0,2
order001,u001,经典红双喜,200.0,4
order001,u001,防水电脑包,400.0,2
order002,u002,小米手环,199.0,3
order002,u002,榴莲,15.0,10
order002,u002,苹果,4.5,20
order002,u002,肥皂,10.0,40

map:读取每一行数据，封装Bean作为key value=null

自定义Partitioner使相同orderId的Bean分发到同一个reduce

自定义GroupComparator分组比较器，使同一组内按照成交额排序

reduce:按照orderId来聚合，执行分组比较器逻辑，输出每组最大的N个

封装Bean

public class OrderBean implements WritableComparable<OrderBean>{

	private String orderId;
	private String userId;
	private String pdtName;
	private float price;
	private int number;
	private float amountFee;

	public void set(String orderId, String userId, String pdtName, float price, int number) {
		this.orderId = orderId;
		this.userId = userId;
		this.pdtName = pdtName;
		this.price = price;
		this.number = number;
		this.amountFee = price * number;
	}

	public String getOrderId() {
		return orderId;
	}

	public void setOrderId(String orderId) {
		this.orderId = orderId;
	}

	public String getUserId() {
		return userId;
	}

	public void setUserId(String userId) {
		this.userId = userId;
	}

	public String getPdtName() {
		return pdtName;
	}

	public void setPdtName(String pdtName) {
		this.pdtName = pdtName;
	}

	public float getPrice() {
		return price;
	}

	public void setPrice(float price) {
		this.price = price;
	}

	public int getNumber() {
		return number;
	}

	public void setNumber(int number) {
		this.number = number;
	}

	public float getAmountFee() {
		return amountFee;
	}

	public void setAmountFee(float amountFee) {
		this.amountFee = amountFee;
	}

	@Override
	public String toString() {

		return this.orderId + "," + this.userId + "," + this.pdtName + "," + this.price + "," + this.number + ","
				+ this.amountFee;
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(this.orderId);
		out.writeUTF(this.userId);
		out.writeUTF(this.pdtName);
		out.writeFloat(this.price);
		out.writeInt(this.number);

	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.orderId = in.readUTF();
		this.userId = in.readUTF();
		this.pdtName = in.readUTF();
		this.price = in.readFloat();
		this.number = in.readInt();
		this.amountFee = this.price * this.number;
	}

	// 比较规则：先比总金额，如果相同，再比商品名称
	@Override
	public int compareTo(OrderBean o) {
		int fc = Float.compare(o.getAmountFee(), this.getAmountFee());
		if(fc > 0)
			return 1;
		else if (fc < 0)
			return -1;
		return this.orderId.compareTo(o.getOrderId());		
	}
}

自定义Partitioner

public class OrderIdPartitioner extends Partitioner<OrderBean, NullWritable>{
	@Override
	public int getPartition(OrderBean key, NullWritable value, int numPartitions) {
		// 按照订单中的orderid来分发数据
		return (key.getOrderId().hashCode() & Integer.MAX_VALUE) % numPartitions;
	}
}

自定义GroupComparator

public class OrderIdGroupingComparator extends WritableComparator{
	
	public OrderIdGroupingComparator() {
		super(OrderBean.class, true);
	}
	
	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		OrderBean o1 = (OrderBean) a;
		OrderBean o2 = (OrderBean) b;
		return o1.getOrderId().compareTo(o2.getOrderId());
	}
}

mapreduce

public class OrderTopn {

	public static class OrderTopnMapper extends Mapper<LongWritable, Text, OrderBean, NullWritable>{
		OrderBean orderBean = new OrderBean();
		NullWritable v = NullWritable.get();
		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, OrderBean, NullWritable>.Context context)
				throws IOException, InterruptedException {
			String[] fields = value.toString().split(",");
			orderBean.set(fields[0], fields[1], fields[2], Float.parseFloat(fields[3]), Integer.parseInt(fields[4]));
			context.write(orderBean, v);
		}	
	}
	
	
	public static class OrderTopnReducer extends Reducer< OrderBean, NullWritable,  OrderBean, NullWritable>{
		
		/**
		 * 虽然reduce方法中的参数key只有一个，但是只要迭代器迭代一次，key中的值就会变
		 */
		@Override
		protected void reduce(OrderBean key, Iterable<NullWritable> values,
				Reducer<OrderBean, NullWritable, OrderBean, NullWritable>.Context context)
				throws IOException, InterruptedException {
			int i=0;
			System.out.println(key.toString());
			for (NullWritable v : values) {	
				context.write(key, v);
				if(++i==3) return;
			}	
		}			
	}
	
	public static void main(String[] args) throws Exception {		
		Configuration conf = new Configuration(); 
		
		Job job = Job.getInstance(conf);
		job.setJarByClass(OrderTopn.class);
		job.setMapperClass(OrderTopnMapper.class);
		job.setReducerClass(OrderTopnReducer.class);
		
		job.setPartitionerClass(OrderIdPartitioner.class);
		job.setGroupingComparatorClass(OrderIdGroupingComparator.class);
		
		job.setNumReduceTasks(4);

		job.setMapOutputKeyClass(OrderBean.class);
		job.setMapOutputValueClass(NullWritable.class);
		
		job.setOutputKeyClass(OrderBean.class);
		job.setOutputValueClass(NullWritable.class);

		FileInputFormat.setInputPaths(job, new Path("F:\\hadoop-2.8.1\\data\\order\\input"));
		FileOutputFormat.setOutputPath(job, new Path("F:\\hadoop-2.8.1\\data\\order\\input" + System.currentTimeMillis()));

		job.waitForCompletion(true);
	}
	
}

分组比较器的指定

job.setGroupingComparatorClass(OrderIdGroupingComparator.class);

传入对象为继承WritableComparator，需要重写方法可以选择

/** Optimization hook.  Override this to make SequenceFile.Sorter's scream.
   *
   * <p>The default implementation reads the data into two {@link
   * WritableComparable}s (using {@link
   * Writable#readFields(DataInput)}, then calls {@link
   * #compare(WritableComparable,WritableComparable)}.
   */
  @Override
  public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {

/** Compare two WritableComparables.
   *
   * <p> The default implementation uses the natural ordering, calling {@link
   * Comparable#compareTo(Object)}. */
  @SuppressWarnings("unchecked")
  public int compare(WritableComparable a, WritableComparable b) {
    return a.compareTo(b);
  }

@Override
  public int compare(Object a, Object b) {
    return compare((WritableComparable)a, (WritableComparable)b);
  }