HADOOP cluster MAPREDUCE practice (10)

Custom GroupingComparator
1 requires
the following order data:
order id commodity id transaction amount
Order_0000001 Pdt_01 222.8
Order_0000001 Pdt_05 25.8
Order_0000002 Pdt_03 522.8
Order_0000002 Pdt_04 122.4
Order_0000002 Pdt_05 722

Order_0000003 Pdt_01 222.8

Now we need to find the transaction with the largest transaction amount in each order

2 Analysis
1. Using "order id and transaction amount" as the key, all order data read in the map phase can be partitioned by id, sorted by amount, and sent to reduce

2. On the reduce side, use groupingcomparator to aggregate kvs with the same order id into groups, and then take the first one as the maximum value

3 Implementation

Define the order information bean

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;

public class OrderBean implements WritableComparable<OrderBean>{

	private Text itemid;
	private DoubleWritable amount;

	public OrderBean() {
	}

	public OrderBean(Text itemid, DoubleWritable amount) {
		set(itemid, amount);

	}

	public void set(Text itemid, DoubleWritable amount) {

		this.itemid = itemid;
		this.amount = amount;
	}

	public Text getItemid() {
		return itemid;
	}

	public DoubleWritable getAmount() {
		return amount;
	}

	@Override
	public int compareTo (OrderBean o) {
		int cmp = this.itemid.compareTo(o.getItemid());
		if (cmp == 0) {
			cmp = -this.amount.compareTo(o.getAmount());
		}
		return cmp;
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(itemid.toString());
		out.writeDouble(amount.get());
		
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		String readUTF = in.readUTF();
		double readDouble = in.readDouble();
		
		this.itemid = new Text(readUTF);
		this.amount= new DoubleWritable(readDouble);
	}

	@Override
	public String toString() {

		return itemid.toString() + "\t" + amount.get();
		
	}
}

Customize groupingcomparator, treat a group of beans as the same key

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 * Use the GroupingComparator on the reduce side to treat a group of beans as the same key
 * @author [email protected]
 *
 */
public class ItemidGroupingComparator extends WritableComparator {

	//Pass in the class type of the bean as the key, and specify the need for the framework to do reflection to obtain the instance object
	protected ItemidGroupingComparator() {
		super(OrderBean.class, true);
	}	

	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		OrderBean abean = (OrderBean) a;
		OrderBean bbean = (OrderBean) b;
		
		//When comparing two beans, specify to compare only the orderid in the bean
		return abean.getItemid (). compareTo (bbean.getItemid ());		
	}
}

Custom Partitioner, the number of partitions generated

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;

public class ItemIdPartitioner extends Partitioner<OrderBean, NullWritable>{

	@Override
	public int getPartition(OrderBean bean, NullWritable value, int numReduceTasks) {
		//Order beans with the same id will be sent to the same partition
		//Moreover, the number of partitions generated will be consistent with the number of reduce tasks set by the user
		return (bean.getItemid().hashCode() & Integer.MAX_VALUE) % numReduceTasks;
		
	}

}

Write mapreduce processing flow

import java.io.IOException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.sun.xml.bind.v2.schemagen.xmlschema.List;

/**
 *
 * @author [email protected]
 *
 */
public class SecondarySort {
	
	static class SecondarySortMapper extends Mapper<LongWritable, Text, OrderBean, NullWritable>{
		
		OrderBean bean = new OrderBean ();
		
		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

			String line = value.toString();
			String[] fields = StringUtils.split(line, ",");
			
			bean.set(new Text(fields[0]), new DoubleWritable(Double.parseDouble(fields[2])));
			
			context.write(bean, NullWritable.get());
			
		}
		
	}
	
	static class SecondarySortReducer extends Reducer<OrderBean, NullWritable, OrderBean, NullWritable>{
		
		
		//When reduce is reached, all beans with the same id have been regarded as a group, and the one with the largest amount is in the first row
		@Override
		protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
			context.write(key, NullWritable.get());
		}
	}
	
	
	public static void main(String[] args) throws Exception {
		
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(SecondarySort.class);
		
		job.setMapperClass(SecondarySortMapper.class);
		job.setReducerClass(SecondarySortReducer.class);
		
		
		job.setOutputKeyClass(OrderBean.class);
		job.setOutputValueClass(NullWritable.class);
		
		FileInputFormat.setInputPaths(job, new Path("c:/wordcount/gpinput"));
		FileOutputFormat.setOutputPath(job, new Path("c:/wordcount/gpoutput"));
		
		//Set the custom Groupingcomparator class here
		job.setGroupingComparatorClass(ItemidGroupingComparator.class);
		//Set the custom partitioner class here
		job.setPartitionerClass(ItemIdPartitioner.class);
		
		job.setNumReduceTasks (2);
		
		job.waitForCompletion(true);
		
	}

}

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324812183&siteId=291194637