MapReduce编程之自定义序列化类及自定义排序

数据：

//姓名 收入 支出 时间
zhangsan 6000 0 2016-05-01
lisi 2000 0 2016-05-01
lisi 0 100 2016-05-01
zhangsan 3000 0 2016-05-01
wangwu 9000 0 2016-05-01
wangwu 0 200 2016-05-01
zhangsan 200 400 2016-05-01

需求：

计算每个用户的收入、支出及利润情况，并优先显示利润最大的用户（按利润降序、如果利润相同则按收入降序）

分析：

实现WritableComparable接口，Writable接口是一个实现了序列化协议的序列化对象。在Hadoop中定义一个结构化对象都要实现Writable接口，使得该结构化对象可以序列化为字节流，字节流也可以反序列化为结构化对象。那WritableComparable接口是可序列化并且可比较的接口。 MapReduce中所有的key值类型都必须实现这个接口，既然是可序列化的那就必须得实现readFields（）和write（）这两个序列化和反序列化函数；既然是可比较的就必须实现compareTo（）函数，该函数即是比较和排序规则的实现。这样MR中的key值就既能可序列化又是可比较的。

代码：

TradeBean类：

package com.wqs.myWritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class TradeBean implements WritableComparable<TradeBean>{
	private String name;
	private int income;
	private int pay;
	private int profit;
	
	public TradeBean() {
		super();
		// TODO 自动生成的构造函数存根
	}

	public TradeBean(String name, int income, int pay, int profit) {
		super();
		this.name = name;
		this.income = income;
		this.pay = pay;
		this.profit = profit;
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		name = in.readUTF();
		income = in.readInt();
		pay = in.readInt();
		profit = in.readInt();
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(name);
		out.writeInt(income);
		out.writeInt(pay);
		out.writeInt(profit);
	}

	@Override
	public int compareTo(TradeBean tradeBean) {
		if(this.profit > tradeBean.getProfit()) return -1;
		else if(this.profit < tradeBean.getProfit()) return 1;
		else if(this.income > tradeBean.getIncome()) return -1;
		else if(this.income < tradeBean.getIncome()) return -1;
		else return 0;
	}
	
	@Override
	public String toString() {
		return name + " " + income + " " + pay + " " + profit;
	}

	public String getName() {
		return name;
	}

	public void setName(String name) {
		this.name = name;
	}

	public int getIncome() {
		return income;
	}

	public void setIncome(int income) {
		this.income = income;
	}

	public int getPay() {
		return pay;
	}

	public void setPay(int pay) {
		this.pay = pay;
	}

	public int getProfit() {
		return profit;
	}

	public void setProfit(int profit) {
		this.profit = profit;
	}

}

Map类：

package com.wqs.myWritableComparable;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class Map extends Mapper<Object, Text, Text, TradeBean>{
	private TradeBean bean = new TradeBean();
	private Text name = new Text();
	@Override
	protected void map(Object key, Text value, Context context) throws IOException, InterruptedException{
		String line = value.toString();
		String[] temp = line.split(" ");
		name.set(temp[0]);
		bean.setName(temp[0]);
		bean.setIncome(Integer.valueOf(temp[1]));
		bean.setPay(Integer.valueOf(temp[2]));
		bean.setProfit(0);
		context.write(name, bean);
	}
}

Reduce类：

package com.wqs.myWritableComparable;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class Reduce extends Reducer<Text, TradeBean, TradeBean, NullWritable>{
	ArrayList<TradeBean> tradeBeans = new ArrayList<>();
	@Override
	protected void reduce(Text k2, Iterable<TradeBean> vs2, Context context) 
			throws IOException, InterruptedException {
		String name = null;
		int income = 0;
		int pay = 0;
		int profit = 0;
		for (TradeBean tradeBean : vs2) {
			income += tradeBean.getIncome();
			pay += tradeBean.getPay();
		}
		name = k2.toString();
		profit = income - pay;
		tradeBeans.add(new TradeBean(name, income, pay, profit));
	}
	
	/**
	 * 在所有reduce执行结束之后对tradeBeans进行排序
	 * cleanup方法的作用：在所有reduce执行结束之后调用
	 * 目的：使结果按照利润进行排序。前面map阶段为了reduce阶段容易统计每个人的数据，将K1设置为了name
	 * 那么此时我们发现结果排序是按照name进行排序的，而不是需求所要求的按照利润进行排序，故把最终的结果集
	 * sort一下就可以了
	 */
	@Override
	protected void cleanup(Context context) throws IOException, InterruptedException {
		Collections.sort(tradeBeans);
		for (TradeBean tradeBean : tradeBeans) {
			context.write(tradeBean, NullWritable.get());
		}
	}
}

Main：

package com.wqs.myWritableComparable;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class Main {
	
	public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        System.setProperty("hadoop.home.dir", "E:/hadoop-2.7.7");
		args = new String[] { "/demo03/in/", "/demo03/out" };
        String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
        if(otherArgs.length != 2){
            System.err.println("Usage:InvertedIndex");
            System.exit(2);
        }
        Job job = Job.getInstance();
        job.setJarByClass(Main.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(TradeBean.class);
        job.setOutputKeyClass(TradeBean.class);
        job.setOutputValueClass(NullWritable.class);
         
        FileInputFormat.addInputPath(job, new Path("hdfs://192.168.222.128:9000" + args[0]));
        FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.222.128:9000" + args[1]));
         
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

MapReduce编程之自定义序列化类及自定义排序

猜你喜欢