mapreduce的自定义排序

版权声明:数据丁 https://blog.csdn.net/reasery/article/details/82875815

bean类

package mrpro927;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;

/*
 * 自定义的类作为MapReduce传输对象的时候,必须序列化,实现WritableComparable 接口
 * 泛型为map输出的key的类型
 * 可以把需要的字段都封装过来,map输出的value就可以用NullWritable来代替
 * 
 */
public class phoneBeanCustomSort implements WritableComparable<phoneBeanCustomSort>{
	private int upflow;
	private int downflow;
	private int sum;
	private long pnum;
	//手机号不能用int,否则过长,也可以用string

	public int getUpflow() {
		return upflow;
	}
	public void setUpflow(int upflow) {
		this.upflow = upflow;
	}
	public int getDownflow() {
		return downflow;
	}
	public void setDownflow(int downflow) {
		this.downflow = downflow;
	}
	public int getSum() {
		return sum;
	}
	public void setSum(int sum) {
		this.sum = sum;
	}



	public long getPnum() {
		return pnum;
	}
	public void setPnum(long pnum) {
		this.pnum = pnum;
	}
	@Override
	public String toString() {
		return upflow + "\t" + downflow + "\t" + sum +"\t"+pnum;
	}
	//序列化的方法,对象=》二进制
	//map发到reduce端的的时候先序列化
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeInt(upflow);
		out.writeInt(downflow);
		out.writeInt(sum);
		out.writeLong(pnum);
		
	}
	
	//反序列化的方法,到reduce端的时候进行反序列化,和序列化的顺序一定要一致
	@Override
	public void readFields(DataInput in) throws IOException {
		this.upflow = in.readInt();
		this.downflow = in.readInt();
		this.sum = in.readInt();
		this.pnum = in.readLong();
	}
	//指定排序规则,this本对象,o是用于比较的其他对象
	//先按照上行流量排序,再按照下行流量进行排序
	@Override
	public int compareTo(phoneBeanCustomSort o) {
		//先按照上行流量进行排序
		int tmp = o.upflow - this.upflow;
		//如果上行流量相等就按照下行流量进行排序
		if(tmp==0){
			tmp = o.downflow -this.downflow;
			return tmp;
		}
		return tmp;
	}

}

main类

package mrpro927;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;



/*
 *自定义类的自定义排序,
 *先按照上行流量排序,再按照下行流量进行排序
 * 
 */
public class phoneDataCustomSort {
	//
	public static class MyMapper extends Mapper<LongWritable, Text, phoneBeanCustomSort, NullWritable>{
		phoneBeanCustomSort p = new phoneBeanCustomSort();
		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, phoneBeanCustomSort, NullWritable>.Context context)
				throws IOException, InterruptedException {
			//取出每一行,并按\t 进行切分
			String[] split = value.toString().split("\t");
			//清洗出有效数据
			if(split.length == 11){
				p.setUpflow(Integer.parseInt(split[7]));
				p.setDownflow(Integer.parseInt(split[8]));				
				p.setSum(p.getUpflow()+p.getDownflow());
				p.setPnum(Long.parseLong(split[1]));
				context.write(p, NullWritable.get());
			}
		}
	}
	
	//每组调用一次
	public static class MyReducer extends Reducer<phoneBeanCustomSort, NullWritable,phoneBeanCustomSort, NullWritable>{
		@Override
		protected void reduce(phoneBeanCustomSort key, Iterable<NullWritable> values,
				Reducer<phoneBeanCustomSort, NullWritable, phoneBeanCustomSort, NullWritable>.Context context) 
						throws IOException, InterruptedException {
			for(NullWritable n:values){
				context.write(key, NullWritable.get());
			}
		}
	}
	
	
	public static class MyPartitioner extends Partitioner<phoneBeanCustomSort, NullWritable>{

		@Override
		public int getPartition(phoneBeanCustomSort key, NullWritable value, int numPartitions) {
			String s = String.valueOf(key.getPnum());
			if(s.startsWith("136")){
				return 0;
			}else if(s.startsWith("137")){
				return 1;
			}else if(s.startsWith("138")){
				return 2;
			}else if(s.startsWith("139")){
				return 3;
			}else {
				return 4;
			}
		}
		
	}
	
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		//加载配置文件
		Configuration conf = new Configuration();
		//eclipse运行设置linux用户名
		System.setProperty("HADOOP_USER_NAME", "mading");
		//启动一个job
		Job job = Job.getInstance(conf);
		//指定当前任务的主类
		job.setJarByClass(phoneDataCustomSort.class);
		//指定mapper和reducer类
		job.setMapperClass(MyMapper.class);
		job.setReducerClass(MyReducer.class);
		//指定map输出的key,value类型,如果和reduce的输出类型相同的情况下可以省略
		job.setMapOutputKeyClass(phoneBeanCustomSort.class);
		job.setMapOutputValueClass(NullWritable.class);
		//指定reduce输出的key,value类型
		job.setOutputKeyClass(phoneBeanCustomSort.class);
		job.setOutputValueClass(NullWritable.class);
		//指定分区算法
		//job.setPartitionerClass(MyPartitioner.class);
		//设置reducetask的并行度
		//job.setNumReduceTasks(5);
		//指定文件输入的路径,这里是HA高可用集群的路径
		FileInputFormat.addInputPath(job, new Path("hdfs://master:9000/pout01"));
		//指定文件的输出路径
		FileOutputFormat.setOutputPath(job, new Path("hdfs://master:9000/sout01"));
		//提交job
		job.waitForCompletion(true);
	}
}

ps:phonenum字段不能用int,否则会过长,这个报错找了好久都不知道怎么回事

猜你喜欢

转载自blog.csdn.net/reasery/article/details/82875815
今日推荐