MapReduce2中自定义排序分组

1 Map 、Reduce和主类

package com.wzt.mapreduce.secondsort;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.wzt.mapreduce.wordcount.WCRunner;

public class SecSortMain {

	public static class SecSortMapper extends Mapper<LongWritable, Text, FirstSortEntity, IntWritable> {
		
		protected void map(LongWritable key, Text value, Context context)
				throws  IOException, InterruptedException {
			 
			String line = value.toString();
			String[] spilted = line.split(" ");
			
			// 为了显示效果而输出Mapper的输出键值对信息
			System.out.println("Mapper输出<" + spilted[0] + "," + spilted[1] + ">"+this);
			context.write(new FirstSortEntity(spilted[0], Integer.parseInt(spilted[1]))  , new IntWritable(Integer.parseInt(spilted[1])) );
		};
		
	}

	public static class SecSortReducer extends Reducer<FirstSortEntity, IntWritable , FirstSortEntity, IntWritable> {
		
		@Override
		protected void reduce(
				FirstSortEntity key,
				Iterable<IntWritable> values,
				Context context)
				throws IOException, InterruptedException {
			
			// 显示次数表示redcue函数被调用了多少次，表示k2有多少个分组
			System.out.println("Reducer输入分组<" + key+ ",N(N>=1)>"+this);
			StringBuffer sb = new StringBuffer() ; 
			for (IntWritable value : values) {
				//count += value.get();
				// 显示次数表示输入的k2,v2的键值对数量
				sb.append( value+" , " ) ;
				System.out.println("Reducer输入键值对<" + key.toString() + "," + value.get() + ">  组"+sb.toString() );
			}
//			if(sb.length()>0){
//				sb.deleteCharAt( -1 ) ;
//			}

			context.write(key, key.getSecondkey());
			//context.write(key.getFirstkey(),  new Text(sb.toString() ));
			
		}
		
	}

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		
		Configuration conf = new Configuration() ; 
		Job job = Job.getInstance(conf) ;
		
		job.setJarByClass(WCRunner.class );
		
		job.setMapperClass( SecSortMapper.class );
		job.setMapOutputKeyClass( FirstSortEntity.class);
		job.setMapOutputValueClass( IntWritable.class );
		 
		//设置分区方法
		job.setPartitionerClass( SSPartintioner.class);//不同
		//会有几个reduce去执行最后的汇总数据， 有几个分区就要有几个reduce ，最后就会生成几个reduce ，这里设置为1 ，没看到调用但是确实分区了，没弄明白
		job.setNumReduceTasks(1);//当任务数为1的时候设置Partitioner是没有用的
		
		//数据做总的排序
		job.setSortComparatorClass(MySSSortComparator.class) ; //排序
		//总数据  记性分组 
		job.setGroupingComparatorClass( GroupComparator.class );//分组
		
		job.setReducerClass( SecSortReducer.class );
		job.setOutputKeyClass( FirstSortEntity.class );
		job.setOutputValueClass(IntWritable.class );
		
		
//		FileInputFormat.setInputPaths(job,  "/wc/input/xiyou.txt");
//		FileOutputFormat.setOutputPath(job,  new Path("/wc/output6"));
		FileInputFormat.setInputPaths(job,  "/sort/input");
		FileOutputFormat.setOutputPath(job,  new Path("/sort/output1"));
		
 		job.waitForCompletion(true) ; 
	}
}

2 自定义组合key

package com.wzt.mapreduce.secondsort;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
/**
 * 自定义组合件 
 * @author root
 *
 */
public class FirstSortEntity implements WritableComparable<FirstSortEntity>{

	private Text firstkey ; 
	private IntWritable secondkey ;
	
	public FirstSortEntity( ) {
	}
	
	public FirstSortEntity(Text firstkey, IntWritable secondkey) {
		this.firstkey = firstkey;
		this.secondkey = secondkey;
	}
	public FirstSortEntity(String firstkey, int secondkey) {
		this.firstkey = new Text(firstkey);
		this.secondkey = new IntWritable(secondkey);
	}
	
	public Text getFirstkey() {
		return firstkey;
	}
	public void setFirstkey(Text firstkey) {
		this.firstkey = firstkey;
	}
	public IntWritable getSecondkey() {
		return secondkey;
	}
	public void setSecondkey(IntWritable secondkey) {
		this.secondkey = secondkey;
	}
	/**
	 * 对象序列化
	 */
	@Override
	public void write(DataOutput out) throws IOException {
		 out.writeUTF(firstkey.toString() );
		 out.writeInt(  secondkey.get() );
	}

	//对象反序列化
	@Override
	public void readFields(DataInput in) throws IOException {
		 
		firstkey = new Text(in.readUTF() );
		secondkey = new IntWritable(in.readInt()); 
	}

	
	/**
	 * 排序在map执行后数据传出后 会调用这个方法对key进行排序 
	 * 数据map后，如果设置了分区并且reduce>1 的话，会执行分区类方法，进行分区
	 */
	@Override
	public int compareTo(FirstSortEntity entity) {
		//利用这个来控制升序或降序
		//this本对象写在前面代表是升序
		//this本对象写在后面代表是降序
		return this.firstkey.compareTo( entity.getFirstkey());
		//return this.secondkey.get()>entity.getSecondkey().get()?1:-1;	
	}
	@Override
	public String toString() {
		return this.getFirstkey() +" "+this.getSecondkey()+ "   "  ;
	} 

}

3 自定义分区

package com.wzt.mapreduce.secondsort;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;

//自定义 分区
public class SSPartintioner extends Partitioner<FirstSortEntity, IntWritable>{
 
	/**
	 * key map输出的key
	 * value map 输出的value 
	 *  map后的数据 经过排序后传进这个分区方法，如果返回的值相同的数据，值相同的数据会分配到一组中 ，即 放到一堆 
	 *  到此 数据为N堆，并且数据是经过排序的 
	 */
	@Override
	public int getPartition(FirstSortEntity key, IntWritable value,
			int numPartitions) {
			System.out.println("Partitioner  key:"+key.getFirstkey()+"  value:"+value+"  "+ ( ( key.getFirstkey().hashCode()&Integer.MAX_VALUE)%numPartitions ) +"   "+this);
			//System.out.println("Partitioner  key:"+key.getFirstkey()+"  value:"+value+"  "+ ((key.getSecondkey().get()&Integer.MAX_VALUE)%numPartitions) +"   "+this);
			
	       return (key.getFirstkey().hashCode()&Integer.MAX_VALUE)%numPartitions;
			//return (key.getSecondkey().get()&Integer.MAX_VALUE)%numPartitions;
	}
	 
	
}

个人理解以上都是在Map阶段进行，即本地操作，以下为Map到Reduce这段进行的

4 自定义整体排序

package com.wzt.mapreduce.secondsort;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;


//组内自定义排序策略
/**
 * @author root
 *
 */
public class MySSSortComparator extends WritableComparator{

	public MySSSortComparator() {//注册处理的试题类型 
		super(FirstSortEntity.class,true);
	}
	
	/**
	 *  reduce 处理数据之前 
	 *  对全量数据排序 
	 *  逻辑：分组一样则按照第二个参数排序  ，分组不一样，则按照第一个参数排序  
	 */
	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		
		FirstSortEntity e1 = (FirstSortEntity)a;
		FirstSortEntity e2 = (FirstSortEntity)b;
		System.out.println( e1.getFirstkey()+"==MySSSortComparator 排序 。。 "+e2.getFirstkey());
		//首先要保证是同一个组内，同一个组的标识就是第一个字段相同
		if(!e1.getFirstkey().equals( e2.getFirstkey())){
			return e1.getFirstkey().compareTo(e2.getFirstkey());
		}else{
			return e1.getSecondkey().get() - e2.getSecondkey().get() ; 
		}
	}
}

5 自定义分组

package com.wzt.mapreduce.secondsort;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;


//对象分组策略 
//数据放到 reduce前 ，对数据进行分组 
public class GroupComparator extends WritableComparator{

	public GroupComparator() { //注册处理的试题类型 
		super(FirstSortEntity.class,true ) ; 
	}
	
	
	/**
	 * 对排序后的数据 分组， 
	 * 第一个参数相同的，放到一个key的 迭代器 集合中  
	 */
	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		FirstSortEntity e1 = (FirstSortEntity)a;
		FirstSortEntity e2 = (FirstSortEntity)b;
		System.out.println( e1.getFirstkey()+"==GroupComparator = 分组=="+e2.getFirstkey());
		return  e1.getFirstkey().toString().compareTo( e2.getFirstkey().toString());
		//return  e1.getSecondkey().compareTo( e2.getSecondkey());
	}
}

在以后就是主类中的reduce进行数据处理

下面这个类作为自己的记录，这里没用：

package com.wzt.mapreduce.secondsort;

import java.io.ByteArrayInputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparator;

//自定义分组比较器
//这个类 暂时没用， 分组比较器的 实现，但没有测试 
public class SSGroupComparator implements RawComparator<FirstSortEntity>{

	@Override
	public int compare(FirstSortEntity o1, FirstSortEntity o2) {
	 
		return o1.getSecondkey().get()>o2.getSecondkey().get()?1:-1;
	}
 
    @Override
    public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
    	 
    	//对象可以这样反序列化 
    	//IntWritable d ; 
    	System.out.println( "SSGroupComparator   自定义分组 =" );
    	ByteArrayInputStream bis = new ByteArrayInputStream(b1);
    	DataInput in1 = new DataInputStream(bis); 
    	FirstSortEntity entity1 = new FirstSortEntity();
    	
    	ByteArrayInputStream bis2 = new ByteArrayInputStream(b2);
    	DataInput in2 = new DataInputStream(bis2); 
    	FirstSortEntity entity2 = new FirstSortEntity();
    	try {
			entity1.readFields(in1);
			entity2.readFields(in2);
		} catch (IOException e) {
			e.printStackTrace();
		}
     
        return entity1.getFirstkey().compareTo( entity2.getFirstkey());
    }
 

}

MapReduce2中自定义排序分组

猜你喜欢