Custom grouping mapreduce

Default grouping

Grouping and sorting rules under the default is the same, are compareTo () method they call.
Sort: compareTo () Returns the value of the adjustment data is sequentially
the same row will collation with
packet: compareTo () call is the return value is 0 if the
return value is 0 is set to all 0's
return value is not 0 for the new group
underlying implementation: call a class WritableComparetor is a general category

/*
分组按照map输出的key(序列化、比较)
map输出的key肯定是WritableCompareable的子类
*/
  @SuppressWarnings("unchecked")
  public int compare(WritableComparable a, WritableComparable b) {
   //map输出的key的compareTo方法
    return a.compareTo(b);
  }
  默认分组调用的就是 key的compareTo()
 key是默认类型,默认类型 compareTo 方法    比较的是实际的值(数值、Text)
 key是自定义类型,自定义的compareTo方法

Custom grouping

1) writing a packet class inherits class WritableComparator
2) override the compare method, redefining grouping algorithm
3) to specify the job class grouping
// specified packet class
job.setGroupingComparatorClass (Group.class);

报错:Caused by: java.lang.NullPointerException	
分组类中有错
默认不会创建compare方法中的参数 WritableComparable 对象的

解决方法:
//重写父类的构造方法
		public Group() {
			//调用父类的构造, 第二个参数为true 创建map的key对象  
			super(Stu.class,true);
		}
分组是在排序的基础上进行的,只会比较排序结果的前后相连的数据,不会跳跃比价的
无参构造
protected WritableComparator() {
    this(null);
  }
  /** Construct for a {@link WritableComparable} implementation. 

keyClass------>map的key
null
false 不创建对象
*/
  protected WritableComparator(Class<? extends WritableComparable> keyClass) {
    this(keyClass, null, false);
  }
  /*
   	keyClass     map输出的key
   	Configuration   conf  配置文件没有传
	boolean createInstances   是否创建对象
*/
protected WritableComparator(Class<? extends WritableComparable> keyClass,
                               Configuration conf,
                               boolean createInstances) {
    this.keyClass = keyClass;
    this.conf = (conf != null) ? conf : new Configuration();
    if (createInstances) {
      key1 = newKey();
      key2 = newKey();
      buffer = new DataInputBuffer();
    } else {
      key1 = key2 = null;
      buffer = null;
    }
  }

When the default packet can not meet the needs (grouping and ordering conflict time) need to be customized packet
packet: A
Sort: B
actually written:
sorting; A + B
packet: A
sort fields in the first sorted by the packet field, to ensure the same group the data put together, in accordance with normal sort field
packets just follow the packet fields may

Case:

package GroupByMapreduce;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class Stu implements WritableComparable<Stu> {
	private String course;
	private String name;
	private double avgscore;
	//无参构造
	public Stu() {
		super();
		// TODO Auto-generated constructor stub
	}
	public Stu(String course, String name, double avgscore) {
		super();
		this.course = course;
		this.name = name;
		this.avgscore = avgscore;
	}
	public String getCourse() {
		return course;
	}
	public void setCourse(String course) {
		this.course = course;
	}
	public String getName() {
		return name;
	}
	public void setName(String name) {
		this.name = name;
	}
	public double getAvgscore() {
		return avgscore;
	}
	public void setAvgscore(double avgscore) {
		this.avgscore = avgscore;
	}
	@Override
	public String toString() {
		return  course + "\t" + name + "\t" + avgscore ;
	}
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(course);
		out.writeUTF(name);
		out.writeDouble(avgscore);
		
	}
	@Override
	public void readFields(DataInput in) throws IOException {
		this.course=in.readUTF();
		this.name=in.readUTF();
		this.avgscore=in.readDouble();
		
	}
	//定义排序-----平均分降序
	@Override
	public int compareTo(Stu o) {
		int  course=this.getCourse().compareTo(o.getCourse());
		if(course==0) {
			//在课程相同的时候,按照平均分
			double tmp=o.getAvgscore()-this.getAvgscore();
			if(tmp>0) {
				return 1;
			}else if(tmp==0) {
				return 1;
			}else {
				return -1;
			}
		}
		return course;
	}

}
package GroupByMapreduce;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class myGroup {
	
	static class myMapper extends Mapper<LongWritable, Text, Stu, NullWritable>{
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			//获取每一行数据
			String[] datas = value.toString().split(",");
			//求平均分
			int sum=0;
			for (int i=2;i<datas.length;i++) {
				sum+= Integer.parseInt(datas[i].trim());
			}
			double avg=sum*1.0/(datas.length-2);
			Stu stu = new Stu(datas[0], datas[1], avg);
			context.write(stu, NullWritable.get());
		}
	}
	
	/*
	 * shuffle
	 * 先进行排序 按平均分排序
	 * 
	 * */
	
	/*
	 * 自定义分组类
	 * */
	static class Group extends WritableComparator{
		//重写父类的构造方法
		public Group() {
			//调用父类的构造, 第二个参数为true 创建map的key对象  
			super(Stu.class,true);
		}
		//重写compare
		/*
		 * a=this 
		 * b=o
		 * a和b都代表map的key
		 * a b ---Stu ---->WritableComparable 父类
		 * */
		@Override
		public int compare(WritableComparable a, WritableComparable b) {
			
			//自定义分组规则
			//将父类转换为子类
			Stu as=(Stu)a;
			Stu bs=(Stu)b;
			//比较科目
			return as.getCourse().compareTo(bs.getCourse());
		}
	}
	static class myReduce extends Reducer<Stu, NullWritable, Stu, NullWritable>{
		/*
		 * 按照科目分
		 * 每个科目一组  共四组
		 * values代表的是每个科目的所有null值
		 * algorithm	huangjiaju	82.28571428571429       null
		   algorithm	liutao	82.0                        null
		   algorithm	huanglei	74.42857142857143       null
		   algorithm	huangzitao	72.75                   null
		   algorithm	liuyifei	62.142857142857146      null
		   algorithm	huangdatou	56.0                    null
		    传过来的key都是科目相同的不同Stu对象
		  key是指针操作,对应的是每一个values中的value值
		  每一个values中的value值都会对应一个key
		  key的默认指针  指向 每一组的第一个key
		 * 
		 * */
		@Override
		protected void reduce(Stu key, Iterable<NullWritable> values,
				Context context) throws IOException, InterruptedException {
			//System.out.println("-------------------");
			int count=0;
			for (NullWritable v : values) {
				count++;
				context.write(key, NullWritable.get());
				if(count==2) {
					break;
				}
				//System.out.println("%%%%%%%%%%%"+key);
			}
		}
		
		public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
			Configuration conf = new Configuration();
			
			Job job=Job.getInstance(conf);
			job.setJarByClass(myGroup.class);
			
			job.setMapperClass(myMapper.class);
			job.setReducerClass(myReduce.class);
			
			job.setOutputKeyClass(Stu.class);
			job.setOutputValueClass(NullWritable.class);
			
			//指定分组类
			job.setGroupingComparatorClass(Group.class);
			
			FileInputFormat.addInputPath(job, new Path("E:\\stu.txt"));
			
			//只要有reduce类,就会有输出路径
			//输出路径的作用:存放输出标志文件_success
			FileOutputFormat.setOutputPath(job, new Path("e:\\stu_out2"));
			
			boolean waitForCompletion = job.waitForCompletion(true);
			System.exit(waitForCompletion?0:1);
		}
	}
}
Published 28 original articles · won praise 0 · Views 766

Guess you like

Origin blog.csdn.net/qq_27347421/article/details/104104401