Default grouping
Grouping and sorting rules under the default is the same, are compareTo () method they call.
Sort: compareTo () Returns the value of the adjustment data is sequentially
the same row will collation with
packet: compareTo () call is the return value is 0 if the
return value is 0 is set to all 0's
return value is not 0 for the new group
underlying implementation: call a class WritableComparetor is a general category
/*
分组按照map输出的key(序列化、比较)
map输出的key肯定是WritableCompareable的子类
*/
@SuppressWarnings("unchecked")
public int compare(WritableComparable a, WritableComparable b) {
//map输出的key的compareTo方法
return a.compareTo(b);
}
默认分组调用的就是 key的compareTo()
key是默认类型,默认类型 compareTo 方法 比较的是实际的值(数值、Text)
key是自定义类型,自定义的compareTo方法
Custom grouping
1) writing a packet class inherits class WritableComparator
2) override the compare method, redefining grouping algorithm
3) to specify the job class grouping
// specified packet class
job.setGroupingComparatorClass (Group.class);
报错:Caused by: java.lang.NullPointerException
分组类中有错
默认不会创建compare方法中的参数 WritableComparable 对象的
解决方法:
//重写父类的构造方法
public Group() {
//调用父类的构造, 第二个参数为true 创建map的key对象
super(Stu.class,true);
}
分组是在排序的基础上进行的,只会比较排序结果的前后相连的数据,不会跳跃比价的
无参构造
protected WritableComparator() {
this(null);
}
/** Construct for a {@link WritableComparable} implementation.
keyClass------>map的key
null
false 不创建对象
*/
protected WritableComparator(Class<? extends WritableComparable> keyClass) {
this(keyClass, null, false);
}
/*
keyClass map输出的key
Configuration conf 配置文件没有传
boolean createInstances 是否创建对象
*/
protected WritableComparator(Class<? extends WritableComparable> keyClass,
Configuration conf,
boolean createInstances) {
this.keyClass = keyClass;
this.conf = (conf != null) ? conf : new Configuration();
if (createInstances) {
key1 = newKey();
key2 = newKey();
buffer = new DataInputBuffer();
} else {
key1 = key2 = null;
buffer = null;
}
}
When the default packet can not meet the needs (grouping and ordering conflict time) need to be customized packet
packet: A
Sort: B
actually written:
sorting; A + B
packet: A
sort fields in the first sorted by the packet field, to ensure the same group the data put together, in accordance with normal sort field
packets just follow the packet fields may
Case:
package GroupByMapreduce;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class Stu implements WritableComparable<Stu> {
private String course;
private String name;
private double avgscore;
//无参构造
public Stu() {
super();
// TODO Auto-generated constructor stub
}
public Stu(String course, String name, double avgscore) {
super();
this.course = course;
this.name = name;
this.avgscore = avgscore;
}
public String getCourse() {
return course;
}
public void setCourse(String course) {
this.course = course;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public double getAvgscore() {
return avgscore;
}
public void setAvgscore(double avgscore) {
this.avgscore = avgscore;
}
@Override
public String toString() {
return course + "\t" + name + "\t" + avgscore ;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(course);
out.writeUTF(name);
out.writeDouble(avgscore);
}
@Override
public void readFields(DataInput in) throws IOException {
this.course=in.readUTF();
this.name=in.readUTF();
this.avgscore=in.readDouble();
}
//定义排序-----平均分降序
@Override
public int compareTo(Stu o) {
int course=this.getCourse().compareTo(o.getCourse());
if(course==0) {
//在课程相同的时候,按照平均分
double tmp=o.getAvgscore()-this.getAvgscore();
if(tmp>0) {
return 1;
}else if(tmp==0) {
return 1;
}else {
return -1;
}
}
return course;
}
}
package GroupByMapreduce;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class myGroup {
static class myMapper extends Mapper<LongWritable, Text, Stu, NullWritable>{
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//获取每一行数据
String[] datas = value.toString().split(",");
//求平均分
int sum=0;
for (int i=2;i<datas.length;i++) {
sum+= Integer.parseInt(datas[i].trim());
}
double avg=sum*1.0/(datas.length-2);
Stu stu = new Stu(datas[0], datas[1], avg);
context.write(stu, NullWritable.get());
}
}
/*
* shuffle
* 先进行排序 按平均分排序
*
* */
/*
* 自定义分组类
* */
static class Group extends WritableComparator{
//重写父类的构造方法
public Group() {
//调用父类的构造, 第二个参数为true 创建map的key对象
super(Stu.class,true);
}
//重写compare
/*
* a=this
* b=o
* a和b都代表map的key
* a b ---Stu ---->WritableComparable 父类
* */
@Override
public int compare(WritableComparable a, WritableComparable b) {
//自定义分组规则
//将父类转换为子类
Stu as=(Stu)a;
Stu bs=(Stu)b;
//比较科目
return as.getCourse().compareTo(bs.getCourse());
}
}
static class myReduce extends Reducer<Stu, NullWritable, Stu, NullWritable>{
/*
* 按照科目分
* 每个科目一组 共四组
* values代表的是每个科目的所有null值
* algorithm huangjiaju 82.28571428571429 null
algorithm liutao 82.0 null
algorithm huanglei 74.42857142857143 null
algorithm huangzitao 72.75 null
algorithm liuyifei 62.142857142857146 null
algorithm huangdatou 56.0 null
传过来的key都是科目相同的不同Stu对象
key是指针操作,对应的是每一个values中的value值
每一个values中的value值都会对应一个key
key的默认指针 指向 每一组的第一个key
*
* */
@Override
protected void reduce(Stu key, Iterable<NullWritable> values,
Context context) throws IOException, InterruptedException {
//System.out.println("-------------------");
int count=0;
for (NullWritable v : values) {
count++;
context.write(key, NullWritable.get());
if(count==2) {
break;
}
//System.out.println("%%%%%%%%%%%"+key);
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job=Job.getInstance(conf);
job.setJarByClass(myGroup.class);
job.setMapperClass(myMapper.class);
job.setReducerClass(myReduce.class);
job.setOutputKeyClass(Stu.class);
job.setOutputValueClass(NullWritable.class);
//指定分组类
job.setGroupingComparatorClass(Group.class);
FileInputFormat.addInputPath(job, new Path("E:\\stu.txt"));
//只要有reduce类,就会有输出路径
//输出路径的作用:存放输出标志文件_success
FileOutputFormat.setOutputPath(job, new Path("e:\\stu_out2"));
boolean waitForCompletion = job.waitForCompletion(true);
System.exit(waitForCompletion?0:1);
}
}
}