默认分组
默认情况下的分组和排序的规则是一样的,他们调用的都是compareTo()方法。
排序:compareTo()返回值 调整数据顺序
排序规则相同的会排在一起
分组:compareTo()调用的是返回值是否为0
返回值为0的为0的全部为一组
返回值不为0的为新的组
底层实现: 调用一个类 WritableComparetor 是一个普通类
/*
分组按照map输出的key(序列化、比较)
map输出的key肯定是WritableCompareable的子类
*/
@SuppressWarnings("unchecked")
public int compare(WritableComparable a, WritableComparable b) {
//map输出的key的compareTo方法
return a.compareTo(b);
}
默认分组调用的就是 key的compareTo()
key是默认类型,默认类型 compareTo 方法 比较的是实际的值(数值、Text)
key是自定义类型,自定义的compareTo方法
自定义分组
1)写一个分组类继承WritableComparator类
2)重写compare方法,重新定义分组算法
3)在job中指定分组类
//指定分组类
job.setGroupingComparatorClass(Group.class);
报错:Caused by: java.lang.NullPointerException
分组类中有错
默认不会创建compare方法中的参数 WritableComparable 对象的
解决方法:
//重写父类的构造方法
public Group() {
//调用父类的构造, 第二个参数为true 创建map的key对象
super(Stu.class,true);
}
分组是在排序的基础上进行的,只会比较排序结果的前后相连的数据,不会跳跃比价的
无参构造
protected WritableComparator() {
this(null);
}
/** Construct for a {@link WritableComparable} implementation.
keyClass------>map的key
null
false 不创建对象
*/
protected WritableComparator(Class<? extends WritableComparable> keyClass) {
this(keyClass, null, false);
}
/*
keyClass map输出的key
Configuration conf 配置文件没有传
boolean createInstances 是否创建对象
*/
protected WritableComparator(Class<? extends WritableComparable> keyClass,
Configuration conf,
boolean createInstances) {
this.keyClass = keyClass;
this.conf = (conf != null) ? conf : new Configuration();
if (createInstances) {
key1 = newKey();
key2 = newKey();
buffer = new DataInputBuffer();
} else {
key1 = key2 = null;
buffer = null;
}
}
当默认分组不能满足需要的时候(分组和排序冲突的时候)需要自定义分组
分组:A
排序:B
实际写:
排序;A+B
分组:A
排序字段中先按照分组字段进行排序,保证同一组的数据放在一起,在按照正常的排序字段排序
分组只需按照分组字段就可以
案例:
package GroupByMapreduce;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class Stu implements WritableComparable<Stu> {
private String course;
private String name;
private double avgscore;
//无参构造
public Stu() {
super();
// TODO Auto-generated constructor stub
}
public Stu(String course, String name, double avgscore) {
super();
this.course = course;
this.name = name;
this.avgscore = avgscore;
}
public String getCourse() {
return course;
}
public void setCourse(String course) {
this.course = course;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public double getAvgscore() {
return avgscore;
}
public void setAvgscore(double avgscore) {
this.avgscore = avgscore;
}
@Override
public String toString() {
return course + "\t" + name + "\t" + avgscore ;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(course);
out.writeUTF(name);
out.writeDouble(avgscore);
}
@Override
public void readFields(DataInput in) throws IOException {
this.course=in.readUTF();
this.name=in.readUTF();
this.avgscore=in.readDouble();
}
//定义排序-----平均分降序
@Override
public int compareTo(Stu o) {
int course=this.getCourse().compareTo(o.getCourse());
if(course==0) {
//在课程相同的时候,按照平均分
double tmp=o.getAvgscore()-this.getAvgscore();
if(tmp>0) {
return 1;
}else if(tmp==0) {
return 1;
}else {
return -1;
}
}
return course;
}
}
package GroupByMapreduce;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class myGroup {
static class myMapper extends Mapper<LongWritable, Text, Stu, NullWritable>{
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//获取每一行数据
String[] datas = value.toString().split(",");
//求平均分
int sum=0;
for (int i=2;i<datas.length;i++) {
sum+= Integer.parseInt(datas[i].trim());
}
double avg=sum*1.0/(datas.length-2);
Stu stu = new Stu(datas[0], datas[1], avg);
context.write(stu, NullWritable.get());
}
}
/*
* shuffle
* 先进行排序 按平均分排序
*
* */
/*
* 自定义分组类
* */
static class Group extends WritableComparator{
//重写父类的构造方法
public Group() {
//调用父类的构造, 第二个参数为true 创建map的key对象
super(Stu.class,true);
}
//重写compare
/*
* a=this
* b=o
* a和b都代表map的key
* a b ---Stu ---->WritableComparable 父类
* */
@Override
public int compare(WritableComparable a, WritableComparable b) {
//自定义分组规则
//将父类转换为子类
Stu as=(Stu)a;
Stu bs=(Stu)b;
//比较科目
return as.getCourse().compareTo(bs.getCourse());
}
}
static class myReduce extends Reducer<Stu, NullWritable, Stu, NullWritable>{
/*
* 按照科目分
* 每个科目一组 共四组
* values代表的是每个科目的所有null值
* algorithm huangjiaju 82.28571428571429 null
algorithm liutao 82.0 null
algorithm huanglei 74.42857142857143 null
algorithm huangzitao 72.75 null
algorithm liuyifei 62.142857142857146 null
algorithm huangdatou 56.0 null
传过来的key都是科目相同的不同Stu对象
key是指针操作,对应的是每一个values中的value值
每一个values中的value值都会对应一个key
key的默认指针 指向 每一组的第一个key
*
* */
@Override
protected void reduce(Stu key, Iterable<NullWritable> values,
Context context) throws IOException, InterruptedException {
//System.out.println("-------------------");
int count=0;
for (NullWritable v : values) {
count++;
context.write(key, NullWritable.get());
if(count==2) {
break;
}
//System.out.println("%%%%%%%%%%%"+key);
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job=Job.getInstance(conf);
job.setJarByClass(myGroup.class);
job.setMapperClass(myMapper.class);
job.setReducerClass(myReduce.class);
job.setOutputKeyClass(Stu.class);
job.setOutputValueClass(NullWritable.class);
//指定分组类
job.setGroupingComparatorClass(Group.class);
FileInputFormat.addInputPath(job, new Path("E:\\stu.txt"));
//只要有reduce类,就会有输出路径
//输出路径的作用:存放输出标志文件_success
FileOutputFormat.setOutputPath(job, new Path("e:\\stu_out2"));
boolean waitForCompletion = job.waitForCompletion(true);
System.exit(waitForCompletion?0:1);
}
}
}