MapReduce自定义排序、分区、分组案例

一、题目

数据：由于数据量比较大，放入百度网盘中链接: https://pan.baidu.com/s/13vHZ1v7Rw2Vbb5wZrWX0cA 提取码: 6qug

字段说明
班级学号姓名语文数学英语
1307 7026 邝卓男 95 88 98

1.求每个学生的总分和平均分，并按总分降序排序

2.求每个班级每一门课程的平均分，不同班级的结果输出到不同的结果文件

3.求每个班级的总分最高的前5个学生

二、答案

1、求每个学生的总分和平均分，并按总分降序排序

思路： 当看到“每个”时，就把后面的字段当成分组字段，需要对总分进行排序，所以需要自定义排序

在开始代码的编写前，首先要确定map、reduce中的key和value各是什么？

map

reduce

key

value

key

value

自定义类

StudentScore

Text

Text

StudentScore

代码实现：

1）StudentScore类

public class StudentScore implements WritableComparable<StudentScore> {
    private String name;
    private int sum;
    private double avg;
    public StudentScore(){
        super();
    }
    public StudentScore(String name,int sum,Double avg){
        this.name=name;
        this.sum=sum;
        this.avg=avg;
    }
    public int getSum() {
        return sum;
    }
    public double getAvg() {
        return avg;
    }
    public void setSum(int sum) {
        this.sum = sum;
    }
    public void setAvg(double avg) {
        this.avg = avg;
    }
    public String getName() {
        return name;
    }
    public void setName(String name) {
        this.name = name;
    }
//   重写toString()方法
    @Override
    public String toString() {
        return this.name+"\t"+this.sum+"\t"+this.avg;
    }

    @Override
    public void write(DataOutput out) throws IOException {
      out.writeUTF(name);
      out.writeInt(sum);
      out.writeDouble(avg);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
       this.name=in.readUTF();
       this.sum=in.readInt();
       this.avg=in.readDouble();

    }
    @Override
    public int compareTo(StudentScore o) {
        return o.getSum()-this.getSum(); //根据总分进行倒序排序
    }
}

2）主类 MyScoreMapReduce

public class MyScoreMapReduce {
      public static class MyMapper extends Mapper<LongWritable,Text,StudentScore,Text> {
         Text ovalue=new Text();
          @Override
          protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//             读取一行数据进行切分
              String[] fields = value.toString().split("\t");
              int chinese=Integer.parseInt(fields[3]);   //将string类型转换为int类型传入自定义类中
              int math=Integer.parseInt(fields[4]);
              int english=Integer.parseInt(fields[5]);
              int sum=chinese+math+english;
              Double avg=(1.0)*sum/3;
              StudentScore ss=new StudentScore(fields[2],sum,avg); //创建自定义类对象
             ovalue.set(fields[0]+"\t"+fields[1]);
             context.write(ss,ovalue);
          }
      }
      public static class MyReduce extends Reducer<StudentScore,Text,Text,StudentScore>{

          @Override
          protected void reduce(StudentScore key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
              for(Text v: values){
                  context.write(v,key);
              }
          }
      }

    public static void main(String[] args) {
        Configuration conf=new Configuration();
        System.setProperty("HADOOP_HOME_USER","qyl");
        conf .set( "fs.defaultFS" , "hdfs://qyl02:9000" );
        try {
            Job job=Job.getInstance(conf);
            job.setJarByClass(MyScoreMapReduce.class);
            job.setMapperClass(MyMapper.class);
            job.setReducerClass(MyReduce.class);

            job.setMapOutputKeyClass(StudentScore.class);
            job.setMapOutputValueClass(Text.class);

            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(StudentScore.class);

            Path inpath=new Path("/student/shuju");
            FileInputFormat.addInputPath(job,inpath);
            Path outpath=new Path("/student/result01");
            if(outpath.getFileSystem(conf).exists(outpath)){
                outpath.getFileSystem(conf).delete(outpath,true);
            }
            FileOutputFormat.setOutputPath(job,outpath);

            job.waitForCompletion(true);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

3）结果

只列举一下
1304	4021	张晓宸	293	97.66666666666667
1307	7007	刘俊辉	293	97.66666666666667
1307	7019	刘程望	291	97.0
1304	4054 	谭凌云	291	97.0
1305	5024	吴思妮	291	97.0
1306	6001 	张轩铭	291	97.0
1304	4028 	宇佳杨	290	96.66666666666667
1304	4027	张鑫	289	96.33333333333333
1304	4026 	胡量	289	96.33333333333333
1303	3003 	王凭	289	96.33333333333333
1303	3004 	唐翔	289	96.33333333333333

2.求每个班级每一门课程的平均分，不同班级的结果输出到不同的结果文件

思路：分区字段为班级

排序字段为班级和课程

分组字段为班级和课程

代码编写

1）自定义分区

public class MyPartition  extends Partitioner<MySort, IntWritable> {
    @Override
    public int getPartition(MySort key, IntWritable arg1, int arg2) {
        if(key.getClassname().equals("1303")){
            return 0;
        }
        if(key.getClassname().equals("1304")){
            return 1;
        }
        if(key.getClassname().equals("1305")){
            return 2;
        }
        if(key.getClassname().equals("1306")){
            return 3;
        }else{
            return 4;
        }
    }
}

2）自定义排序

public class MySort implements WritableComparable<MySort> {
     private  String classname;
     private  String course;

    public MySort() {
        super();
    }

    public MySort(String classname, String course) {
        this.classname = classname;
        this.course = course;

    }

    public String getClassname() {
        return classname;
    }

    public String getCourse() {
        return course;
    }

    @Override
    public String toString() {
        return classname+"\t"+course;
    }

    public void setClassname(String classname) {
        this.classname = classname;
    }

    public void setCourse(String course) {
        this.course = course;
    }

    @Override
    public void write(DataOutput out) throws IOException {
       out.writeUTF(classname);
       out.writeUTF(course);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
       this.classname=in.readUTF();
       this.course=in.readUTF();
    }
    @Override
    public int compareTo(MySort o) {
       int temp=this.getClassname().compareTo(o.getClassname());
       if(temp==0){
           temp=this.getCourse().compareTo(o.getCourse());
       }
       return temp;
    }
}

3)自定义分组

public class MyGrouping extends WritableComparator {
    public MyGrouping(){
        super(MySort.class,true);
    }
    @Override
    public int compare(WritableComparable a, WritableComparable b) {
         MySort aa=(MySort)a;
         MySort bb=(MySort)b;
        int  i= aa.getClassname().compareTo(bb.getClassname());
        if(i==0){
            return aa.getCourse().compareTo(bb.getCourse());
        }
        return i;
    }
}

4）编写主类 MyClassMapReduce

public class MyClassMapReduce {
    public static class MyMapper extends Mapper<LongWritable,Text,MySort,IntWritable>{
        MySort ms=new MySort();
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] fields = value.toString().split("\t");
            int score=0;
            ms.setClassname(fields[0]);
              for(int i=3;i<fields.length;i++){
                  if(i==3){
                  ms.setCourse("语文");
                  score=Integer.parseInt(fields[3]);
                  context.write(ms,new IntWritable(score));
                  }
                  if(i==4){
                      ms.setCourse("数学");
                      score=Integer.parseInt(fields[4]);
                      context.write(ms,new IntWritable(score));
                  }else{
                      ms.setCourse("英语");
                      score=Integer.parseInt(fields[5]);
                      context.write(ms,new IntWritable(score));
                  }
              }
        }
    }
    public static class MyReducer extends Reducer<MySort,IntWritable,MySort,Text>{
        @Override
        protected void reduce(MySort key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum=0;int count=0;
          for(IntWritable v: values){
              sum+=v.get();
              count++;
          }
          context.write(key,new Text(""+1.0*sum/count));
          System.out.println(key.toString()+"---------"+1.0*sum/count);


             }
    }
    public static void main(String[] args) {
        Configuration conf =new Configuration();
        System.setProperty("HADOOP_USER_NAME", "qyl");
        conf .set( "fs.defaultFS" , "hdfs://qyl02:9000" );
        try {
            Job job=Job.getInstance(conf);
            job.setJarByClass(MyClassMapReduce.class);
            job.setMapperClass(MyMapper.class);
            job.setReducerClass(MyReducer.class);

            job.setMapOutputKeyClass(MySort.class);
            job.setMapOutputValueClass(IntWritable.class);

            job.setOutputKeyClass(MySort.class);
            job.setOutputValueClass(Text.class);

            job.setPartitionerClass(MyPartition.class);
            job.setGroupingComparatorClass(MyGrouping.class);
            job.setNumReduceTasks(5);

            //指定需要统计的文件输入路径
            Path inpath=new Path("/student/shuju");
            FileInputFormat.addInputPath(job, inpath);

            //指定输出目录 输出路径不能存在，否则就会报错 默认是覆盖式的输出
            Path outpath=new Path("/student/result02");
            if(outpath.getFileSystem(conf).exists(outpath)){
                outpath.getFileSystem(conf).delete(outpath,true);
            }
            FileOutputFormat.setOutputPath(job, outpath);

            job.waitForCompletion(true);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

5）结果

列举一下
1303	数学	86.78181818181818
1303	英语	90.0
1303	语文	95.12727272727273

3.求每个班级的总分最高的前5个学生

思路：自定义排序字段为班级和总成绩

自定义分组字段为班级

1）自定义排序

public class MyClassAndScore  implements WritableComparable<MyClassAndScore> {
   private String classname;
   private int sum;
    public String getClassname() {
        return classname;
    }
    public int getSum() {
        return sum;
    }
    public void setClassname(String classname) {
        this.classname = classname;
    }
    public void setSum(int sum) {
        this.sum = sum;
    }
    public MyClassAndScore() {
        super();
    }
    @Override
    public String toString() {
        return classname+"\t"+sum;
    }

    public MyClassAndScore(String classname, int sum) {
        this.classname = classname;
        this.sum = sum;
    }
    @Override
    public void write(DataOutput out) throws IOException {
     out.writeUTF(classname);
     out.writeInt(sum);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
     this.classname=in.readUTF();
     this.sum=in.readInt();
    }
    @Override
    public int compareTo(MyClassAndScore o) {
        int temp=this.getClassname().compareTo(o.getClassname());
        if(temp==0){
            temp=o.getSum()-this.getSum();
        }
        return temp;
    }
}

2）自定义分区

package com.qyl.lt.mapreduce.test03;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class MyGroup2 extends WritableComparator {
    public MyGroup2(){
        super(MyClassAndScore.class,true);
    }

    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        MyClassAndScore aa=(MyClassAndScore)a;
        MyClassAndScore bb=(MyClassAndScore)b;
        return aa.getClassname().compareTo(bb.getClassname());
    }
}

3）编写主类 MyAllSource

public class MyAllSource {
    public static class MyMapper extends Mapper<LongWritable,Text,MyClassAndScore,Text>{

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] fields = value.toString().split("\t");
            int chinese=Integer.parseInt(fields[3]);
            int math=Integer.parseInt(fields[4]);
            int english=Integer.parseInt(fields[5]);
            int sum=chinese+math+english;
            MyClassAndScore ms=new MyClassAndScore(fields[0],sum);
            context.write(ms,new Text(fields[1]+"\t"+fields[2]));
        }
    }
  public static class MyReducer extends Reducer<MyClassAndScore,Text,Text,Text>{

        @Override
      protected void reduce(MyClassAndScore key, Iterable<Text> values, Context  context) throws IOException, InterruptedException {
            int count=0;
            for(Text v:values){
              count++;
              if(count<=5) {
                  context.write(new Text(key.getClassname()), new Text(v.toString() + "\t" + key.getSum()));
              }
          }
      }
  }
    public static void main(String[] args) {
        Configuration conf =new Configuration();
        System.setProperty("HADOOP_USER_NAME", "qyl");
        conf .set( "fs.defaultFS" , "hdfs://qyl02:9000" );
        try {
            Job job=Job.getInstance(conf);
            job.setJarByClass(MyAllSource .class);
            job.setMapperClass(MyMapper.class);
            job.setReducerClass(MyReducer.class);

            job.setMapOutputKeyClass(MyClassAndScore.class);
            job.setMapOutputValueClass(Text.class);

            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);

            job.setGroupingComparatorClass(MyGroup2.class);

            //指定需要统计的文件输入路径
            Path inpath=new Path("/student/shuju");
            FileInputFormat.addInputPath(job, inpath);

            //指定输出目录 输出路径不能存在，否则就会报错 默认是覆盖式的输出
            Path outpath=new Path("/student/result03");
            if(outpath.getFileSystem(conf).exists(outpath)){
                outpath.getFileSystem(conf).delete(outpath,true);
            }
            FileOutputFormat.setOutputPath(job, outpath);

            job.waitForCompletion(true);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

4)结果

1303	3001 	谢雨泽	289
1303	3003 	王凭	289
1303	3006 	钟英杰	289
1303	3010 	吕俊刚	289
1303	3013	曹娴瑶	289
1304	4021	张晓宸	293
1304	4054 	谭凌云	291
1304	4028 	宇佳杨	290
1304	4002 	罗斐丹	289
1304	4053 	雷磊	289
1305	5024	吴思妮	291
1305	5009 	冯志超	289
1305	5053	曹能兴鑫	289
1305	5052 	莫涟欢	289
1305	5046	马小雅	289
1306	6001 	张轩铭	291
1306	6017	唐昕	289
1306	6002 	邹明慧	289
1306	6043 	李君清	289
1306	6042 	罗天	289
1307	7007	刘俊辉	293
1307	7019	刘程望	291
1307	7006 	苏新兴	289
1307	7001 	邓思维	289
1307	7048 	吴芷馨	289