MapReduce练习之二次排序

0. 运行环境

idea+hadoop 2.9.0 本地调试
关于idea上hadoop的配置,见前文
https://blog.csdn.net/wxfghy/article/details/80521577
输入文件格式如下,输出以字母分区,分区内部排序,也可以不分区,按ABC排序
刘备 15
关羽 60
张飞 8
刘备 75
关羽 65
张飞 98
刘备 55
刘备 23
关羽 85
张飞 67
张飞 58
输出文件按姓名分3个文件保存,格式为人名分数升序
刘备 15
刘备 23
刘备 55
刘备 75

1. 主方法

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    Configuration cfg=new Configuration();
    Job job = Job.getInstance(cfg);
    //SecondSort为主方法所在类
    job.setJarByClass(SecondSort.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapOutputKeyClass(SeKey.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setMapperClass(SeMaper.class);
    job.setReducerClass(SeReduceer.class);
    //设置reduce数量为3,默认为1
    job.setNumReduceTasks(3);
    //设置自定义分区类Partitioner
    job.setPartitionerClass(SePart.class);
    //输入路径和输出路径的设置
    FileInputFormat.addInputPath(job, new Path("d:\\mr\\ssort.txt"));
    FileOutputFormat.setOutputPath(job, new Path("d:\\mr\\output"));
    System.exit(job.waitForCompletion(true)?0:1);
}

2. map

//输入文件为<偏移量,当前行字符串>,输出为<组合键类,分数>
static class SeMaper extends Mapper<LongWritable,Text,SeKey,IntWritable>{
    private SeKey sekey=new SeKey();
    private IntWritable svalue=new IntWritable();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        System.out.println("二次排序map");
        //得到当前行
        String line = value.toString();
        //按空格分割人名和分数
        String[] lines = line.split("\\s");
        //过滤为空的非法记录
        if (lines==null||lines.length<1){
            return;
        }
        //将人名,分数写入组合键作为map输出key
        sekey.setMkey(lines[0]+","+lines[1]);
        //将分数写入map输出value
        svalue.set(new Integer(lines[1]));
        context.write(sekey,svalue);
    }
}

3. 组合键类SeKey,实现WritableComparable接口

static class SeKey implements Writable,WritableComparable<SeKey>{
    //自定义字符串作为组合键中的键
    private String mkey;
    //getset方法
    public String getMkey() {
        return mkey;
    }

    public void setMkey(String mkey) {
        this.mkey = mkey;
    }
    //重写compareTo方法,自定义比较方法
    @Override
    public int compareTo(SeKey o) {
        //data1当前行的key
        String[] data1 = this.mkey.split(",");
        //o是参与比较的另一个主键对象
        String[] data2 = o.getMkey().split(",");
        //比较人名compareTo返回0表示相同
        int res=data1[0].compareTo(data2[0]);
        //当人名相同时,比较主键中包含的分数
        if(res==0){
            //人名相同时,compareTo比较第二个字段,即分数
            res=new Integer(data1[1]).compareTo(new Integer(data2[1]));
        }   
        return res;
    }
    //重写write方法,实现组合键序列化
    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(this.mkey);
    }
    //重写readFields方法,实现组合键反序列化
    @Override
    public void readFields(DataInput in) throws IOException {
        this.mkey = in.readUTF();
    }
}

4. 自定义分区类SePart,继承Partitioner

static class SePart extends Partitioner<SeKey,IntWritable>{
    //输入为map的输出
    //重写getPartition方法,按map输出组合键的key进行分区
    //默认按map输出的key分区,由于是组合键,所以要自定义比较的key
    @Override
    public int getPartition(SeKey seKey, IntWritable intWritable, int numPartitions) {
    //int numPartitions为设置的reducetask数量
        if (seKey.getMkey().split(",")[0].equals("刘备")) {
            //返回0,1,2表示不同分区
            return 0;
        } else if (seKey.getMkey().split(",")[0].equals("关羽")) {
            return 1;
        } else {
            return 2;
        }
    }
}

5. reduce

static class SeReduceer extends Reducer<SeKey,IntWritable,Text,Text>{
    private Text rkey=new Text();
    private Text rvalue=new Text();
    @Override
    protected void reduce(SeKey skey, Iterable<IntWritable> iter, Context context) throws IOException, InterruptedException {
        System.out.println("二次排序reduce");
        //StringBuilder用于装分数,转换为字符串后输出
        StringBuilder buf=new StringBuilder();
        for (IntWritable it:iter){
            buf.append(it);
        }
        rkey.set(skey.getMkey().split(",")[0]);
        rvalue.set(buf.toString());
        context.write(rkey,rvalue);
    }
}

MapReduce练习之二次排序

0. 运行环境

1. 主方法

2. map

3. 组合键类SeKey,实现WritableComparable接口

4. 自定义分区类SePart,继承Partitioner

5. reduce

猜你喜欢