MapReduce入门详解（二）

MapReduce实践攻略

超详细入门级-WordCount

问题描述：
统计一个文件中，各种单词出现的次数
思路分析：
1. 在map阶段，对每行数据调用一次map方法，对读取到的每行数据按空格进行切割，将分割得到的每个单词作为key，value的值给定为1传递给reduce
2. 在reduce阶段，从map接收到传递过来的key和value，key值相同的为同一组，对每一组只调用一次reduce方法，将每一组的value值累加即可得到该单词出现的次数，最后将该组的key作为key，累加的value作为value作为结果输出

public class WordCountMR2 extends Configured implements Tool {
    /**
     * KEYIN: 默认情况下，是mr框架所读到的一行文本的起始偏移量，Long,
     * 但是在hadoop中有自己的更精简的序列化接口，所以不直接用Long，而用LongWritable
     * VALUEIN:默认情况下，是mr框架所读到的一行文本的内容，String，同上，用Text
     * KEYOUT：是用户自定义逻辑处理完成之后输出数据中的key，在此处是单词，String，同上，用Text
     * VALUEOUT：是用户自定义逻辑处理完成之后输出数据中的value，在此处是单词次数，Integer，同上，用IntWritable
     */
    public static class WCMapper extends Mapper<LongWritable,Text, Text, IntWritable> {
        /**
         * map阶段的业务逻辑就写在自定义的map()方法中
         * maptask会对每一行输入数据调用一次我们自定义的map()方法
         * context是上下文引用对象，传递输出值
         */
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            Collections.list(new StringTokenizer(value.toString()," ")).stream().map(s -> ((String)s).trim())
                    .filter(s -> s.length() > 1).forEach(ExceptionConsumer.of(word -> context.write(new Text(word),new IntWritable(1))));
        }
    }

    /**
     * KEYIN, VALUEIN对应mapper输出的KEYOUT,VALUEOUT类型对应
     * KEYOUT, VALUEOUT是自定义reduce逻辑处理结果的输出数据类型
     * KEYOUT是单词
     * VLAUEOUT是总次数
     */
    public static class WCReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
        /**
         * reduce阶段的业务逻辑就写在自定义的reduce()方法中
         * reducetask会对所有相同的key调用一次reduce()方法
         * context是上下文引用对象，传递输出值
         */
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            //map阶段的输出是reduce阶段的输入，样式如下
            //<helle,1><hello,1><helle,1><hello,1><helle,1><hello,1>
            //<tom,1><tom,1><tom,1>
            //<good,1>

//            int count = 0;
//            for (IntWritable value : values){
//                count += value.get();
//            }
//            context.write(key, new IntWritable(count));

            IntWritable count = StreamSupport.stream(values.spliterator(), false).collect(Collectors.toSet()).stream()
                    .reduce((a, b) -> new IntWritable(a.get() + b.get())).get();
            context.write(key,count);
        }
    }
    @Override
    public int run(String[] strings) throws Exception {
        Configuration conf = getConf();
        //创建job实例对象
        Job job = Job.getInstance(conf,"test_fun_wordcount2");
        //指定本程序的jar包所在的本地路径
        job.setJarByClass(this.getClass());
        //指定本业务job要使用的mapper/Reducer业务类
        job.setMapperClass(WCMapper.class);
        job.setReducerClass(WCReducer.class);
        //指定mapper输出数据的kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        //指定最终输出的数据的kv类型
        //注：不是setReduceOutput，因为有的时候只需要用到map，直接输出map的结果就可以
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        //指定job的输入原始文件所在目录
        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job,new Path(conf.get("inpath")));
        //指定job的输出结果所在目录
        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job,new Path(conf.get("outpath")));
        //指定开启的reduce的数量
        job.setNumReduceTasks(1);
        //将job中配置的相关参数，以及job所用的java类所在的jar包，提交给yarn去运行
        return job.waitForCompletion(true) ? 0 : 1;
    }
    public static void main(String[] args) throws Exception{
        ToolRunner.run(new WordCountMR2(),args);
    }
}

去重-DuplicateRemoveMR

问题描述：
去掉列表中所有重复的值，不考虑顺序
思路分析：
将每一行的值按分隔符切开重新排序，然后再拼接起来作为key，value置为NullWritable类型，传递给reduce，reduce对相同的key只会输出一次，以此达到去重复的效果。

public class DuplicateRemoveMR extends Configured implements Tool {
    public static class DRMapper extends Mapper<LongWritable,Text, Text, NullWritable>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String str = Collections.list(new StringTokenizer(value.toString(), ",")).stream()
                    .map(s -> ((String) s).trim()).filter(s -> s.length() > 1).sorted()
                    .collect(Collectors.joining(","));
            context.write(new Text(str), NullWritable.get());
        }
    }

    public static class DRReducer extends Reducer<Text,NullWritable,Text,NullWritable>{
        @Override
        protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
            context.write(key,NullWritable.get());
        }
    }
    @Override
    public int run(String[] strings) throws Exception {
        Configuration conf = getConf();
        Job job = Job.getInstance(conf,"dup_remove_xj");
        job.setJarByClass(DuplicateRemoveMR.class);

        job.setMapperClass(DRMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setReducerClass(DRReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        TextInputFormat.addInputPath(job,new Path(conf.get("inpath")));
        TextOutputFormat.setOutputPath(job,new Path(conf.get("outpath")));
        job.setNumReduceTasks(1);
        return job.waitForCompletion(true)? 0 : 1;
    }
    public static void main(String[] args) throws Exception{
        ToolRunner.run(new DuplicateRemoveMR(),args);
    }
}

倒置索引-InvertIndexMR

问题描述：
统计不同文件中单词出现的次数，还要输出该单词存在于哪些文件中
思路分析：
输入的每一行按分隔符切割成一个个单词，作为key，当前文件路径作为value传递给reduce，在reduce阶段统计相同key的个数即为单词个数，然后映射输出形式和拼接value的值，最后将单词作为key，单词个数和拼接起来的文件路径作为value输出。

public class InvertIndexMR extends Configured implements Tool {

    public static class IIMapper extends Mapper<LongWritable,Text, Text, Text> {
        Text file = new Text();
        @Override
        protected void map(LongWritable key, Text value, Context context){
            // ExceptionConsumer为自定义捕获异常类型，可用trycatch代替
            Collections.list(new StringTokenizer(value.toString()," ")).stream().map(s -> ((String)s).trim())
                    .filter(s -> s.length() > 1).forEach(ExceptionConsumer.of(name -> context.write(new Text(name),file)));
        }

        // setup在map前就运行了
        @Override
        protected void setup(Context context){
            String name = ((FileSplit) context.getInputSplit()).getPath().getName();
            file.set(name);
        }
    }

    public static class IIReducer extends Reducer<Text,Text,Text,Text> {
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            // StreamSupport.stream(values.spliterator(), false)是将Iterable类型转换为stream
            String str = StreamSupport.stream(values.spliterator(), false)
                    .collect(Collectors.groupingBy(Text::toString, Collectors.counting())).entrySet().stream()
                    .map(en -> en.getKey() + ":" + en.getValue()).collect(Collectors.joining(" "));
            context.write(key,new Text(str));
        }
    }
    @Override
    public int run(String[] strings) throws Exception {
        Configuration conf = getConf();
        Job job = Job.getInstance(conf, "invert_index_xj");
        job.setJarByClass(InvertIndexMR.class);

        job.setMapperClass(IIMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(IIReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        TextInputFormat.addInputPath(job,new Path(conf.get("inpath")));
        TextOutputFormat.setOutputPath(job,new Path(conf.get("outpath")));
        job.setNumReduceTasks(1);
        return job.waitForCompletion(true)? 0 : 1;
    }
    public static void main(String[] args) throws Exception{
        ToolRunner.run(new InvertIndexMR(),args);
    }
}

共现矩阵-ConcurrenceMR

问题描述：
求出两两共同好友出现的次数。例如，甲好友列表有1和2，乙好友列表也有1和2，那么1和2共现的次数为2，共现次数越大，说明两者关联的可能性越大。
思路分析：
第一步，先输出每个人的所有好友。第二步，map阶段循环每个人的好友两两组合的结果并排序，将所有的两两组合分别作为key，value置为1输出，reduce阶段直接统计相同key的个数即为两两共同好友数。
第一步：FlatFriendsMR

public class FlatFriendsMR extends Configured implements Tool{
    static class FFMapper extends Mapper<LongWritable,Text, Text, Text> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            Stream.of(value.toString()).filter(s->s.length()>1).map(line->line.split(","))
                    .filter(arr->arr.length==2).forEach(ExceptionConsumer.of(arr->context
                    .write(new Text(arr[0].trim()),new Text(arr[1].trim()))));
        }
    }

    static class FFReducer extends Reducer<Text,Text,Text,Text> {
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            String fs = StreamSupport.stream(values.spliterator(), false).map(s -> s.toString())
                    .collect(Collectors.joining(","));
            context.write(key,new Text(fs));
        }
    }
    @Override
    public int run(String[] strings) throws Exception {
        Configuration conf = getConf();
        Job job = Job.getInstance(conf,"flat_friends_xj");
        job.setJarByClass(this.getClass());

        job.setMapperClass(FFMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(FFReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        TextInputFormat.addInputPath(job,new Path(conf.get("inpath")));
        TextOutputFormat.setOutputPath(job,new Path(conf.get("outpath")));
        job.setNumReduceTasks(1);
        return job.waitForCompletion(true)? 0 : 1;
    }
    public static void main(String[] args) throws Exception{
        ToolRunner.run(new FlatFriendsMR(),args);
    }
}

第二步：ConcurrenceMR

public class ConcurrenceMR extends Configured implements Tool{
    static class CCMapper extends Mapper<LongWritable,Text, Text, IntWritable> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String s = value.toString();
            String[] arr = s.split("\t");
            String[] names = arr[1].split(",");
            // 将所有好友两两组合输出
            for (int i = 0; i < names.length-1; i++){
                for (int j = i+1; j < names.length; j++){
                    String first = names[i];
                    String second = names[j];
                    String pair = getPair(first,second);
                    context.write(new Text(pair),new IntWritable(1));
                }
            }
        }

        /**
         * 排序，防止key重复
         * @param first
         * @param second
         * @return
         */
        public String getPair(String first,String second){
            if(first.compareTo(second) > 0){
                return second+","+first;
            }else{
                return first+","+second;
            }
        }
    }

    static class CCReducer extends Reducer<Text,Text,Text,IntWritable> {
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            // 将好友组合两两相同的累加
            long count = StreamSupport.stream(values.spliterator(), false).count();
            context.write(key,new IntWritable((int)count));
        }
    }
    @Override
    public int run(String[] strings) throws Exception {
        Configuration conf = getConf();
        Job job = Job.getInstance(conf,"concurrence_xj");
        job.setJarByClass(this.getClass());

        job.setMapperClass(CCMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setReducerClass(CCReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        TextInputFormat.addInputPath(job,new Path(conf.get("inpath")));
        TextOutputFormat.setOutputPath(job,new Path(conf.get("outpath")));
        job.setNumReduceTasks(1);
        return job.waitForCompletion(true)? 0 : 1;
    }

    public static void main(String[] args) throws Exception{
        ToolRunner.run(new ConcurrenceMR(),args);
    }
}

MapReduce排序

局部排序-PartitionSortMR

问题描述：
将所有数据根据气温排序，每个分区之间不存在排序关系，仅在各个区内部进行排序
思路分析：
默认排序方式，只需要将key设置为温度即可

public class PartitionSortMR extends Configured implements Tool {
    public static class PSMapper extends Mapper<LongWritable, Text, DoubleWritable, Text>{
        // 将气温作为key，整体作为value
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] ss = line.split("\t");
            String tmp = ss[2];
            context.write(new DoubleWritable(Double.parseDouble(tmp)),value);
        }
    }

    @Override
    public int run(String[] strings) throws Exception {
        Configuration conf = getConf();
        Job job = Job.getInstance(conf,"part_sort_xj");
        job.setJarByClass(this.getClass());

        job.setMapperClass(PSMapper.class);
        job.setMapOutputKeyClass(DoubleWritable.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(Reducer.class);
        job.setOutputKeyClass(DoubleWritable.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        TextInputFormat.addInputPath(job,new Path(conf.get("inpath")));
        TextOutputFormat.setOutputPath(job,new Path(conf.get("outpath")));
        //-D mapreduce.job.reduces
        job.setNumReduceTasks(5);
        return job.waitForCompletion(true)? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new PartitionSortMR(),args);
    }
}

全局排序-TotalSortMR

问题描述：
将所有数据根据气温排序，每个分区之间也存在排序关系
思路分析：
设置成根据样本分区排序，这样的话必须保证样本的泛型前后一致，故无法使用默认的输入格式，可以修改InputFormat或者使用sequencefile，因为sequencefile可以保存数据类型，案例中使用这种方法，先将数据转化为sequencefile，然后直接从sequencefile读取数据进行分区排序。
OutSequenceMR

public class OutSequenceMR extends Configured implements Tool {
    public static class OSMapper extends Mapper<LongWritable, Text, DoubleWritable, Text>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] ss = line.split("\t");
            String tmp = ss[2];
            context.write(new DoubleWritable(Double.parseDouble(tmp)),value);
        }
    }

    @Override
    public int run(String[] strings) throws Exception {
        Configuration conf = getConf();
        Job job = Job.getInstance(conf, "out_sequence_xj");
        job.setJarByClass(this.getClass());

        job.setMapperClass(OSMapper.class);
        job.setMapOutputKeyClass(DoubleWritable.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(Reducer.class);
        job.setOutputKeyClass(DoubleWritable.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        TextInputFormat.addInputPath(job,new Path(conf.get("inpath")));
        SequenceFileOutputFormat.setOutputPath(job,new Path(conf.get("outpath")));
        //-D mapreduce.job.reduces
        //job.setNumReduceTasks(1);
        return job.waitForCompletion(true)? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new OutSequenceMR(),args);
    }
}

TotalSortMR

public class TotalSortMR extends Configured implements Tool {
    public static class TSMapper extends Mapper<DoubleWritable, Text, DoubleWritable, Text>{
        @Override
        protected void map(DoubleWritable key, Text value, Context context) throws IOException, InterruptedException {
            context.write(key, value);
        }
    }

    @Override
    public int run(String[] strings) throws Exception {
        Configuration conf = getConf();
        Job job = Job.getInstance(conf, "total_sort_xj");
        job.setJarByClass(this.getClass());

        job.setMapperClass(TSMapper.class);
        job.setMapOutputKeyClass(DoubleWritable.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(Reducer.class);
        job.setOutputKeyClass(DoubleWritable.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        SequenceFileInputFormat.addInputPath(job,new Path(conf.get("inpath")));
        TextOutputFormat.setOutputPath(job,new Path(conf.get("outpath")));

        // 设置成根据样本分区排序
        job.setPartitionerClass(TotalOrderPartitioner.class);
        // 获取随机样本
        // 0.8表示，数量少的话，随机取80%的数据作为样本
        // 1000表示，数量很多的话，随机取1000个数据作为样本
        // 10表示，最大支持10个分区
        InputSampler.RandomSampler<DoubleWritable,Text> sam = new InputSampler.RandomSampler(0.8,1000,10);
        //把采样结果传递给job
        InputSampler.writePartitionFile(job,sam);
        String file = TotalOrderPartitioner.getPartitionFile(job.getConfiguration());
        job.addCacheFile(URI.create(file));
//        job.setNumReduceTasks(5);
        return job.waitForCompletion(true)? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new TotalSortMR(),args);
    }
}

二次排序-SecondarySortMR

问题描述：
将所有数据先根据年份升序排列，再根据气温降序排列
思路分析：
要进行二次排序，必须要创建一个复合类型作为key来进行排序比较，这个复合类型实现WritableComparable接口，包含年份和气温两个属性，重写compareTo()方法，按年份升序，按气温降序。除此之外，要实现二次排序必须保证相同年份的被分到同一个分区，这样才可以比较气温。因此，还需要定义一个类来继承Partitioner抽象类，重写getPartition()方法，使分区根据年份来划分。另外，还需手动设置根据年份进行分组，故还需要创建一个类实现WritableComparator接口，重写compare()方法，将相同年份的分为同一组。最后，在主类中将复合类型作为map的key的输出类型，完成排序，在job上设置自定义的分区规则和分组规则。
YearTmp（复合类型）

public class YearTmp implements WritableComparable<YearTmp> {
    private IntWritable year = new IntWritable(); // 年份
    private DoubleWritable tmp = new DoubleWritable(); // 平均温度

    public YearTmp() {
    }

    public YearTmp(IntWritable year, DoubleWritable tmp) {
        this.year = new IntWritable(year.get());
        this.tmp = new DoubleWritable(tmp.get());
    }

    public YearTmp(int year, double tmp) {
        this.year = new IntWritable(year);
        this.tmp = new DoubleWritable(tmp);
    }

    public IntWritable getYear() {
        return year;
    }

    public void setYear(IntWritable year) {
        this.year = new IntWritable(year.get());
    }

    public DoubleWritable getTmp() {
        return tmp;
    }

    public void setTmp(DoubleWritable tmp) {
        this.tmp = new DoubleWritable(tmp.get());
    }

    // 第二步，排序，年份升序，温度降序
    @Override
    public int compareTo(YearTmp o) {
        return this.year.compareTo(o.year)==0 ? o.tmp.compareTo(this.tmp): this.year.compareTo(o.year);
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        year.write(dataOutput);
        tmp.write(dataOutput);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        year.readFields(dataInput);
        tmp.readFields(dataInput);
    }
}

YearPartitioner（自定义分区规则）

public class YearPartitioner extends Partitioner<YearTmp, Text> {
    public YearPartitioner() {
    }
    @Override
    public int getPartition(YearTmp o,Text o2, int i) {
        return o.getYear().get()%i;
    }
}

YearGroupComparator（自定义分组规则）

public class YearGroupComparator extends WritableComparator {
    public YearGroupComparator() {
        super(YearTmp.class,true);
    }

    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        YearTmp y1 = (YearTmp)a;
        YearTmp y2 = (YearTmp)b;
        return y1.getYear().compareTo(y2.getYear());
    }
}

SecondarySortMR（MR主程序）

public class SecondarySortMR extends Configured implements Tool {
    public static class SSMapper extends Mapper<LongWritable, Text, YearTmp,Text>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] infos = line.split("\t");
            YearTmp yt = new YearTmp(Integer.parseInt(infos[0]), Double.parseDouble(infos[2]));
            context.write(yt,new Text(infos[1]));
        }
    }
    public static class SSReducer extends Reducer<YearTmp,Text,Text,Text>{
        @Override
        protected void reduce(YearTmp key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            for (Text value : values) {
                String str = key.getYear() + "\t" + key.getTmp();
                context.write(new Text(str),value);
            }
        }
    }

    @Override
    public int run(String[] strings) throws Exception {
        Configuration conf = getConf();
        Job job = Job.getInstance(conf, "secondary_sort_xj");
        job.setJarByClass(this.getClass());

        job.setMapperClass(SSMapper.class);
        job.setMapOutputKeyClass(YearTmp.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(SSReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        TextInputFormat.addInputPath(job,new Path(conf.get("inpath")));
        TextOutputFormat.setOutputPath(job,new Path(conf.get("outpath")));

        // 设置分区规则
        job.setPartitionerClass(YearPartitioner.class);
        // 设置分组规则
        job.setGroupingComparatorClass(YearGroupComparator.class);

        return job.waitForCompletion(true)? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new SecondarySortMR(),args);
    }
}