MapReduce进阶(2)---------------------数据倾斜的解决。

数据倾斜定义:数据倾斜是大量的相同key被partition分配到一个分区里,其他几个key的数据不是很多task都完成了计算,而其中一个数据量特别大的key却迟迟运行部出结果,造成了’一个人累死,其他人闲死’的情况。


数据倾斜的解决
1.前面文章中论述的Combine组件利用map阶段的计算去减轻负担,但是需要注意的地方太多
2.常规有效的解决方案—->打散倾斜的key
整体思路:
(1) 在key的后面加上一个随机数,方便分配task的时候均匀分配
(2) 加上随机数的key分配均匀计算出结果后传入第二个map
(3)将传入的数切割成原来状态的key,进行聚合
解决实例,还是以经典的词频分析为例
1)第一个map

   public class SkewWordcount {

    public static class SkewWordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    //这块主要是代码的优化,减少创建对象的空间
        Random random = new Random();
        Text k = new Text();
        IntWritable v = new IntWritable(1);
        int numReduceTasks = 0;

        @Override
        protected void setup(Mapper<LongWritable, Text, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            numReduceTasks = context.getNumReduceTasks();//获得用来计算的ReduceTask
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String[] words = value.toString().split(" ");
            for (String w : words) {
//加上随机数

                k.set(w + "\001" + random.nextInt(numReduceTasks));             context.write(k, v);

            }

        }

    }

    public static class SkewWordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        IntWritable v = new IntWritable();

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
            int count = 0;
            for (IntWritable value : values) {
                count += value.get();//分别计算每个task上的个数
            }
            v.set(count);
            context.write(key, v);
        }

    }

    public static void main(String[] args) throws Exception {


        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(SkewWordcount.class);

        job.setMapperClass(SkewWordcountMapper.class);
        job.setReducerClass(SkewWordcountReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // 设置maptask端的局部聚合逻辑类
        job.setCombinerClass(SkewWordcountReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path("f:/mrdata/wordcount/input"));
        FileOutputFormat.setOutputPath(job, new Path("f:/mrdata/wordcount/skew-out"));

        job.setNumReduceTasks(3);

        boolean res = job.waitForCompletion(true);
        System.exit(res?0:1);

    }


}

(2)第二个map

public class SkewWordcount2 {

    public static class SkewWordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        Text k = new Text();
        IntWritable v = new IntWritable(1);

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] wordAndCount = value.toString().split("\t");
            v.set(Integer.parseInt(wordAndCount[1]));
            k.set(wordAndCount[0].split("\001")[0]);//将原来相同的key切割开来,恢复原来的样子

            context.write(k, v);

        }

    }

    public static class SkewWordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        IntWritable v = new IntWritable();

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
            int count = 0;
            for (IntWritable value : values) {
                count += value.get();//计算相同的key下的count
            }
            v.set(count);
            context.write(key, v);
        }

    }

    public static void main(String[] args) throws Exception {


        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(SkewWordcount2.class);

        job.setMapperClass(SkewWordcountMapper.class);
        job.setReducerClass(SkewWordcountReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // 设置maptask端的局部聚合逻辑类
        job.setCombinerClass(SkewWordcountReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path("f:/mrdata/wordcount/skew-out"));
        FileOutputFormat.setOutputPath(job, new Path("f:/mrdata/wordcount/skew-out2"));

        job.setNumReduceTasks(3);

        boolean res = job.waitForCompletion(true);
        System.exit(res?0:1);

    }


}

猜你喜欢

转载自blog.csdn.net/qq_41166135/article/details/82117450
今日推荐