MapReduce分布式计算(四)

文件关联

需求

order.txt

order011 u001
order012 u001
order033 u005
order034 u002
order055 u003
order066 u004
order077 u010

user.txt
u001,hangge,18,male,angelababy
u002,huihui,58,female,ruhua
u003,guanyu,16,male,chunge
u004,laoduan,38,male,angelababy
u005,nana,24,femal,huangbo
u006,xingge,18,male,laoduan


最终结果 
u001,hangge,18,male,angelababy,order012
u001,hangge,18,male,angelababy,order011
u002,huihui,58,female,ruhua,order034
u003,guanyu,16,male,chunge,order055
u004,laoduan,38,male,angelababy,order066
u005,nana,24,femal,huangbo,order033
null,order077

代码实现

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class JoinDemo {
    private static class  JoinMapper extends Mapper<LongWritable, Text,Text,Text>{
        private String fileName;

        @Override
        protected void setup(Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException {
          
          //获取文件名
            FileSplit f =  (FileSplit)context.getInputSplit();
            Path path = f.getPath();
            fileName = path.getName();
        }

        Text t = new Text();

        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException {
             //如果是user文件
             if (fileName.startsWith("user")){
                 String uid = value.toString().split(",")[0];
                 t.set(uid);
             }else{//如果是order文件
                 String uid = value.toString().split("\\s+")[1];
                 t.set(uid);
             }
             context.write(t,value);
        }
    }

    private static class JoinReducer extends Reducer<Text,Text,Text, NullWritable>{

        /*
            key: 001
            values:{order011 u001,order012 u001,u001,hangge,18,male,angelababy}
         */

        Text k3 = new Text();
        @Override
        protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {

            String user = null;
            List<String> orderList = new ArrayList<>();
            for (Text value : values) {
                if(value.toString().contains(",")){
                    user = value.toString();
                }else{
                    orderList.add(value.toString().split("\\s+")[0]);
                }
            }

            //遍历集合 拼接字符串
            for (String s : orderList) {
                k3.set(user+","+s);
                context.write(k3,NullWritable.get());
            }


        }
    }

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();

        //创建任务
        Job job = Job.getInstance(conf, "movie2");
        //设置Mapper类
        job.setMapperClass(JoinMapper.class);
        //设置Reduce类
        job.setReducerClass(JoinReducer.class);
        //设置map的输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        //设置reduce的输出类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        //设置输入文件位置
        FileInputFormat.setInputPaths(job,new Path("d:\\work\\abc\\input"));
        //设置输出文件位置
        FileOutputFormat.setOutputPath(job,new Path("d:\\work\\abc\\out_res"));

        //将任务提交 并等待完成
        job.waitForCompletion(true);
    }
}

多文件统计词频

需求

a.html
hello world
hello lucy
hello jack
hello liuyan

b.html
hello aaa
aaa bbb
bbb ccc
hello liuyan 
liuyan  tangyan

c.html
world hello 
liuyan tangyan
tangyan aaa
bbb	ccc


计算每个单词在每个文件中出现的次数 
aaa	b.html-2 c.html-1 
bbb	b.html-2 c.html-1 
ccc	b.html-1 
hello	a.html-4 b.html-2 c.html-1 
jack	a.html-1 
liuyan	b.html-2 c.html-1 
lucy	a.html-1 
tangyan	c.html-2 b.html-1 
world	a.html-1

代码实现

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class Index01 {

    private static class Index01Mapper extends Mapper<LongWritable, Text,Text,LongWritable>{

        String FileName ;

        @Override
        protected void setup(Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
            //获取文件名
            FileSplit in = (FileSplit) context.getInputSplit();
            FileName= in.getPath().getName();
        }

        Text k2 = new Text();
        LongWritable v2 = new LongWritable();

        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {

            String[] split = value.toString().split("\\s+");

            for (String s : split) {
                k2.set(s + "-" + FileName);
                v2.set(1);

                context.write(k2,v2 );
            }

        }
    }

    private static  class Index01Reduce extends Reducer<Text,LongWritable,Text,LongWritable>{

        LongWritable v3 = new LongWritable();

        @Override
        protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
            //hello-a.html <1,1,1,1,>

            long sum = 0;
            for (LongWritable value : values) {
                sum+=value.get();
            }

            v3.set(sum);


            context.write(key,v3);

        }
    }

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();

        //创建任务
        Job job = Job.getInstance(conf, "index01");
        //设置Mapper类
        job.setMapperClass(Index01.Index01Mapper.class);
        //设置Reduce类
        job.setReducerClass(Index01.Index01Reduce.class);
        //设置map的输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);
        //设置reduce的输出类型
        job.setOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);


        //设置输入文件位置
        FileInputFormat.setInputPaths(job,new Path("d:\\work\\abc\\input"));
        //设置输出文件位置
        FileOutputFormat.setOutputPath(job,new Path("d:\\work\\abc\\out_put3"));

        //将任务提交 并等待完成
        job.waitForCompletion(true);
    }
}

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;

public class Index02Demo {
    private static class  Index02Mapper extends Mapper<LongWritable, Text,Text,Text>{

        Text k2 = new Text();
        Text v2 = new Text();
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException {
            String s = value.toString();
            String[] split = s.split("-");

            k2.set(split[0]);

            v2.set(split[1].replaceAll("\\s+","-"));
            context.write(k2,v2);
        }
    }

    private static class Index02Reducer extends Reducer<Text,Text,Text, NullWritable>{

        Text k3 = new Text();
        @Override
        protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {

            ArrayList<String> list = new ArrayList<>();

            for (Text value : values) {
               list.add(value.toString());

            }
            Collections.sort(list, new Comparator<String>() {
                @Override
                public int compare(String s1, String s2) {
                    return  s2.split("-")[1].compareTo(s1.split("-")[1]);
                }
            });


            StringBuilder sb = new StringBuilder(key.toString());
            for (String s : list) {
                sb.append(" "+s);

                k3.set(sb.toString());
            }



            context.write(k3,NullWritable.get());
        }
    }

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf, "index02");

        //设置Mapper类
        job.setMapperClass(Index02Mapper.class);
        //设置Reduce类
        job.setReducerClass(Index02Reducer.class);
        //设置map的输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        //设置reduce的输出类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);



        //设置输入文件位置
        FileInputFormat.setInputPaths(job,new Path("d:\\work\\abc\\out_put2"));
        //设置输出文件位置
        FileOutputFormat.setOutputPath(job,new Path("d:\\work\\abc\\out_put4"));

        //将任务提交 并等待完成
        job.waitForCompletion(true);
    }
}

数据倾斜

分布式计算中最常见的，最容易遇到的问题就是数据倾斜
现象:当提交运行一个程序时，这个程序的大多数的Task都已经运行结束了，只有某一个Task一直在运行，迟迟不能结束，导致整体的进度卡在99%或者100%，这时候就可以判定程序出现了数据倾斜的问题。
原因:数据分配不均衡
解决方案1

可以考虑使用combiner组件  在map端输出时,进行数据的合并

解决方案2

数据倾斜的原因主要是key分配的不均匀,我们使用的是Hash的分区器,那么可以考虑为key+随机数来修改key的哈希值,
让key可以尽量随机分配 这样需要两次mr任务 能得到最终结果.

假如 hello比较多  2个reducetask 可以 hello-0 hello-1 
3个 reducetask  可以hello-0 hello-1 hello-2
这样可以将hello的哈希值改变 平均分配

hello world
hello hadoop
hello 51doit
hadoop mapreduce
mapreduce spark

public class Test {

    public static class WordCountMapper  extends Mapper<LongWritable, Text,Text, IntWritable>{

        Text k2 = new Text();
        IntWritable v2 = new IntWritable();

        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {

            String[] arr = value.toString().split("\\s+");

            //获取reduce个数
            int numReduceTasks = context.getNumReduceTasks();

            //将key+随机数后写出  人家reduce个数生成随机数
            for (String s : arr) {
                k2.set(s+"-"+new Random().nextInt(numReduceTasks));
                v2.set(1);

                context.write(k2,v2);
            }
        }
    }

    public static class WordCountReducer extends  Reducer<Text,IntWritable,Text,IntWritable>{

        IntWritable v3 = new IntWritable();

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable value : values) {
                sum+=value.get();
            }

            v3.set(sum);

            context.write(key,v3);
        }
    }

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {
        Configuration conf = new Configuration();


        //创建任务
        Job job = Job.getInstance(conf, "wordcount");
        job.setNumReduceTasks(2);

        //设置Mapper类
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //设置reduce的输出类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //设置输入文件位置
        FileInputFormat.setInputPaths(job,new Path("d:\\work\\word.txt"));
        //设置输出文件位置
        FileOutputFormat.setOutputPath(job,new Path("d:\\work\\abc\\out_res5"));

        //将任务提交 并等待完成
        job.waitForCompletion(true);
    }
}

51doit-0	1
hadoop-1	1
hello-0	1
mapreduce-0	1
spark-1	1


hadoop-0	1
hello-1	2
mapreduce-1	1
world-1	1

public class Test02 {
    public static class WordCountMapper  extends Mapper<LongWritable, Text,Text, IntWritable> {

        Text k2 = new Text();
        IntWritable v2 = new IntWritable();

        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {

            String[] arr = value.toString().split("\\s+");
            String word = arr[0].split("-")[0];

            k2.set(word);
            v2.set(Integer.parseInt(arr[1]));

            context.write(k2,v2);
        }
    }

    public static class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable> {

        IntWritable v3 = new IntWritable();

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable value : values) {
                sum+=value.get();
            }

            v3.set(sum);

            context.write(key,v3);
        }
    }

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {
        Configuration conf = new Configuration();


        //创建任务
        Job job = Job.getInstance(conf, "wordcount");
        job.setNumReduceTasks(2);

        //设置Mapper类
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //设置reduce的输出类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //设置输入文件位置
        FileInputFormat.setInputPaths(job, new Path("d:\\work\\abc\\out_res5"));
        //设置输出文件位置
        FileOutputFormat.setOutputPath(job, new Path("d:\\work\\abc\\out_res6"));

        //将任务提交 并等待完成
        job.waitForCompletion(true);
    }
}

hadoop	2
spark	1

51doit	1
hello	3
mapreduce	2
world	1

MapReduce分布式计算(四)

文件关联

需求

代码实现

多文件统计词频

需求

代码实现

数据倾斜

猜你喜欢