Hadoop detailed notes (14) Mutual friend case of mapreduce data analysis case

Free video tutorial  https://www.51doit.com/  or contact the blogger on WeChat 17710299606

1 data

A:B,C,D,F,E,O
B:A,C,E,K
C:F,A,D,I
D:A,E,F,L
E:B,C,D,M,L
F:A,B,C,D,E,O,M
G:A,C,D,E,F
H:A,C,D,E,O
I:A,O
J:B,O
K:A,C,D
L:D,E,F
M:E,F,G
O:A,H,I,J

2 demand

Get mutual friends between any two people

For example, the mutual friends of A and B are: [E, C] The
           mutual friends of A and C are: [D, F]  

           The mutual friends of B and F are: [E, A, C]

3 implementation

  One MR program to achieve this requirement does not get the final result, so we need to use two MR programs to achieve the final requirement, MR1 -> intermediate result --> MR2-> result

3.1 The first MR program 

import com._51doit.mr.join.Join;
import com._51doit.mr.line.LineDemo;
import com._51doit.pojo.JoinBean;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.zookeeper.txn.Txn;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * Author:   多易教育-行哥
 * Date:     2020/7/12
 * Description:
 */
public class Friend1 {

    static class Friend1Mapper extends Mapper<LongWritable , Text , Text , Text>{
        Text k = new Text() ;
        Text v = new Text() ;
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] split = line.split(":");
            String id = split[0];
            String[] fs = split[1].split(",");
            v.set(id);
            for (String f : fs) {
                k.set(f);
                context.write(k,v);
            }
        }
    }

    static class Friend1Reducer extends Reducer<Text , Text ,Text,Text>{
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            List<String> list =  new ArrayList<String>() ;
            for (Text value : values) { // B C D F G H I K O
                String f = value.toString();
                list.add(f) ;
            }
            // 排序
            Collections.sort(list);
            for(int i=0 ; i < list.size()-1;i++){  //  0
                for(int j=i+1 ; j<list.size();j++){ // 1   2  3  4  5  6
                    String pre = list.get(i); // B   C
                    String post = list.get(j);//  D F G H I K O
                    context.write(new Text(pre+"和"+post+"共同好友是:"),key);// key ==F
                }
            }
        }
    }

    public static void main(String[] args) throws Exception {
        Logger.getLogger("org").setLevel(Level.ERROR);
        Configuration conf = new Configuration();
        // 参数2  job的名字
        Job job = Job.getInstance(conf, new LineDemo().getClass().getSimpleName());

        job.setMapperClass(Friend1Mapper.class);
        job.setReducerClass(Friend1Reducer.class);
        // 设置map阶段的输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        // 最终结果的数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        // job.setNumReduceTasks(2);  //启动3个reduce任务
        // 待处理数据的路径
        FileInputFormat.setInputPaths(job, new Path("D:\\data\\friend\\input"));
        FileOutputFormat.setOutputPath(job, new Path("D:\\data\\friend\\out2"));
        job.waitForCompletion(true);
    }
}

3.2 The second MR program 

import com._51doit.mr.line.LineDemo;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * Author:   多易教育-行哥
 * Date:     2020/7/12
 * Description:
 */
public class Friend2 {

    static class Friend2Mapper extends Mapper<LongWritable , Text , Text , Text>{
        Text k = new Text() ;
        Text v = new Text() ;
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] split = value.toString().split("\t");
            k.set(split[0]);
            v.set(split[1]);
            context.write(k,v);
        }
    }

    static class Friend2Reducer extends Reducer<Text , Text ,Text,Text>{
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            List<String> list =  new ArrayList<String>() ;
            for (Text value : values) { //  A  C  D  G L  M
                String f = value.toString();
                list.add(f) ;
            }
            context.write(key ,new Text(list.toString()));
        }
    }

    public static void main(String[] args) throws Exception {
        Logger.getLogger("org").setLevel(Level.ERROR);
        Configuration conf = new Configuration();
        // 参数2  job的名字
        Job job = Job.getInstance(conf, new LineDemo().getClass().getSimpleName());

        job.setMapperClass(Friend2Mapper.class);
        job.setReducerClass(Friend2Reducer.class);
        // 设置map阶段的输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        // 最终结果的数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        // job.setNumReduceTasks(2);  //启动3个reduce任务
        // 待处理数据的路径
        FileInputFormat.setInputPaths(job, new Path("D:\\data\\friend\\out2"));
        FileOutputFormat.setOutputPath(job, new Path("D:\\data\\friend\\res"));
        job.waitForCompletion(true);

    }

}

3.3 Knowledge points

maptast will write the data processed by the maptask task to the working directory of the task, and wait for the corresponding reducetask task to pull the data, then this data can use special files or use compression to improve efficiency 

The intermediate results produced by MR1 will be output to the disk as the data source processed by the MR2 task, then this data can also be used in a special file format or compressed to improve efficiency

We use the sequence file format as an example to output the intermediate result file

3.3.1 MR1

import com._51doit.mr.join.Join;
import com._51doit.mr.line.LineDemo;
import com._51doit.pojo.JoinBean;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.zookeeper.txn.Txn;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * Author:   多易教育-行哥
 * Date:     2020/7/12
 * Description:
 */
public class Friend1 {

    static class Friend1Mapper extends Mapper<LongWritable , Text , Text , Text>{
        Text k = new Text() ;
        Text v = new Text() ;
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] split = line.split(":");
            String id = split[0];
            String[] fs = split[1].split(",");
            v.set(id);
            for (String f : fs) {
                k.set(f);
                context.write(k,v);
            }
        }
    }

    static class Friend1Reducer extends Reducer<Text , Text ,Text,Text>{
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            List<String> list =  new ArrayList<String>() ;
            for (Text value : values) { // B C D F G H I K O
                String f = value.toString();
                list.add(f) ;
            }
            // 排序
            Collections.sort(list);
            for(int i=0 ; i < list.size()-1;i++){  //  0
                for(int j=i+1 ; j<list.size();j++){ // 1   2  3  4  5  6
                    String pre = list.get(i); // B   C
                    String post = list.get(j);//  D F G H I K O
                    context.write(new Text(pre+"和"+post+"共同好友是:"),key);// key ==F
                }
            }
        }
    }

    public static void main(String[] args) throws Exception {
        Logger.getLogger("org").setLevel(Level.ERROR);
        Configuration conf = new Configuration();
        // 参数2  job的名字
        Job job = Job.getInstance(conf, new LineDemo().getClass().getSimpleName());

        job.setMapperClass(Friend1Mapper.class);
        job.setReducerClass(Friend1Reducer.class);
        // 设置map阶段的输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        // 最终结果的数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        // job.setNumReduceTasks(2);  //启动3个reduce任务
        // 待处理数据的路径
        FileInputFormat.setInputPaths(job, new Path("D:\\data\\friend\\input"));
        FileOutputFormat.setOutputPath(job, new Path("D:\\data\\friend\\seq"));
        job.waitForCompletion(true);
    }

}

        job.setOutputFormatClass(SequenceFileOutputFormat.class); Set the format of data output

3.3.2 MR2

The input of the maptask task is not a row offset and row data but a pair of KV

  static class Friend2Mapper extends Mapper<Text , Text , Text , Text>

import com._51doit.mr.line.LineDemo;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * Author:   多易教育-行哥
 * Date:     2020/7/12
 * Description:
 */
public class Friend2 {

    static class Friend2Mapper extends Mapper<Text , Text , Text , Text>{

        @Override
        protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
            context.write(key,value);
        }
    }

    static class Friend2Reducer extends Reducer<Text , Text ,Text,Text>{
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            List<String> list =  new ArrayList<String>() ;
            for (Text value : values) { //  A  C  D  G L  M
                String f = value.toString();
                list.add(f) ;
            }
            context.write(key ,new Text(list.toString()));
        }
    }

    public static void main(String[] args) throws Exception {
        Logger.getLogger("org").setLevel(Level.ERROR);
        Configuration conf = new Configuration();
        // 参数2  job的名字
        Job job = Job.getInstance(conf, new LineDemo().getClass().getSimpleName());

        job.setMapperClass(Friend2Mapper.class);
        job.setReducerClass(Friend2Reducer.class);
        // 设置map阶段的输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        // 最终结果的数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        // job.setNumReduceTasks(2);  //启动3个reduce任务
        // 待处理数据的路径
        FileInputFormat.setInputPaths(job, new Path("D:\\data\\friend\\seq"));
        FileOutputFormat.setOutputPath(job, new Path("D:\\data\\friend\\seq2"));
        job.waitForCompletion(true);
    }

}

 

 

 

 

 

 

Guess you like

Origin blog.csdn.net/qq_37933018/article/details/107331474