hadoop详细笔记(十四) mapreduce数据分析案例之共同好友案例

免费视频教程 https://www.51doit.com/ 或者联系博主微信 17710299606

1数据

A:B,C,D,F,E,O
B:A,C,E,K
C:F,A,D,I
D:A,E,F,L
E:B,C,D,M,L
F:A,B,C,D,E,O,M
G:A,C,D,E,F
H:A,C,D,E,O
I:A,O
J:B,O
K:A,C,D
L:D,E,F
M:E,F,G
O:A,H,I,J

2需求

获取任意两个人之间的共同好友

比如    A和B共同好友是:    [E, C]
           A和C共同好友是:    [D, F]  

           B和F共同好友是:    [E, A, C]

3实现

  实现这个需求一个MR程序并不能得到最终的结果 , 所以我们需要使用两个MR程序实现最终的需求 ,MR1 ->中间结果-->MR2->结果

3.1 第一个MR程序 

import com._51doit.mr.join.Join;
import com._51doit.mr.line.LineDemo;
import com._51doit.pojo.JoinBean;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.zookeeper.txn.Txn;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * Author:   多易教育-行哥
 * Date:     2020/7/12
 * Description:
 */
public class Friend1 {

    static class Friend1Mapper extends Mapper<LongWritable , Text , Text , Text>{
        Text k = new Text() ;
        Text v = new Text() ;
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] split = line.split(":");
            String id = split[0];
            String[] fs = split[1].split(",");
            v.set(id);
            for (String f : fs) {
                k.set(f);
                context.write(k,v);
            }
        }
    }

    static class Friend1Reducer extends Reducer<Text , Text ,Text,Text>{
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            List<String> list =  new ArrayList<String>() ;
            for (Text value : values) { // B C D F G H I K O
                String f = value.toString();
                list.add(f) ;
            }
            // 排序
            Collections.sort(list);
            for(int i=0 ; i < list.size()-1;i++){  //  0
                for(int j=i+1 ; j<list.size();j++){ // 1   2  3  4  5  6
                    String pre = list.get(i); // B   C
                    String post = list.get(j);//  D F G H I K O
                    context.write(new Text(pre+"和"+post+"共同好友是:"),key);// key ==F
                }
            }
        }
    }

    public static void main(String[] args) throws Exception {
        Logger.getLogger("org").setLevel(Level.ERROR);
        Configuration conf = new Configuration();
        // 参数2  job的名字
        Job job = Job.getInstance(conf, new LineDemo().getClass().getSimpleName());

        job.setMapperClass(Friend1Mapper.class);
        job.setReducerClass(Friend1Reducer.class);
        // 设置map阶段的输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        // 最终结果的数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        // job.setNumReduceTasks(2);  //启动3个reduce任务
        // 待处理数据的路径
        FileInputFormat.setInputPaths(job, new Path("D:\\data\\friend\\input"));
        FileOutputFormat.setOutputPath(job, new Path("D:\\data\\friend\\out2"));
        job.waitForCompletion(true);
    }
}

3.2 第二个MR程序 

import com._51doit.mr.line.LineDemo;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * Author:   多易教育-行哥
 * Date:     2020/7/12
 * Description:
 */
public class Friend2 {

    static class Friend2Mapper extends Mapper<LongWritable , Text , Text , Text>{
        Text k = new Text() ;
        Text v = new Text() ;
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] split = value.toString().split("\t");
            k.set(split[0]);
            v.set(split[1]);
            context.write(k,v);
        }
    }

    static class Friend2Reducer extends Reducer<Text , Text ,Text,Text>{
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            List<String> list =  new ArrayList<String>() ;
            for (Text value : values) { //  A  C  D  G L  M
                String f = value.toString();
                list.add(f) ;
            }
            context.write(key ,new Text(list.toString()));
        }
    }

    public static void main(String[] args) throws Exception {
        Logger.getLogger("org").setLevel(Level.ERROR);
        Configuration conf = new Configuration();
        // 参数2  job的名字
        Job job = Job.getInstance(conf, new LineDemo().getClass().getSimpleName());

        job.setMapperClass(Friend2Mapper.class);
        job.setReducerClass(Friend2Reducer.class);
        // 设置map阶段的输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        // 最终结果的数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        // job.setNumReduceTasks(2);  //启动3个reduce任务
        // 待处理数据的路径
        FileInputFormat.setInputPaths(job, new Path("D:\\data\\friend\\out2"));
        FileOutputFormat.setOutputPath(job, new Path("D:\\data\\friend\\res"));
        job.waitForCompletion(true);

    }

}

3.3 知识点

maptast会将maptask任务处理的数据,会写到任务的工作目录中,等待对应的reducetask任务去拉取数据 , 那么这种数据就可以使用特殊的文件个是或者是使用压缩来提高效率 

MR1产生的中间结果会输出到磁盘中 , 作为MR2任务处理的数据源 , 那么这个数据也可以使用特殊的文件格式或者压缩来提高效率

我们将中间结果文件使用sequence文件格式为例来输出

3.3.1 MR1

import com._51doit.mr.join.Join;
import com._51doit.mr.line.LineDemo;
import com._51doit.pojo.JoinBean;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.zookeeper.txn.Txn;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * Author:   多易教育-行哥
 * Date:     2020/7/12
 * Description:
 */
public class Friend1 {

    static class Friend1Mapper extends Mapper<LongWritable , Text , Text , Text>{
        Text k = new Text() ;
        Text v = new Text() ;
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] split = line.split(":");
            String id = split[0];
            String[] fs = split[1].split(",");
            v.set(id);
            for (String f : fs) {
                k.set(f);
                context.write(k,v);
            }
        }
    }

    static class Friend1Reducer extends Reducer<Text , Text ,Text,Text>{
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            List<String> list =  new ArrayList<String>() ;
            for (Text value : values) { // B C D F G H I K O
                String f = value.toString();
                list.add(f) ;
            }
            // 排序
            Collections.sort(list);
            for(int i=0 ; i < list.size()-1;i++){  //  0
                for(int j=i+1 ; j<list.size();j++){ // 1   2  3  4  5  6
                    String pre = list.get(i); // B   C
                    String post = list.get(j);//  D F G H I K O
                    context.write(new Text(pre+"和"+post+"共同好友是:"),key);// key ==F
                }
            }
        }
    }

    public static void main(String[] args) throws Exception {
        Logger.getLogger("org").setLevel(Level.ERROR);
        Configuration conf = new Configuration();
        // 参数2  job的名字
        Job job = Job.getInstance(conf, new LineDemo().getClass().getSimpleName());

        job.setMapperClass(Friend1Mapper.class);
        job.setReducerClass(Friend1Reducer.class);
        // 设置map阶段的输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        // 最终结果的数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        // job.setNumReduceTasks(2);  //启动3个reduce任务
        // 待处理数据的路径
        FileInputFormat.setInputPaths(job, new Path("D:\\data\\friend\\input"));
        FileOutputFormat.setOutputPath(job, new Path("D:\\data\\friend\\seq"));
        job.waitForCompletion(true);
    }

}

        job.setOutputFormatClass(SequenceFileOutputFormat.class); 设置数据输出的格式

3.3.2 MR2

maptask任务的输入不在是一行的偏移量和行数据而是一对对的KV

  static class Friend2Mapper extends Mapper<Text , Text , Text , Text>

import com._51doit.mr.line.LineDemo;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * Author:   多易教育-行哥
 * Date:     2020/7/12
 * Description:
 */
public class Friend2 {

    static class Friend2Mapper extends Mapper<Text , Text , Text , Text>{

        @Override
        protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
            context.write(key,value);
        }
    }

    static class Friend2Reducer extends Reducer<Text , Text ,Text,Text>{
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            List<String> list =  new ArrayList<String>() ;
            for (Text value : values) { //  A  C  D  G L  M
                String f = value.toString();
                list.add(f) ;
            }
            context.write(key ,new Text(list.toString()));
        }
    }

    public static void main(String[] args) throws Exception {
        Logger.getLogger("org").setLevel(Level.ERROR);
        Configuration conf = new Configuration();
        // 参数2  job的名字
        Job job = Job.getInstance(conf, new LineDemo().getClass().getSimpleName());

        job.setMapperClass(Friend2Mapper.class);
        job.setReducerClass(Friend2Reducer.class);
        // 设置map阶段的输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        // 最终结果的数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        // job.setNumReduceTasks(2);  //启动3个reduce任务
        // 待处理数据的路径
        FileInputFormat.setInputPaths(job, new Path("D:\\data\\friend\\seq"));
        FileOutputFormat.setOutputPath(job, new Path("D:\\data\\friend\\seq2"));
        job.waitForCompletion(true);
    }

}

猜你喜欢

转载自blog.csdn.net/qq_37933018/article/details/107331474