Free video tutorial https://www.51doit.com/ or contact the blogger on WeChat 17710299606
1 data
A:B,C,D,F,E,O
B:A,C,E,K
C:F,A,D,I
D:A,E,F,L
E:B,C,D,M,L
F:A,B,C,D,E,O,M
G:A,C,D,E,F
H:A,C,D,E,O
I:A,O
J:B,O
K:A,C,D
L:D,E,F
M:E,F,G
O:A,H,I,J
2 demand
Get mutual friends between any two people
For example, the mutual friends of A and B are: [E, C] The
mutual friends of A and C are: [D, F]The mutual friends of B and F are: [E, A, C]
3 implementation
One MR program to achieve this requirement does not get the final result, so we need to use two MR programs to achieve the final requirement, MR1 -> intermediate result --> MR2-> result
3.1 The first MR program
import com._51doit.mr.join.Join;
import com._51doit.mr.line.LineDemo;
import com._51doit.pojo.JoinBean;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.zookeeper.txn.Txn;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* Author: 多易教育-行哥
* Date: 2020/7/12
* Description:
*/
public class Friend1 {
static class Friend1Mapper extends Mapper<LongWritable , Text , Text , Text>{
Text k = new Text() ;
Text v = new Text() ;
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] split = line.split(":");
String id = split[0];
String[] fs = split[1].split(",");
v.set(id);
for (String f : fs) {
k.set(f);
context.write(k,v);
}
}
}
static class Friend1Reducer extends Reducer<Text , Text ,Text,Text>{
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
List<String> list = new ArrayList<String>() ;
for (Text value : values) { // B C D F G H I K O
String f = value.toString();
list.add(f) ;
}
// 排序
Collections.sort(list);
for(int i=0 ; i < list.size()-1;i++){ // 0
for(int j=i+1 ; j<list.size();j++){ // 1 2 3 4 5 6
String pre = list.get(i); // B C
String post = list.get(j);// D F G H I K O
context.write(new Text(pre+"和"+post+"共同好友是:"),key);// key ==F
}
}
}
}
public static void main(String[] args) throws Exception {
Logger.getLogger("org").setLevel(Level.ERROR);
Configuration conf = new Configuration();
// 参数2 job的名字
Job job = Job.getInstance(conf, new LineDemo().getClass().getSimpleName());
job.setMapperClass(Friend1Mapper.class);
job.setReducerClass(Friend1Reducer.class);
// 设置map阶段的输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// 最终结果的数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// job.setNumReduceTasks(2); //启动3个reduce任务
// 待处理数据的路径
FileInputFormat.setInputPaths(job, new Path("D:\\data\\friend\\input"));
FileOutputFormat.setOutputPath(job, new Path("D:\\data\\friend\\out2"));
job.waitForCompletion(true);
}
}
3.2 The second MR program
import com._51doit.mr.line.LineDemo;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* Author: 多易教育-行哥
* Date: 2020/7/12
* Description:
*/
public class Friend2 {
static class Friend2Mapper extends Mapper<LongWritable , Text , Text , Text>{
Text k = new Text() ;
Text v = new Text() ;
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
k.set(split[0]);
v.set(split[1]);
context.write(k,v);
}
}
static class Friend2Reducer extends Reducer<Text , Text ,Text,Text>{
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
List<String> list = new ArrayList<String>() ;
for (Text value : values) { // A C D G L M
String f = value.toString();
list.add(f) ;
}
context.write(key ,new Text(list.toString()));
}
}
public static void main(String[] args) throws Exception {
Logger.getLogger("org").setLevel(Level.ERROR);
Configuration conf = new Configuration();
// 参数2 job的名字
Job job = Job.getInstance(conf, new LineDemo().getClass().getSimpleName());
job.setMapperClass(Friend2Mapper.class);
job.setReducerClass(Friend2Reducer.class);
// 设置map阶段的输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// 最终结果的数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// job.setNumReduceTasks(2); //启动3个reduce任务
// 待处理数据的路径
FileInputFormat.setInputPaths(job, new Path("D:\\data\\friend\\out2"));
FileOutputFormat.setOutputPath(job, new Path("D:\\data\\friend\\res"));
job.waitForCompletion(true);
}
}
3.3 Knowledge points
maptast will write the data processed by the maptask task to the working directory of the task, and wait for the corresponding reducetask task to pull the data, then this data can use special files or use compression to improve efficiency
The intermediate results produced by MR1 will be output to the disk as the data source processed by the MR2 task, then this data can also be used in a special file format or compressed to improve efficiency
We use the sequence file format as an example to output the intermediate result file
3.3.1 MR1
import com._51doit.mr.join.Join;
import com._51doit.mr.line.LineDemo;
import com._51doit.pojo.JoinBean;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.zookeeper.txn.Txn;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* Author: 多易教育-行哥
* Date: 2020/7/12
* Description:
*/
public class Friend1 {
static class Friend1Mapper extends Mapper<LongWritable , Text , Text , Text>{
Text k = new Text() ;
Text v = new Text() ;
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] split = line.split(":");
String id = split[0];
String[] fs = split[1].split(",");
v.set(id);
for (String f : fs) {
k.set(f);
context.write(k,v);
}
}
}
static class Friend1Reducer extends Reducer<Text , Text ,Text,Text>{
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
List<String> list = new ArrayList<String>() ;
for (Text value : values) { // B C D F G H I K O
String f = value.toString();
list.add(f) ;
}
// 排序
Collections.sort(list);
for(int i=0 ; i < list.size()-1;i++){ // 0
for(int j=i+1 ; j<list.size();j++){ // 1 2 3 4 5 6
String pre = list.get(i); // B C
String post = list.get(j);// D F G H I K O
context.write(new Text(pre+"和"+post+"共同好友是:"),key);// key ==F
}
}
}
}
public static void main(String[] args) throws Exception {
Logger.getLogger("org").setLevel(Level.ERROR);
Configuration conf = new Configuration();
// 参数2 job的名字
Job job = Job.getInstance(conf, new LineDemo().getClass().getSimpleName());
job.setMapperClass(Friend1Mapper.class);
job.setReducerClass(Friend1Reducer.class);
// 设置map阶段的输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// 最终结果的数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
// job.setNumReduceTasks(2); //启动3个reduce任务
// 待处理数据的路径
FileInputFormat.setInputPaths(job, new Path("D:\\data\\friend\\input"));
FileOutputFormat.setOutputPath(job, new Path("D:\\data\\friend\\seq"));
job.waitForCompletion(true);
}
}
job.setOutputFormatClass(SequenceFileOutputFormat.class); Set the format of data output
3.3.2 MR2
The input of the maptask task is not a row offset and row data but a pair of KV
static class Friend2Mapper extends Mapper<Text , Text , Text , Text>
import com._51doit.mr.line.LineDemo;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* Author: 多易教育-行哥
* Date: 2020/7/12
* Description:
*/
public class Friend2 {
static class Friend2Mapper extends Mapper<Text , Text , Text , Text>{
@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
context.write(key,value);
}
}
static class Friend2Reducer extends Reducer<Text , Text ,Text,Text>{
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
List<String> list = new ArrayList<String>() ;
for (Text value : values) { // A C D G L M
String f = value.toString();
list.add(f) ;
}
context.write(key ,new Text(list.toString()));
}
}
public static void main(String[] args) throws Exception {
Logger.getLogger("org").setLevel(Level.ERROR);
Configuration conf = new Configuration();
// 参数2 job的名字
Job job = Job.getInstance(conf, new LineDemo().getClass().getSimpleName());
job.setMapperClass(Friend2Mapper.class);
job.setReducerClass(Friend2Reducer.class);
// 设置map阶段的输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// 最终结果的数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(SequenceFileInputFormat.class);
// job.setNumReduceTasks(2); //启动3个reduce任务
// 待处理数据的路径
FileInputFormat.setInputPaths(job, new Path("D:\\data\\friend\\seq"));
FileOutputFormat.setOutputPath(job, new Path("D:\\data\\friend\\seq2"));
job.waitForCompletion(true);
}
}