大数据技术之Hadoop（MapReduce）第7章 MapReduce扩展案例

7.1 倒排索引案例（多job串联）

1）需求：有大量的文本（文档、网页），需要建立搜索索引

atguigu pingping
atguigu ss
atguigu ss

atguigu pingping
atguigu pingping
pingping ss

atguigu ss
atguigu pingping

（1）第一次预期输出结果

atguigu--a.txt 3

atguigu--b.txt 2

atguigu--c.txt 2

pingping--a.txt 1

pingping--b.txt 3

pingping--c.txt 1

ss--a.txt 2

ss--b.txt 1

ss--c.txt 1

（2）第二次预期输出结果

atguigu c.txt-->2 b.txt-->2 a.txt-->3

pingping c.txt-->1 b.txt-->3 a.txt-->1

ss c.txt-->1 b.txt-->1 a.txt-->2

2）第一次处理

（1）第一次处理，编写OneIndexMapper

package com.atguigu.mapreduce.index;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class OneIndexMapper extends Mapper<LongWritable, Text, Text , IntWritable>{

String name;

Text k = new Text();

IntWritable v = new IntWritable();

@Override

protected void setup(Context context)

throws IOException, InterruptedException {

// 获取文件名称

FileSplit split = (FileSplit) context.getInputSplit();

name = split.getPath().getName();

}

@Override

protected void map(LongWritable key, Text value, Context context)

throws IOException, InterruptedException {

// 1 获取1行

String line = value.toString();

// 2 切割

String[] fields = line.split(" ");

for (String word : fields) {

// 3 拼接

k.set(word+"--"+name);

v.set(1);

// 4 写出

context.write(k, v);

}

（2）第一次处理，编写OneIndexReducer

package com.atguigu.mapreduce.index;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

public class OneIndexReducer extends Reducer<Text, IntWritable, Text, IntWritable>{

@Override

protected void reduce(Text key, Iterable<IntWritable> values,

Context context) throws IOException, InterruptedException {

int count = 0;

// 1 累加求和

for(IntWritable value: values){

count +=value.get();

}

// 2 写出

context.write(key, new IntWritable(count));

}

（3）第一次处理，编写OneIndexDriver

package com.atguigu.mapreduce.index;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class OneIndexDriver {

public static void main(String[] args) throws Exception {

args = new String[] { "e:/input/inputoneindex", "e:/output5" };

Configuration conf = new Configuration();

Job job = Job.getInstance(conf);

job.setJarByClass(OneIndexDriver.class);

job.setMapperClass(OneIndexMapper.class);

job.setReducerClass(OneIndexReducer.class);

job.setMapOutputKeyClass(Text.class);

job.setMapOutputValueClass(IntWritable.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(IntWritable.class);

FileInputFormat.setInputPaths(job, new Path(args[0]));

FileOutputFormat.setOutputPath(job, new Path(args[1]));

job.waitForCompletion(true);

}

（4）查看第一次输出结果

atguigu--a.txt 3

atguigu--b.txt 2

atguigu--c.txt 2

pingping--a.txt 1

pingping--b.txt 3

pingping--c.txt 1

ss--a.txt 2

ss--b.txt 1

ss--c.txt 1

3）第二次处理

（1）第二次处理，编写TwoIndexMapper

package com.atguigu.mapreduce.index;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

public class TwoIndexMapper extends Mapper<LongWritable, Text, Text, Text>{

Text k = new Text();

Text v = new Text();

@Override

protected void map(LongWritable key, Text value, Context context)

throws IOException, InterruptedException {

// 1 获取1行数据

String line = value.toString();

// 2用“--”切割

String[] fields = line.split("--");

k.set(fields[0]);

v.set(fields[1]);

// 3 输出数据

context.write(k, v);

}

（2）第二次处理，编写TwoIndexReducer

package com.atguigu.mapreduce.index;

import java.io.IOException;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

public class TwoIndexReducer extends Reducer<Text, Text, Text, Text> {

@Override

protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

// atguigu a.txt 3

// atguigu b.txt 2

// atguigu c.txt 2

// atguigu c.txt-->2 b.txt-->2 a.txt-->3

StringBuilder sb = new StringBuilder();

// 1 拼接

for (Text value : values) {

sb.append(value.toString().replace("\t", "-->") + "\t");

}

// 2 写出

context.write(key, new Text(sb.toString()));

}

（3）第二次处理，编写TwoIndexDriver

package com.atguigu.mapreduce.index;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class TwoIndexDriver {

public static void main(String[] args) throws Exception {

args = new String[] { "e:/input/inputtwoindex", "e:/output6" };

Configuration config = new Configuration();

Job job = Job.getInstance(config);

job.setJarByClass(TwoIndexDriver.class);

job.setMapperClass(TwoIndexMapper.class);

job.setReducerClass(TwoIndexReducer.class);

job.setMapOutputKeyClass(Text.class);

job.setMapOutputValueClass(Text.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(Text.class);

FileInputFormat.setInputPaths(job, new Path(args[0]));

FileOutputFormat.setOutputPath(job, new Path(args[1]));

boolean result = job.waitForCompletion(true);

System.exit(result?0:1);

}

（4）第二次查看最终结果

atguigu c.txt-->2 b.txt-->2 a.txt-->3

pingping c.txt-->1 b.txt-->3 a.txt-->1

ss c.txt-->1 b.txt-->1 a.txt-->2

7.2 找博客共同好友案例

1）需求：

以下是博客的好友列表数据，冒号前是一个用户，冒号后是该用户的所有好友（数据中的好友关系是单向的）

friends.

A:B,C,D,F,E,O
B:A,C,E,K
C:F,A,D,I
D:A,E,F,L
E:B,C,D,M,L
F:A,B,C,D,E,O,M
G:A,C,D,E,F
H:A,C,D,E,O
I:A,O
J:B,O
K:A,C,D
L:D,E,F
M:E,F,G
O:A,H,I,J

求出哪些人两两之间有共同好友，及他俩的共同好友都有谁？

2）需求分析：

先求出A、B、C、….等是谁的好友

第一次输出结果

A I,K,C,B,G,F,H,O,D,

B A,F,J,E,

C A,E,B,H,F,G,K,

D G,C,K,A,L,F,E,H,

E G,M,L,H,A,F,B,D,

F L,M,D,C,G,A,

G M,

H O,

I O,C,

J O,

K B,

L D,E,

M E,F,

O A,H,I,J,F,

第二次输出结果

A-B E C

A-C D F

A-D E F

A-E D B C

A-F O B C D E

A-G F E C D

A-H E C D O

A-I O

A-J O B

A-K D C

A-L F E D

A-M E F

B-C A

B-D A E

B-E C

B-F E A C

B-G C E A

B-H A E C

B-I A

B-K C A

B-L E

B-M E

B-O A

C-D A F

C-E D

C-F D A

C-G D F A

C-H D A

C-I A

C-K A D

C-L D F

C-M F

C-O I A

D-E L

D-F A E

D-G E A F

D-H A E

D-I A

D-K A

D-L E F

D-M F E

D-O A

E-F D M C B

E-G C D

E-H C D

E-J B

E-K C D

E-L D

F-G D C A E

F-H A D O E C

F-I O A

F-J B O

F-K D C A

F-L E D

F-M E

F-O A

G-H D C E A

G-I A

G-K D A C

G-L D F E

G-M E F

G-O A

H-I O A

H-J O

H-K A C D

H-L D E

H-M E

H-O A

I-J O

I-K A

I-O A

K-L D

K-O A

L-M E F

3）代码实现：

（1）第一次Mapper

package com.atguigu.mapreduce.friends;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

public class OneShareFriendsMapper extends Mapper<LongWritable, Text, Text, Text>{

@Override

protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)

throws IOException, InterruptedException {

// 1 获取一行 A:B,C,D,F,E,O

String line = value.toString();

// 2 切割

String[] fields = line.split(":");

// 3 获取person和好友

String person = fields[0];

String[] friends = fields[1].split(",");

// 4写出去

for(String friend: friends){

// 输出 <好友，人>

context.write(new Text(friend), new Text(person));

}

（2）第一次Reducer

package com.atguigu.mapreduce.friends;

import java.io.IOException;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

public class OneShareFriendsReducer extends Reducer<Text, Text, Text, Text>{

@Override

protected void reduce(Text key, Iterable<Text> values, Context context)

throws IOException, InterruptedException {

StringBuffer sb = new StringBuffer();

//1 拼接

for(Text person: values){

sb.append(person).append(",");

}

//2 写出

context.write(key, new Text(sb.toString()));

}

（3）第一次Driver

package com.atguigu.mapreduce.friends;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class OneShareFriendsDriver {

public static void main(String[] args) throws Exception {

// 1 获取job对象

Configuration configuration = new Configuration();

Job job = Job.getInstance(configuration);

// 2 指定jar包运行的路径

job.setJarByClass(OneShareFriendsDriver.class);

// 3 指定map/reduce使用的类

job.setMapperClass(OneShareFriendsMapper.class);

job.setReducerClass(OneShareFriendsReducer.class);

// 4 指定map输出的数据类型

job.setMapOutputKeyClass(Text.class);

job.setMapOutputValueClass(Text.class);

// 5 指定最终输出的数据类型

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(Text.class);

// 6 指定job的输入原始所在目录

FileInputFormat.setInputPaths(job, new Path(args[0]));

FileOutputFormat.setOutputPath(job, new Path(args[1]));

// 7 提交

boolean result = job.waitForCompletion(true);

System.exit(result?0:1);

}

（4）第二次Mapper

package com.atguigu.mapreduce.friends;

import java.io.IOException;

import java.util.Arrays;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

public class TwoShareFriendsMapper extends Mapper<LongWritable, Text, Text, Text>{

@Override

protected void map(LongWritable key, Text value, Context context)

throws IOException, InterruptedException {

// A I,K,C,B,G,F,H,O,D,

// 友人，人，人

String line = value.toString();

String[] friend_persons = line.split("\t");

String friend = friend_persons[0];

String[] persons = friend_persons[1].split(",");

Arrays.sort(persons);

for (int i = 0; i < persons.length - 1; i++) {

for (int j = i + 1; j < persons.length; j++) {

// 发出 <人-人，好友> ，这样，相同的“人-人”对的所有好友就会到同1个reduce中去

context.write(new Text(persons[i] + "-" + persons[j]), new Text(friend));

}

（5）第二次Reducer

package com.atguigu.mapreduce.friends;

import java.io.IOException;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

public class TwoShareFriendsReducer extends Reducer<Text, Text, Text, Text>{

@Override

protected void reduce(Text key, Iterable<Text> values, Context context)

throws IOException, InterruptedException {

StringBuffer sb = new StringBuffer();

for (Text friend : values) {

sb.append(friend).append(" ");

}

context.write(key, new Text(sb.toString()));

}

（6）第二次Driver

package com.atguigu.mapreduce.friends;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class TwoShareFriendsDriver {

public static void main(String[] args) throws Exception {

// 1 获取job对象

Configuration configuration = new Configuration();

Job job = Job.getInstance(configuration);

// 2 指定jar包运行的路径

job.setJarByClass(TwoShareFriendsDriver.class);

// 3 指定map/reduce使用的类

job.setMapperClass(TwoShareFriendsMapper.class);

job.setReducerClass(TwoShareFriendsReducer.class);

// 4 指定map输出的数据类型

job.setMapOutputKeyClass(Text.class);

job.setMapOutputValueClass(Text.class);

// 5 指定最终输出的数据类型

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(Text.class);

// 6 指定job的输入原始所在目录

FileInputFormat.setInputPaths(job, new Path(args[0]));

FileOutputFormat.setOutputPath(job, new Path(args[1]));

// 7 提交

boolean result = job.waitForCompletion(true);

System.exit(result?0:1);

}

7.3 Top10案例

1）需求：对需求2.4输出结果进行加工，输出流量使用量在前10的用户信息

2）实现代码

（1）编写JavaBean类

package com.atguigu.mr;

import java.io.DataInput;

import java.io.DataOutput;

import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class FlowBean implements WritableComparable<FlowBean> {

private Long sumFlow; // 总流量

private String phoneNum; // 手机号

// 空参构造

public FlowBean() {

super();

}

public Long getSumFlow() {

return sumFlow;

}

public void setSumFlow(Long sumFlow) {

this.sumFlow = sumFlow;

}

public String getPhoneNum() {

return phoneNum;

}

public void setPhoneNum(String phoneNum) {

this.phoneNum = phoneNum;

}

@Override

public String toString() {

return sumFlow + "\t" + phoneNum;

}

@Override

public void write(DataOutput out) throws IOException {

// 序列化

out.writeLong(sumFlow);

out.writeUTF(phoneNum);

}

@Override

public void readFields(DataInput in) throws IOException {

// 反序列化

this.sumFlow = in.readLong();

this.phoneNum = in.readUTF();

}

@Override

public int compareTo(FlowBean o) {

int result;

result = this.sumFlow.compareTo(o.sumFlow);

if (result == 0) {

result = this.phoneNum.compareTo(o.phoneNum);

}

return result;

}

（1）编写JavaBean类

（2）编写TopTenMapper类

package com.atguigu.mr;

import java.io.IOException;

import java.util.TreeMap;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

public class TopTenMapper extends Mapper<LongWritable, Text, NullWritable, Text> {

// 定义一个TreeMap作为存储数据的容器（天然按key排序）

private TreeMap<FlowBean, Text> flowMap = new TreeMap<FlowBean, Text>();

@Override

protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

FlowBean bean = new FlowBean();

// 1.获取一行

String line = value.toString();

// 2.切割

String[] fields = line.split("\t");

long sumFlow = Long.parseLong(fields[3]);

bean.setSumFlow(sumFlow);

bean.setPhoneNum(fields[0]);

// 3.向TreeMap中添加数据

flowMap.put(bean, new Text(value));

// 4.限制TreeMap的数据量，超过10条就删除掉流量最小的一条数据

if (flowMap.size() > 10) {

flowMap.remove(flowMap.firstKey());

}

@Override

protected void cleanup(Context context) throws IOException, InterruptedException {

// 输出

for (Text t : flowMap.values()) {

context.write(NullWritable.get(), t);

}

（3）编写TopTenReducer类

package com.atguigu.mr;

import java.io.IOException;

import java.util.TreeMap;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

public class TopTenReducer extends Reducer<NullWritable, Text, NullWritable, Text> {

@Override

protected void reduce(NullWritable key, Iterable<Text> values, Context context)

throws IOException, InterruptedException {

// 1.定义一个TreeMap作为存储数据的容器（天然按key排序）

TreeMap<FlowBean, Text> flowMap = new TreeMap<FlowBean, Text>();

for (Text value : values) {

FlowBean bean = new FlowBean();

bean.setPhoneNum(value.toString().split("\t")[0]);

bean.setSumFlow(Long.parseLong(value.toString().split("\t")[3]));

flowMap.put(bean, new Text(value));

// 2.限制TreeMap的数据量，超过10条就删除掉流量最小的一条数据

if (flowMap.size() > 10) {

flowMap.remove(flowMap.firstKey());

}

// 3.输出

for (Text t : flowMap.descendingMap().values()) {

context.write(NullWritable.get(), t);

}

（4）编写驱动类

package com.atguigu.mr;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class TopTenDriver {

public static void main(String[] args)

throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {

// 1 获取配置信息，或者job对象实例

Configuration configuration = new Configuration();

Job job = Job.getInstance(configuration);

// 6 指定本程序的jar包所在的本地路径

job.setJarByClass(TopTenDriver.class);

// 2 指定本业务job要使用的mapper/Reducer业务类

job.setMapperClass(TopTenMapper.class);

job.setReducerClass(TopTenReducer.class);

// 3 指定mapper输出数据的kv类型

job.setMapOutputKeyClass(NullWritable.class);

job.setMapOutputValueClass(Text.class);

// 4 指定最终输出的数据的kv类型

job.setOutputKeyClass(NullWritable.class);

job.setOutputValueClass(Text.class);

// 5 指定job的输入原始文件所在目录

FileInputFormat.setInputPaths(job, new Path(args[0]));

FileOutputFormat.setOutputPath(job, new Path(args[1]));

// 7 将job中配置的相关参数，以及job所用的java类所在的jar包，提交给yarn去运行

boolean result = job.waitForCompletion(true);

System.exit(result ? 0 : 1);

}

本教程由尚硅谷教育大数据研究院出品，如需转载请注明来源。