1 join case
1.1 pojo type
The MR program will process different types of data, which can be processed separately according to the file name
Encapsulate a pojo class that contains all the attributes of the data, and add an identification field to distinguish the encapsulated data
/** * @Classname UserAndOrdersWritable * @Date 2020/9/24 0024 9:43 * @Created by 多易教育-DOIT18 * @Description: 存储用户数据和订单数据 */ public class UserAndOrdersWritable implements Writable { // 用户的数据 private String uid ; private String name ; private int age ; private String gender ; private String friend ; // 存储订单的数据属性 private String oid ; /** * 数据标识 */ private String tbName ; public String getUid() { return uid; } public void set(String uid, String name, int age, String gender, String friend, String oid, String tbName) { this.uid = uid; this.name = name; this.age = age; this.gender = gender; this.friend = friend; this.oid = oid; this.tbName = tbName; } public void setUid(String uid) { this.uid = uid; } public String getName() { return name; } public void setName(String name) { this.name = name; } public int getAge() { return age; } public void setAge(int age) { this.age = age; } public String getGender() { return gender; } public void setGender(String gender) { this.gender = gender; } public String getFriend() { return friend; } public void setFriend(String friend) { this.friend = friend; } public String getOid() { return oid; } public void setOid(String oid) { this.oid = oid; } public String getTbName() { return tbName; } public void setTbName(String tbName) { this.tbName = tbName; } @Override public String toString() { return uid+","+name+","+age+","+gender+","+friend ; } @Override public void write(DataOutput dataOutput) throws IOException { dataOutput.writeUTF(uid); dataOutput.writeUTF(name); dataOutput.writeInt(age); dataOutput.writeUTF(gender); dataOutput.writeUTF(friend); dataOutput.writeUTF(oid); dataOutput.writeUTF(tbName); } @Override public void readFields(DataInput dataInput) throws IOException { this.uid = dataInput.readUTF(); this.name = dataInput.readUTF(); this.age = dataInput.readInt() ; this.gender = dataInput.readUTF(); this.friend = dataInput.readUTF(); this.oid = dataInput.readUTF();//null this.tbName = dataInput.readUTF(); } }
1.2 Step analysis
1.3 Code implementation
package com._51doit.mr.day05.join;
import com._51doit.mr.day04.index.Index2;
import com._51doit.mr.day05.beans.UserAndOrdersWritable;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.List;
/**
* @Classname Join
* @Date 2020/9/24 0024 9:40
* @Created by 多易教育-DOIT18
* @Description:
* 1 MR程序处理的数据的输入路径可以是一个文件夹
* 文件夹中可能存在不同类型的数据文件 , 可以使用文件名来区分数据
*/
public class Join {
static class JoinMapper extends Mapper<LongWritable, Text , Text , UserAndOrdersWritable> {
String fileName = null ;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
FileSplit f = (FileSplit) context.getInputSplit();
fileName = f.getPath().getName();
}
Text k = new Text() ;
UserAndOrdersWritable v = new UserAndOrdersWritable() ;
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
try {
String line = value.toString();
// 根据文件名区分是何种数据
if("user.txt".equals(fileName)){ // 用户数据 u003,jiajia,16,female,chunge
String[] split = line.split(",");
String uid = split[0] ;
String name = split[1] ;
int age = Integer.parseInt(split[2]) ;
String gender = split[3] ;
String friend = split[4] ;
String tbName = "tb_user" ;
k.set(uid);
//自定义类的每个属性都要设置
v.set(uid,name,age,gender,friend,"",tbName);
}else{// 订单数据 order033 u005
String[] split = line.split("\\s+");
String oid = split[0] ;
String uid = split[1] ;
k.set(uid);
v.set(uid,"",-1 , "","",oid,"tb_orders");
}
context.write(k,v);
} catch (Exception e) {
e.printStackTrace();
}
}
}
static class JoinReducer extends Reducer<Text, UserAndOrdersWritable ,Text, NullWritable>{
Text k = new Text() ;
/**
* 将用户id相同的数据聚合在一起
* uid 包含一个用户信息 多个订单信息
* @param key
* @param values
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(Text key, Iterable<UserAndOrdersWritable> values, Context context) throws IOException, InterruptedException {
try {
// 用户
UserAndOrdersWritable user = new UserAndOrdersWritable() ;
// list 订单
List<UserAndOrdersWritable> ordersList = new ArrayList<>() ;
// 将所有的数据 多个订单数据封装在list中 一个用户数据封装在user中
for (UserAndOrdersWritable value : values) {
String tbName = value.getTbName();
if("tb_user".equals(tbName)){ // 用户数据
//用户数据
BeanUtils.copyProperties(user , value);
}else{// 订单数据
// 将订单数据存储在list集合中
UserAndOrdersWritable orders = new UserAndOrdersWritable() ;
BeanUtils.copyProperties(orders , value);
// 添加到list集合
ordersList.add(orders);
}
}
/*
join
for (UserAndOrdersWritable ordersWritable : ordersList) {
String oid = ordersWritable.getOid();
//oid,uid,name,age,gender,friend
String kk = oid+","+user ;
k.set(kk) ;
context.write(k,NullWritable.get());
}*/
if(ordersList!=null && ordersList.size()>0){ // right join
// 遍历list 拼接用户数据 写出去
for (UserAndOrdersWritable ordersWritable : ordersList) {
String oid = ordersWritable.getOid();
//oid,uid,name,age,gender,friend
String kk = oid+","+user ;
k.set(kk) ;
context.write(k,NullWritable.get());
}
}else{ // right join
String kk = "null"+","+user ;
k.set(kk) ;
context.write(k,NullWritable.get());
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// 2 获取一个Job对象
Job job = Job.getInstance(conf, "wordcount");
//3 设置map和reducetask任务类
job.setMapperClass(JoinMapper.class);
job.setReducerClass(JoinReducer.class);
// 4 设置maptask和reducetask的输出的key value类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(UserAndOrdersWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//5 设置reduce的个数
// job.setNumReduceTasks(2);
//6 设置处理数据的输入路径 本地测试
FileInputFormat.setInputPaths(job,new Path("D:\\data\\join\\input"));
// 7设置结果的输出路径
FileOutputFormat.setOutputPath(job,new Path("D:\\data\\join\\output3"));
//8提交job
job.waitForCompletion(true) ;
}
}
1.4 MR program can have no reducer stage
package com._51doit.mr.day05.noreducer; import com._51doit.mr.day05.beans.UserAndOrdersWritable; import com._51doit.mr.day05.join.Join; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; /** * @Classname Demo * @Date 2020/9/24 0024 11:09 * @Created by 多易教育-DOIT18 * @Description: */ public class Demo { static class DemoMapper extends Mapper<LongWritable, Text, Text, NullWritable> { Text k = new Text(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String upstr = value.toString().toUpperCase(); k.set(upstr); context.write(k, NullWritable.get()); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); // 2 获取一个Job对象 Job job = Job.getInstance(conf, "dmeo"); //3 设置map和reducetask任务类 job.setMapperClass(DemoMapper.class); // 4 设置maptask和reducetask的输出的key value类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); //5 设置reduce的个数 job.setNumReduceTasks(0); //6 设置处理数据的输入路径 本地测试 FileInputFormat.setInputPaths(job,new Path("D://word.txt")); // 7设置结果的输出路径 FileOutputFormat.setOutputPath(job,new Path("D://demo/")); //8提交job job.waitForCompletion(true) ; } }
2 Data tilt case
When processing data, some reduce tasks allocate a large amount of data, and some allocate very little, resulting in uneven distribution of tasks. The process of processing data may slow down certain reduce tasks!
package com._51doit.mr.day05.skew;
import com.sun.media.jfxmediaimpl.platform.gstreamer.GSTPlatform;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.Random;
/**
* @Classname Skew2
* @Date 2020/9/24 0024 14:39
* @Created by 多易教育-DOIT18
* @Description:
*/
public class Skew2 {
static class Skew2Mapper extends Mapper<LongWritable , Text , Text, IntWritable> {
int reduceTasks = 0 ;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
reduceTasks = context.getNumReduceTasks();
}
Random random = new Random();
Text k = new Text() ;
IntWritable v = new IntWritable(1) ;
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] words = line.split("\\s+");
for (String word : words) {
//a a-0 a-1 生成随机数字
int i = random.nextInt(reduceTasks);// 0 1 0 1 1
// 将单词随机数字拼接
String kk = word+"-"+i ; // a-1 a-0 a-0 a-1
k.set(kk);
context.write(k,v);
}
}
}
static class Skew2Reducer extends Reducer<Text, IntWritable, Text , IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0 ;
for (IntWritable value : values) { // a 1770
count++ ;
}
context.write(key ,new IntWritable(count));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// 2 获取一个Job对象
Job job = Job.getInstance(conf, "wordcount");
//3 设置map和reducetask任务类
job.setMapperClass(Skew2Mapper.class);
job.setReducerClass(Skew2Reducer.class);
// 4 设置maptask和reducetask的输出的key value类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//5 设置reduce的个数
job.setNumReduceTasks(2);
//6 设置处理数据的输入路径 本地测试
FileInputFormat.setInputPaths(job,new Path("D:\\data\\skew\\input"));
// 7设置结果的输出路径
FileOutputFormat.setOutputPath(job,new Path("D:\\data\\skew\\output1"));
//8提交job
job.waitForCompletion(true) ;
}
}
package com._51doit.mr.day05.skew;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @Classname Skew3
* @Date 2020/9/24 0024 14:50
* @Created by 多易教育-DOIT18
* @Description:
*/
public class Skew3 {
static class Skew3Mapper extends Mapper<LongWritable, Text, Text, IntWritable> {
Text k = new Text();
IntWritable v = new IntWritable();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString(); //a-0 846
String[] split = line.split("-");
String word = split[0];
String[] split1 = split[1].split("\\s+");
int count = Integer.parseInt(split1[1]);
k.set(word);
v.set(count);
context.write(k,v);
}
}
static class Skew3Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {
IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count += value.get();
}
v.set(count);
context.write(key, v);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// 2 获取一个Job对象
Job job = Job.getInstance(conf, "wordcount");
//3 设置map和reducetask任务类
job.setMapperClass(Skew3Mapper.class);
job.setReducerClass(Skew3Reducer.class);
// 4 设置maptask和reducetask的输出的key value类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//5 设置reduce的个数
job.setNumReduceTasks(1);
//6 设置处理数据的输入路径 本地测试
FileInputFormat.setInputPaths(job,new Path("D:\\data\\skew\\output1"));
// 7设置结果的输出路径
FileOutputFormat.setOutputPath(job,new Path("D:\\data\\skew\\res"));
//8提交job
job.waitForCompletion(true) ;
}
}