一、group by:
HiveQL:select deptno, sum(sal) from emp group by deptno order by deptno;
场景模拟:要求计算出每个部门的工资总额。
package com.szh.hadoop;
import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.util.Arrays;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class SalaryTotalMain {
protected static final Logger LOG = LoggerFactory.getLogger(SalaryTotalMain.class);
public static void main(String[] args) throws Exception {
LOG.info("main方法执行在机器: {}上", InetAddress.getLocalHost());
LOG.info("Args: " + Arrays.toString(args));
Configuration conf = new Configuration();
conf.set("mapreduce.job.jar", System.getProperty("user.dir") + File.separator + "test.jar");
// 创建一个job:select deptno , sum(sal) from emp group by deptno order by deptno;
// userId,name,job,jobId,time,sal,xx,deptId
// 7900,JAMES,CLERK,7698,1981/12/3,950,,30
Job job = Job.getInstance(conf);
job.setJarByClass(SalaryTotalMain.class);
job.setMapperClass(SalaryTotalMapper.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(SalaryTotalReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
public static class SalaryTotalMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable> {
@Override
protected void map(LongWritable key, Text text, Context context) throws IOException, InterruptedException {
LOG.info("源文本数据(map输入数据): {} : {}", key, text);
// 数据:7900,JAMES,CLERK,7698,1981/12/3,950,,30
String data = text.toString();
String[] cols = data.split(",");
context.write(new IntWritable(Integer.parseInt(cols[7])), new IntWritable(Integer.parseInt(cols[5])));
LOG.info("\tmap: <{},{}>", cols[7], cols[5]);
}
}
public static class SalaryTotalReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {
@Override
protected void reduce(IntWritable deptId, Iterable<IntWritable> salarys, Context context)
throws IOException, InterruptedException {
LOG.info("reduce输入数据: {},{}", deptId, salarys);
int total = 0;
for (IntWritable salary : salarys) {
total += salary.get();
LOG.info("\treduce循环: <{},{}>", deptId, salary.get());
}
context.write(deptId, new IntWritable(total));
LOG.info("本次reduce结束: <{},{}>", deptId, total);
}
}
}
二、reduce side join:
HiveQL:select * from user right join history on user.id=history.user_id;
场景模拟:要求输出每一条登陆历史且包含所属用户的完整信息。
实现方式:对于输入的两个表或hdfs文件的每一行信息,都需要根据当前信息的来源路径,给自身加区分标志,以便在reduce端区分开来,最后在reduce端进行join。
相关API:// 获得文件输入路径
String pathName = ((FileSplit) context.getInputSplit()).getPath().toString();
package com.szh.hadoop;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class ReduceJoin {
protected static final Logger LOG = LoggerFactory.getLogger(ReduceJoin.class);
private static final String SEPERATOR = "#";
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("mapreduce.job.jar", System.getProperty("user.dir") + File.separator + "test.jar");
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 3) {
LOG.error("Usage: ReduceJoin <in1> <in2> <out>");
System.exit(2);
}
Job job = Job.getInstance(conf, "szh's reducejoin 20190617");
job.setJarByClass(ReduceJoin.class);
job.setMapperClass(ReduceJoinMapper.class);
job.setReducerClass(ReduceJoinReducer.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(Text.class);
for (int i = 0; i < otherArgs.length - 1; i++) {
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
LOG.info("Input path: {}", otherArgs[i]);
}
FileOutputFormat.setOutputPath(job, new Path(otherArgs[(otherArgs.length - 1)]));
LOG.info("Output path: {}", otherArgs[(otherArgs.length - 1)]);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
public static class ReduceJoinMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, LongWritable, Text>.Context context)
throws IOException, InterruptedException {
// 获得文件输入路径
String pathName = ((FileSplit) context.getInputSplit()).getPath().toString();
LOG.info("源文本数据(map输入数据)来自文件{}: {} : {}", pathName, key, value);
if (pathName.contains("users")) {
// 代表是第一个输入文件,user(需自己排除掉userId列的内容含逗号的情况,这里忽略)
String[] userInfos = value.toString().split(",");
String otherInfo = value.toString().substring(value.toString().indexOf(",") + 1,
value.toString().length());
context.write(new LongWritable(Long.valueOf(userInfos[0])), new Text(0 + SEPERATOR + otherInfo));
LOG.info("\tmap_user: <{},{}>", userInfos[0], otherInfo);
} else if (pathName.contains("histories")) {
// 代表是第一个输入文件,history(同上)
String[] histories = value.toString().split(",");
String otherInfo = value.toString().substring(value.toString().indexOf(",") + 1,
value.toString().length());
context.write(new LongWritable(Long.valueOf(histories[0])), new Text(1 + SEPERATOR + otherInfo));
LOG.info("\tmap_history: <{},{}>", histories[0], otherInfo);
} else {
LOG.error("Inner Error: {}", pathName);
System.exit(-1);
}
LOG.info("本次map结束!");
}
}
public static class ReduceJoinReducer extends Reducer<LongWritable, Text, LongWritable, Text> {
@Override
protected void reduce(LongWritable key, Iterable<Text> values,
Reducer<LongWritable, Text, LongWritable, Text>.Context context)
throws IOException, InterruptedException {
LOG.info("reduce输入数据: {},{}", key, values);
String userInfo = null;
List<String> historyLst = new ArrayList<String>();
for (Text text : values) {
LOG.info("\treduce循环: <{},{}>", key, text);
String[] texts = text.toString().split(SEPERATOR);
if ("0".equals(texts[0])) {
userInfo = texts[1];
} else if ("1".equals(texts[0])) {
historyLst.add(texts[1]);
} else {
LOG.error("Inner Error: {}", texts[0]);
System.exit(-1);
}
}
// user right join history
userInfo = userInfo == null ? "null,null" : userInfo;
for (String history : historyLst) {
context.write(key, new Text(history + "," + userInfo));
}
LOG.info("本次reduce结束!");
}
}
}
三、map side join:
HiveQL:select * from user right join history on user.id=history.user_id;
场景模拟:要求输出每一条登陆历史且包含所属用户的完整信息。
实现方式:对于输入的两个表或hdfs文件,一般相对会存在一个小表(登陆历史数据是海量的,而用户数量却相对少很多),将小表user放在内存中,JobTracker在作业启动之前会获取这个URI列表,并将相应的文件拷贝到各个TaskTracker的本地磁盘上以实现共享user;然后在map中,对每一条history,去内存中获取对应的user信息进行join拼接。相比reduce join的方式,避免了user的信息发送到reduce端的开销。
相关API:
// 为job添加缓存文件
job.addCacheFile(URI.create(HDFS_URI + otherArgs[0]));
// 获取缓存文件
URI[] cacheFiles = context.getCacheFiles();
package com.szh.hadoop;
import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class MapJoin {
protected static final Logger LOG = LoggerFactory.getLogger(MapJoin.class);
private static final String HDFS_URI = "hdfs://10.1.4.18:8020";
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("mapreduce.job.jar", System.getProperty("user.dir") + File.separator + "test.jar");
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 3) {
LOG.error("Usage: MapJoin <in1> <in2> <out>");
System.exit(2);
}
Job job = Job.getInstance(conf, "szh's mapjoin 20190617");
job.setJarByClass(MapJoin.class);
job.setMapperClass(MapJoinMapper.class);
job.setReducerClass(MapJoinReducer.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(Text.class);
job.addCacheFile(URI.create(HDFS_URI + otherArgs[0]));
FileInputFormat.addInputPath(job, new Path(otherArgs[1]));
LOG.info("Input path: {}", otherArgs[1]);
FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
public static class MapJoinMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, LongWritable, Text>.Context context)
throws IOException, InterruptedException {
LOG.info("源文本数据(map输入数据): {} : {}", key, value);
URI[] cacheFiles = context.getCacheFiles();
if (cacheFiles.length > 0) {
LOG.info("Cached file: {}", cacheFiles[0]);
} else {
LOG.error("Inner Error!");
System.exit(-1);
}
Configuration conf = new Configuration();// 加载配置文件
conf.set("dfs.client.use.datanode.hostname", "true");
FileSystem fs = FileSystem.get(URI.create(HDFS_URI), conf, "hdfs");
Path path = new Path(cacheFiles[0]);
LOG.info("要查看的缓存文件路径为: {}", fs.getFileStatus(path).getPath());
FSDataInputStream fsis = fs.open(fs.getFileStatus(path).getPath());
String str = "";
Map<String, String> userMap = new HashMap<String, String>();
while ((str = fsis.readLine()) != null) {
String userId = str.substring(0, str.indexOf(","));
String otherInfo = str.substring(str.indexOf(",") + 1, str.length());
userMap.put(userId, otherInfo);
LOG.info("\tuser: <{},{}>", userId, otherInfo);
}
fsis.close();
String userId = value.toString().substring(0, value.toString().indexOf(","));
String otherInfo = value.toString().substring(value.toString().indexOf(",") + 1, value.toString().length());
String fullInfo = null;
if (userMap.containsKey(userId)) {
fullInfo = otherInfo + "," + userMap.get(userId);
} else {
// user right join history
fullInfo = otherInfo + "," + "null,null";
}
context.write(new LongWritable(Long.valueOf(userId)), new Text(fullInfo));
LOG.info("\tmap: <{},{}>", userId, fullInfo);
LOG.info("本次map结束!");
}
}
public static class MapJoinReducer extends Reducer<LongWritable, Text, LongWritable, Text> {
@Override
protected void reduce(LongWritable key, Iterable<Text> values,
Reducer<LongWritable, Text, LongWritable, Text>.Context context)
throws IOException, InterruptedException {
LOG.info("reduce输入数据: {},{}", key, values);
for (Text text : values) {
context.write(key, text);
LOG.info("\treduce循环: <{},{}>", key, text);
}
LOG.info("本次reduce结束!");
}
}
}