MapReduce实现sql的group by和join

一、group by：

HiveQL：select deptno, sum(sal) from emp group by deptno order by deptno;

场景模拟：要求计算出每个部门的工资总额。

package com.szh.hadoop;

import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.util.Arrays;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class SalaryTotalMain {

	protected static final Logger LOG = LoggerFactory.getLogger(SalaryTotalMain.class);

	public static void main(String[] args) throws Exception {
		LOG.info("main方法执行在机器: {}上", InetAddress.getLocalHost());
		LOG.info("Args: " + Arrays.toString(args));
		Configuration conf = new Configuration();
		conf.set("mapreduce.job.jar", System.getProperty("user.dir") + File.separator + "test.jar");
		// 创建一个job:select deptno , sum(sal) from emp group by deptno order by deptno;
		// userId,name,job,jobId,time,sal,xx,deptId
		// 7900,JAMES,CLERK,7698,1981/12/3,950,,30
		Job job = Job.getInstance(conf);
		job.setJarByClass(SalaryTotalMain.class);

		job.setMapperClass(SalaryTotalMapper.class);
		job.setMapOutputKeyClass(IntWritable.class);
		job.setMapOutputValueClass(IntWritable.class);

		job.setReducerClass(SalaryTotalReducer.class);
		job.setOutputKeyClass(IntWritable.class);
		job.setOutputValueClass(IntWritable.class);

		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		job.waitForCompletion(true);
	}

	public static class SalaryTotalMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable> {

		@Override
		protected void map(LongWritable key, Text text, Context context) throws IOException, InterruptedException {
			LOG.info("源文本数据（map输入数据）: {} : {}", key, text);
			// 数据：7900,JAMES,CLERK,7698,1981/12/3,950,,30
			String data = text.toString();
			String[] cols = data.split(",");
			context.write(new IntWritable(Integer.parseInt(cols[7])), new IntWritable(Integer.parseInt(cols[5])));
			LOG.info("\tmap: <{},{}>", cols[7], cols[5]);
		}

	}

	public static class SalaryTotalReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {

		@Override
		protected void reduce(IntWritable deptId, Iterable<IntWritable> salarys, Context context)
				throws IOException, InterruptedException {
			LOG.info("reduce输入数据: {},{}", deptId, salarys);
			int total = 0;
			for (IntWritable salary : salarys) {
				total += salary.get();
				LOG.info("\treduce循环: <{},{}>", deptId, salary.get());
			}
			context.write(deptId, new IntWritable(total));
			LOG.info("本次reduce结束: <{},{}>", deptId, total);
		}
	}
}

二、reduce side join：

HiveQL：select * from user right join history on user.id=history.user_id;

场景模拟：要求输出每一条登陆历史且包含所属用户的完整信息。

实现方式：对于输入的两个表或hdfs文件的每一行信息，都需要根据当前信息的来源路径，给自身加区分标志，以便在reduce端区分开来，最后在reduce端进行join。

相关API：// 获得文件输入路径
String pathName = ((FileSplit) context.getInputSplit()).getPath().toString();

package com.szh.hadoop;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class ReduceJoin {

	protected static final Logger LOG = LoggerFactory.getLogger(ReduceJoin.class);
	private static final String SEPERATOR = "#";

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();
		conf.set("mapreduce.job.jar", System.getProperty("user.dir") + File.separator + "test.jar");
		String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		if (otherArgs.length != 3) {
			LOG.error("Usage: ReduceJoin <in1> <in2> <out>");
			System.exit(2);
		}

		Job job = Job.getInstance(conf, "szh's reducejoin 20190617");
		job.setJarByClass(ReduceJoin.class);
		job.setMapperClass(ReduceJoinMapper.class);
		job.setReducerClass(ReduceJoinReducer.class);
		job.setOutputKeyClass(LongWritable.class);
		job.setOutputValueClass(Text.class);
		for (int i = 0; i < otherArgs.length - 1; i++) {
			FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
			LOG.info("Input path: {}", otherArgs[i]);
		}
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[(otherArgs.length - 1)]));
		LOG.info("Output path: {}", otherArgs[(otherArgs.length - 1)]);

		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}

	public static class ReduceJoinMapper extends Mapper<LongWritable, Text, LongWritable, Text> {

		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, LongWritable, Text>.Context context)
				throws IOException, InterruptedException {
			// 获得文件输入路径
			String pathName = ((FileSplit) context.getInputSplit()).getPath().toString();
			LOG.info("源文本数据（map输入数据）来自文件{}: {} : {}", pathName, key, value);
			if (pathName.contains("users")) {
				// 代表是第一个输入文件，user（需自己排除掉userId列的内容含逗号的情况，这里忽略）
				String[] userInfos = value.toString().split(",");
				String otherInfo = value.toString().substring(value.toString().indexOf(",") + 1,
						value.toString().length());
				context.write(new LongWritable(Long.valueOf(userInfos[0])), new Text(0 + SEPERATOR + otherInfo));
				LOG.info("\tmap_user: <{},{}>", userInfos[0], otherInfo);
			} else if (pathName.contains("histories")) {
				// 代表是第一个输入文件，history（同上）
				String[] histories = value.toString().split(",");
				String otherInfo = value.toString().substring(value.toString().indexOf(",") + 1,
						value.toString().length());
				context.write(new LongWritable(Long.valueOf(histories[0])), new Text(1 + SEPERATOR + otherInfo));
				LOG.info("\tmap_history: <{},{}>", histories[0], otherInfo);
			} else {
				LOG.error("Inner Error: {}", pathName);
				System.exit(-1);
			}
			LOG.info("本次map结束！");
		}
	}

	public static class ReduceJoinReducer extends Reducer<LongWritable, Text, LongWritable, Text> {

		@Override
		protected void reduce(LongWritable key, Iterable<Text> values,
				Reducer<LongWritable, Text, LongWritable, Text>.Context context)
				throws IOException, InterruptedException {
			LOG.info("reduce输入数据: {},{}", key, values);
			String userInfo = null;
			List<String> historyLst = new ArrayList<String>();
			for (Text text : values) {
				LOG.info("\treduce循环: <{},{}>", key, text);
				String[] texts = text.toString().split(SEPERATOR);
				if ("0".equals(texts[0])) {
					userInfo = texts[1];
				} else if ("1".equals(texts[0])) {
					historyLst.add(texts[1]);
				} else {
					LOG.error("Inner Error: {}", texts[0]);
					System.exit(-1);
				}
			}
			// user right join history
			userInfo = userInfo == null ? "null,null" : userInfo;
			for (String history : historyLst) {
				context.write(key, new Text(history + "," + userInfo));
			}
			LOG.info("本次reduce结束！");
		}
	}

}

三、map side join：

HiveQL：select * from user right join history on user.id=history.user_id;

场景模拟：要求输出每一条登陆历史且包含所属用户的完整信息。

实现方式：对于输入的两个表或hdfs文件，一般相对会存在一个小表（登陆历史数据是海量的，而用户数量却相对少很多），将小表user放在内存中，JobTracker在作业启动之前会获取这个URI列表，并将相应的文件拷贝到各个TaskTracker的本地磁盘上以实现共享user；然后在map中，对每一条history，去内存中获取对应的user信息进行join拼接。相比reduce join的方式，避免了user的信息发送到reduce端的开销。

相关API：

// 为job添加缓存文件
job.addCacheFile(URI.create(HDFS_URI + otherArgs[0]));

// 获取缓存文件
URI[] cacheFiles = context.getCacheFiles();

package com.szh.hadoop;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class MapJoin {

	protected static final Logger LOG = LoggerFactory.getLogger(MapJoin.class);
	private static final String HDFS_URI = "hdfs://10.1.4.18:8020";

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();
		conf.set("mapreduce.job.jar", System.getProperty("user.dir") + File.separator + "test.jar");
		String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		if (otherArgs.length != 3) {
			LOG.error("Usage: MapJoin <in1> <in2> <out>");
			System.exit(2);
		}
		Job job = Job.getInstance(conf, "szh's mapjoin 20190617");
		job.setJarByClass(MapJoin.class);
		job.setMapperClass(MapJoinMapper.class);
		job.setReducerClass(MapJoinReducer.class);
		job.setOutputKeyClass(LongWritable.class);
		job.setOutputValueClass(Text.class);

		job.addCacheFile(URI.create(HDFS_URI + otherArgs[0]));

		FileInputFormat.addInputPath(job, new Path(otherArgs[1]));
		LOG.info("Input path: {}", otherArgs[1]);
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));

		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}

	public static class MapJoinMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, LongWritable, Text>.Context context)
				throws IOException, InterruptedException {
			LOG.info("源文本数据（map输入数据）: {} : {}", key, value);
			URI[] cacheFiles = context.getCacheFiles();
			if (cacheFiles.length > 0) {
				LOG.info("Cached file: {}", cacheFiles[0]);
			} else {
				LOG.error("Inner Error!");
				System.exit(-1);
			}
			Configuration conf = new Configuration();// 加载配置文件
			conf.set("dfs.client.use.datanode.hostname", "true");
			FileSystem fs = FileSystem.get(URI.create(HDFS_URI), conf, "hdfs");
			Path path = new Path(cacheFiles[0]);
			LOG.info("要查看的缓存文件路径为: {}", fs.getFileStatus(path).getPath());

			FSDataInputStream fsis = fs.open(fs.getFileStatus(path).getPath());
			String str = "";
			Map<String, String> userMap = new HashMap<String, String>();
			while ((str = fsis.readLine()) != null) {
				String userId = str.substring(0, str.indexOf(","));
				String otherInfo = str.substring(str.indexOf(",") + 1, str.length());
				userMap.put(userId, otherInfo);
				LOG.info("\tuser: <{},{}>", userId, otherInfo);
			}
			fsis.close();

			String userId = value.toString().substring(0, value.toString().indexOf(","));
			String otherInfo = value.toString().substring(value.toString().indexOf(",") + 1, value.toString().length());
			String fullInfo = null;
			if (userMap.containsKey(userId)) {
				fullInfo = otherInfo + "," + userMap.get(userId);
			} else {
				// user right join history
				fullInfo = otherInfo + "," + "null,null";
			}
			context.write(new LongWritable(Long.valueOf(userId)), new Text(fullInfo));
			LOG.info("\tmap: <{},{}>", userId, fullInfo);
			LOG.info("本次map结束！");
		}
	}

	public static class MapJoinReducer extends Reducer<LongWritable, Text, LongWritable, Text> {
		@Override
		protected void reduce(LongWritable key, Iterable<Text> values,
				Reducer<LongWritable, Text, LongWritable, Text>.Context context)
				throws IOException, InterruptedException {
			LOG.info("reduce输入数据: {},{}", key, values);
			for (Text text : values) {
				context.write(key, text);
				LOG.info("\treduce循环: <{},{}>", key, text);
			}
			LOG.info("本次reduce结束！");
		}
	}

}

DayDayUp丶

发布了62 篇原创文章 · 获赞 22 · 访问量 7万+

私信关注