MapReduce求最大值

思想：

在map端找出map端的最大值（局部）。map的cleanup方法输出这个最大值，再通过reduce端将map端的局部最大进行比较。求出最终最大的那个值。

怎么样找出map端的最大值？老样子，先看数据！

销售时间	        社保卡号	        商品编码	商品名称	    销售数量	应收金额	 实收金额 
2018-01-01	001616528	236701	强力VC银翘片	6.0	82.8	69.0
2018-01-01	0012697828	861464	复方利血平片(复方降压片)	4.0	10.0	9.4
2018-01-01	0010060654328	861458	复方利血平氨苯蝶啶片(北京降压0号)	1.0	10.3	9.2
2018-01-01	0011811728	861456	酒石酸美托洛尔片(倍他乐克)	1.0	7.0	6.3
2018-01-01	0013448228	861507	苯磺酸氨氯地平片(安内真)	1.0	9.5	8.5

数据准备了两个文件 file1 和file2 这样容易体现最后reduce的作用

需求是计算这个数据片段中的最大值,并输出药品名称和实收金额;

也就是要输出商品的名称和实收金额，并且找出金额最大值的商品

package com.hnxy.mr.max;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import com.hnxy.mr.max.MaxWrod.MaxMapper;
import com.hnxy.mr.max.MaxWrod.MaxReducer;

public class MaxWrod3 extends Configured implements Tool {
	public static class MaxMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> {
		private Text outkey = new Text();
		private DoubleWritable outval = new DoubleWritable();
		private Double maxval = 0D;
		private String maxkey = "";
		String[] star = null;

		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
				throws IOException, InterruptedException {
			// 数据 T2018-01-01 001616528 236701 强力VC银翘片 6.0 82.8 69.0
			// 按照\t分割
			star = value.toString().split("\t");
			// 每行正确的数据数组的长度为7 并且部位null
			if (star.length == 7 && null != star) {
				// maxkey 如果小于数组第六为也就是实收金额 那么就maxval就等于这个数
				if (maxval < Double.parseDouble(star[6])) {
					maxval = Double.parseDouble(star[6]);
					// 这时候maxkey就等于数组的第三位也就是药品名称
					maxkey = star[3];
				}
			}

		}

		@Override
		protected void cleanup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
				throws IOException, InterruptedException {
			// 这里为什么要用cleanup，因为cleanup只执行一次，而且是最后一次执行。因为要释放资源
			// 输出最后最大值
			outkey.set(maxkey);
			outval.set(maxval);
			context.write(outkey, outval);

		}
	}

	public static class MaxReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable> {
		private Text outkey = new Text();
		private DoubleWritable outval = new DoubleWritable();
		private Double maxval = 0D;
		private String maxkey = "";

		@Override
		protected void reduce(Text key, Iterable<DoubleWritable> values,
				Reducer<Text, DoubleWritable, Text, DoubleWritable>.Context context)
				throws IOException, InterruptedException {
			// 迭代器遍历所有的实收金额
			DoubleWritable value = values.iterator().next();
			// 最后进行判断 将多个map的最大值进行比较、
			if (maxval < value.get()) {
				maxval = value.get();
				maxkey = key.toString();
			}

		}

		@Override
		protected void cleanup(Reducer<Text, DoubleWritable, Text, DoubleWritable>.Context context)
				throws IOException, InterruptedException {
			// TODO Auto-generated method stub
			outkey.set(maxkey);
			outval.set(maxval);
			context.write(outkey, outval);

		}
	}

	@Override
	public int run(String[] args) throws Exception {
		// 设置Configretion
		Configuration conf = this.getConf();
		// 设置job
		Job job = Job.getInstance();
		job.setJarByClass(MaxWrod3.class);
		// map reduce类
		job.setMapperClass(MaxMapper.class);
		job.setReducerClass(MaxReducer.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(DoubleWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(DoubleWritable.class);
		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		// 设置文件路径
		Path in = new Path(args[0]);
		Path out = new Path(args[1]);
		// 设置hdfs操作对象
		FileSystem fs = FileSystem.get(conf);
		// 绑定文件输出输入目录
		FileInputFormat.addInputPath(job, in);
		FileOutputFormat.setOutputPath(job, out);
		// 自动删除
		if (fs.exists(out)) {
			fs.delete(out, true);
			// 提示
			System.out.println(job.getJobName() + "'s Path Output is deleted");
		}
		// 执行
		boolean con = job.waitForCompletion(true);
		if (con) {
			System.out.println("ok");
		} else {
			System.out.println("file");
		}
		// FileInputFormat.addInputPath(job, path);
		return 0;
	}

	public static void main(String[] args) throws Exception {
		// TODO Auto-generated method stub
		System.exit(ToolRunner.run(new MaxWrod3(), args));
	}
}

第二种求最大值的方法：

我们可以将上面要输出的药品的名字和价格用特殊符号链接成一个val 然后map阶段使用统一的key进行val的传输,这样reduce阶段就不再进行一次cleanup了

package com.hnxy.mr.max;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import com.hnxy.mr.max.MaxWrod.MaxMapper;
import com.hnxy.mr.max.MaxWrod.MaxReducer;

public class MaxWrod4 extends Configured implements Tool {
	// 定义分隔符
	private static final String SPLIT_STP = "\001";

	public static class MaxMapper extends Mapper<LongWritable, Text, Text, Text> {
		private Text outkey = new Text();
		private Text outval = new Text();
		private Double maxval = 0D;
		private String maxkey = "";
		String[] star = null;

		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			// 数据 T2018-01-01 001616528 236701 强力VC银翘片 6.0 82.8 69.0
			// 按照\t分割
			star = value.toString().split("\t");
			// 每行正确的数据数组的长度为7 并且部位null
			if (star.length == 7 && null != star) {
				// maxkey 如果小于数组第六为也就是实收金额 那么就maxval就等于这个数
				if (maxval < Double.parseDouble(star[6])) {
					maxval = Double.parseDouble(star[6]);
					// 这时候maxkey就等于数组的第三位也就是药品名称
					maxkey = star[3];
				}
			}

		}

		@Override
		protected void cleanup(Mapper<LongWritable, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			// 这里为什么要用cleanup，因为cleanup只执行一次，而且是最后一次执行。因为要释放资源
			// 输出最后最大值
			outkey.set("max");
			outval.set(maxkey + SPLIT_STP + maxval);
			context.write(outkey, outval);

		}
	}

	public static class MaxReducer extends Reducer<Text, Text, Text, DoubleWritable> {
		private Text outkey = new Text();
		private DoubleWritable outval = new DoubleWritable();
		private Double maxval = 0D;
		private String maxkey = "";
		private String[] strs = null;

		@Override
		protected void reduce(Text key, Iterable<Text> values,
				Reducer<Text, Text, Text, DoubleWritable>.Context context) throws IOException, InterruptedException {
			// 迭代器遍历所有的实收金额
			for (Text t : values) {
				System.out.println(t);
				strs = t.toString().split(SPLIT_STP);
				// 最后进行判断 将多个map的最大值进行比较
				if (maxval < Double.parseDouble(strs[1])) {
					maxkey = strs[0];
					maxval = Double.parseDouble(strs[1]);
				}
			}
			outkey.set(maxkey);
			outval.set(maxval);
			context.write(outkey, outval);
		}

	}

	@Override
	public int run(String[] args) throws Exception {
		// 设置Configretion
		Configuration conf = this.getConf();
		// 设置job
		Job job = Job.getInstance();
		job.setJarByClass(MaxWrod4.class);
		// map reduce类
		job.setMapperClass(MaxMapper.class);
		job.setReducerClass(MaxReducer.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(DoubleWritable.class);
		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		// 设置文件路径
		Path in = new Path(args[0]);
		Path out = new Path(args[1]);
		// 设置hdfs操作对象
		FileSystem fs = FileSystem.get(conf);
		// 绑定文件输出输入目录
		FileInputFormat.addInputPath(job, in);
		FileOutputFormat.setOutputPath(job, out);
		// 自动删除
		if (fs.exists(out)) {
			fs.delete(out, true);
			// 提示
			System.out.println(job.getJobName() + "'s Path Output is deleted");
		}
		// 执行
		boolean con = job.waitForCompletion(true);
		if (con) {
			System.out.println("ok");
		} else {
			System.out.println("file");
		}
		// FileInputFormat.addInputPath(job, path);
		return 0;
	}

	public static void main(String[] args) throws Exception {
		// TODO Auto-generated method stub
		System.exit(ToolRunner.run(new MaxWrod4(), args));
	}
}

我还有第三种方法！map阶段使用combiner进行局部聚合省去map阶段的cleanup

但是这里就不写了，因为map() 中输出所有数据，会产生大量的溢写排序合并，效率要比写的两个的低。

今天就到着这里，晚安(￣▽￣)"

爱吃芝麻

发布了95 篇原创文章 · 获赞 0 · 访问量 1073

私信关注

猜你喜欢