package com.dt.spark.topn; import java.io.IOException; import java.util.Arrays; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; public class TopN { /** * * 从所有订单日志中获取Top 5订单及付款金额 * @author yuming * @ail: [email protected] * @weibo: http://www.weibo.com/yumzhang */ public static class ForTopNMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable> { int[] tops; int length; @Override protected void setup(Context context) throws IOException, InterruptedException { length = context.getConfiguration().getInt("topN", 5); tops = new int[length + 1]; } /** * 在Map阶段各个Map分别计算自己的Top N,减少网络传输的压力 * 减少数据量,提高Reduce处理的效率,防止少海量数据的OOM问题 */ public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] data = value.toString().split(","); if (data != null && 4 == data.length) { int cost = Integer.valueOf(data[2]); tops[0] = cost; Arrays.sort(tops); //正向排序 } } @Override protected void cleanup(Context context) throws IOException, InterruptedException { for (int i = 0; i < tops.length; i++) { context.write(new IntWritable(tops[i]), new IntWritable(tops[i])); } } } public static class ForTopNReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable> { int[] tops; int length; @Override protected void setup(Context context) throws IOException, InterruptedException { length = context.getConfiguration().getInt("topN", 5); //default get Top 5 tops = new int[length + 1]; } public void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { tops[0] = key.get(); Arrays.sort(tops); } @Override protected void cleanup(Context context) throws IOException, InterruptedException { for (int i = length; i > 0; i--) { //对已经排序好的数组输出 context.write(new IntWritable(length - i + 1), new IntWritable(tops[i])); } } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); conf.setInt("topN", 5); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: TopN Rank <in> [<in>...] <out>"); System.exit(2); } // set job Job job = new Job(conf, "Sorted TopN Application"); job.setJarByClass(TopN.class); // set Map、Combine and Reduce class job.setMapperClass(ForTopNMapper.class); // job.setCombinerClass(ForSortReducer .class); job.setReducerClass(ForTopNReducer.class); // set input output data format job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); // set path for (int i = 0; i < otherArgs.length - 1; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } } /** * dataTopN1.txt Id,custmId,pay,productId 1,9819,100,121 2,8918,2000,111 3,2813,1234,22 4,9100,10,1101 5,3210,490,111 6,1298,28,1211 7,1010,281,90 8,1818,9000,20 ---------------- dataTopN.txt 10,2222,10,1000 11,9321,1000,293 12,3881,999,328 13,8328,1000,66 */
一图顶千言:
充分利用Map的集群效应,在Map阶段将处理各自处理自己的Top N,然后将数据输出给Reducer,提高集群利用率,防止OOM发生,可能还有更优算法,如果哪位有也可以回复我。