Hadoop-MapReduce2的几个基本示例

去重处理(Distinct)

类似于db中的select distinct(x) from table , 去重处理甚至比WordCount还要简单,假如我们要对以下文件的内容做去重处理(注:该文件也是后面几个示例的输入参数)

基本上啥也不用做,在map阶段,把每一行的值当成key分发下去,然后在reduce阶段回收上来就可以了.

注:里面用到了一个自己写的类HDFSUtil,可以在 hadoop: hdfs API示例一文中找到.

原理:map阶段完成后,在reduce开始之前,会有一个combine的过程,相同的key值会自动合并,所以自然而然的就去掉了重复.

 
     
   
package yjmyzz.mr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import yjmyzz.util.HDFSUtil;

import java.io.IOException;


public class RemoveDup {

   public static class RemoveDupMapper
           extends Mapper<Object, Text, Text, NullWritable> {

       public void map(Object key, Text value, Context context)
               throws IOException, InterruptedException {
           context.write(value, NullWritable.get());
           //System.out.println("map: key=" + key + ",value=" + value);
       }

   }

   public static class RemoveDupReducer extends Reducer<Text, NullWritable, Text, NullWritable> {
       public void reduce(Text key, Iterable<NullWritable> values, Context context)
               throws IOException, InterruptedException {
           context.write(key, NullWritable.get());
           //System.out.println("reduce: key=" + key);
       }
   }

   public static void main(String[] args) throws Exception {
       Configuration conf = new Configuration();
       String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
       if (otherArgs.length < 2) {
           System.err.println("Usage: RemoveDup <in> [<in>...] <out>");
           System.exit(2);
       }

       //删除输出目录(可选,省得多次运行时,总是报OUTPUT目录已存在)
       HDFSUtil.deleteFile(conf, otherArgs[otherArgs.length - 1]);

       Job job = Job.getInstance(conf, "RemoveDup");
       job.setJarByClass(RemoveDup.class);
       job.setMapperClass(RemoveDupMapper.class);
       job.setCombinerClass(RemoveDupReducer.class);
       job.setReducerClass(RemoveDupReducer.class);
       job.setOutputKeyClass(Text.class);
       job.setOutputValueClass(NullWritable.class);


       for (int i = 0; i < otherArgs.length - 1; ++i) {
           FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
       }
       FileOutputFormat.setOutputPath(job,
               new Path(otherArgs[otherArgs.length - 1]));
       System.exit(job.waitForCompletion(true) ? 0 : 1);
   }


} 
     
   

输出:

3) 记录计数(Count)

这个跟WordCount略有不同,类似于Select Count(*) from tables的效果,代码也超级简单,直接拿WordCount改一改就行了

 
     
   
package yjmyzz.mr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import yjmyzz.util.HDFSUtil;

import java.io.IOException;
import java.util.StringTokenizer;


public class RowCount {

   public static class RowCountMapper
           extends Mapper<Object, Text, Text, IntWritable> {

       private final static IntWritable one = new IntWritable(1);
       private final  static Text countKey = new Text("count");

       public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
               context.write(countKey, one);
       }
   }

   public static class RowCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
       private IntWritable result = new IntWritable();

       public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
           int sum = 0;
           for (IntWritable val : values) {
               sum += val.get();
           }
           result.set(sum);
           context.write(key, result);
       }
   }

   public static void main(String[] args) throws Exception {
       Configuration conf = new Configuration();
       String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
       if (otherArgs.length < 2) {
           System.err.println("Usage: RowCount <in> [<in>...] <out>");
           System.exit(2);
       }
       //删除输出目录(可选)
       HDFSUtil.deleteFile(conf, otherArgs[otherArgs.length - 1]);

       Job job = Job.getInstance(conf, "word count");
       job.setJarByClass(RowCount.class);
       job.setMapperClass(RowCountMapper.class);
       job.setCombinerClass(RowCountReducer.class);
       job.setReducerClass(RowCountReducer.class);
       job.setOutputKeyClass(Text.class);
       job.setOutputValueClass(IntWritable.class);
       for (int i = 0; i < otherArgs.length - 1; ++i) {
           FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
       }
       FileOutputFormat.setOutputPath(job,
               new Path(otherArgs[otherArgs.length - 1]));
       System.exit(job.waitForCompletion(true) ? 0 : 1);
   }


} 
     
   

输出: count 11

注:如果只想输出一个数字,不需要"count"这个key,可以改进一下:

 
     
   
package yjmyzz.mr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import yjmyzz.util.HDFSUtil;

import java.io.IOException;


public class RowCount2 {

   public static class RowCount2Mapper
           extends Mapper<LongWritable, Text, LongWritable, NullWritable> {

       public long count = 0;

       public void map(LongWritable key, Text value, Context context)
               throws IOException, InterruptedException {
           count += 1;
       }

       protected void cleanup(Context context) throws IOException, InterruptedException {
           context.write(new LongWritable(count), NullWritable.get());
       }

   }

   public static class RowCount2Reducer extends Reducer<LongWritable, NullWritable, LongWritable, NullWritable> {

       public long count = 0;

       public void reduce(LongWritable key, Iterable<NullWritable> values, Context context)
               throws IOException, InterruptedException {
           count += key.get();
       }


       protected void cleanup(Context context) throws IOException, InterruptedException {
           context.write(new LongWritable(count), NullWritable.get());
       }

   }

   public static void main(String[] args) throws Exception {
       Configuration conf = new Configuration();
       String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
       if (otherArgs.length < 2) {
           System.err.println("Usage: FindMax <in> [<in>...] <out>");
           System.exit(2);
       }

       //删除输出目录(可选,省得多次运行时,总是报OUTPUT目录已存在)
       HDFSUtil.deleteFile(conf, otherArgs[otherArgs.length - 1]);

       Job job = Job.getInstance(conf, "RowCount2");
       job.setJarByClass(RowCount2.class);
       job.setMapperClass(RowCount2Mapper.class);
       job.setCombinerClass(RowCount2Reducer.class);
       job.setReducerClass(RowCount2Reducer.class);
       job.setOutputKeyClass(LongWritable.class);
       job.setOutputValueClass(NullWritable.class);

       for (int i = 0; i < otherArgs.length - 1; ++i) {
           FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
       }
       FileOutputFormat.setOutputPath(job,
               new Path(otherArgs[otherArgs.length - 1]));
       System.exit(job.waitForCompletion(true) ? 0 : 1);
   }


} 
     
   

这样输出结果就只有一个数字11了.

注意: 这里context.write(xxx)只能写在cleanup方法中, 该方法在Mapper和Reducer接口中都有, 在map方法及reduce方法执行完后,会触发cleanup方法. 大家可以尝试下,把context.write(xxx)写在map和reduce方法中试试看,结果会出现多行记录,而不是预期的仅1个数字.

4)求最大值(Max)

 
     
   
package yjmyzz.mr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import yjmyzz.util.HDFSUtil;

import java.io.IOException;


public class Max {

   public static class MaxMapper
           extends Mapper<LongWritable, Text, LongWritable, NullWritable> {

       public long max = Long.MIN_VALUE;

       public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
           max = Math.max(Long.parseLong(value.toString()), max);
       }

       protected void cleanup(Mapper.Context context) throws IOException, InterruptedException {
           context.write(new LongWritable(max), NullWritable.get());
       }

   }

   public static class MaxReducer extends Reducer<LongWritable, NullWritable, LongWritable, NullWritable> {

       public long max = Long.MIN_VALUE;

       public void reduce(LongWritable key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {

           max = Math.max(max, key.get());

       }


       protected void cleanup(Reducer.Context context) throws IOException, InterruptedException {
           context.write(new LongWritable(max), NullWritable.get());
       }

   }

   public static void main(String[] args) throws Exception {
       Configuration conf = new Configuration();
       String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
       if (otherArgs.length < 2) {
           System.err.println("Usage: Max <in> [<in>...] <out>");
           System.exit(2);
       }

       //删除输出目录(可选,省得多次运行时,总是报OUTPUT目录已存在)
       HDFSUtil.deleteFile(conf, otherArgs[otherArgs.length - 1]);

       Job job = Job.getInstance(conf, "Max");
       job.setJarByClass(Max.class);
       job.setMapperClass(MaxMapper.class);
       job.setCombinerClass(MaxReducer.class);
       job.setReducerClass(MaxReducer.class);
       job.setOutputKeyClass(LongWritable.class);
       job.setOutputValueClass(NullWritable.class);

       for (int i = 0; i < otherArgs.length - 1; ++i) {
           FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
       }
       FileOutputFormat.setOutputPath(job,
               new Path(otherArgs[otherArgs.length - 1]));
       System.exit(job.waitForCompletion(true) ? 0 : 1);
   }


} 
     
   

输出结果:8

如果看懂了刚才的Count2版本的代码,这个自然不用多解释.

5)求和(Sum)

 
  View Code

输出结果:43

Sum与刚才的Max原理如出一辙,不多解释了,依旧利用了cleanup方法

6)求平均值(Avg)

 
  View Code

输出:3.909090909090909

这个稍微要复杂一点,平均值大家都知道=Sum/Count,所以这其实前面Count与Max的综合运用而已,思路是在输出的key-value中,用max做key,用count做value,最终形成{sum,count}的输出,然后在最后的cleanup中,sum/count即得avg,但是有一个特点要注意的地方,由于Mapper与Reducer的output {key,value}类型并不一致,所以96-101行这里,分别设置了Map及Reduce的key,value输出类型,如果没有96-97这二行,100-101这二行会默认把Mapper,Combiner,Reducer这三者的输出类型设置成相同的类型.

7) 改进型的WordCount(按词频倒排)

官网示例WordCount只统计出单词出现的次数,并未按词频做倒排,下面的代码示例实现了该功能

 
     
   
package yjmyzz.mr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import yjmyzz.util.HDFSUtil;

import java.io.IOException;
import java.util.Comparator;
import java.util.StringTokenizer;
import java.util.TreeMap;


public class WordCount2 {

   public static class TokenizerMapper
           extends Mapper<Object, Text, Text, IntWritable> {

       private final static IntWritable one = new IntWritable(1);
       private Text word = new Text();

       public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
           StringTokenizer itr = new StringTokenizer(value.toString());
           while (itr.hasMoreTokens()) {
               word.set(itr.nextToken());
               context.write(word, one);
           }
       }
   }

   public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

       //定义treeMap来保持统计结果,由于treeMap是按key升序排列的,这里要人为指定Comparator以实现倒排
       private TreeMap<Integer, String> treeMap = new TreeMap<Integer, String>(new Comparator<Integer>() {
           @Override
           public int compare(Integer x, Integer y) {
               return y.compareTo(x);
           }
       });

       public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
           //reduce后的结果放入treeMap,而不是向context中记入结果
           int sum = 0;
           for (IntWritable val : values) {
               sum += val.get();
           }
           if (treeMap.containsKey(sum)){
               String value = treeMap.get(sum) + "," + key.toString();
               treeMap.put(sum,value);
           }
           else {
               treeMap.put(sum, key.toString());
           }
       }

       protected void cleanup(Context context) throws IOException, InterruptedException {
           //将treeMap中的结果,按value-key顺序写入contex中
           for (Integer key : treeMap.keySet()) {
               context.write(new Text(treeMap.get(key)), new IntWritable(key));
           }
       }
   }

   public static void main(String[] args) throws Exception {
       Configuration conf = new Configuration();
       String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
       if (otherArgs.length < 2) {
           System.err.println("Usage: wordcount2 <in> [<in>...] <out>");
           System.exit(2);
       }
       //删除输出目录
       HDFSUtil.deleteFile(conf, otherArgs[otherArgs.length - 1]);
       Job job = Job.getInstance(conf, "word count2");
       job.setJarByClass(WordCount2.class);
       job.setMapperClass(TokenizerMapper.class);
       job.setCombinerClass(IntSumReducer.class);
       job.setReducerClass(IntSumReducer.class);
       job.setOutputKeyClass(Text.class);
       job.setOutputValueClass(IntWritable.class);
       for (int i = 0; i < otherArgs.length - 1; ++i) {
           FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
       }
       FileOutputFormat.setOutputPath(job,
               new Path(otherArgs[otherArgs.length - 1]));
       System.exit(job.waitForCompletion(true) ? 0 : 1);
   }


} 
     
   

原理: 依然用到了cleanup,此外为了实现排序,采用了TreeMap这种内置了key排序的数据结构.

这里为了展示更直观,选用了电影<超能陆战队>主题曲的第一段歌词做为输入:

They say we are what we are
But we do not have to be
I am  bad behavior but I do it in the best way
I will be the watcher
Of the eternal flame
I will be the guard dog
of all your fever dreams

原版的WordCount处理完后,结果如下:

But	1
I	4
Of	1
They	1
all	1
am	1
are	2
bad	1
be	3
behavior	1
best	1
but	1
do	2
dog	1
dreams	1
eternal	1
fever	1
flame	1
guard	1
have	1
in	1
it	1
not	1
of	1
say	1
the	4
to	1
watcher	1
way	1
we	3
what	1
will	2
your	1

改进后的WordCount2处理结果如下:

I,the	4
be,we	3
are,do,will	2
But,Of,They,all,am,bad,behavior,best,but,dog,dreams,eternal,fever,flame,guard,have,in,it,not,of,say,to,watcher,way,what,your	1

Hadoop-MapReduce2的几个基本示例

猜你喜欢