Hadoop笔记之三:WordCount实验续

## 引言 ##
在上一篇的文章中，基本了解了Hadoop的Map-Reduce大致框架，根据官方教程，对WordCount实验有了新的补充，补充基本是在Map-Reduce中加入作业Job的一些控制信息，这就来看下。
## 实验 ##
先上文档的代码

package com.luchi.wordcount;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.StringUtils;

public class WordCount2 {

  public static class TokenizerMapper
       extends Mapper<Object, Text, Text, IntWritable>{

    static enum CountersEnum { INPUT_WORDS }

    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();

    private boolean caseSensitive;
    private Set<String> patternsToSkip = new HashSet<String>();

    private Configuration conf;
    private BufferedReader fis;

    @Override
    public void setup(Context context) throws IOException,
        InterruptedException {
      conf = context.getConfiguration();
      caseSensitive = conf.getBoolean("wordcount.case.sensitive", true);
      if (conf.getBoolean("wordcount.skip.patterns", true)) {
        URI[] patternsURIs = Job.getInstance(conf).getCacheFiles();
        for (URI patternsURI : patternsURIs) {
          Path patternsPath = new Path(patternsURI.getPath());
          String patternsFileName = patternsPath.getName().toString();
          parseSkipFile(patternsFileName);
        }
      }
    }

    private void parseSkipFile(String fileName) {
      try {
        fis = new BufferedReader(new FileReader(fileName));
        String pattern = null;
        while ((pattern = fis.readLine()) != null) {
          patternsToSkip.add(pattern);
        }
      } catch (IOException ioe) {
        System.err.println("Caught exception while parsing the cached file '"
            + StringUtils.stringifyException(ioe));
      }
    }

    @Override
    public void map(Object key, Text value, Context context
                    ) throws IOException, InterruptedException {
      String line = (caseSensitive) ?
          value.toString() : value.toString().toLowerCase();
      for (String pattern : patternsToSkip) {
        line = line.replaceAll(pattern, "");
      }
      StringTokenizer itr = new StringTokenizer(line);

      while (itr.hasMoreTokens()) {
        word.set(itr.nextToken());
        context.write(word, one);
        Counter counter = context.getCounter(CountersEnum.class.getName(),
        CountersEnum.INPUT_WORDS.toString());
        counter.increment(1);
      }
    }
  }

  public static class IntSumReducer
       extends Reducer<Text,IntWritable,Text,IntWritable> {
    private IntWritable result = new IntWritable();

    public void reduce(Text key, Iterable<IntWritable> values,
                       Context context
                       ) throws IOException, InterruptedException {

      int sum = 0;
      for (IntWritable val : values) {
        sum += val.get();
      }
      result.set(sum);
      context.write(key, result);
    }
  }

  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
    String[] remainingArgs = optionParser.getRemainingArgs();
    if (!(remainingArgs.length != 2 || remainingArgs.length != 4)) {
      System.err.println("Usage: wordcount <in> <out> [-skip skipPatternFile]");
      System.exit(2);
    }
    Job job = Job.getInstance(conf, "word count2");
    job.setJarByClass(WordCount2.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    List<String> otherArgs = new ArrayList<String>();
    for (int i=0; i < remainingArgs.length; ++i) {
      if ("-skip".equals(remainingArgs[i])) {
        job.addCacheFile(new Path(remainingArgs[++i]).toUri());
        job.getConfiguration().setBoolean("wordcount.skip.patterns", true);
      } else {
        otherArgs.add(remainingArgs[i]);
      }
    }
    FileInputFormat.addInputPath(job, new Path(otherArgs.get(0)));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1)));


    int e =job.waitForCompletion(true) ? 0 : 1;
    Counters counters =job.getCounters();
    Counter c=counters.findCounter("com.luchi.wordcount.WordCount2$TokenizerMapper$CountersEnum", "INPUT_WORDS");
    System.out.println(c.getValue());

    System.exit(e);
  }
}

然后解释这段代码的意思。

    Configuration conf = new Configuration();
    GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
    String[] remainingArgs = optionParser.getRemainingArgs();
    if (!(remainingArgs.length != 2 || remainingArgs.length != 4)) {
      System.err.println("Usage: wordcount <in> <out> [-skip skipPatternFile]");
      System.exit(2);
    }

这里面使用了GenericOptionsParser 这个类，这个类主要是为了解析命令行参数，我们来看下其API说明：

GenericOptionsParser is a utility to parse command line arguments generic to the Hadoop framework. GenericOptionsParser recognizes several standarad command line arguments, enabling applications to easily specify a namenode, a jobtracker, additional configuration resources etc.

大概意思就是，GenericOptionsParser 这个类是Hadoop框架用来解析命令参数的类，这个类的主要作用是配置一些Hadoop运行期间的参数。其主要解析的参数列表是：

 -conf <configuration file>     specify a configuration file
     -D <property=value>            use value for given property
     -fs <local|namenode:port>      specify a namenode
     -jt <local|jobtracker:port>    specify a job tracker
     -files <comma separated list of files>    specify comma separated
                            files to be copied to the map reduce cluster
     -libjars <comma separated list of jars>   specify comma separated
                            jar files to include in the classpath.
     -archives <comma separated list of archives>    specify comma
             separated archives to be unarchived on the compute machines.

每个参数配置后面都加了说明，这里就不一一说了，这个实验里面主要用到的是-D命令，相当于Hadoop的环境变量设置。
这个类的getRemainingArgs() 方法是为了获取不在上述配置列表里面的命令参数，也就是非Hadoop设置参数。其方法的说明是：

Returns an array of Strings containing only application-specific arguments.

命令解析完了之后的代码和之前的wordcount1实验是一样的，区别是：

for (int i=0; i < remainingArgs.length; ++i) {
      if ("-skip".equals(remainingArgs[i])) {
        job.addCacheFile(new Path(remainingArgs[++i]).toUri());
        job.getConfiguration().setBoolean("wordcount.skip.patterns", true);
      } else {
        otherArgs.add(remainingArgs[i]);
      }
    }

这段代码解析的是Application args，也就是 optionParser.getRemainingArgs();获取的参数，这里面有一个不同是，如果参数里面有-skip选项，那么-skip后面的pattern.txt文件路径就会加入到Hadoop的cacheFile里面，关于cacheFile，API说明的是：

The framework will copy the necessary files to the slave node before any tasks for the job are executed on that node. Its efficiency stems from the fact that the files are only copied once per job and the ability to cache archives which are un-archived on the slaves.

大意是cacheFile会在Job执行之前，被传送到待执行的节点上去，我理解的是为了加入Job执行的速度。
在Mapper类里面，这段代码

public void setup(Context context) throws IOException,
        InterruptedException {
      conf = context.getConfiguration();
      caseSensitive = conf.getBoolean("wordcount.case.sensitive", true);
      if (conf.getBoolean("wordcount.skip.patterns", true)) {
        URI[] patternsURIs = Job.getInstance(conf).getCacheFiles();
        for (URI patternsURI : patternsURIs) {
          Path patternsPath = new Path(patternsURI.getPath());
          String patternsFileName = patternsPath.getName().toString();
          parseSkipFile(patternsFileName);
        }
      }
    }

    private void parseSkipFile(String fileName) {
      try {
        fis = new BufferedReader(new FileReader(fileName));
        String pattern = null;
        while ((pattern = fis.readLine()) != null) {
          patternsToSkip.add(pattern);
        }
      } catch (IOException ioe) {
        System.err.println("Caught exception while parsing the cached file '"
            + StringUtils.stringifyException(ioe));
      }

的意思是，如果命令行的-D后面的wordcount.case.sensitive为true时候，就不改变大小写，如果为false时候，就把文本转化成小写的。另外，如果命令行参数里面传入了pattern.txt的URI的时候，就把这个文件读取，作为执行map操作的一个过滤器，过滤一些不符合pattern.txt的文本。

这里需要另外提及的是Counter这个类，在map方法里面，加了这个代码：


        Counter counter = context.getCounter(CountersEnum.class.getName(),
        CountersEnum.INPUT_WORDS.toString());
        counter.increment(1);

这个Counter的作用是计数，每处理一个单词次数就加一。
然后在Job执行结束后，注意一定要在Job执行结束后使用这段代码：

    int e =job.waitForCompletion(true) ? 0 : 1;
    Counters counters =job.getCounters();
    Counter c = counters.findCounter("com.luchi.wordcount.WordCount2$TokenizerMapper$CountersEnum", "INPUT_WORDS");
    System.out.println(c.getValue());

然后就能打印出单词的个数，如果计数的代码加在Job执行前，就会报下面的错误：

Job in state DEFINE instead of RUNNING

因为这个时候Job还没有执行。
就解释到这儿了。

Hadoop笔记之三:WordCount实验续

猜你喜欢