MapReduce演练WordCount

1、官方示例代码

安装hadoop后，默认带有示例代码，包括著名的WordCount
示例代码路径：hadoop-2.8.5/share/hadoop/mapreduce/sources/

2、重写WordCount

2.1、WordCount类

package test.word;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class WordCount extends Configured implements Tool{

	@Override
	public int run(String[] args) throws Exception {		
		return singleJob(args);
	}

	public static void main(String[] args) {
		int exitCode = -1;
		try {
			exitCode = ToolRunner.run(new WordCount(), args);
		} catch (Exception e) {
			e.printStackTrace();
		}finally{
			System.exit(exitCode);
		}
	}
	
	/**
	 * MapReduce 程序的业务编码分为两个大部分，一部分配置程序的运行信息，
	 * 一部分编写该 MapReduce 程序的业务逻辑，
	 * 并且业务逻辑的 map 阶段和 reduce 阶段的代码分别继承 Mapper类和 Reducer类
	 * @param args
	 */
	public int singleJob(String[] args){
		try{		
			Configuration conf = new Configuration();
			//if no set defaultFS then the path is current user's localfile
			conf.set("fs.defaultFS", "hdfs://cos6743:9000");
			//运行时设置环境变量 java -DHADOOP_USER_NAME=root
			//程序中设置环境变量
//			System.setProperty("HADOOP_USER_NAME", "root");
			
		    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		    if (otherArgs.length < 2) {
		      System.err.println("Usage: wordcount <in> [<in>...] <out>");
		      System.exit(2);
		    }
		    /**
		     * Job: It allows the user to configure the job, submit it, control its execution, and query the state.
		     * 它允许用户配置作业、提交作业、控制作业执行和查询状态
		     */
		    // Create a new Job
		    Job job = Job.getInstance(conf, "word count");
		    job.setJarByClass(WordCount.class);
		    
		    job.setMapperClass(WordMapper.class);
		    job.setCombinerClass(WordReducer.class);
		    job.setReducerClass(WordReducer.class);
		    
		    job.setOutputKeyClass(Text.class);
		    job.setOutputValueClass(IntWritable.class);
		    
		    for (int i = 0; i < otherArgs.length - 1; ++i) {
		      FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
		    }
		    
		    //if output path is exists,then delete
		    Path resultPath = new Path(otherArgs[otherArgs.length - 1]);
		    FileSystem fs = resultPath.getFileSystem(conf);
		    if(fs.exists(resultPath)){
		    	fs.delete(resultPath, true);
		    	System.out.println("-- resultPath is delete --");
		    }		    
		    //output result
		    FileOutputFormat.setOutputPath(job, resultPath);
		    
		    // Submit the job, then poll for progress until the job is complete
		    boolean flag = job.waitForCompletion(true);//true: print the progress to the user
		    if(flag){
		    	System.out.println("-- job is waitForCompletion --"+flag);
		    }
		    return flag ? 0 : 1;
		}catch(Exception e){
			e.printStackTrace();
		}
		//exception
		return -1;
	}

}

2.2、WordMapper类

public class WordMapper extends Mapper<Object, Text, Text, IntWritable>{
    //A WritableComparable for ints
	private final static IntWritable one = new IntWritable(1);
	//This class stores text using standard UTF8 encoding. It provides methods to serialize, deserialize, and compare texts at byte level.
	private Text word = new Text();
	
	public void map(Object key, Text value, Context context)  {
		//构造一个用来解析str的StringTokenizer对象。java默认的分隔符是“空格”、“制表符(‘\t’)”、“换行符(‘\n’)”、“回车符(‘\r’)”
		StringTokenizer itr = new StringTokenizer(value.toString());
		while (itr.hasMoreTokens()) {
			word.set(itr.nextToken());
			try {
				context.write(word, one);
			} catch (IOException e) {
				e.printStackTrace();
			} catch (InterruptedException e) {
				e.printStackTrace();
			}
		}
	}
}

2.3、WordReducer类

public class WordReducer
	extends Reducer<Text,IntWritable,Text,IntWritable> {
	    private IntWritable result = new IntWritable();

	    public void reduce(Text key, Iterable<IntWritable> values, Context context)
	    		throws IOException, InterruptedException {
	      int sum = 0;
	      for (IntWritable val : values) {
	        sum += val.get();
	      }
	      result.set(sum);
	      context.write(key, result);
	    }
}