maven做如下配置
groupId:org.apache.hadoop
artifactId:hadoop-client
version:${hadoop.version}
类申明,继承configured类,并实现Tool接口
public class WordCount extends Configured implements Tool {
}
实现Tool接口的run方法
public int run(String[] args) throws Exception {
// TODO Auto-generated method stub
Job job = Job.getInstance(this.getConf(), this.getClass().getSimpleName());
job.setJarByClass(this.getClass());
FileInputFormat.addInputPath(job, new Path(args[0]));
job.setMapperClass(WordCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(WordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
FileOutputFormat.setCompressOutput(job, true);
FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);
return job.waitForCompletion(true) ? 0: 1;
}
Mapper类的实现
public static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String string = value.toString();
StringTokenizer tokenizer = new StringTokenizer(string);
while (tokenizer.hasMoreTokens()) {
String text = tokenizer.nextToken();
context.write(new Text(text), new IntWritable(1));
}
}
}
Reducer类的实现
public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Context context)
throws IOException, InterruptedException {
int sum = 0;
for(IntWritable value : values){
sum += value.get();
}
context.write(key, new IntWritable(sum));
}
}
最后的Main方法
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.setBoolean(Job.MAP_OUTPUT_COMPRESS, true);
conf.setClass(Job.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);
System.exit(ToolRunner.run(conf, new WordCount(), args));
}