一、map
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MR extends Mapper<LongWritable, Text, Text, IntWritable>{
private Text word = new Text();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
StringTokenizer str = new StringTokenizer(value.toString());
while(str.hasMoreTokens()) {
word.set(str.nextToken());
context.write(word, new IntWritable(1));
}
}
}
二、Reduce
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class RE extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text keys, Iterable<IntWritable> values,
Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int sum = 0;
for(IntWritable val:values) {
sum += val.get();
}
// result.set(sum);
context.write(keys, new IntWritable(sum));
}
}
三、主程序
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Runner {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//重要:指定本job所在的jar包
job.setJarByClass(Runner.class);
//设置wordCountJob所用的mapper逻辑类为哪个类
job.setMapperClass(MR.class);
//设置wordCountJob所用的reducer逻辑类为哪个类
job.setReducerClass(RE.class);
//设置map阶段输出的kv数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//设置最终输出的kv数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//设置要处理的文本数据所存放的路径
FileInputFormat.setInputPaths(job, new Path("hdfs://172.16.169.10:9000/wordcount/srcdata/"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://172.16.169.10:9000/wordcount/output/"));
//提交job给hadoop集群
job.waitForCompletion(true);
}
}
四、将maven项目在eclipse打成jar包
上传到集群的本地文件系统即可
五、hadoop jar命令运行jar包
六、查看结果
七、问题总结
1、当我写完三个java类的时候发现maven项目M字母下方有个感叹号:这个问题一般是maven库中的jar包异常引起的。为了查看那些jar包异常,我们可以通过Window-》Show View-》Problems查看当前maven项目的问题列表:其中或提示jar包有问题。只要将j本地库中的jar包删除后更新项目即可。可能有多个包存在问题,需要重复上述步骤。
2、hadoop jar命令
hadoop jar 【本地jar包绝对路径】 【指出主类】【hdfs上输入map的文件的路径】【结果输出路径(hdfs上的路径)】
八、加入combiner
只需要在主程序Runner中加入代码:
job.setCombinerClass(IntSumReduce.class);//这里使用reduce做为combiner,也可自定义