第一个程序一般都是Hello World,所以说MapReduce的第一个程序就是单词计数,主要代码如下:
package Temperature; import java.io.IOException; import java.util.*; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.*; import org.apache.hadoop.mapred.*; import org.apache.hadoop.util.*; public class WordCount { /*** * * 当向MapReduce提交作业的时候,首先文件会被分割成splits,由于我们只是测试 * 所以,只有一个split,然后MapReduce按行将文件切分,<key,value>相当于Python的字典 * */ /*** * * 将上边切割好的<key,value>传递给一下我们自定义的map * 生成<key, value> * 上边是按行分的文件数据,这里是按照空格分的行数据 * */ public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text word = new Text(); /** * hadoop === java * BooleanWritable === boolean * ByteWritable === byte * ShortWritable === short * LongWritable === long * Text === String * IntWritable === int * FloatWritable === float * DoubleWritable === double * ArrayWritable === Array * MapWritable === map */ public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { word.set(tokenizer.nextToken()); output.collect(this.word, this.one); } } } /** * * 得到<key,value>的值后Mapper会按照key对其进行排序, * 如果定义了Combine函数,将会对这些排序后的相同的键值进行合并,以后再解析Combine函数,这里先不做解释 * Mapper将<key,value>交给Reducer * Reduce端首先把收到的数据进行排序,生成<key,[values]> * 然后交给下面我们自定义的reduce函数处理,最后生成<key,value>键值对输出到hdfs。 * */ public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> { public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter report) throws IOException { int sum = 0; while (values.hasNext()) { sum += values.next().get(); } output.collect(key, new IntWritable(sum)); } } public static void main(String [] args) throws Exception { JobConf conf = new JobConf(WordCount.class); conf.setJobName("wordcount"); // 配置输出Key和Value的类型 conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); //配置Map和Reduce类 conf.setMapperClass(Map.class); conf.setReducerClass(Reduce.class); //配置输入输出类 conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); //设置输入输出路径 FileInputFormat.setInputPaths(conf, new Path("hdfs://192.168.1.51:9000/input/qixiang_data")); FileOutputFormat.setOutputPath(conf, new Path("hdfs://192.168.1.51:9000/output/lzh/3")); //提交作业 JobClient.runJob(conf); } }