一、案例描述
现hdfs中有一文件/input/wordcount/xiaoming.txt),文件中每一单词使用制表符(\t)或者回车符分割,请统计文件中每一个单词的词频。将结果写入/output/wordcount文件夹下。Tips:结果不用按词频排序。
二、代码
- Mapper代码
package com.example.demo.map;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**die
* Mapper第一个泛型必须是LongWritable,代表的是读取文件的行号,第二个是文件的当前行文本
*/
public class WordCountMap extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] words = line.split("\t");
for (String word :
words) {
Text wordText = new Text(word);
IntWritable outValue = new IntWritable(1);
context.write(wordText,outValue);
}
}
}
- reducer 代码
package com.example.demo.reduce;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WordCountReduce extends Reducer<Text, IntWritable,Text,IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
System.out.println("---reduce--key: "+key+" ;values:"+values.toString());
int sum=0;
for (IntWritable i :
values) {
sum+=i.get();
}
context.write(key,new IntWritable(sum));
}
}
- 驱动类
package com.example.demo.driver;
import com.example.demo.map.WordCountMap;
import com.example.demo.reduce.WordCountReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import java.io.IOException;
@Component
public class WordCountDriver {
@Autowired
private Configuration configuration;
public void wordCountDriver(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job = Job.getInstance(configuration, "wordcount01");
//job设置驱动类
job.setJarByClass(WordCountDriver.class);
//设置Map\Reduce
job.setMapperClass(WordCountMap.class);
job.setReducerClass(WordCountReduce.class);
//设置Map的输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//设置Reducer的输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//设置输入文件
FileInputFormat.setInputPaths(job,args[0]);
//设置结果输出目录
FileOutputFormat.setOutputPath(job,new Path(args[1]));
boolean result = job.waitForCompletion(true);
System.out.println("***** ok!!!");
if (!result)
System.out.println("-------------------失败!!!");
System.exit(result?0:1);
}
}
- Junit测试
package com.example.demo;
import com.example.demo.driver.WordCountDriver;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import java.io.IOException;
@SpringBootTest
public class WordCountTest {
@Autowired
private WordCountDriver wordCountDriver;
@Test
public void wordCountDriverTest() throws InterruptedException, IOException, ClassNotFoundException {
wordCountDriver.wordCountDriver(new String[]{"/input/wordcount","/output/wordcount"});
}
}
三、运行结果
xiaobai 1
xiaohong 1
xiaoming 2