Mapreduce's wordcount word frequency statistics
1. Description of requirements
1. Input file icon
2. Demand
Count the number of occurrences of each letter in the data file, output in the form of letter-times, for example (a 14).
2. Code implementation
1. Writing ideas
In the map stage, each line of data separated by spaces is read and output to the reduce stage in the form of (letter, 1). Before the output, the bottom layer will implement the partition function, divide the results with the same letter into a range, and pass it to the reduce stage , count the number of occurrences of letters in each partition in the reduce phase. Finally output the result to the specified folder.
2. Code
Mapper:
package com.worldcount.zqd;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WCMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
Text k = new Text();
LongWritable l = new LongWritable();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString(); //读取一行数据
String[] words = StringUtils.split(line, " "); //利用工具类对读取的数据的空格进行切割,返回数组
for (String word:words) {
k.set(word); // 获取word
l.set(1); // 默认为1
context.write(k, l); //写入键值对
}
}
}
Reducer:
package com.worldcount.zqd;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WCreducer extends Reducer<Text, LongWritable, Text, LongWritable> {
LongWritable l = new LongWritable();
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long count = 0; //计数变量
// 增强for拿到分区的所有value值, 进行累加
for (LongWritable value : values) {
count += value.get();
}
l.set(count);
context.write(key, l);
}
}
Main program:
package com.worldcount.zqd;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WCrunner {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(WCrunner.class);
job.setMapperClass(WCMapper.class);
job.setReducerClass(WCreducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileInputFormat.setInputPaths(job, new Path("C:\\Users\\Lenovo\\Desktop\\hadoop_mr\\wc_input"));
FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\Lenovo\\Desktop\\hadoop_mr\\wc_out"));
job.waitForCompletion(true);
}
}
The result is as follows: