Statistical code word - in conjunction with Hadoop to do with java, there is source code comments

Statistics word statistics, statistical word frequency of each word. To better understand the idea of Hadoop framework.

package mapreduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * 词频统计
 */
public class WordCountApp {

    /**
     * Mapper：读取源文件，进行单词拆分
     */
    public static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> {

        LongWritable one = new LongWritable(1);

        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {

            // 获取文件一行的内容
            String line = value.toString();

            // 将行内容拆分成一个个单词
            String[] words = line.split(" ");

            // 将单词做成键值对输出
            for(String word : words) {
                context.write(new Text(word), one);
            }

        }
    }

    /**
     * 对mapper的输出进行合并统计
     */
    public static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable> {

        @Override
        protected void reduce(Text key, Iterable<LongWritable> values, Context context)
                throws IOException, InterruptedException {
            long sum = 0;
            for(LongWritable value : values){
                // 将当前key对应的多个values进行累加
                sum += value.get();
            }

            // 将统计完成的结果按照(text, long)键值对输出
            context.write(key, new LongWritable(sum));

        }
    }

    public static void main(String[] args) throws Exception{
        // 创建配置实例
        Configuration configuration = new Configuration();

        // 创建一个job
        Job job = Job.getInstance(configuration, "wordcount");
        // 设置该job的处理类
        job.setJarByClass(WordCountApp.class);

        // 设置输入文件的路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));

        // 设置mapper的相关参数
        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        // 设置reducer的相关参数
        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        // 设置处理的结果文件输出目录
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // 提交给yarn运行，等待运行完成之后退出
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

PS_DaPa micro-channel public number "Love coding programmer"

Published 32 original articles · won praise 9 · views 3162

Private letter concerns

Statistical code word - in conjunction with Hadoop to do with java, there is source code comments

Guess you like