Estadística palabra de código - en conjunto con Hadoop que ver con Java, no hay comentarios del código fuente

palabra estadística estadísticas, estadística frecuencia de palabras de cada palabra. Para entender mejor la idea del marco de Hadoop.

package mapreduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * 词频统计
 */
public class WordCountApp {

    /**
     * Mapper:读取源文件,进行单词拆分
     */
    public static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> {

        LongWritable one = new LongWritable(1);

        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {

            // 获取文件一行的内容
            String line = value.toString();

            // 将行内容拆分成一个个单词
            String[] words = line.split(" ");

            // 将单词做成键值对输出
            for(String word : words) {
                context.write(new Text(word), one);
            }

        }
    }

    /**
     * 对mapper的输出进行合并统计
     */
    public static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable> {

        @Override
        protected void reduce(Text key, Iterable<LongWritable> values, Context context)
                throws IOException, InterruptedException {
            long sum = 0;
            for(LongWritable value : values){
                // 将当前key对应的多个values进行累加
                sum += value.get();
            }

            // 将统计完成的结果按照(text, long)键值对输出
            context.write(key, new LongWritable(sum));

        }
    }

    public static void main(String[] args) throws Exception{
        // 创建配置实例
        Configuration configuration = new Configuration();

        // 创建一个job
        Job job = Job.getInstance(configuration, "wordcount");
        // 设置该job的处理类
        job.setJarByClass(WordCountApp.class);

        // 设置输入文件的路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));

        // 设置mapper的相关参数
        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        // 设置reducer的相关参数
        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        // 设置处理的结果文件输出目录
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // 提交给yarn运行,等待运行完成之后退出
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

Publicado 32 artículos originales · ganado elogios 9 · vistas 3162

Supongo que te gusta

Origin blog.csdn.net/weixin_43501566/article/details/104966890
Recomendado
Clasificación