hadoop案例一(统计文本中单词出现的次数)

一、案例描述

现hdfs中有一文件/input/wordcount/xiaoming.txt),文件中每一单词使用制表符(\t)或者回车符分割,请统计文件中每一个单词的词频。将结果写入/output/wordcount文件夹下。Tips:结果不用按词频排序。

二、代码

  1. Mapper代码
package com.example.demo.map;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**die
 * Mapper第一个泛型必须是LongWritable,代表的是读取文件的行号,第二个是文件的当前行文本
 */
public class WordCountMap extends Mapper<LongWritable, Text, Text, IntWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] words = line.split("\t");
        for (String word :
                words) {
            Text wordText = new Text(word);
            IntWritable outValue = new IntWritable(1);
            context.write(wordText,outValue);
        }
    }
}
  1. reducer 代码
package com.example.demo.reduce;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;


import java.io.IOException;

public class WordCountReduce extends Reducer<Text, IntWritable,Text,IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        System.out.println("---reduce--key: "+key+"  ;values:"+values.toString());
        int sum=0;
        for (IntWritable i :
                values) {
            sum+=i.get();
        }
        context.write(key,new IntWritable(sum));
    }
}
  1. 驱动类
package com.example.demo.driver;

import com.example.demo.map.WordCountMap;
import com.example.demo.reduce.WordCountReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;


import java.io.IOException;

@Component
public class WordCountDriver {
    @Autowired
    private Configuration configuration;
    public void wordCountDriver(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Job job = Job.getInstance(configuration, "wordcount01");
        //job设置驱动类
        job.setJarByClass(WordCountDriver.class);
        //设置Map\Reduce
        job.setMapperClass(WordCountMap.class);
        job.setReducerClass(WordCountReduce.class);

        //设置Map的输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        //设置Reducer的输出类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        //设置输入文件
        FileInputFormat.setInputPaths(job,args[0]);
        //设置结果输出目录
        FileOutputFormat.setOutputPath(job,new Path(args[1]));
        boolean result = job.waitForCompletion(true);
        System.out.println("***** ok!!!");
        if (!result)
            System.out.println("-------------------失败!!!");
        System.exit(result?0:1);
    }
}
  1. Junit测试
package com.example.demo;

import com.example.demo.driver.WordCountDriver;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;

import java.io.IOException;

@SpringBootTest
public class WordCountTest {
    @Autowired
    private WordCountDriver wordCountDriver;

    @Test
    public void wordCountDriverTest() throws InterruptedException, IOException, ClassNotFoundException {
        wordCountDriver.wordCountDriver(new String[]{"/input/wordcount","/output/wordcount"});
    }
}

三、运行结果

xiaobai         1
xiaohong        1
xiaoming        2

猜你喜欢

转载自blog.csdn.net/qq_29012499/article/details/108451823