MapReduce实现单词统计案例

1、pom.xml配置

<properties>
<java.version>1.8</java.version>
<hadoop.version>2.6.0</hadoop.version>
</properties>

<!-- hadoop相关jar包下载坐标-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!--spring boot hadoop start-->
<dependency>
<groupId>org.springframework.data</groupId>
<artifactId>spring-data-hadoop-boot</artifactId>
<version>2.5.0.RELEASE</version>
</dependency>
<!--spring boot hadoop end-->


2、准备ren.txt文件(文件行数10000条),文件中每个词之间用“空格”隔开。如下每个词依次代表姓名、性别、年龄、婚姻状况、收入,职务,该文件需要上传到hdfs系统中2、准备计算的文件

zhaosi man 78 yh 1676 合伙人
benshan man 68 yh 707 理事
xiaohua man 57 wh 1332 执行监事
lilei man 71 wh 1224 理事
lisi women 93 yh 7577 联络员
liangxiao man 48 wh 4174 执行监事
wangwu women 48 yh 4412 理事

注:本人将上传到了hdfs文件系统的/hdfsapi/test/ren.txt切记不是任意文件夹。可以通过ip:50070取查看一下文件是否上传成功,

3、编写mapper类WordCountMapper

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

    // 以单词作为键
    public Text k = new Text();
    // 以 1 作为 value
    public IntWritable v = new IntWritable(1);
    @Override
    protected void map(LongWritable key , Text value ,Context context )
            throws IOException, InterruptedException {
        String[] words = StringUtils.split ( value.toString(), " " );
        for ( int i = 0; i < words . length ; i ++){
            String w = words [ i ];
            // 以单词为 key
            k .set( w );
            context .write( k , v );
        }
    }

}

4、编写reduce类WorldCountReduce

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class WorldCountReduce extends Reducer<Text, IntWritable, Text, IntWritable> {

    @Override
    protected void reduce(Text key , Iterable<IntWritable> ite ,Context context )
            throws IOException, InterruptedException {
        int sum = 0;
        for (IntWritable i : ite ){
            sum += i .get();
        }
        context .write( key , new IntWritable( sum ));
    }

}

5、编写job类RunJob


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class RunJob {


   public static void main(String[] args ) {

        wordCountJob();
    }

    public static void wordCountJob(){
        try {
            Configuration config = new Configuration(true);
            Job job = Job. getInstance ( config );
            job .setJarByClass(RunJob.class );
            //给job任务起名
            job.setJobName("ren_count");
            //定义一个maper类
            job .setMapperClass(WordCountMapper.class );
            //定义一个reduce类
            job .setReducerClass(WorldCountReduce.class );
            job .setMapOutputKeyClass(Text.class );
            job .setMapOutputValueClass(IntWritable.class );
            // 给 job 指定计算的输入数据  / usr /input/wc.txt
            Path input = new Path( "/hdfsapi/test/ren.txt" );
            FileInputFormat.addInputPath(job,input);
            //输入文件格式化
            job.setInputFormatClass(TextInputFormat.class);
            // 给 job 指定计算之后结果的输出目录,该目录不允许存在,如果存在, job 执行出错
            Path output = new Path( "/mrjob/resultdata/emptest" );
            if ( output.getFileSystem(config).exists(output)){
                output.getFileSystem(config).delete(output,true);
            }
            FileOutputFormat.setOutputPath ( job , output );
            boolean f = job .waitForCompletion( true );
            if ( f ){
                System. out .println( " 执行成功 " );
            }
        } catch (Exception e ) {
            e .printStackTrace();
        }
    }


}

6、执行MapReduce任务

(1)打包代码并上传hadoop主节点所在服务器

       只需要打包任务所在的package即可,将任务打成jar包,并上传到hadoop所在的服务器的任意文件夹下(本人打包后的文件superv.jar)

 (2)登录服务器并执行MapReduce任务(需要hdoop的用户登录)

hadoop jar superv.jar com.gsww.asset.superv.mapreduce.RunJob

(3)通过命令查看执行后生成的文件(ls后面写入的是输出文件的路径,并且该文件还在hdfs文件系统中)

hdfs dfs -ls /usr/local/mrdata/outdata

(4)将执行完的文件从hdfs中复制到本地

hdfs dfs -get /usr/local/mrdata/outdata/* ./

猜你喜欢

转载自blog.csdn.net/superGrant/article/details/88237973