Hadoop MapReduce WordCount 快速上手

需求

统计一个文件中出现的各个单词个数。
文件路径/Users/ylj/demo/input/hello.txt

☁  input  pwd
/Users/ylj/demo/input
☁  input  cat hello.txt
php
java go
scala
php
go go go
python

分析

Mapper

  1. 将mapTask传给我们的Text内容转换成String
  2. 根据空格分割
  3. 输出(key,1)

Reducer

  1. 汇总各个key的个数
  2. 输出(key,sum)

Driver

  1. 获取job对象
  2. 设置jar存储位置
  3. 设置Map 和 Reduce 类
  4. 设置Mapper阶段输出流的key和value类型
  5. 设置最终数据输出的key和value类型
  6. 设置输入路径和输出路径
  7. 提交job

代码

Mapper
package com.yljphp.mapreduce.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * map 阶段
 * <p>
 * 原始文本:
 * php java
 * c go
 * scala
 * <p>
 * 输出文本
 * php 1
 * java 1
 * c 1
 * <p>
 * <p>
 * KMapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
 * <p>
 * KEYIN 输入的数据key 所在位置
 * VALUEIN 输入的数据value 字符串
 * KEYOUT 输出的数据key 字符串
 * VALUEOUT 输出的数据value 整型1
 */
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

    private final static IntWritable one = new IntWritable(1);
    private Text keyOut = new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        //1.获取一行数据
        String line = value.toString();

        //2.根据空格切割
        String[] words = line.split(" ");

        //3.循环写出
        for (String word : words) {

            keyOut.set(word);

            context.write(keyOut, one);
        }
    }
}
Reducer
package com.yljphp.mapreduce.wordcount;


import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * Reduce 阶段
 * 输入数据:
 * java 1
 * java 1
 * go 1
 * scala 1
 * 输出数据
 * java 2
 * go 1
 * scala 1
 * <p>
 * Reducer<KEYIN,VALUEIN,KEYOUT,VALUEOUT>
 * KEYIN 输入的数据key 字符串
 * VALUEIN 输入数据value int
 * KEYOUT  输出数据key 字符串
 * VALUEOUT 输出数据value long
 */
public class WordCountReducer extends Reducer<Text, IntWritable, Text, LongWritable> {


    private LongWritable valueOut = new LongWritable();

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

        int totalNum = 0;

        //1.累加求和
        for (IntWritable value : values) {
            totalNum += value.get();
        }

        //2.写出
        valueOut.set(totalNum);
        context.write(key, valueOut);

    }
}
Driver
package com.yljphp.mapreduce.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WordCountDriver {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        System.out.println(args[0]);
        System.out.println(args[1]);

        //1. 获取job对象
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf,"word count");

        //2. 设置jar存储位置
        job.setJarByClass(WordCountDriver.class);

        //3. 设置Map 和 Reduce 类
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);

        //4. 设置Mapper阶段输出流的key和value类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //5. 设置最终数据输出的key和value类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //6. 设置输入路径和输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //7. 提交job
//        job.submit();
        boolean result = job.waitForCompletion(true);

        System.exit(result ? 0 : 1);

    }
}

输出

☁  output  pwd
/Users/ylj/demo/output
☁  output  cat part-r-00000
go	4
java	1
php	2
python	1
scala	1

注意

不要使用已经废弃的类

比如org.apache.hadoop.mapred
-w632

数据类型

MapReduce内置数据类型都在包org.apache.hadoop.io中。

类型 说明 相似的Java类
BooleanWritable 标准布尔型数值 Boolean
ByteWritable 单字节数值 Byte
DoubleWritable 双字节数值 Double
FloatWritable 浮点数值 Float
IntWritable 整型数值 Int
LongWritable 长整型数值 Long
Text 使用UTF8格式存储的文本 String
NullWritable 当<key, value>中的key或value为空时使用 null

猜你喜欢

转载自blog.csdn.net/yljphp/article/details/89045598
今日推荐