Mapreduce basics: handwritten local wordcount case

Article Directory

1. Source code

1. WordCountMapper class

package org.example.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class WordCountMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
    
    

    //新建输出文本对象(输出的key类型)
    private Text text = new Text();
    //新建输出IntWritable对象(输出的value类型)
    private IntWritable intWritable = new IntWritable( 1);


    /**
     * 重写map方法
     * @param key 文本的索引
     * @param value 文本值
     * @param context 上下文对象
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    
    

        //获取拆分后的一行文本

        //mysql mysql value value value
        String line = value.toString();

        //根据分隔符进行单词拆分
        String[] words = line.split( " ");

        //循环创建键值对
        for (String word : words){
    
    

            //输出key值设置
            text.set (word) ;

            //进行map输出
            //igeek igeek -> <igeek ,1> <igeek,1>
            context.write(text,intWritable);
        }

    }
}

2. WordCountReducer class

package org.example.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class WordCountReducer extends Reducer<Text, IntWritable,Text, IntWritable> {
    
    

    //输出value对象
    private IntWritable valueOut = new IntWritable();


    /**
     * 重写reduce方法
     * @param key 单词值
     * @param values 单词出现的次数集合
     * @param context   上下文对象
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
    
    

        //每个单词出现的次数
        int sum= 0;

        //<igeek,(1,1)>
        for (IntWritable value : values){
    
    

        //累计单词出现的数量
            sum += value.get();
        }

        //进行封装
        valueOut.set(sum);

        // reduce输出
        context.write(key, valueOut);


    }
}

3. WordCountDriver class

package org.example.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.example.wordcounttemplate.WordCountMapper;
import org.example.wordcounttemplate.WordCountReducer;

import java.io.IOException;

/**
 * 充当mapreduce任务的客户端，用于提交任务
 */

public class WordCountDriver {
    
    
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    
    
//        1.获取配置信息，获取job对象实例
        Configuration conf=new Configuration();
        Job job=Job.getInstance(conf);

//        2.关联本Driver得jar路径
        job.setJarByClass(WordCountDriver.class);

//        3.关联map和reduce
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);

//        4.设置map得输出kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

//        5.设置最终输出得kv类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

//        6.设置输入和输出路径
//        setInputPaths:可传多个输入文件
//         FileInputFormat.setInputPaths(job,new Path(args[0]));
//         FileOutputFormat.setOutputPath(job,new Path(args[1]));
        
        Path inputPath = new Path("D:\\Documentation\\Hello.txt");
        FileInputFormat.setInputPaths(job,inputPath);

        Path outputPath = new Path("D:\\Documentation\\outHello");
        FileOutputFormat.setOutputPath(job,outputPath);

//        7.提交job任务
        boolean result=job.waitForCompletion(true);
        System.out.println(result?"任务提交成功":"任务提交失败");

    }

}

pom.xml

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>org.example</groupId>
  <artifactId>mapreduce_demo</artifactId>
  <version>1.0-SNAPSHOT</version>

  <name>mapreduce_demo</name>
  <!-- FIXME change it to the project's website -->
  <url>http://www.example.com</url>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>1.8</maven.compiler.source>
    <maven.compiler.target>1.8</maven.compiler.target>
  </properties>

  <dependencies>
    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>3.1.3</version>
    </dependency>

  </dependencies>

  <build>
    <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
      <plugins>
        <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
        <plugin>
          <artifactId>maven-clean-plugin</artifactId>
          <version>3.1.0</version>
        </plugin>
        <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
        <plugin>
          <artifactId>maven-resources-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-compiler-plugin</artifactId>
          <version>3.8.0</version>
        </plugin>
        <plugin>
          <artifactId>maven-surefire-plugin</artifactId>
          <version>2.22.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-jar-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-install-plugin</artifactId>
          <version>2.5.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-deploy-plugin</artifactId>
          <version>2.8.2</version>
        </plugin>
        <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
        <plugin>
          <artifactId>maven-site-plugin</artifactId>
          <version>3.7.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-project-info-reports-plugin</artifactId>
          <version>3.0.0</version>
        </plugin>
      </plugins>
    </pluginManagement>
  </build>
</project>

Hello.txt test document

mysql mysql value value value
igeek igeek
mysql hive port property
mysql value value value property
mysql value port property
mysql hive port property
mysql value hive hive hive property
mysql hive port property
mysql port property
mysql value hive hive port property
value property
value hive

insert image description here