MapReduce简介和wordcount

MapReduce简介

执行框架组件和执行流程

《深入理解大数据》黄宜华著

1. InputFormat

负责确定从HDFS文件中读取数据的输入格式

2. Split

负责将数据切片(HDFS默认128M/块),每个切片分配一个map对象

3. RecordReader

负责逐个读取切片中的数据,转换为key-value键值对后输入到map对象

4. Map

负责具体业务,处理输入的键值对,输出键值对,key可重复

5. Combiner

负责合并Map输出的键值对,即将相同的key合并为一个键值对

6. Partitioner

负责将map输出的数据分区,发送到合适的reduce节点,原则是消除reduce节点间数据的相关性,保证reduce可独立完成本地计算

7. sort

负责将数据按主键值排序,默认在map阶段写入文件时执行快排,reduce阶段执行归并排序,都为升序

8. Reduce

负责具体业务,处理输入的键值对,输出键值对,key不可重复

9. OutputFormat

负责确定数据的输出格式,并向HDFS中写入文件

WordCount

MR入门第一步,统计文本文件单词数,向hdfs上传文本文件input,输出output文件夹必须不存在,否则报错

1. MyMap

package com;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class MyMap extends Mapper<LongWritable,Text,Text,IntWritable> {
    private IntWritable count=new IntWritable(1);
    private Text w=new Text();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //1 toString字符串转换
        String line = value.toString();
        //2 split分割单词
        String[] words = line.split(" ");
        //3 for遍历
        for (String word:words){
            //4 set转换Text
            w.set(word);
            //5 write写入
            context.write(w,count);
        }
    }
}

2. Reduce

package com;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class MyReduce extends Reducer<Text,IntWritable,Text,IntWritable> {
    private IntWritable i=new IntWritable();
    @Override
    protected void reduce(Text word, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        //1 int计数器
        int sum=0;
        //2 for遍历value
        for (IntWritable it:values){
            //3 get转换int
            sum +=it.get();
        }
        //4 set转换IntWritable
        i.set(sum);
        //5 write写入
        context.write(word,i);
    }
}

3. MyJob

package com;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class MyJob {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        //1 Configuration创建配置对象
        Configuration cfg=new Configuration();
        //2 getInstance创建job对象
        Job job = Job.getInstance(cfg);
        //3 set设置job,map,reduce类,map,reduce输出类
        job.setJarByClass(MyJob.class);
        job.setMapperClass(MyMap.class);
        job.setReducerClass(MyReduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        //4 FileInputFormat.addInputPath指定读取地址
        FileInputFormat.addInputPath(job,new Path("hdfs://master:9000/wc/input"));
        //5 FileOutputFormat.setOutputPath指定写入地址
        FileOutputFormat.setOutputPath(job,new Path("hdfs://master:9000/wc/output"));
        //6 waitForCompletion等待mapreduce完成
        boolean flag = job.waitForCompletion(true);
        //7 System.exit关闭系统
        System.exit(flag?0:1);
    }
}