MapReduce简介
执行框架组件和执行流程
《深入理解大数据》黄宜华著
1. InputFormat
负责确定从HDFS文件中读取数据的输入格式
2. Split
负责将数据切片(HDFS默认128M/块),每个切片分配一个map对象
3. RecordReader
负责逐个读取切片中的数据,转换为key-value键值对后输入到map对象
4. Map
负责具体业务,处理输入的键值对,输出键值对,key可重复
5. Combiner
负责合并Map输出的键值对,即将相同的key合并为一个键值对
6. Partitioner
负责将map输出的数据分区,发送到合适的reduce节点,原则是消除reduce节点间数据的相关性,保证reduce可独立完成本地计算
7. sort
负责将数据按主键值排序,默认在map阶段写入文件时执行快排,reduce阶段执行归并排序,都为升序
8. Reduce
负责具体业务,处理输入的键值对,输出键值对,key不可重复
9. OutputFormat
负责确定数据的输出格式,并向HDFS中写入文件
WordCount
MR入门第一步,统计文本文件单词数,向hdfs上传文本文件input,输出output文件夹必须不存在,否则报错
1. MyMap
package com;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class MyMap extends Mapper<LongWritable,Text,Text,IntWritable> {
private IntWritable count=new IntWritable(1);
private Text w=new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//1 toString字符串转换
String line = value.toString();
//2 split分割单词
String[] words = line.split(" ");
//3 for遍历
for (String word:words){
//4 set转换Text
w.set(word);
//5 write写入
context.write(w,count);
}
}
}
2. Reduce
package com;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class MyReduce extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable i=new IntWritable();
@Override
protected void reduce(Text word, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
//1 int计数器
int sum=0;
//2 for遍历value
for (IntWritable it:values){
//3 get转换int
sum +=it.get();
}
//4 set转换IntWritable
i.set(sum);
//5 write写入
context.write(word,i);
}
}
3. MyJob
package com;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class MyJob {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//1 Configuration创建配置对象
Configuration cfg=new Configuration();
//2 getInstance创建job对象
Job job = Job.getInstance(cfg);
//3 set设置job,map,reduce类,map,reduce输出类
job.setJarByClass(MyJob.class);
job.setMapperClass(MyMap.class);
job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//4 FileInputFormat.addInputPath指定读取地址
FileInputFormat.addInputPath(job,new Path("hdfs://master:9000/wc/input"));
//5 FileOutputFormat.setOutputPath指定写入地址
FileOutputFormat.setOutputPath(job,new Path("hdfs://master:9000/wc/output"));
//6 waitForCompletion等待mapreduce完成
boolean flag = job.waitForCompletion(true);
//7 System.exit关闭系统
System.exit(flag?0:1);
}
}