简介
自己在练习 MR 的时候,重复的操作的代码比较多,于是灵机一动,自己何不写一个代理类,把 Maper、reducer、partitioner 等方法封装到一起。
所以就有了下面这个类。
MR 代理类
自己写的 MR 的代理类,能给自己省不少的代码呢!
package org.wang.MR.template;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MRTemplate<M_IN_K,M_IN_V,M_OUT_K,M_OUT_V,R_OUT_K,R_OUT_V,P_K,P_V> {
@SuppressWarnings("unused")
private Mapper<M_IN_K,M_IN_V,M_OUT_K,M_OUT_V> mapper ;
@SuppressWarnings("unused")
private Reducer<M_OUT_K,M_OUT_V,R_OUT_K,R_OUT_V> reducer ;
private Partitioner<P_K,P_V> partitioner ;
private Configuration conf = null ;
private Job job = null;
public MRTemplate() throws IOException{
conf = new Configuration();
job = Job.getInstance(conf);
}
public MRTemplate<M_IN_K,M_IN_V,M_OUT_K,M_OUT_V,R_OUT_K,R_OUT_V,P_K,P_V> set(Mapper<M_IN_K,M_IN_V,M_OUT_K,M_OUT_V> mapper){
this.mapper = mapper;
job.setMapperClass(mapper.getClass());
return this;
}
public MRTemplate<M_IN_K,M_IN_V,M_OUT_K,M_OUT_V,R_OUT_K,R_OUT_V,P_K,P_V> set(Reducer<M_OUT_K,M_OUT_V,R_OUT_K,R_OUT_V> reducer){
this.reducer = reducer;
job.setReducerClass(reducer.getClass());
return this;
}
public MRTemplate<M_IN_K,M_IN_V,M_OUT_K,M_OUT_V,R_OUT_K,R_OUT_V,P_K,P_V> setMapOut(Class<M_OUT_K> class1, Class<M_OUT_V> class2) {
job.setMapOutputKeyClass(class1);
job.setMapOutputValueClass(class2);
return this;
}
public MRTemplate<M_IN_K,M_IN_V,M_OUT_K,M_OUT_V,R_OUT_K,R_OUT_V,P_K,P_V> setOut(Class<R_OUT_K> class1, Class<R_OUT_V> class2) {
job.setOutputKeyClass(class1);
job.setOutputValueClass(class2);
return this;
}
public MRTemplate<M_IN_K,M_IN_V,M_OUT_K,M_OUT_V,R_OUT_K,R_OUT_V,P_K,P_V> setPath(String in , String out) throws IllegalArgumentException, IOException{
FileInputFormat.addInputPath(job, new Path(in));
FileOutputFormat.setOutputPath(job, new Path(out));
return this ;
}
public void run() throws ClassNotFoundException, IOException, InterruptedException{
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
public Partitioner<P_K, P_V> getPartitioner() {
return partitioner;
}
public MRTemplate<M_IN_K,M_IN_V,M_OUT_K,M_OUT_V,R_OUT_K,R_OUT_V,P_K,P_V> setPartitioner(Partitioner<P_K, P_V> partitioner) {
this.partitioner = partitioner;
job.setPartitionerClass(partitioner.getClass());
return this;
}
}
我的思路是在 MRTemplate 中设置 MapReduce 框架自己的 Mapper、Reducer、Partitioner,并且定义好 Mapper、Reducer、Partitioner 的输出和输入泛型。
除此之外,实现 MRTemplate 的链式调用,可以方便的调用 set 方法设置 Mapper、Reducer、Partitioner 的值。
在 set Mapper 和 reducer 的时候,利用 Java 的匿名类设置 Mapper、Reducer 的值。
使用
MRTemplate<Object, Text,Text , IntWritable, Text, Text,Text , IntWritable> mr = new MRTemplate<Object, Text,Text , IntWritable, Text, Text,Text , IntWritable>();
mr.set(new Mapper<Object, Text,Text , IntWritable>(){
@Override
protected void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String[] cols = value.toString().split(" ");
context.write(new Text(cols[1]), new IntWritable(Integer.parseInt(cols[2]) -Integer.parseInt(cols[3])));
}
}).set(new Reducer<Text , IntWritable, Text, Text>(){
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Context context)
throws IOException, InterruptedException {
Iterator<IntWritable> it = values.iterator();
int sum = 0 ;
int count = 0;
while(it.hasNext()){
count ++;
sum += it.next().get();
}
context.write(key, new Text((sum/count)+""));
}
}).setMapOut(Text.class, IntWritable.class)
.setOut(Text.class , Text.class)
.setPath("/wyf/doublemr/", "/doublemr_out/")
.run();
适合的场景
我认为这个只适用于初学的阶段,可以减少因为使用类型错误耗费大好的时光