自己封装的MR类

简介

自己在练习 MR 的时候，重复的操作的代码比较多，于是灵机一动，自己何不写一个代理类，把 Maper、reducer、partitioner 等方法封装到一起。
所以就有了下面这个类。

MR 代理类

自己写的 MR 的代理类，能给自己省不少的代码呢！

package org.wang.MR.template;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class MRTemplate<M_IN_K,M_IN_V,M_OUT_K,M_OUT_V,R_OUT_K,R_OUT_V,P_K,P_V> {
    @SuppressWarnings("unused")
    private Mapper<M_IN_K,M_IN_V,M_OUT_K,M_OUT_V> mapper ;
    @SuppressWarnings("unused")
    private Reducer<M_OUT_K,M_OUT_V,R_OUT_K,R_OUT_V> reducer ;
    private Partitioner<P_K,P_V> partitioner ;
    private Configuration conf = null ;
    private Job job = null;
    public MRTemplate() throws IOException{
        conf = new Configuration();
        job  = Job.getInstance(conf);
    }
    public MRTemplate<M_IN_K,M_IN_V,M_OUT_K,M_OUT_V,R_OUT_K,R_OUT_V,P_K,P_V> set(Mapper<M_IN_K,M_IN_V,M_OUT_K,M_OUT_V> mapper){
        this.mapper = mapper;
        job.setMapperClass(mapper.getClass());
        return this; 
    }
    public MRTemplate<M_IN_K,M_IN_V,M_OUT_K,M_OUT_V,R_OUT_K,R_OUT_V,P_K,P_V> set(Reducer<M_OUT_K,M_OUT_V,R_OUT_K,R_OUT_V> reducer){
        this.reducer = reducer;
        job.setReducerClass(reducer.getClass());
        return this;
    }
    public MRTemplate<M_IN_K,M_IN_V,M_OUT_K,M_OUT_V,R_OUT_K,R_OUT_V,P_K,P_V> setMapOut(Class<M_OUT_K> class1, Class<M_OUT_V> class2) {
        job.setMapOutputKeyClass(class1);
        job.setMapOutputValueClass(class2);
        return this;
    }

    public MRTemplate<M_IN_K,M_IN_V,M_OUT_K,M_OUT_V,R_OUT_K,R_OUT_V,P_K,P_V> setOut(Class<R_OUT_K> class1, Class<R_OUT_V> class2) { 
        job.setOutputKeyClass(class1); 
        job.setOutputValueClass(class2);
        return this;
    }
    public MRTemplate<M_IN_K,M_IN_V,M_OUT_K,M_OUT_V,R_OUT_K,R_OUT_V,P_K,P_V> setPath(String in , String out) throws IllegalArgumentException, IOException{
        FileInputFormat.addInputPath(job, new Path(in));
        FileOutputFormat.setOutputPath(job, new Path(out));
        return this ;
    }
    public void run() throws ClassNotFoundException, IOException, InterruptedException{
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
    public Partitioner<P_K, P_V> getPartitioner() {
        return partitioner;
    }
    public MRTemplate<M_IN_K,M_IN_V,M_OUT_K,M_OUT_V,R_OUT_K,R_OUT_V,P_K,P_V> setPartitioner(Partitioner<P_K, P_V> partitioner) {
        this.partitioner = partitioner;
        job.setPartitionerClass(partitioner.getClass());
        return this;
    }
}

我的思路是在 MRTemplate 中设置 MapReduce 框架自己的 Mapper、Reducer、Partitioner，并且定义好 Mapper、Reducer、Partitioner 的输出和输入泛型。

除此之外，实现 MRTemplate 的链式调用，可以方便的调用 set 方法设置 Mapper、Reducer、Partitioner 的值。

在 set Mapper 和 reducer 的时候，利用 Java 的匿名类设置 Mapper、Reducer 的值。

使用

MRTemplate<Object, Text,Text , IntWritable, Text, Text,Text , IntWritable> mr = new MRTemplate<Object, Text,Text , IntWritable, Text, Text,Text , IntWritable>();
mr.set(new Mapper<Object, Text,Text , IntWritable>(){

    @Override
    protected void map(Object key, Text value, Context context)
            throws IOException, InterruptedException {
        String[] cols = value.toString().split(" ");
        context.write(new Text(cols[1]), new IntWritable(Integer.parseInt(cols[2]) -Integer.parseInt(cols[3])));
    }

}).set(new Reducer<Text , IntWritable, Text, Text>(){

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values,
            Context context)
            throws IOException, InterruptedException {
        Iterator<IntWritable> it = values.iterator();
        int sum = 0 ;
        int count = 0;
        while(it.hasNext()){
            count ++;
            sum += it.next().get();
        }

        context.write(key, new Text((sum/count)+""));
    }

}).setMapOut(Text.class, IntWritable.class)
.setOut(Text.class , Text.class)
.setPath("/wyf/doublemr/", "/doublemr_out/")
.run();

适合的场景

我认为这个只适用于初学的阶段，可以减少因为使用类型错误耗费大好的时光

简介

MR 代理类

使用

适合的场景

猜你喜欢