Continuously performs two tasks of MapReduce Job

An existing requirements are as follows:

有三个文件 我们需要统计每个文件中的单词数量 但是格式有一定要求
最终格式:jee    a.txt-->4 b.txt-->2 c.txt-->2
前面是单词中间隔一个\t 后面是每个文件中包含该单词的数量 不同文件名用一个空格隔开

我们可以这样实现  使用两次的map-reduce  第一次map-reduce输入 jee--a.txt    4 这种格式 
然后再下一个map-reduce 将第一次的map-reduce拆分开 然后组成最终需要的格式即可

Mapper1:

package com.jee.doublejob;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

public class WCMapper1 extends Mapper<LongWritable, Text,Text, IntWritable> {

    private String fileName = new String();

    private Text k = new Text();
    private IntWritable v = new IntWritable(1);

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        //获得map文件接收到的切片
        FileSplit split = (FileSplit) context.getInputSplit();
        //根据切片获得文件的名字
        fileName = split.getPath().getName().toString();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] items = line.split("\t");
        for(String item : items){
            k.set(item + "--" + fileName);
            context.write(k,v);
        }
    }
}

Reducer1:

package com.jee.doublejob;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.Iterator;

public class WCReducer1 extends Reducer<Text, IntWritable,Text,IntWritable> {

    private IntWritable v = new IntWritable();

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        Iterator<IntWritable> iterator = values.iterator();
        int t = 0;
        while(iterator.hasNext()){
            t += iterator.next().get();
        }
        v.set(t);
        context.write(key,v);
    }
}

Mapper2:

package com.jee.doublejob;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class WCMapper2 extends Mapper<LongWritable, Text,Text,Text> {

    private Text k = new Text();
    private Text v = new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] items = line.split("--");
        k.set(items[0]);

        String[] items2 = items[1].split("\t");
        v.set(items2[0] + "-->" + items2[1]);

        context.write(k,v);
    }
}

Reducer2:

package com.jee.doublejob;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.Iterator;

public class WCReducer2 extends Reducer<Text,Text,Text,Text> {

    private Text v = new Text();

    @Override
    protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        Iterator<Text> iterator = values.iterator();
        String temp = "";
        while (iterator.hasNext()){
            temp += iterator.next().toString() + " ";
        }
        v.set(temp);
        context.write(key,v);
    }
}

Driver:

package com.jee.doublejob;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WCDriver {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        //第一个Job任务
        Job job1 = Job.getInstance(new Configuration());

        job1.setJarByClass(WCDriver.class);

        job1.setMapperClass(WCMapper1.class);
        job1.setReducerClass(WCReducer1.class);

        job1.setMapOutputKeyClass(Text.class);
        job1.setMapOutputValueClass(IntWritable.class);
        job1.setOutputKeyClass(Text.class);
        job1.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job1,new Path("d:/Hadoop/input"));
        FileOutputFormat.setOutputPath(job1,new Path("d:/Hadoop/output"));

        boolean b = job1.waitForCompletion(true);

        //第一个Job任务如果成功  执行第二个Job任务 第二个的输入是第一个Job任务的输出
        if(b){
            Job job2 = Job.getInstance(new Configuration());

            job2.setJarByClass(WCDriver.class);

            job2.setMapperClass(WCMapper2.class);
            job2.setReducerClass(WCReducer2.class);

            job2.setMapOutputKeyClass(Text.class);
            job2.setMapOutputValueClass(Text.class);
            job2.setOutputKeyClass(Text.class);
            job2.setOutputValueClass(Text.class);

            FileInputFormat.setInputPaths(job2,new Path("d:/Hadoop/output"));
            FileOutputFormat.setOutputPath(job2,new Path("d:/Hadoop/output2"));

            boolean b1 = job2.waitForCompletion(true);
            System.exit(b1 ? 0 : 1);
        }else{
            System.exit(1);
        }

    }
}

Published 53 original articles · won praise 0 · Views 1933

Guess you like

Origin blog.csdn.net/XXuan_/article/details/105164615