MapReduce之自定义OutputFormat-11

输入数据

输出数据,把含有kxj的日志单独存放到指定的文件里,其它日志存到另一个文件里,

package com.buba.mapreduce.filter;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class FilterOutputformat extends FileOutputFormat<Text,NullWritable> {
    @Override
    public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        return new FilterRecordWriter(taskAttemptContext);
    }
}
package com.buba.mapreduce.filter;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import java.io.IOException;

public class FilterRecordWriter extends RecordWriter<Text, NullWritable> {

    private FSDataOutputStream kxjOut = null;

    private  FSDataOutputStream otherOut = null;

    public FilterRecordWriter(TaskAttemptContext job) {
        Configuration configuration = job.getConfiguration();

        try  {
            //获取文件系统
            FileSystem fileSystem = FileSystem.get(configuration);

            //创建两个文件的输出流
            kxjOut  = fileSystem.create(new Path("f:/output5/kxj.log"));

            otherOut = fileSystem.create(new Path("f:/output5/other.log"));

        }catch (Exception e){
            e.printStackTrace();
        }
    }

    @Override
    public void write(Text key, NullWritable value) throws IOException, InterruptedException {
        //区分输入的key是否包含kxj
        if(key.toString().contains("kxj")){//包含
            kxjOut.write(key.toString().getBytes());
        }else{//不包含
            otherOut.write(key.toString().getBytes());
        }
    }

    @Override
    public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        if(kxjOut!=null){
            kxjOut.close();
        }
        if(otherOut!=null){
            otherOut.close();
        }
    }
}
package com.buba.mapreduce.filter;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class FilterMapper extends Mapper<LongWritable, Text,Text, NullWritable> {

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        context.write(value,NullWritable.get());
    }
}
package com.buba.mapreduce.filter;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class FilterReducer extends Reducer<Text, NullWritable,Text, NullWritable> {

    @Override
    protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
        context.write(new Text(key.toString()+"\r\n"),NullWritable.get());
    }
}
package com.buba.mapreduce.filter;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class FilterDriver {
    public static void main(String[] args)throws Exception {
        //1.获取job信息
        Configuration configuration = new Configuration();

        Job job = Job.getInstance(configuration);

        //2.获取jar的存储路径
        job.setJarByClass(FilterDriver.class);

        //3.关联map和reduce的class类
        job.setMapperClass(FilterMapper.class);

        job.setReducerClass(FilterReducer.class);

        //4.设置map阶段输出key和value类型
        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(NullWritable.class);

        //5.设置最后输入数据的key和value的类型
        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(NullWritable.class);

        //设置分区
       // job.setPartitionerClass(FlowPartitioner.class);

        //设置reducetask个数也就是得和FlowPartitioner里设置的分区个数一致
        //job.setNumReduceTasks(1);

        //将自定义输出格式组件设置到job中
        job.setOutputFormatClass(FilterOutputformat.class);


        //6.设置输入数据的路径和输出数据的路径
        FileInputFormat.setInputPaths(job,new Path(args[0]));

        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        //7.提交
        boolean b = job.waitForCompletion(true);

        System.exit(b?0:1);
    }
}

猜你喜欢

转载自blog.csdn.net/kxj19980524/article/details/89351683
今日推荐