OutputFormat data output principle of frame hadoop-MapReduce

1.OutputFormat interface class

  OutputFormat MapReduce output is the base class, all the outputs are implemented to achieve MapReduce OutputFormat interfaces;

  1.1 text output TextOutputFormat

    TextOutputFormat default output format, it is written to each record text line; key and its value may be any type, as TextOutputFormat call toString () method to convert them into a string;

  1.2 SequenceFileOutputFormat

    The subsequent MapReduce SequenceFileOutputFormat as input the output of the task, which is a good output format, because of its compact form, easy to compress;

  1.3 Custom OutputFormat

    According to user requirements, custom implementation output;

2. Custom OutputFormat

  2.1 usage scenarios

    In order to achieve the final control of the output path and output file formats, can be automatically the OutputFormat;

    For example: To a MapReduce program types depending on the output result data to a different directory, such flexible output demand can be achieved by custom OutputFormat;

  2.2 Customizing step OutputFormat

    2.2.1 a custom class that inherits FileOutputFormat;

    2.2.2 rewrite RecordWriter, specific output data rewriting Write ();

  2.3 Demand

    Log log input filter, comprising baidu site baidu.log output to a file, it does not contain the site to other.log baidu output file;

http://www.baidu.com
http://www.google.com
http://cn.bing.com
http://www.baidu.com
http://www.sohu.com
http://www.sina.com
http://www.sin2a.com
http://www.sin2desa.com
http://www.sindsafa.com

3. Custom Case OutputFormat practical operation

  Write 3.1 FilterMapper

package com.wn.outputformat;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class FilterMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //写出
        context.write(value,NullWritable.get());
    }
}

  Write 3.2 FilterReducer

package com.wn.outputformat;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class FilterReducer extends Reducer<Text, NullWritable,Text,NullWritable> {

    Text text=new Text();

    @Override
    protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
        //获取一行
        String line = text.toString();
        //拼接
        line=line+"\r\n";
        //设置
        text.set(line);
        //输出
        context.write(text,NullWritable.get());
    }
}

  Write 3.3 FilterRecordWriter

package com.wn.outputformat;

import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import java.io.IOException;

public class FilterRecordWriter extends RecordWriter<Text, NullWritable> {

    FSDataOutputStream atguiguOut = null;
    FSDataOutputStream otherOut = null;

    public FilterRecordWriter (TaskAttemptContext Job) {
         // Get file system 
        the FileSystem FS;
         the try {
            FS . = the FileSystem GET (job.getConfiguration ());
             // create the output file path 
            Path atguiguPath = new new Path ( " E: Beida Jade Bird \\ \\ \\ big data hadoop \\ baidu.log 04 " );
            OtherPath the Path = new new the Path ( " E: \\ APTECH Hadoop \\ \\ \\ large data other.log 04 " );
             // create output stream 
            atguiguOut = fs.create (atguiguPath);
            otherOut=fs.create(otherPath);
        } catch (IOException e) {
            e.printStackTrace ();
        }
    }

    @Override
    public  void Write (the Text text, NullWritable nullWritable) throws IOException, InterruptedException {
         // Analyzing comprising "baidu" output file to a different 
        IF (text.toString (). the contains ( " baidu " )) {
            atguiguOut.write(text.toString().getBytes());
        }else{
            otherOut.write(text.toString().getBytes());
        }
    }

    @Override
    public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        //关闭资源
        IOUtils.closeStream(atguiguOut);
        IOUtils.closeStream(otherOut);
    }
}

  Write 3.4 FilterOutputFormat

package com.wn.outputformat;


import java.io.IOException;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class FilterOutputFormat extends FileOutputFormat<Text, NullWritable> {

    @Override
    public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        return new FilterRecordWriter(taskAttemptContext);
    }
}

  Write 3.5 FilterDriver

package com.wn.outputformat;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public  class FilterDriver {
     public  static  void main (String [] args) throws IOException, a ClassNotFoundException, InterruptedException {
         // input and output path needs to be set according to the actual input and output paths on their computer 
        args = new new String [] { " E: \\ APTECH Hadoop \\ \\ \\ large data INPUT 04 " , " E: \\ APTECH Hadoop \\ \\ \\ big data Output 04 " };
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(FilterDriver.class);
        job.setMapperClass(FilterMapper.class);
        job.setReducerClass(FilterReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        // the output format component own custom settings to the job 
        job.setOutputFormatClass (FilterOutputFormat. Class );

        FileInputFormat.setInputPaths(job,new Path(args[0]));

        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        boolean b = job.waitForCompletion(true);
        System.exit(b ? 0 : 1);
    }
}

 

Guess you like

Origin www.cnblogs.com/wnwn/p/12642495.html