1.OutputFormat interface class
OutputFormat MapReduce output is the base class, all the outputs are implemented to achieve MapReduce OutputFormat interfaces;
1.1 text output TextOutputFormat
TextOutputFormat default output format, it is written to each record text line; key and its value may be any type, as TextOutputFormat call toString () method to convert them into a string;
1.2 SequenceFileOutputFormat
The subsequent MapReduce SequenceFileOutputFormat as input the output of the task, which is a good output format, because of its compact form, easy to compress;
1.3 Custom OutputFormat
According to user requirements, custom implementation output;
2. Custom OutputFormat
2.1 usage scenarios
In order to achieve the final control of the output path and output file formats, can be automatically the OutputFormat;
For example: To a MapReduce program types depending on the output result data to a different directory, such flexible output demand can be achieved by custom OutputFormat;
2.2 Customizing step OutputFormat
2.2.1 a custom class that inherits FileOutputFormat;
2.2.2 rewrite RecordWriter, specific output data rewriting Write ();
2.3 Demand
Log log input filter, comprising baidu site baidu.log output to a file, it does not contain the site to other.log baidu output file;
http://www.baidu.com
http://www.google.com
http://cn.bing.com
http://www.baidu.com
http://www.sohu.com
http://www.sina.com
http://www.sin2a.com
http://www.sin2desa.com
http://www.sindsafa.com
3. Custom Case OutputFormat practical operation
Write 3.1 FilterMapper
package com.wn.outputformat;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class FilterMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//写出
context.write(value,NullWritable.get());
}
}
Write 3.2 FilterReducer
package com.wn.outputformat;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class FilterReducer extends Reducer<Text, NullWritable,Text,NullWritable> {
Text text=new Text();
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
//获取一行
String line = text.toString();
//拼接
line=line+"\r\n";
//设置
text.set(line);
//输出
context.write(text,NullWritable.get());
}
}
Write 3.3 FilterRecordWriter
package com.wn.outputformat;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import java.io.IOException;
public class FilterRecordWriter extends RecordWriter<Text, NullWritable> {
FSDataOutputStream atguiguOut = null;
FSDataOutputStream otherOut = null;
public FilterRecordWriter (TaskAttemptContext Job) {
// Get file system
the FileSystem FS;
the try {
FS . = the FileSystem GET (job.getConfiguration ());
// create the output file path
Path atguiguPath = new new Path ( " E: Beida Jade Bird \\ \\ \\ big data hadoop \\ baidu.log 04 " );
OtherPath the Path = new new the Path ( " E: \\ APTECH Hadoop \\ \\ \\ large data other.log 04 " );
// create output stream
atguiguOut = fs.create (atguiguPath);
otherOut=fs.create(otherPath);
} catch (IOException e) {
e.printStackTrace ();
}
}
@Override
public void Write (the Text text, NullWritable nullWritable) throws IOException, InterruptedException {
// Analyzing comprising "baidu" output file to a different
IF (text.toString (). the contains ( " baidu " )) {
atguiguOut.write(text.toString().getBytes());
}else{
otherOut.write(text.toString().getBytes());
}
}
@Override
public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
//关闭资源
IOUtils.closeStream(atguiguOut);
IOUtils.closeStream(otherOut);
}
}
Write 3.4 FilterOutputFormat
package com.wn.outputformat;
import java.io.IOException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class FilterOutputFormat extends FileOutputFormat<Text, NullWritable> {
@Override
public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
return new FilterRecordWriter(taskAttemptContext);
}
}
Write 3.5 FilterDriver
package com.wn.outputformat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class FilterDriver {
public static void main (String [] args) throws IOException, a ClassNotFoundException, InterruptedException {
// input and output path needs to be set according to the actual input and output paths on their computer
args = new new String [] { " E: \\ APTECH Hadoop \\ \\ \\ large data INPUT 04 " , " E: \\ APTECH Hadoop \\ \\ \\ big data Output 04 " };
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(FilterDriver.class);
job.setMapperClass(FilterMapper.class);
job.setReducerClass(FilterReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// the output format component own custom settings to the job
job.setOutputFormatClass (FilterOutputFormat. Class );
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}