RecoderWriterの主な業務であるファイルコンテンツの読み取りとフィルタリング
入力ログにceventが含まれているかどうかをフィルタリングします
ceventを含むWebサイトはDに出力されます:\ DEV_CODE \ eclipse_code \ hadoopTMP \ inputOutputFormat
1.RecoderWriter
package
com.cevent.hadoop.mapreduce.outputformat;
import
java.io.IOException;
import
org.apache.hadoop.conf.Configuration;
import
org.apache.hadoop.fs.FSDataOutputStream;
import
org.apache.hadoop.fs.FileSystem;
import
org.apache.hadoop.fs.Path;
import
org.apache.hadoop.io.NullWritable;
import
org.apache.hadoop.io.Text;
import
org.apache.hadoop.mapreduce.RecordWriter;
import
org.apache.hadoop.mapreduce.TaskAttemptContext;
/**
*
RecordWriter中创建OutputFormat所需要的空参构造器
* @author cevent
* @date 2020年4月14日
*/
public class FilterOuputFormatRecordWriter extends RecordWriter<Text, NullWritable>{
private FSDataOutputStream ceventOutputStream=null;
private FSDataOutputStream otherOutputStream=null;
//OutputFormat调用这里的空参
public FilterOuputFormatRecordWriter(TaskAttemptContext job){
//1.调用配置信息
Configuration configuration=job.getConfiguration();
try {
//1.获取文件系统
FileSystem fileSystem=FileSystem.get(configuration);
//2.创建文件的输出流
ceventOutputStream=fileSystem.create(new Path("D:/DEV_CODE/eclipse_code/hadoopTMP/outputFormat/cevent.log"));
otherOutputStream=fileSystem.create(new Path("D:/DEV_CODE/eclipse_code/hadoopTMP/outputFormat/other.log"));
}
catch (Exception e) {
e.printStackTrace();
}
}
//过滤:key=输入的数据
@Override
public void write(Text key, NullWritable value) throws
IOException,
InterruptedException
{
// 区分输入的:是否包含cevent
if(key.toString().contains("cevent")){
//需要key=text转换为string,再转换为字节类型
ceventOutputStream.write(key.toString().getBytes());
}else{
otherOutputStream.write(key.toString().getBytes());
}
}
//结束过滤
@Override
public void close(TaskAttemptContext context) throws
IOException,
InterruptedException
{
if(ceventOutputStream!=null){
ceventOutputStream.close();
}
if(otherOutputStream!=null){
otherOutputStream.close();
}
}
}
2.OuputFormat
package
com.cevent.hadoop.mapreduce.outputformat;
import
java.io.IOException;
import
org.apache.hadoop.io.NullWritable;
import
org.apache.hadoop.io.Text;
import
org.apache.hadoop.mapreduce.RecordWriter;
import
org.apache.hadoop.mapreduce.TaskAttemptContext;
import
org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 输出为key:一条过滤信息Text 不需要value=NullWritable
* @author cevent
* @date 2020年4月14日
*/
public class FilterOutputFormat extends FileOutputFormat<Text, NullWritable>{
@Override
public RecordWriter<Text, NullWritable> getRecordWriter(
TaskAttemptContext job) throws
IOException, InterruptedException {
//
return new FilterOuputFormatRecordWriter(job);
}
}
3.マッパー
package
com.cevent.hadoop.mapreduce.outputformat;
import
java.io.IOException;
import
org.apache.hadoop.io.LongWritable;
import
org.apache.hadoop.io.NullWritable;
import
org.apache.hadoop.io.Text;
import
org.apache.hadoop.mapreduce.Mapper;
/**
*
* @author cevent
* @date 2020年4月14日
*/
public class FilterOuputFormatMapper extends Mapper<LongWritable, Text, Text, NullWritable>{
//设置获取到的key
private Text kText=new Text();
@Override
protected void map(LongWritable key, Text value,Context context)
throws
IOException, InterruptedException {
// 读取一行
String line=value.toString();
// 写入keyT
kText.set(line);
// 写出
context.write(kText, NullWritable.get());
}
}
4.レデューサー
package
com.cevent.hadoop.mapreduce.outputformat;
import
java.io.IOException;
import
org.apache.hadoop.io.NullWritable;
import
org.apache.hadoop.io.Text;
import
org.apache.hadoop.mapreduce.Reducer;
/**
*
reducer的输入=mapper的输出
* 设置输出text的换行
* @author cevent
* @date 2020年4月14日
*/
public class FilterOutputFormatReducer extends Reducer<Text, NullWritable, Text, NullWritable>{
Text keyText=new Text();
@Override
protected void reduce(Text key, Iterable<NullWritable> values,Context context)
throws
IOException, InterruptedException {
// 在key上加入回测\r换行\n
keyText.set(key.toString()+"\r\n");
context.write(keyText, NullWritable.get());
}
}
5.ドライバー
package
com.cevent.hadoop.mapreduce.outputformat;
import
org.apache.hadoop.conf.Configuration;
import
org.apache.hadoop.fs.Path;
import
org.apache.hadoop.io.NullWritable;
import
org.apache.hadoop.io.Text;
import
org.apache.hadoop.mapreduce.Job;
import
org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import
org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class FilterOutputFormatDriver {
public static void main(String[] args) throws Exception
{
Configuration configuration=new Configuration();
Job job=Job.getInstance(configuration);
job.setJarByClass(FilterOutputFormatDriver.class);
job.setMapperClass(FilterOuputFormatMapper.class);
job.setReducerClass(FilterOutputFormatReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//将自定义的输出格式设置到job
job.setOutputFormatClass(FilterOutputFormat.class);
//
FileInputFormat.setInputPaths(job, new Path(args[0]));
//源于自定义的OutputFormat继承自FileOutputFormat(需要输出_SUCCESS文件,不指定的话,未知输出的存储路径)
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean result=job.waitForCompletion(true);
System.exit(result?0:1);
}
}