第3节 mapreduce高级:7、自定义outputformat实现输出到不同的文件夹下面

2.1 需求

现在有一些订单的评论数据,需求,将订单的好评与差评进行区分开来,将最终的数据分开到不同的文件夹下面去,数据内容参见资料文件夹,其中数据第九个字段表示好评,中评,差评。0:好评,1:中评,2:差评

数据内容类似如下:

1 2018-03-15 22:29:06 2018-03-15 22:29:06 我想再来一个 \N 1 3 hello 来就来吧 0 2018-03-14 22:29:03
2 2018-03-15 22:42:08 2018-03-15 22:42:08 好的 \N 1 1 添加一个吧 说走咱就走啊 0 2018-03-14 22:42:04

2.2 分析

程序的关键点是要在一个mapreduce程序中根据数据的不同输出两类结果到不同目录,这类灵活的输出需求可以通过自定义outputformat来实现

2.3 实现

实现要点:

1、 在mapreduce中访问外部资源

2、自定义outputformat,改写其中的recordwriter,改写具体输出数据的方法write()

 代码:

MyOutputFormatMain :
package cn.itcast.demo4.outputformat;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class MyOutputFormatMain extends Configured implements Tool{
@Override
public int run(String[] args) throws Exception {

Job job = Job.getInstance(this.getConf(), MyOutputFormatMain.class.getSimpleName());
// job.setJarByClass(MyInputFormatMain.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("file:///D:\\Study\\BigData\\heima\\stage2\\5、大数据离线第五天\\自定义outputformat\\input"));

job.setMapperClass(MyOutputFormatMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);

job.setOutputFormatClass(MyOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("file:///D:\\Study\\BigData\\heima\\stage2\\5、大数据离线第五天\\自定义outputformat\\output"));

boolean b = job.waitForCompletion(true);
return b?0:1;
}

public static void main(String[] args) throws Exception {
int run = ToolRunner.run(new Configuration(), new MyOutputFormatMain(), args);
System.exit(run);
}
}


MyOutputFormatMapper:
package cn.itcast.demo4.outputformat;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

public class MyOutputFormatMapper extends Mapper<LongWritable,Text,Text,NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
context.write(value,NullWritable.get());
}
}

MyOutputFormat:
package cn.itcast.demo4.outputformat;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class MyOutputFormat extends FileOutputFormat<Text,NullWritable> {

@Override
public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
FileSystem fileSystem = FileSystem.get(conf);
FSDataOutputStream goodComment = fileSystem.create(new Path("file:///D:\\Study\\BigData\\heima\\stage2\\5、大数据离线第五天\\自定义outputformat\\goodComment\\goodComment.txt"));

FSDataOutputStream badComment = fileSystem.create(new Path("file:///D:\\Study\\BigData\\heima\\stage2\\5、大数据离线第五天\\自定义outputformat\\badComment\\badComment.txt"));

MyRecordWriter myRecordWriter = new MyRecordWriter(goodComment, badComment);
return myRecordWriter;
}
}

MyRecordWriter:
package cn.itcast.demo4.outputformat;

import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import java.io.IOException;

public class MyRecordWriter extends RecordWriter<Text,NullWritable> {
//定义我们两个输出流,分别输出到不同的文件夹下面去
private FSDataOutputStream goodStream;
private FSDataOutputStream badStream;

public MyRecordWriter(FSDataOutputStream goodStream, FSDataOutputStream badStream) {
this.goodStream = goodStream;
this.badStream = badStream;
}

/**
* 将我们的数据往外写
* @param text 我们一行的评论数据
* @param nullWritable
* @throws IOException
* @throws InterruptedException
*/
@Override
public void write(Text text, NullWritable nullWritable) throws IOException, InterruptedException {
//1 2018-03-15 22:29:06 2018-03-15 22:29:06 我想再来一个 \N 1 3 hello 来就来吧 0 2018-03-14 22:29:03
String[] split = text.toString().split("\t");
Integer commentStatus = Integer.parseInt(split[9]);

if(commentStatus<=1){//好评0、中评1数据
//通过输出流,往外写出去数据
goodStream.write(text.toString().getBytes());
goodStream.write("\r\n".getBytes());//换行
}else{//差评2数据
badStream.write(text.toString().getBytes());
badStream.write("\r\n".getBytes());
}

}

@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
IOUtils.closeQuietly(goodStream);
IOUtils.closeQuietly(badStream);
}
}

猜你喜欢

转载自www.cnblogs.com/mediocreWorld/p/11041026.html
今日推荐