使用目前hadoop内置的序列化类(不使用自定义序列化类)，实现流量统计的功能

package hadoop2;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class TrafficSysper {

public static void main(String[] args) throws Exception{
Job job = Job.getInstance(new Configuration(), TrafficSysper.class.getSimpleName());
job.setJarByClass(TrafficSysper.class);

//设置读取目录，从参数中获取
FileInputFormat.setInputPaths(job, args[0]);
job.setMapperClass(MyMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);

//如果输出目录存在，则删除输出目录
Path path = new Path(args[1]);
FileSystem fs = FileSystem.get(new URI(args[1]), new Configuration());
if(fs.exists(path)){
fs.delete(path, true);
}
//设置输出目录，从参数中获取
FileOutputFormat.setOutputPath(job, path);

job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);

job.waitForCompletion(true);

}

public static class MyMapper extends Mapper<LongWritable, Text, Text, Text>{

Text k2 = new Text();
Text v2 = new Text();

@Override
protected void map(LongWritable k1, Text v1,
Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
String line = v1.toString();
String[] splited = line.split("\t");

k2.set(splited[1]);
v2.set(splited[6] + "\t" + splited[7] + "\t" +splited[8] + "\t" +splited[9]);
context.write(k2, v2);
}
}

public static class MyReducer extends Reducer<Text, Text, Text, Text>{
Text k3 = new Text();
Text v3 = new Text();
@Override
protected void reduce(Text k2, Iterable<Text> v2s,
Reducer<Text, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
Long t1 = 0L;
Long t2 = 0L;
Long t3 = 0L;
Long t4 = 0L;
for (Text v2 : v2s) {
String line = v2.toString();
    String[] splited = line.split("\t");
    t1 += Long.parseLong(splited[0]);
    t2 += Long.parseLong(splited[1]);
    t3 += Long.parseLong(splited[2]);
    t4 += Long.parseLong(splited[3]);
}
k3.set(k2);
v3.set(t1 + "\t" + t2 + "\t" + t3 + "\t" + t4);
context.write(k3, v3);
}
}

}

使用目前hadoop内置的序列化类(不使用自定义序列化类)，实现流量统计的功能

猜你喜欢