用户流量统计
一、问题描述
现有一hdfs文件/input/phoneCount/phone_date.txt,文件中包含多个用户的网站访问情况。为了计算用户的流量费用,现需要统计出文件中每个用户的总上行流量、下行流量、总流量。结果保存到/output/phoneCount目录下。
phone_date.txt:
13726230503 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200
13826544101 5C-0E-8B-C7-F1-E0:CMCC 120.197.40.4 4 0 264 0 200
13926435656 20-10-7A-28-CC-0A:CMCC 120.196.100.99 2 4 132 1512 200
13926251106 5C-0E-8B-8B-B1-50:CMCC 120.197.40.4 4 0 240 0 200
18211575961 94-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99 iface.qiyi.com 视频网站 15 12 1527 2106 200
84138413 5C-0E-8B-8C-E8-20:7DaysInn 120.197.40.4 122.72.52.12 20 16 4116 1432 200
13560439658 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 18 15 1116 954 200
15920133257 5C-0E-8B-C7-BA-20:CMCC 120.197.40.4 sug.so.360.cn 信息安全 20 20 3156 2936 200
13719199419 68-A1-B7-03-07-B1:CMCC-EASY 120.196.100.82 4 0 240 0 200
13660577991 5C-0E-8B-92-5C-20:CMCC-EASY 120.197.40.4 s19.cnzz.com 站点统计 24 9 6960 690 200
15013685858 5C-0E-8B-C7-F7-90:CMCC 120.197.40.4 rank.ie.sogou.com 搜索引擎 28 27 3659 3538 200
15989002119 E8-99-C4-4E-93-E0:CMCC-EASY 120.196.100.99 www.umeng.com 站点统计 3 3 1938 180 200
13560439658 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 15 9 918 4938 200
13480253104 5C-0E-8B-C7-FC-80:CMCC-EASY 120.197.40.4 3 3 180 180 200
13602846565 5C-0E-8B-8B-B6-00:CMCC 120.197.40.4 2052.flash2-http.qq.com 综合门户 15 12 1938 2910 200
13922314466 00-FD-07-A2-EC-BA:CMCC 120.196.100.82 img.qfc.cn 12 12 3008 3720 200
13502468823 5C-0A-5B-6A-0B-D4:CMCC-EASY 120.196.100.99 y0.ifengimg.com 综合门户 57 102 7335 110349 200
18320173382 84-25-DB-4F-10-1A:CMCC-EASY 120.196.100.99 input.shouji.sogou.com 搜索引擎 21 18 9531 2412 200
13925057413 00-1F-64-E1-E6-9A:CMCC 120.196.100.55 t3.baidu.com 搜索引擎 69 63 11058 48243 200
13760778710 00-FD-07-A4-7B-08:CMCC 120.196.100.82 2 2 120 120 200
13560436666 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200
13560436666 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 18 15 1116 954 200
二、问题分析
- 有的行是9列,有的是10列,长度不统一。但是我们用到的列相对位置是固定的。电话号码在第一列,上行流量在倒数第三列,下行流量在倒数第二列。
- 列之间的空格分隔符个数不同一。
三、代码
- 自定义Writable类,PhoneFlow
package com.example.demo.FlowBean;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class PhoneFlow implements Writable {
private String number;
private long upStream;
private long downStream;
private long sumStream;
//反序列化时需要调用
public PhoneFlow(){}
public PhoneFlow(String number, long upStream, long downStream,long sumStream) {
this.number = number;
this.upStream = upStream;
this.downStream = downStream;
this.sumStream=sumStream;
}
/**
* 对象序列化到数据流中
* @param dataOutput
* @throws IOException
*/
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(number);
dataOutput.writeLong(upStream);
dataOutput.writeLong(downStream);
dataOutput.writeLong(sumStream);
}
public String getNumber() {
return number;
}
public void setNumber(String number) {
this.number = number;
}
public long getUpStream() {
return upStream;
}
public void setUpStream(long upStream) {
this.upStream = upStream;
}
public long getDownStream() {
return downStream;
}
public void setDownStream(long downStream) {
this.downStream = downStream;
}
public long getSumStream() {
return sumStream;
}
public void setSumStream(long sumStream) {
this.sumStream = sumStream;
}
/** 数据流反序列化到对象==>
* 从数据流中反序列化出对象的数据
* 读取对象的顺序必须与序列化时的字段顺序一致
*/
@Override
public void readFields(DataInput dataInput) throws IOException {
this.number=dataInput.readUTF();
this.upStream=dataInput.readLong();
this.downStream=dataInput.readLong();
this.sumStream=dataInput.readLong();
}
@Override
public String toString() {
return " "+upStream+" "+downStream+" "+sumStream+" ";
}
}
- mapper类
package com.example.demo.map;
import com.example.demo.FlowBean.PhoneFlow;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class PhoneMapper extends Mapper<LongWritable, Text, Text, PhoneFlow> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] columns = line.split("\\s+"); //使用一个或以上的空格分隔列
PhoneFlow phone = new PhoneFlow(columns[1], Long.valueOf(columns[columns.length-3]), Long.valueOf(columns[columns.length-2]),
Long.valueOf(columns[columns.length-3])+Long.valueOf(columns[columns.length-2]));
context.write(new Text(columns[1]),phone);
}
}
- reducer
package com.example.demo.reduce;
import com.example.demo.FlowBean.PhoneFlow;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class PhoneReducer extends Reducer<Text, PhoneFlow,Text,PhoneFlow> {
@Override
protected void reduce(Text key, Iterable<PhoneFlow> values, Context context) throws IOException, InterruptedException {
PhoneFlow phoneFlow = new PhoneFlow();
long up=phoneFlow.getUpStream();
long down=phoneFlow.getDownStream();
for (PhoneFlow i : values) {
up+=i.getUpStream();
down+=i.getDownStream();
}
phoneFlow.setSumStream(up+down);
phoneFlow.setUpStream(up);
phoneFlow.setDownStream(down);
phoneFlow.setNumber(key.toString());
context.write(key,phoneFlow);
}
}
- driver类
package com.example.demo.driver;
import com.example.demo.FlowBean.PhoneFlow;
import com.example.demo.map.PhoneMapper;
import com.example.demo.map.WordCountMap;
import com.example.demo.reduce.PhoneReducer;
import com.example.demo.reduce.WordCountReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import java.io.IOException;
@Component
public class PhoneCountDriver {
@Autowired
private Configuration configuration;
public void phoneCountDriver(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job = Job.getInstance(configuration, "wordcount01");
//job设置驱动类
job.setJarByClass(PhoneCountDriver.class);
//设置Map\Reduce
job.setMapperClass(PhoneMapper.class);
job.setReducerClass(PhoneReducer.class);
//设置Map的输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(PhoneFlow.class);
//设置Reducer的输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(PhoneFlow.class);
//设置输入文件
FileInputFormat.setInputPaths(job,args[0]);
//设置结果输出目录
FileOutputFormat.setOutputPath(job,new Path(args[1]));
boolean result = job.waitForCompletion(true);
System.out.println("***** ok!!!");
if (!result)
System.out.println("-------------------失败!!!");
System.exit(result?0:1);
}
}
四、运行结果
13480253104 180 180 360
13502468823 7335 110349 117684
13560436666 3597 25635 29232
13560439658 2034 5892 7926
13602846565 1938 2910 4848
13660577991 6960 690 7650
13719199419 240 0 240
13726230503 2481 24681 27162
13760778710 120 120 240
13826544101 264 0 264
13922314466 3008 3720 6728
13925057413 11058 48243 59301
13926251106 240 0 240
13926435656 132 1512 1644
15013685858 3659 3538 7197
15920133257 3156 2936 6092
15989002119 1938 180 2118
18211575961 1527 2106 3633
18320173382 9531 2412 11943
84138413 4116 1432 5548