Hadoop案例二(用户流量统计)

用户流量统计

一、问题描述

现有一hdfs文件/input/phoneCount/phone_date.txt,文件中包含多个用户的网站访问情况。为了计算用户的流量费用,现需要统计出文件中每个用户的总上行流量、下行流量、总流量。结果保存到/output/phoneCount目录下。
phone_date.txt:

  13726230503    00-FD-07-A4-72-B8:CMCC    120.196.100.82    i02.c.aliimg.com        24    27    2481    24681    200
  13826544101    5C-0E-8B-C7-F1-E0:CMCC    120.197.40.4            4    0    264    0    200
  13926435656    20-10-7A-28-CC-0A:CMCC    120.196.100.99            2    4    132    1512    200
  13926251106    5C-0E-8B-8B-B1-50:CMCC    120.197.40.4            4    0    240    0    200
  18211575961    94-71-AC-CD-E6-18:CMCC-EASY    120.196.100.99    iface.qiyi.com    视频网站    15    12   1527  2106 200
  84138413    5C-0E-8B-8C-E8-20:7DaysInn    120.197.40.4    122.72.52.12        20    16    4116    1432    200
  13560439658    C4-17-FE-BA-DE-D9:CMCC    120.196.100.99            18    15    1116    954    200
  15920133257    5C-0E-8B-C7-BA-20:CMCC    120.197.40.4    sug.so.360.cn    信息安全    20    20    3156    2936    200
  13719199419    68-A1-B7-03-07-B1:CMCC-EASY    120.196.100.82            4    0    240    0    200
  13660577991    5C-0E-8B-92-5C-20:CMCC-EASY    120.197.40.4    s19.cnzz.com    站点统计    24    9    6960    690    200
  15013685858    5C-0E-8B-C7-F7-90:CMCC    120.197.40.4    rank.ie.sogou.com    搜索引擎    28    27    3659    3538  200
  15989002119    E8-99-C4-4E-93-E0:CMCC-EASY    120.196.100.99    www.umeng.com    站点统计    3    3    1938    180  200
    13560439658    C4-17-FE-BA-DE-D9:CMCC    120.196.100.99            15    9    918    4938    200
    13480253104    5C-0E-8B-C7-FC-80:CMCC-EASY    120.197.40.4            3    3    180    180    200
    13602846565    5C-0E-8B-8B-B6-00:CMCC    120.197.40.4    2052.flash2-http.qq.com    综合门户    15    12    1938    2910    200
    13922314466    00-FD-07-A2-EC-BA:CMCC    120.196.100.82    img.qfc.cn        12    12    3008    3720    200
13502468823    5C-0A-5B-6A-0B-D4:CMCC-EASY    120.196.100.99   y0.ifengimg.com  综合门户    57   102   7335 110349    200
18320173382    84-25-DB-4F-10-1A:CMCC-EASY  120.196.100.99 input.shouji.sogou.com  搜索引擎  21  18   9531    2412    200
13925057413 00-1F-64-E1-E6-9A:CMCC    120.196.100.55    t3.baidu.com    搜索引擎    69    63    11058    48243    200
    13760778710    00-FD-07-A4-7B-08:CMCC    120.196.100.82            2    2    120    120    200
    13560436666    00-FD-07-A4-72-B8:CMCC    120.196.100.82    i02.c.aliimg.com        24    27    2481    24681    200
    13560436666    C4-17-FE-BA-DE-D9:CMCC    120.196.100.99            18    15    1116    954    200

二、问题分析

  • 有的行是9列,有的是10列,长度不统一。但是我们用到的列相对位置是固定的。电话号码在第一列,上行流量在倒数第三列,下行流量在倒数第二列。
  • 列之间的空格分隔符个数不同一。

三、代码

  1. 自定义Writable类,PhoneFlow
package com.example.demo.FlowBean;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class PhoneFlow implements Writable {
    private String number;
    private long upStream;
    private long downStream;
    private long sumStream;
    //反序列化时需要调用
    public PhoneFlow(){}

    public PhoneFlow(String number, long upStream, long downStream,long sumStream) {
        this.number = number;
        this.upStream = upStream;
        this.downStream = downStream;
        this.sumStream=sumStream;
    }

    /**
     * 对象序列化到数据流中
     * @param dataOutput
     * @throws IOException
     */
    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(number);
        dataOutput.writeLong(upStream);
        dataOutput.writeLong(downStream);
        dataOutput.writeLong(sumStream);
    }

    public String getNumber() {
        return number;
    }

    public void setNumber(String number) {
        this.number = number;
    }

    public long getUpStream() {
        return upStream;
    }

    public void setUpStream(long upStream) {
        this.upStream = upStream;
    }

    public long getDownStream() {
        return downStream;
    }

    public void setDownStream(long downStream) {
        this.downStream = downStream;
    }

    public long getSumStream() {
        return sumStream;
    }

    public void setSumStream(long sumStream) {
        this.sumStream = sumStream;
    }

    /**  数据流反序列化到对象==>
     * 从数据流中反序列化出对象的数据
     * 读取对象的顺序必须与序列化时的字段顺序一致
     */
    @Override
    public void readFields(DataInput dataInput) throws IOException {
        this.number=dataInput.readUTF();
        this.upStream=dataInput.readLong();
        this.downStream=dataInput.readLong();
        this.sumStream=dataInput.readLong();
    }

    @Override
    public String toString() {
        return " "+upStream+"  "+downStream+"  "+sumStream+"  ";
    }
}
  1. mapper类
package com.example.demo.map;

import com.example.demo.FlowBean.PhoneFlow;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class PhoneMapper extends Mapper<LongWritable, Text, Text, PhoneFlow> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] columns = line.split("\\s+"); //使用一个或以上的空格分隔列
        PhoneFlow phone = new PhoneFlow(columns[1], Long.valueOf(columns[columns.length-3]), Long.valueOf(columns[columns.length-2]),
                Long.valueOf(columns[columns.length-3])+Long.valueOf(columns[columns.length-2]));
        context.write(new Text(columns[1]),phone);
    }
}
  1. reducer
package com.example.demo.reduce;

import com.example.demo.FlowBean.PhoneFlow;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class PhoneReducer extends Reducer<Text, PhoneFlow,Text,PhoneFlow> {
    @Override
    protected void reduce(Text key, Iterable<PhoneFlow> values, Context context) throws IOException, InterruptedException {
        PhoneFlow phoneFlow = new PhoneFlow();
        long up=phoneFlow.getUpStream();
        long down=phoneFlow.getDownStream();
        for (PhoneFlow i : values) {
            up+=i.getUpStream();
            down+=i.getDownStream();
        }
        phoneFlow.setSumStream(up+down);
        phoneFlow.setUpStream(up);
        phoneFlow.setDownStream(down);
        phoneFlow.setNumber(key.toString());
        context.write(key,phoneFlow);
    }
}
  1. driver类
package com.example.demo.driver;

import com.example.demo.FlowBean.PhoneFlow;
import com.example.demo.map.PhoneMapper;
import com.example.demo.map.WordCountMap;
import com.example.demo.reduce.PhoneReducer;
import com.example.demo.reduce.WordCountReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import java.io.IOException;

@Component
public class PhoneCountDriver {
    @Autowired
    private Configuration configuration;
    public void phoneCountDriver(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Job job = Job.getInstance(configuration, "wordcount01");
        //job设置驱动类
        job.setJarByClass(PhoneCountDriver.class);
        //设置Map\Reduce
        job.setMapperClass(PhoneMapper.class);
        job.setReducerClass(PhoneReducer.class);

        //设置Map的输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(PhoneFlow.class);
        //设置Reducer的输出类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(PhoneFlow.class);
        //设置输入文件
        FileInputFormat.setInputPaths(job,args[0]);
        //设置结果输出目录
        FileOutputFormat.setOutputPath(job,new Path(args[1]));
        boolean result = job.waitForCompletion(true);
        System.out.println("***** ok!!!");
        if (!result)
            System.out.println("-------------------失败!!!");
        System.exit(result?0:1);


    }
}

四、运行结果

13480253104      180  180  360
13502468823      7335  110349  117684
13560436666      3597  25635  29232
13560439658      2034  5892  7926
13602846565      1938  2910  4848
13660577991      6960  690  7650
13719199419      240  0  240
13726230503      2481  24681  27162
13760778710      120  120  240
13826544101      264  0  264
13922314466      3008  3720  6728
13925057413      11058  48243  59301
13926251106      240  0  240
13926435656      132  1512  1644
15013685858      3659  3538  7197
15920133257      3156  2936  6092
15989002119      1938  180  2118
18211575961      1527  2106  3633
18320173382      9531  2412  11943
84138413         4116  1432  5548

猜你喜欢

转载自blog.csdn.net/qq_29012499/article/details/108460624