hadoop排序流量统计代码详解

网上已经有了很多成熟的教程,但是对于不懂MapReduce相关概念的新手理解起来均有些困难。
博主读了一天代码,终于将代码理解了,特此给大家分享


一、带详细注释的代码

//第一部分是导入各种包,没什么好讲的
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.io.WritableComparable;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

//PhoneFlowDriver类中有FlowWritable,PhoneFlowMapper,PhoneFlowReducer和main
public class PhoneFlowDriver {

    //自定义FlowWritable
    static class FlowWritable implements WritableComparable<FlowWritable> {

        private String phonenum;  //电话号码
        private long upflow;  //上行流量
        private long downflow;  //下行流量
        private long sumflow;  //总流量
        
        //空的构造方法
        public FlowWritable() {}  

        //构造方法,利用电话号码,上行流量和下行流量来构造
        public FlowWritable(String phonenum, long upflow, long downflow) {
            super();
            this.phonenum = phonenum;
            this.upflow = upflow;
            this.downflow = downflow;
            this.sumflow = this.upflow + this.downflow;
        }

        //以下几个方法功能可以参照方法名称了解,我就不讲了
        public void write(DataOutput dataOutput) throws IOException {
            dataOutput.writeUTF(this.phonenum);
            dataOutput.writeLong(this.upflow);
            dataOutput.writeLong(this.downflow);
            dataOutput.writeLong(this.sumflow);
        }

        public void readFields(DataInput dataInput) throws IOException {
            this.phonenum = dataInput.readUTF();
            this.upflow = dataInput.readLong();
            this.downflow = dataInput.readLong();
            this.sumflow = dataInput.readLong();
        }

        public String toString() {
            return   this.phonenum+"\t"+this.upflow+"\t"+this.downflow+"\t"+this.sumflow;
        }


        public String getPhone() {
            return phonenum;
        }

        public void setPhone(String phone) {
            this.phonenum = phone;
        }

        public long getUp() {
            return upflow;
        }

        public void setUp(long up) {
            this.upflow = up;
        }

        public long getDown() {
            return downflow;
        }

        public void setDown(long down) {
            this.downflow = down;
        }
        public long getSum() {
            return sumflow;
        }

        //这个方法是排序方法,规定MapReduce键值对的键如何排序,如果看不懂这句话可以先跳过,全部看完再感悟
        public int compareTo(FlowWritable o) {
            return this.getSum()>o.getSum()?-1:1;

        }


    }


    //Mapper前两个泛型是输入数据类型,第一个是Key的类型,第二个是Value的类型,<key, value>就是一个键值对。后两个是输出数据类型
    //默认Key是要处理的文本中一行的起始偏移量,Value是这一行的内容
    public static class PhoneFlowMapper  extends  Mapper<LongWritable, Text, FlowWritable, NullWritable> {

        //mapreduce每读一行数据就调用一次该方法
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String[] tables  = value.toString().split("\t"); // 切分成数组,里面存放的是手机号码和上下行流量

            String phonenum = tables[1];
            long upflow = Long.parseLong(tables[7]);
            long downflow = Long.parseLong(tables[8]);

            //NullWritable是Writable的一个特殊类,实现方法为空实现,不从数据流中读数据,也不写入数据,只充当占位符,
            // 如在MapReduce中,如果你不需要使用键或值,你就可以将键或值声明为NullWritable
            //注意FlowWritable是构造方法,方法参数是phonenum,upflow,downflow;
            //这使得这个键值对的键已经包含了电话,上行流量,下行流量和总流量,所以值就用不上了,为空
            //注意这里会按照键值对的键来排序。我在FlowWritable的compareTo方法中已经规定了逆序排序
            context.write(new FlowWritable(phonenum,upflow,downflow), NullWritable.get());
        }
    }

    //Mapreduce的保障之一就是送到Reducer端的数据总是根据Reducer的输入键进行排序的
    public static class PhoneFlowReducer extends Reducer<FlowWritable, NullWritable, FlowWritable, NullWritable> {
        @Override
        protected void reduce(FlowWritable key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {

            context.write(key, NullWritable.get());
        }
    }

    public static class FlowSortRunner extends Configured implements Tool {

        //run方法就是将各种操作夹到job中,这个job会在hadoop集群中运行
        public int run(String[] args) throws Exception {

            Configuration conf = new Configuration();

            Job job = Job.getInstance(conf);

            job.setJarByClass(PhoneFlowDriver.class);
            job.setMapperClass(PhoneFlowMapper.class);
            job.setReducerClass(PhoneFlowReducer.class);

            //设置map程序的输出key、value
            job.setMapOutputKeyClass(FlowWritable.class);
            job.setMapOutputValueClass(NullWritable.class);

            //设置   输出 key、value
            job.setOutputKeyClass(FlowWritable.class);
            job.setOutputValueClass(NullWritable.class);

            FileInputFormat.setInputPaths(job, new Path(args[0]));//输入数据路径

            //检查一下参数所指定的输出路径是否存在,如果已存在,先删除
            Path output = new Path(args[1]);
            FileSystem fs = FileSystem.get(conf);
            if(fs.exists(output)){
                fs.delete(output, true);
            }

            FileOutputFormat.setOutputPath(job, new Path(args[1]));//输出数据路径

            return job.waitForCompletion(true)?0:1;
        }

    }

    //主程序入口
    public static void main(String[] args) throws Exception {
        int  status = ToolRunner.run(new Configuration(), new FlowSortRunner(), args);
        System.exit(status);
    }
}


二、测试情况(含测试文件分享)

这个是原始数据

1363157985066   13726230503 00-FD-07-A4-72-B8:CMCC  120.196.100.82  i02.c.aliimg.com        24  27  2481    24681   200
1363157995052   13826544101 5C-0E-8B-C7-F1-E0:CMCC  120.197.40.4            4   0   264 0   200
1363157991076   13926435656 20-10-7A-28-CC-0A:CMCC  120.196.100.99          2   4   132 1512    200
1363154400022   13926251106 5C-0E-8B-8B-B1-50:CMCC  120.197.40.4            4   0   240 0   200
1363157993044   18211575961 94-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99  iface.qiyi.com  视频网站    15  12  1527    2106    200
1363157995074   84138413    5C-0E-8B-8C-E8-20:7DaysInn  120.197.40.4    122.72.52.12        20  16  4116    1432    200
1363157993055   13560439658 C4-17-FE-BA-DE-D9:CMCC  120.196.100.99          18  15  1116    954 200
1363157995033   15920133257 5C-0E-8B-C7-BA-20:CMCC  120.197.40.4    sug.so.360.cn   信息安全    20  20  3156    2936    200
1363157983019   13719199419 68-A1-B7-03-07-B1:CMCC-EASY 120.196.100.82          4   0   240 0   200
1363157984041   13660577991 5C-0E-8B-92-5C-20:CMCC-EASY 120.197.40.4    s19.cnzz.com    站点统计    24  9   6960    690 200
1363157973098   15013685858 5C-0E-8B-C7-F7-90:CMCC  120.197.40.4    rank.ie.sogou.com   搜索引擎    28  27  3659    3538    200
1363157986029   15989002119 E8-99-C4-4E-93-E0:CMCC-EASY 120.196.100.99  www.umeng.com   站点统计    3   3   1938    180 200
1363157992093   13560439658 C4-17-FE-BA-DE-D9:CMCC  120.196.100.99          15  9   918 4938    200
1363157986041   13480253104 5C-0E-8B-C7-FC-80:CMCC-EASY 120.197.40.4            3   3   180 180 200
1363157984040   13602846565 5C-0E-8B-8B-B6-00:CMCC  120.197.40.4    2052.flash2-http.qq.com 综合门户    15  12  1938    2910    200
1363157995093   13922314466 00-FD-07-A2-EC-BA:CMCC  120.196.100.82  img.qfc.cn      12  12  3008    3720    200
1363157982040   13502468823 5C-0A-5B-6A-0B-D4:CMCC-EASY 120.196.100.99  y0.ifengimg.com 综合门户    57  102 7335    110349  200
1363157986072   18320173382 84-25-DB-4F-10-1A:CMCC-EASY 120.196.100.99  input.shouji.sogou.com  搜索引擎    21  18  9531    2412    200
1363157990043   13925057413 00-1F-64-E1-E6-9A:CMCC  120.196.100.55  t3.baidu.com    搜索引擎    69  63  11058   48243   200
1363157988072   13760778710 00-FD-07-A4-7B-08:CMCC  120.196.100.82          2   2   120 120 200
1363157985066   13726238888 00-FD-07-A4-72-B8:CMCC  120.196.100.82  i02.c.aliimg.com        24  27  2481    24681   200
1363157993055   13560436666 C4-17-FE-BA-DE-D9:CMCC  120.196.100.99          18  15  1116    954 200

这个是分析后的数据
这里写图片描述

猜你喜欢

转载自www.cnblogs.com/umbrellalalalala/p/9273767.html