大数据入门——hadoop排序流量统计代码详解

网上已经有了很多成熟的教程,但是对于不懂MapReduce相关概念的新手理解起来均有些困难。
博主读了一天代码,终于将代码理解了,特此给大家分享


一、带详细注释的代码

//第一部分是导入各种包,没什么好讲的
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.io.WritableComparable;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

//PhoneFlowDriver类中有FlowWritable,PhoneFlowMapper,PhoneFlowReducer和main
public class PhoneFlowDriver {

    //自定义FlowWritable
    static class FlowWritable implements WritableComparable<FlowWritable> {

        private String phonenum;  //电话号码
        private long upflow;  //上行流量
        private long downflow;  //下行流量
        private long sumflow;  //总流量
        
        //空的构造方法
        public FlowWritable() {}  

        //构造方法,利用电话号码,上行流量和下行流量来构造
        public FlowWritable(String phonenum, long upflow, long downflow) {
            super();
            this.phonenum = phonenum;
            this.upflow = upflow;
            this.downflow = downflow;
            this.sumflow = this.upflow + this.downflow;
        }

        //以下几个方法功能可以参照方法名称了解,我就不讲了
        public void write(DataOutput dataOutput) throws IOException {
            dataOutput.writeUTF(this.phonenum);
            dataOutput.writeLong(this.upflow);
            dataOutput.writeLong(this.downflow);
            dataOutput.writeLong(this.sumflow);
        }

        public void readFields(DataInput dataInput) throws IOException {
            this.phonenum = dataInput.readUTF();
            this.upflow = dataInput.readLong();
            this.downflow = dataInput.readLong();
            this.sumflow = dataInput.readLong();
        }

        public String toString() {
            return   this.phonenum+"\t"+this.upflow+"\t"+this.downflow+"\t"+this.sumflow;
        }


        public String getPhone() {
            return phonenum;
        }

        public void setPhone(String phone) {
            this.phonenum = phone;
        }

        public long getUp() {
            return upflow;
        }

        public void setUp(long up) {
            this.upflow = up;
        }

        public long getDown() {
            return downflow;
        }

        public void setDown(long down) {
            this.downflow = down;
        }
        public long getSum() {
            return sumflow;
        }

        //这个方法是排序方法,规定MapReduce键值对的键如何排序,如果看不懂这句话可以先跳过,全部看完再感悟
        public int compareTo(FlowWritable o) {
            return this.getSum()>o.getSum()?-1:1;

        }


    }


    //Mapper前两个泛型是输入数据类型,第一个是Key的类型,第二个是Value的类型,<key, value>就是一个键值对。后两个是输出数据类型
    //默认Key是要处理的文本中一行的起始偏移量,Value是这一行的内容
    public static class PhoneFlowMapper  extends  Mapper<LongWritable, Text, FlowWritable, NullWritable> {

        //mapreduce每读一行数据就调用一次该方法
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String[] tables  = value.toString().split("\t"); // 切分成数组,里面存放的是手机号码和上下行流量

            String phonenum = tables[1];
            long upflow = Long.parseLong(tables[7]);
            long downflow = Long.parseLong(tables[8]);

            //NullWritable是Writable的一个特殊类,实现方法为空实现,不从数据流中读数据,也不写入数据,只充当占位符,
            // 如在MapReduce中,如果你不需要使用键或值,你就可以将键或值声明为NullWritable
            //注意FlowWritable是构造方法,方法参数是phonenum,upflow,downflow;
            //这使得这个键值对的键已经包含了电话,上行流量,下行流量和总流量,所以值就用不上了,为空
            //注意这里会按照键值对的键来排序。我在FlowWritable的compareTo方法中已经规定了逆序排序
            context.write(new FlowWritable(phonenum,upflow,downflow), NullWritable.get());
        }
    }

    //Mapreduce的保障之一就是送到Reducer端的数据总是根据Reducer的输入键进行排序的
    public static class PhoneFlowReducer extends Reducer<FlowWritable, NullWritable, FlowWritable, NullWritable> {
        @Override
        protected void reduce(FlowWritable key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {

            context.write(key, NullWritable.get());
        }
    }

    public static class FlowSortRunner extends Configured implements Tool {

        //run方法就是将各种操作夹到job中,这个job会在hadoop集群中运行
        public int run(String[] args) throws Exception {

            Configuration conf = new Configuration();

            Job job = Job.getInstance(conf);

            job.setJarByClass(PhoneFlowDriver.class);
            job.setMapperClass(PhoneFlowMapper.class);
            job.setReducerClass(PhoneFlowReducer.class);

            //设置map程序的输出key、value
            job.setMapOutputKeyClass(FlowWritable.class);
            job.setMapOutputValueClass(NullWritable.class);

            //设置   输出 key、value
            job.setOutputKeyClass(FlowWritable.class);
            job.setOutputValueClass(NullWritable.class);

            FileInputFormat.setInputPaths(job, new Path(args[0]));//输入数据路径

            //检查一下参数所指定的输出路径是否存在,如果已存在,先删除
            Path output = new Path(args[1]);
            FileSystem fs = FileSystem.get(conf);
            if(fs.exists(output)){
                fs.delete(output, true);
            }

            FileOutputFormat.setOutputPath(job, new Path(args[1]));//输出数据路径

            return job.waitForCompletion(true)?0:1;
        }

    }

    //主程序入口
    public static void main(String[] args) throws Exception {
        int  status = ToolRunner.run(new Configuration(), new FlowSortRunner(), args);
        System.exit(status);
    }
}



二、测试情况(含测试数据分享)

这个是原始数据

1363157985066 	13726230503	00-FD-07-A4-72-B8:CMCC	120.196.100.82	i02.c.aliimg.com		24	27	2481	24681	200
1363157995052 	13826544101	5C-0E-8B-C7-F1-E0:CMCC	120.197.40.4			4	0	264	0	200
1363157991076 	13926435656	20-10-7A-28-CC-0A:CMCC	120.196.100.99			2	4	132	1512	200
1363154400022 	13926251106	5C-0E-8B-8B-B1-50:CMCC	120.197.40.4			4	0	240	0	200
1363157993044 	18211575961	94-71-AC-CD-E6-18:CMCC-EASY	120.196.100.99	iface.qiyi.com	视频网站	15	12	1527	2106	200
1363157995074 	84138413	5C-0E-8B-8C-E8-20:7DaysInn	120.197.40.4	122.72.52.12		20	16	4116	1432	200
1363157993055 	13560439658	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			18	15	1116	954	200
1363157995033 	15920133257	5C-0E-8B-C7-BA-20:CMCC	120.197.40.4	sug.so.360.cn	信息安全	20	20	3156	2936	200
1363157983019 	13719199419	68-A1-B7-03-07-B1:CMCC-EASY	120.196.100.82			4	0	240	0	200
1363157984041 	13660577991	5C-0E-8B-92-5C-20:CMCC-EASY	120.197.40.4	s19.cnzz.com	站点统计	24	9	6960	690	200
1363157973098 	15013685858	5C-0E-8B-C7-F7-90:CMCC	120.197.40.4	rank.ie.sogou.com	搜索引擎	28	27	3659	3538	200
1363157986029 	15989002119	E8-99-C4-4E-93-E0:CMCC-EASY	120.196.100.99	www.umeng.com	站点统计	3	3	1938	180	200
1363157992093 	13560439658	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			15	9	918	4938	200
1363157986041 	13480253104	5C-0E-8B-C7-FC-80:CMCC-EASY	120.197.40.4			3	3	180	180	200
1363157984040 	13602846565	5C-0E-8B-8B-B6-00:CMCC	120.197.40.4	2052.flash2-http.qq.com	综合门户	15	12	1938	2910	200
1363157995093 	13922314466	00-FD-07-A2-EC-BA:CMCC	120.196.100.82	img.qfc.cn		12	12	3008	3720	200
1363157982040 	13502468823	5C-0A-5B-6A-0B-D4:CMCC-EASY	120.196.100.99	y0.ifengimg.com	综合门户	57	102	7335	110349	200
1363157986072 	18320173382	84-25-DB-4F-10-1A:CMCC-EASY	120.196.100.99	input.shouji.sogou.com	搜索引擎	21	18	9531	2412	200
1363157990043 	13925057413	00-1F-64-E1-E6-9A:CMCC	120.196.100.55	t3.baidu.com	搜索引擎	69	63	11058	48243	200
1363157988072 	13760778710	00-FD-07-A4-7B-08:CMCC	120.196.100.82			2	2	120	120	200
1363157985066 	13726238888	00-FD-07-A4-72-B8:CMCC	120.196.100.82	i02.c.aliimg.com		24	27	2481	24681	200
1363157993055 	13560436666	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			18	15	1116	954	200

这个是分析后的数据,我只截了运行结果图片,可以看到数据已经按照流量降序排好了。
这里写图片描述

发布了36 篇原创文章 · 获赞 41 · 访问量 2万+

猜你喜欢

转载自blog.csdn.net/umbrellalalalala/article/details/80888019