【大数据】MapReduce案例（Hadoop序列化）

Flowbean

实现writebale类

封装上下行流量和总流量

mapper

逻辑读取一行数据

切分数据

取出手机号

取出上下行流量封装到bean中

将手机作为key（类型Text），bean（类型Flowbean）作为value输出

reducer

创建上下总流量的变量

将同一个key的多个value（Flowbean）中上下总流量进行统计

创建bean对象封装总上下行流量

将手机作为key（类型Text），bean（类型Flowbean）作为value输出

数据集：

1 13736230513 192.196.100.1 WWw.atguigu.com 2481 24681 200
2 13846544121 192.196.100.2 264 0 200
3 13956435636 192.196.100.3 132 1512 200
4 13966251146 192.168.100.1 240 0 404
5 18271575951 192.168.100.2 www.atguigu.com 1527 2106 200
6 84188413 192.168.100.3 www.atguigu.com 4116 1432 200
7 13590439668 192.168.100.4 1116 954 200
8 15910133277 192.168.100.5 www.hao123.com 3156 2936 200
9 13729199489 192.168.100.6 240 0 200
10 13630577991 192.168.100.7 www.shouhu.com 6960 690 200
11 15043685818 192.168.100.8 www.baidu.com 3659 3538 200
12 15959002129 192.168.100.9 www.atguigu.com 1938 180 500
13 13560439638 192.168.100.10 918 4938 200
14 13470253144 192.168.100.11 180 180 200
15 13682846555 192.168.100.12 www.qq.com 1938 2910 200
16 13992314666 192.168.100.13 www.gaga.com 3008 3720 200
17 13509468723 192.168.100.14 Www.qinghua.com 7335 110349 404
18 18390173782 192.168.100.15 www.sogou.com 9531 2412 200
19 13975057813 192.168.100.16 www.baidu.com 11058 48243 200
20 13768778790 192.168.100.17 120 120 200
21 13568436656 192.168.100.18 www.alibaba.com 2481 24681 200
22 13568436656 192.168.100.19 1116 954 200

FlowBean封装数据：

package hdfs_demo.telFlow;

import org.apache.hadoop.io.Writable;             //不能实现排序
import org.apache.hadoop.io.WritableComparable;   //需要排序

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class FlowBean implements WritableComparable {  //FlowBean要实现序列化要实现WritableComparable接口
    private long upFlow;
    private long downFlow;
    private long sumFlow;

    public FlowBean(){}

    //计算总流量
    public  void set(Long upFlow,Long downFlow){
        this.upFlow=upFlow;
        this.downFlow=downFlow;
        this.sumFlow=upFlow+downFlow;
    }

    @Override
    public String toString() {
        return upFlow + "\t" + downFlow + "\t" + sumFlow;
    }

    public long getUpFlow() {
        return upFlow;
    }

    public void setUpFlow(long upFlow) {
        this.upFlow = upFlow;
    }

    public long getDownFlow() {
        return downFlow;
    }

    public void setDownFlow(long downFlow) {
        this.downFlow = downFlow;
    }

    public long getSumFlow() {
        return sumFlow;
    }

    public void setSumFlow(long sumFlow) {
        this.sumFlow = sumFlow;
    }

    //此write方法用来进行对数据序列化
    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeLong(upFlow);
        dataOutput.writeLong(downFlow);
        dataOutput.writeLong(sumFlow);
    }

    //此read方法就是反序列化方法
    @Override
    public void readFields(DataInput dataInput) throws IOException {
        upFlow = dataInput.readLong();      //要与上面序列化顺序一一对应
        downFlow = dataInput.readLong();
        sumFlow = dataInput.readLong();
    }

    @Override
    public int compareTo(Object o) {
        return 0;
    }
}

mapper阶段：

分割数据

package hdfs_demo.telFlow;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class FlowMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
                                //Mapper<（输入, 输入）, （输出, 输出）>
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        //读取一行数据
        String data = value.toString();
        //对一行内容进行分割
        String[] telInfos = data.split(" ");
        //取出手机号
        Text phone = new Text(telInfos[1]);    //Text  hadoop中的序列化后的String类型

        //取出上下行流量
        FlowBean bean = new FlowBean();        //Bean封装后两个数据

        Long upFlow = Long.parseLong(telInfos[telInfos.length - 3]);  //上行流量，转化成长整形
        Long downFlow = Long.parseLong(telInfos[telInfos.length - 2]);//下行流量
        bean.set(upFlow, downFlow);                                   //计算总流量

        //写出数据
        context.write(phone, bean);
    }
}

reduce阶段：

package hdfs_demo.telFlow;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class FlowReducer extends Reducer<Text, FlowBean, Text, FlowBean> {
                                    //Reducer<（输入, 输入）, （输出, 输出）>
    private  FlowBean bean = new FlowBean();

    protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {

        //key  ---  phone   new Text(13568436656)
        //values -- （FlowBean，FlowBean，FlowBean）

        long sumUpFlow=0;     //上行总和  同一用户的汇总
        long sumDownFlow=0;

        //遍历所有的bean，将其中的上行流量和下行流浪进行分别累加
        for (FlowBean bean : values) {
            sumUpFlow += bean.getUpFlow();
            sumDownFlow += bean.getDownFlow();
        }
        //将总上行流量和总下行流量进封装到bean对象中
        bean.set(sumUpFlow,sumDownFlow);
        //reduce 写出时候 是一个手机号对应一个总上行流量和总下行流量 和总流量
        context.write(key,bean);
    }
}

driver阶段：

package hdfs_demo.telFlow;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class FlowDriver {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        //创建配置对象
        Configuration conf = new Configuration();
        //创建一个job对象
        Job job = Job.getInstance(conf, "telFlowCount");

        //mapreduce的启动类
        job.setJarByClass(FlowDriver.class);

        //设置mapper 和reducer
        job.setMapperClass(FlowMapper.class);
        job.setReducerClass(FlowReducer.class);

        //设置map的输出类型  Text, FlowBean
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(FlowBean.class);

        //设置reduce的输出类型  Text, FlowBean
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(FlowBean.class);

        //设置输入数据路径
        FileInputFormat.setInputPaths(job, new Path("G:\\idea-workspace\\hdfs_java_api\\Resource\\telinfo.txt"));
        //设置reducer输出结果的路径
        FileOutputFormat.setOutputPath(job, new Path("G:\\idea-workspace\\hdfs_java_api\\Resource\\result"));

        //提交任务
        boolean b = job.waitForCompletion(true);

        System.out.println(b);
    }
}

运行结果：

13470253144	180	180	360
13509468723	7335	110349	117684
13560439638	918	4938	5856
13568436656	3597	25635	29232
13590439668	1116	954	2070
13630577991	6960	690	7650
13682846555	1938	2910	4848
13729199489	240	0	240
13736230513	2481	24681	27162
13768778790	120	120	240
13846544121	264	0	264
13956435636	132	1512	1644
13966251146	240	0	240
13975057813	11058	48243	59301
13992314666	3008	3720	6728
15043685818	3659	3538	7197
15910133277	3156	2936	6092
15959002129	1938	180	2118
18271575951	1527	2106	3633
18390173782	9531	2412	11943
84188413	4116	1432	5548