Hadoop-job作业执行流程+MapReduce实现手机流量统计

1.分析题目要求知道，对于一个用户手机流量文件进行统计，我们想要得到的结果肯定是**<key:手机号，value：[上传量下载量总合]>**
在这里插入图片描述

2.首先我们要准备数据文件，上传到HDFS，熟悉hdfs常用的shell命令

3.input Format阶段：我们必须从HDFS中读取文件，从HDFS中读取文件也是<key，value>型的一个结构。这里key值它是行首字母的偏移量，行首字母的偏移量理解就是一行占了多少个字符；value就是一行所有的值。
4.接着进入map阶段，map进行局部运算
5.每进行一次map运算。都会将结果to给shuffle，shuffle阶段就会对结果进行排序分组。
6.执行完所有的map，shuffle排序分组以后，将结果以<key, value>形式传给reduce，进行汇总运算。
7.手机流量统计job作业流程伪代码：
在这里插入图片描述
8.将数据文件上传到HDFS中

#在HDFS下创建目录
[root@Cluster00 ~]# hdfs dfs -mkdir /phonedata 
#将本地文件上传到HDFS下的指定目录
[root@Cluster00 ~]# hdfs dfs -put data.log /phonedata

在这里插入图片描述
9.引入相关Maven依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.example</groupId>
    <artifactId>hadoop-phonedata</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <hadoop.version>2.7.3</hadoop.version>
    </properties>

    <dependencies>
        <!--hadoop公共依赖-->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <!--hadoop client 依赖-->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
        </dependency>


        <!--map reduce-->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <dependency>
            <groupId>log4j</groupId>
            <artifactId>log4j</artifactId>
            <version>1.2.17</version>
        </dependency>

    </dependencies>



    <build>

        <!--扩展maven的插件wagon ssh插件-->
        <extensions>
            <extension>
                <groupId>org.apache.maven.wagon</groupId>
                <artifactId>wagon-ssh</artifactId>
                <version>2.8</version>
            </extension>
        </extensions>

        <plugins>
            <!-- 在打包插件中指定main class 信息 -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-jar-plugin</artifactId>
                <configuration>
                    <outputDirectory>${basedir}/target</outputDirectory>
                    <archive>
                        <manifest>
                            <mainClass>com.wenxin.phonedata.PhoneDataJob</mainClass>
                        </manifest>
                    </archive>
                </configuration>
            </plugin>

            <!--使用wagon ssh 插件-->
<!--            <plugin>-->
<!--                <groupId>org.codehaus.mojo</groupId>-->
<!--                <artifactId>wagon-maven-plugin</artifactId>-->
<!--                <version>1.0</version>-->
<!--                <configuration>-->
<!--                    <fromFile>target/${project.build.finalName}.jar</fromFile>-->
<!--                    <url>scp://root:[email protected]/root</url>-->
<!--                    &lt;!&ndash;自动执行对应脚本&ndash;&gt;-->
<!--                    <commands>-->
<!--                        &lt;!&ndash; 通过sh 执行shell脚本文件 &ndash;&gt;-->
<!--                        <command>nohup /root/hadoop-2.7.3/bin/hadoop jar ${project.build.finalName}.jar > /root/mapreduce.out 2>&amp;1 &amp; </command>-->
<!--                    </commands>-->
<!--                    <displayCommandOutputs>true</displayCommandOutputs>-->
<!--                </configuration>-->
<!--            </plugin>-->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>7</source>
                    <target>7</target>
                </configuration>
            </plugin>

        </plugins>

    </build>

</project>

10.MapReduce开发源代码

package com.wenxin.phonedata;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

//job作业
public class PhoneDataJob  extends Configured implements Tool {
    
    

    public static void main(String[] args) throws Exception {
    
    
        //执行job作业任务类的对象是谁
        ToolRunner.run(new PhoneDataJob(),args);
    }
    //执行job作业
    public int run(String[] strings) throws Exception {
    
    
        //创建job作业对象
        Job job = Job.getInstance(getConf());
        job.setJarByClass(PhoneDataJob.class);
        //1.设置inputFormat
        job.setInputFormatClass(TextInputFormat.class);
        //支持设置一个hdfs系统目录  job作业在执行时会将这个目录中所有文件参与本次job作业计算
        TextInputFormat.addInputPath(job,new Path("/phonedata/data.log"));

        //2.设置map
        job.setMapperClass(PhoneDataMap.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(PhoneWritable.class);

        //3.设置shuffle 自动处理
        job.setNumReduceTasks(4);//设置reduce数量
        job.setPartitionerClass(PhoneCodePartitioner.class); //自定义分区
        job.setCombinerClass(PhoneDataReduce.class); //开启合并  map端输出的数据预先执行一次reduce 从而减少map端局部数据大小

        //4.设置reduce
        job.setReducerClass(PhoneDataReduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(PhoneWritable.class);


        //5.设置output format
        job.setOutputFormatClass(TextOutputFormat.class);
        Path res = new Path("/phonedata/result");
        FileSystem fileSystem = FileSystem.get(getConf());
        if(fileSystem.exists(res))fileSystem.delete(res,true);
        TextOutputFormat.setOutputPath(job, res); //保证output format输出结果目录必须不存在
        //6.提交job作业
        //job.submit();
        boolean status = job.waitForCompletion(true);
        System.out.println("phone data status = " + status);
        return 0;
    }


    //map
    public static class PhoneDataMap extends Mapper<LongWritable, Text,Text,PhoneWritable>{
    
    
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    
    
            Counter counter = context.getCounter("PhoneDataMapGroup", "PhoneDataMap");
            counter.increment(1);//自增1
            String[] values = value.toString().split("\t");
            String phoneCode = values[1];
            String ups = values[6];
            String downs = values[7];
            PhoneWritable phoneWritable = new PhoneWritable();
            phoneWritable.setUploads(Integer.valueOf(ups));
            phoneWritable.setDownloads(Integer.valueOf(downs));
            phoneWritable.setTotals(0);
            context.write(new Text(phoneCode),phoneWritable);
        }
    }
    //reduce
    public static class PhoneDataReduce extends Reducer<Text,PhoneWritable,Text,PhoneWritable>{
    
    
        @Override
        protected void reduce(Text key, Iterable<PhoneWritable> values, Context context) throws IOException, InterruptedException {
    
    
            Counter counter = context.getCounter("PhoneDataReduceGroup", "PhoneDataReduce");
            counter.increment(1);//自增1
            int ups = 0;
            int downs =0;
            for (PhoneWritable value : values) {
    
    
                ups += value.getUploads();
                downs+=value.getDownloads();
            }
            PhoneWritable phoneWritable = new PhoneWritable(ups,downs,(ups+downs));
            context.write(key,phoneWritable);
        }
    }
}

在这里插入图片描述
11.将项目打包（.jar的形式）上传到hadoop

#两个命令选择使用一个即可，运行上传的jar包
[root@Cluster00 ~]# hadoop jar hadoop-phonedata-1.0-SNAPSHOT.jar
[root@Cluster00 ~]# yarn jar hadoop-phonedata-1.0-SNAPSHOT.jar

10.查看结果
结果保存在patr-r-0000文件夹下（目录可以自己定义）

[root@Cluster00 ~]# hdfs dfs -cat /phonedata/result/*

在这里插入图片描述

Hadoop-job作业执行流程+MapReduce实现手机流量统计

猜你喜欢