10天Hadoop快速突击（4）——MapReduce应用案例

开发MapReduce应用程序

一、单词计数

1.实例描述

计算出文件中每个单词的频数。要求输出结果按照单词的字母顺序进行排序。每个单词和其频数占一行，单词和频数之间有间隔。

比如，输出一个文本文件，内容如下：

hello world

hello hadoop

hello mapreduce

对应上面给出的输入样例，其输出样例为：

hadoop 1

hadoop 3

mapreduce 1

word 1

2.设计思路

将文件内容切分成单词，然后将所有相同的单词聚集在一起，最后计算单词出现的次数并输出。Map阶段完成由输入数据到单词切分的工作，shuffe阶段完成相同单词的聚集和分发工作（这个过程是MapReduce的默认过程，不用具体配置）,Reduce阶段负责接收所有单词并计算其频数。MapReduce中传递的数据都是<key，value>形式的，并且shuffle排序聚集分发都是按照key值进行的，Reduce的输入为Map输出聚集后的结果，即<key，value-list>。

3.程序代码

package com.company;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import javax.security.auth.login.AppConfigurationEntry;
import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

public class WordCount {
    //继承Mapper接口，设置map的输入类型为<Object,Text>
    //输出类型为<Text,IntWritable>
    public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
        //one表示单词出现一次
        private final static IntWritable one = new IntWritable(1);
        //word用于存储切下的单词
        private Text word = new Text();

        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            StringTokenizer itr = new StringTokenizer(value.toString());//对输入的行切词
            while (itr.hasMoreTokens()) {
                word.set(itr.nextToken()); //切下的单词存入word
                context.write(word, one);
            }
        }
    }

    //继承Reducer接口，设置Reduce的输入类型为<Text,IntWritable>
    //输出类型为<Text,IntWritable>
    public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        //result记录单词的频数
        private IntWritable result = new IntWritable();

        public void reduce(Text key, Iterator<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            //对获取的<key, value-list>计算value的和
            for (Iterator<IntWritable> it = values; it.hasNext(); ) {
                IntWritable val = it.next();
                sum += val.get();
            }
            //将频数设置到result中
            result.set(sum);
            //收集结果
            context.write(key, result);
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        // write your code here
        Configuration conf = new Configuration();
        //检查原型命令
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length != 2) {
            System.err.println("Usage:wordcount <int> <out>");
            System.exit(2);
        }
        //配置作业名
        Job job = new Job(conf, "word count");
        //配置作业的各个类
        job.setJarByClass(WordCount.class);
        job.setMapperClass(TokenizerMapper.class);
        job.setCombinerClass(IntSumReducer.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }


}

在调试阶段遇到如下问题;

4.程序执行

运行条件：将WordCount.java文件放在Hadoop安装目录下，并在目录下创建输入目录input，目录下有输入文件file1、file2。其中：

file1的内容是：

hello world

file2的内容是：

hello hadoop

hello mapreduce

准备好之后在命令行输入以下命令执行(因版本问题，部分命令已失效)：

1）在集群上创建输入文件夹

bin/hadoop fs -mkdir wordcount_input

2）上传本地目录input下的四个字符为file的文件到集群上的input目录下：

bin/hadoop fs -put input/file* wordcount_input

3）编译WordCount.java程序，将结果放入当前目录的WordCount目录下：

javac -classpath hadoop-1.0.1-core.jar:lib/commons-cli-1.2.jar -d WordCount WordCount.java

4）将编译结果打成jar包

jar -cvf wordcount.jar -C WordCount

5）在集群上运行WordCount程序，以input目录作为输入目录，output目录作为输出目录：

bin/hadoop jar wordcount.jar WordCount wordcount_input wordcount_output

6）查看输出结果

bin/hadoop fs -cat wordcount_output/part-r-00000

5.代码数据流

首先在MapReduce程序启动阶段，JobTracker先将Job的输入文件分割到每个Map Task上。

接下来MapReduce启动Job，每个MapTask在启动之后会接收到自己所分配的输入数据

Combiner相当于将结果先局部进行合并，这样能够降低网络压力，提高效率。

shuffle过程，对Map的输出进行排序合并，并根据Reduce数量对Map的输出进行分割，将结果交给对应的Reduce。

Reduce接收到如上的输入之后，对每个<key，value-list>进行处理，计算每个单词也就是key的出现总数。

最后输出单词和对应的频数，形成整个MapReduce的输出。

二、数据去重

1.实例描述

对数据文件中的数据进行去重。数据文件的每行都是一个数据。

2.设计思路

数据去重的最终目的是让原始数据中出现次数超过一次的数据在输出文件中只出现一次。具体就是Reduce的输入应该以数据作为key，而对value-list则没有要求。当Reduce接收到一个<key，value-list>时就直接将key复制到输出的key中，并将value设置成空值。在MapReduce流程中，Map的输出<key,value>经过shuffle过程聚集成<key，value-list>后会被交给Reduce。所以Map阶段完成的任务就是在采用Hadoop默认的作业输入方式之后，将value设置成key，并直接输出。Map中的结果经过shuffle过程之后被交给Reduce。在Reduce阶段，不管每个key有多少个value，都直接将输入的key复制为输出的key，并输出就可以了。

3.程序代码

package com.company;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.GenericWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.IOException;
import java.util.Iterator;

public class Dedup {
    //map将输入中的value复制到输出数据的key上，并直接输出
    public static class Map extends Mapper<Object, Text, Text, Text> {
        private static Text line = new Text();

        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            line = value;
            context.write(line, new Text(""));
        }
    }

    //reduce将输入中的key复制到输出数据的key上，并直接输出
    public static class Reduce extends Reducer<Text, Text, Text, Text> {
        public void reduce(Text key, Iterator<Text> values, Context context) throws IOException, InterruptedException {
            context.write(key, new Text(""));
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length != 2) {
            System.err.println("Usage:wordcount <in> <out>");
            System.exit(2);
        }
        Job job = new Job(conf, "Data Deduplication");
        job.setJarByClass(Dedup.class);
        job.setMapperClass(Map.class);
        job.setCombinerClass(Reduce.class);
        job.setReducerClass(Reduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

三、排序

1.实例描述

对输入文件中的数据进行排序。输入文件中的每行内容均为一个数字，即一个数据。要求在输出中每行有两个间隔的数字，其中第二个数字代表原始数据，第一个数字代表这个原始数据在原始数据集中的位次。

样例输入：

file1:

654

756

65223

file2:

5956

650

file3:

样例输出：

1 2

2 6

3 15

4 22

5 26

6 32

7 32

8 54

9 92

10 650

11 654

12 756

13 5956

14 65223

2.设计思路

在MapReduce剁成中就有排序，可以利用这个默认的排序。

首先要了解MapReduce过程中的默认排序规则。它是按照key值进行排序，如果key为封装int的IntWritable类型，那么MapReduce按照数字大小对key排序；如果key为封装String的Text类型，那么MapReduce按照字典顺序对字符串排序。需要注意的是，Reduce自动排序的数据仅仅是发送到自己所在节点的数据，使用默认的排序并不能保证全局的顺序，因为在排序前还有一个partition的过程，默认无法保证分割后各个Reduce上的数据整体是有序的。所有要想使用默认的排序过程，还必须定义自己的Partition类，保证执行Partition过程之后所有Reduce上的数据在整体上是有序的，然后再对局部Reduce上的数据进行默认排序，这样才能保证所有数据有序。

所以，具体思路为：首先应该使用封装int的IntWritable型数据结构，也就是将读入的数据在Map中转化为IntWritable型，然后作为key值输出（value任意）；其次需要重写partition类，保证整体有序，具体做法是用输入数据的最大值除以系统partition数量的商作为分割数据的边界增量，也就是说分割数据的边界为此商的1倍、2倍至numPartitions-1倍，这样就能保证执行partition后的数据是整体有序的；然后Reduce获得<key,value-list>之后，根据value-list中元素的个数将输入的key作为value的输出次数，输出的key是一个全局变量，用于统计当前key的位次。

需要注意的是，该程序中没有配置Combiner，也就是说在MapReduce过程中不使用Combiner，这主要是因为使用Map和Reduce就已经能够完成任务了。

3.程序代码

package com.company;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.IOException;
import java.util.Iterator;

public class Sort {
    //map将输入的value转化成IntWritable类型，作为输出的key
    public static class Map extends Mapper<Object, Text, IntWritable, IntWritable> {
        private static IntWritable data = new IntWritable();

        public void map(Object key, Text value, Context context) {
            String line = value.toString();
            data.set(Integer.parseInt(line));
        }
    }

    //reduce将输入的key复制到输出的value上，然后根据输入的value-list中元素的个数决定key的输出次数
    //用全局linenum来代表key的位次
    public static class Reduce extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {
        private static IntWritable linenum = new IntWritable(1);

        public void reduce(IntWritable key, Iterator<IntWritable> values, Context context) throws IOException, InterruptedException {
            for (Iterator<IntWritable> it = values; it.hasNext(); ) {
                IntWritable val = it.next();
                context.write(linenum, key);
                linenum = new IntWritable(linenum.get() + 1);
            }
        }
    }

    //自定义Partition函数，此函数根据输入数据的最大值和MapReduce框架中Partition的数量获取将输入数据按照大小分块的边界，然后根据输入数值和边界的关系返回对应的Partition ID
    public static class Partition extends Partitioner<IntWritable, IntWritable> {
        @Override
        public int getPartition(IntWritable key, IntWritable value, int numPartitions) {
            int Maxnumber = 65223;
            int bound = Maxnumber / numPartitions + 1;
            int keynumber = key.get();
            for (int i = 0; i < numPartitions; i++) {
                if (keynumber < bound * (i + 1) && keynumber >= bound * i) {
                    return i;
                }
            }
            return -1;
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length != 2) {
            System.err.println("Usage:wordcount <in> <out>");
            System.exit(2);
        }
        Job job = new Job(conf, "Sort");
        job.setJarByClass(Sort.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);
        job.setPartitionerClass(Partition.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(IntWritable.class);
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

四、单表关联

1.实例描述

实例中给出child-parent表，要求输出grandchild-grandparent表。

样例输入：

file；

child parent

Tom Lucy

Tom Jack

Jone Lucy

Jone Jack

Lucy Mary

Lucy Ben

Jack Alice

Jack Jesse

Terry Alice

Terry Jesse

Philip Terry

Philip Alma

Mark Terry

Mark Alma

样例输出：

file:

grandchild grandparent

Tom Alice

Tom Jesse

Jone Alice

Jone Jesse

Tom Mary

Tom Ben

Jone Mary

Jone Ben

Philip Alice

Philip Jesse

Mark Alice

Mark Jesse

2.设计思路

首先要考虑如何实现表的自连接，其次是连接列的设置，最后是结果的整理。

MapReduce的shuffle过程会将相同的值自然就会连在一起，所以可以将Map结果的key值设置成待连接的列，然后列中相同的值自然就会连接在一起了。

要连接的是左表的parent列和右表的child列，且左表和右表是同一个表，所以在Map阶段将读入数据分割成child和parent之后，会将parent设置为key，child设置为value进行输出，作为左表；再将同一对child和parent中的child设置成key，parent设置成value进行输出，作为右表。

为了区分输出中的左右表，需要在输出的value中再加上左右表信息，这样在Map的结果中就形成了左表和右表，然后在shuffle过程中完成连接。

在Reduce接收到的连接结果中，每个key的value-list就包含了grandchild和grandparent关系。

取出每个key的value-list进行解析，将左表中的child放入一个数组，右表中的parent放入一个数组，然后对两个数组求笛卡尔积就是最后的结果了。

3.程序代码

package com.company;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.IOException;
import java.util.Iterator;

public class STjoin {
    public static int time = 0;

    //Map将输入分割成child和parent，然后正序输出一次作为右表，反序输出一次作为左表
    //需要注意的是在输出的value中必须加上左右表区别标志
    public static class Map extends Mapper<Object, Text, Text, Text> {
        @Override
        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            String childname = new String();
            String parentname = new String();
            String relationtype = new String();
            String line = value.toString();
            int i = 0;
            while (line.charAt(i) != ' ') {
                i++;
            }
            String[] values = {line.substring(0, i), line.substring(i + 1)};
            if (values[0].compareTo("child") != 0) {
                childname = values[0];
                parentname = values[1];
                relationtype = "1";//左右表区分标志
                context.write(new Text(values[1]), new Text(relationtype + "+" + childname + "+" + parentname));//左表
                relationtype = "2";
                context.write(new Text(values[0]), new Text(relationtype + "+" + childname + "+" + parentname));//右表
            }
        }
    }

    public static class Reduce extends Reducer<Text, Text, Text, Text> {
        @Override
        public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            if (time == 0) {
                context.write(new Text("grandchild"), new Text("grandparent"));
                time++;
            }
            int grandchildnum = 0;
            String grandchild[] = new String[10];
            int grandparentnum = 0;
            String grandparent[] = new String[10];
            Iterator ite = values.iterator();
            while (ite.hasNext()) {
                String record = ite.next().toString();
                int len = record.length();
                int i = 2;
                if (len == 0)
                    continue;
                char relationtype = record.charAt(0);
                String childname = new String();
                String parentname = new String();
                //获取value-list中的value的child
                while (record.charAt(i) != '+') {
                    childname = childname + record.charAt(i);
                    i++;
                }
                i = i + 1;
                //获取value-list中value的parent
                while (i < len) {
                    parentname = parentname + record.charAt(i);
                    i++;
                }
                //左表，取出child放入grandchild
                if (relationtype == '1') {
                    grandchild[grandchildnum] = childname;
                    grandchildnum++;
                } else {//右表，取出parent放入grandparent
                    grandparent[grandparentnum] = parentname;
                    grandparentnum++;
                }
            }
            //grandchild和grandparent数组笛卡尔积
            if (grandparentnum != 0 && grandchildnum != 0) {
                for (int m = 0; m < grandchildnum; m++) {
                    for (int n = 0; n < grandparentnum; n++) {
                        context.write(new Text(grandchild[m]), new Text(grandparent[n]));
                        //输出结果
                    }
                }
            }
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length != 2) {
            System.err.println("Usage:wordcount <in> <out>");
            System.exit(2);
        }
        Job job = new Job(conf, "single table join");
        job.setJarByClass(STjoin.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

五、多表关联

1.实例描述

多表关联与单表关联类似，也是通过对原始数据进行一定的处理，从其中挖掘出关心的信息。

输入是两个文件，一个代表工厂表，包含工程名列和地址编码列；另一个代表地址表，包含地址名列和地址编号列。要求从输入数据中找出工厂名和地址名的对应关系，输出工厂名-地址名表。

输入样例：

factory:

factoryname addressed

Beijing Red Star 1

Shenzhen Thunder 3

Guangzhou Honda 2

Beijing Rising 1

Guangzhou Development Bank 2

Tencent 3

Bank of Beijing 1

address:

addressID addressname

1 Beijing

2 Guangzhou

3 Shenzhen

4 Xian

输出样例：

factoryname addressname

Bank of Beijing Beijing

Beijing Red Star Beijing

Beijing Rising Beijing

Guangzhou Development Bank Guangzhou

Guangzhou Honda Guangzhou

Shenzhen Thunder Shenzhen

Tencent Shenzhen

2.设计思路

Map识别出输入的行属于哪个表之后，对其进行分割，将连接的列值保存在key中，另一列和左右表标志保存在value中，然后输出。Reduce拿到连接结果后，解析value内容，根据标志将左右表内容分开存放，然后求笛卡尔积，最后直接输出。

3.程序代码

package com.company;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.IOException;
import java.util.Iterator;

public class MTjoin {
    public static int time = 0;

    public static class Map extends Mapper<Object, Text, Text, Text> {
        @Override
        protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            int i = 0;
            //输出文件首行，不处理
            if (line.contains("factoryname") == true || line.contains("addressID") == true) {
                return;
            }
            //找出数据中的分割点
            while (line.charAt(i) >= '9' || line.charAt(i) <= '0') {
                i++;
            }
            if (line.charAt(0) >= '9' || line.charAt(0) <= '0') {
                //左表
                int j = i - 1;
                while (line.charAt(j) != ' ')
                    j--;
                String[] values = {line.substring(0, j), line.substring(i)};
                context.write(new Text(values[1]), new Text("1+" + values[0]));
            } else {//右表
                int j = i + 1;
                while (line.charAt(j) != ' ')
                    j++;
                String[] values = {line.substring(0, i + 1), line.substring(j)};
                context.write(new Text(values[0]), new Text("2+" + values[1]));
            }
        }
    }

    public static class Reduce extends Reducer<Text, Text, Text, Text> {
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            //输出文件第一行
            if (time == 0) {
                context.write(new Text("factoryname"), new Text("addressname"));
                time++;
            }
            int factorynum = 0;
            String[] factory = new String[10];
            int addressnum = 0;
            String[] address = new String[10];
            Iterator ite = values.iterator();
            while (ite.hasNext()) {
                String record = ite.next().toString();
                int len = record.length();
                int i = 2;
                char type = record.charAt(0);
                String factoryname = new String();
                String addressname = new String();
                if (type == '1') {//左表
                    factory[factorynum] = record.substring(2);
                    factorynum++;
                } else {//右表
                    address[addressnum] = record.substring(2);
                    addressnum++;
                }
            }
            if (factorynum != 0 && addressnum != 0) {//求笛卡尔积
                for (int m = 0; m < factorynum; m++) {
                    for (int n = 0; n < addressnum; n++) {
                        context.write(new Text(factory[m]), new Text(address[n]));
                    }
                }
            }
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length != 2) {
            System.err.println("Usage:wordcount <in> <out>");
            System.exit(2);
        }
        Job job = new Job(conf, "multiple table join");
        job.setJarByClass(MTjoin.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

六、参考资料

《Hadoop实战》第二版

10天Hadoop快速突击（4）——MapReduce应用案例

开发MapReduce应用程序

猜你喜欢