纪念自己的第一个MapReduce例子-WordCount

版权声明:本文为博主原创文章,转载请说明出处 https://blog.csdn.net/u010002184/article/details/88286907

如下:

import java.io.File;
import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WordCountTest {

    private static Logger logger = LoggerFactory.getLogger(WordCountTest.class);

    public static class TokenizerMapper
            extends Mapper<Object, Text, Text, IntWritable> {

        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();

        public void map(Object key, Text value, Context context
        ) throws IOException, InterruptedException {
            logger.info("Map,key:{},value:{},CurrentKey:{},CurrentValue:{}",
                    key.toString(), value.toString(), context.getCurrentKey().toString(), context.getCurrentValue().toString());
            StringTokenizer itr = new StringTokenizer(value.toString());
            while (itr.hasMoreTokens()) {
                word.set(itr.nextToken());
                context.write(word, one);
                logger.info("word:{},one;{}", word.toString(), one.get());
            }
            logger.info("Map处理结束");
        }
    }

    public static class IntSumReducer
            extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable result = new IntWritable();

        public void reduce(Text key, Iterable<IntWritable> values,
                           Context context
        ) throws IOException, InterruptedException {
            logger.info("Reduce,key:{},CurrentKey:{},CurrentValue:{}",
                    key.toString(), context.getCurrentKey().toString(), context.getCurrentValue().toString());

            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
                logger.info("key:{},val:{},sum:{}", key, val, sum);
            }
            result.set(sum);
            context.write(key, result);
            logger.info("Reduce处理结束");
        }
    }

    public static void main(String[] args) throws Exception {
        deleteDir(new File(args[1]));

        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "word count");
        job.setJarByClass(WordCountTest.class);
        job.setMapperClass(TokenizerMapper.class);
        job.setCombinerClass(IntSumReducer.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }


    private static boolean deleteDir(File dir) {
        if (dir.isDirectory()) {
            String[] children = dir.list();
            //递归删除目录中的子目录下
            for (int i = 0; i < children.length; i++) {
                boolean success = deleteDir(new File(dir, children[i]));
                if (!success) {
                    return false;
                }
            }
        }
        // 目录此时为空,可以删除
        return dir.delete();
    }
}

程序环境参数:

输入文件的路径,输出的目录路径,要配置在program arguments中,中间用英文空格分隔。

日志

2019-03-07 13:58:42,269 INFO [org.apache.hadoop.mapreduce.Job] -  map 0% reduce 0%
2019-03-07 13:59:10,113 INFO [org.apache.hadoop.mapred.MapTask] - Processing split: file:/D:/gitdownload6/flow_log/wireless_parser/src/main/resources/WordCountTestInput/input.log:0+61
2019-03-07 13:59:10,190 INFO [org.apache.hadoop.mapred.MapTask] - Map output collector class = org.apache.hadoop.mapred.MapTask$MapOutputBuffer
2019-03-07 13:59:10,236 INFO [org.apache.hadoop.mapred.MapTask] - (EQUATOR) 0 kvi 26214396(104857584)
2019-03-07 13:59:10,236 INFO [org.apache.hadoop.mapred.MapTask] - mapreduce.task.io.sort.mb: 100
2019-03-07 13:59:10,236 INFO [org.apache.hadoop.mapred.MapTask] - soft limit at 83886080
2019-03-07 13:59:10,236 INFO [org.apache.hadoop.mapred.MapTask] - bufstart = 0; bufvoid = 104857600
2019-03-07 13:59:10,236 INFO [org.apache.hadoop.mapred.MapTask] - kvstart = 26214396; length = 6553600
2019-03-07 14:01:00,744 INFO [...WordCountTest] - Map,key:0,value:hello world,CurrentKey:0,CurrentValue:hello world
2019-03-07 14:01:01,740 INFO [org.apache.hadoop.mapred.LocalJobRunner] - map > map
2019-03-07 14:01:02,741 INFO [...WordCountTest] - word:hello,one;1
2019-03-07 14:01:03,140 INFO [org.apache.hadoop.mapreduce.Job] -  map 14% reduce 0%
2019-03-07 14:01:06,438 INFO [...WordCountTest] - word:world,one;1
2019-03-07 14:01:16,450 INFO [org.apache.hadoop.mapred.LocalJobRunner] - map > map
2019-03-07 14:01:33,962 INFO [...WordCountTest] - Map处理结束
2019-03-07 14:01:34,512 INFO [org.apache.hadoop.mapred.LocalJobRunner] - map > map
2019-03-07 14:02:35,051 INFO [...WordCountTest] - Map,key:13,value:hello java,CurrentKey:13,CurrentValue:hello java
2019-03-07 14:02:39,761 INFO [org.apache.hadoop.mapred.LocalJobRunner] - map > map
2019-03-07 14:02:45,425 INFO [org.apache.hadoop.mapreduce.Job] -  map 27% reduce 0%
2019-03-07 14:02:46,556 INFO [...WordCountTest] - word:hello,one;1
2019-03-07 14:02:47,527 INFO [...WordCountTest] - word:java,one;1
2019-03-07 14:02:47,922 INFO [...WordCountTest] - Map处理结束
2019-03-07 14:02:50,231 INFO [...WordCountTest] - Map,key:25,value:hello python,CurrentKey:25,CurrentValue:hello python
2019-03-07 14:02:51,246 INFO [...WordCountTest] - word:hello,one;1
2019-03-07 14:02:51,865 INFO [org.apache.hadoop.mapred.LocalJobRunner] - map > map
2019-03-07 14:02:52,054 INFO [...WordCountTest] - word:python,one;1
2019-03-07 14:02:52,490 INFO [org.apache.hadoop.mapreduce.Job] -  map 43% reduce 0%
2019-03-07 14:02:52,491 INFO [...WordCountTest] - Map处理结束
2019-03-07 14:02:53,978 INFO [...WordCountTest] - Map,key:39,value:hello php,CurrentKey:39,CurrentValue:hello php
2019-03-07 14:02:55,095 INFO [org.apache.hadoop.mapred.LocalJobRunner] - map > map
2019-03-07 14:02:55,095 INFO [...WordCountTest] - word:hello,one;1
2019-03-07 14:02:55,918 INFO [org.apache.hadoop.mapreduce.Job] -  map 55% reduce 0%
2019-03-07 14:02:55,920 INFO [...WordCountTest] - word:php,one;1
2019-03-07 14:02:56,354 INFO [...WordCountTest] - Map处理结束
2019-03-07 14:02:58,095 INFO [...WordCountTest] - Map,key:50,value:hello scala,CurrentKey:50,CurrentValue:hello scala
2019-03-07 14:02:58,290 INFO [org.apache.hadoop.mapred.LocalJobRunner] - map > map
2019-03-07 14:02:59,126 INFO [...WordCountTest] - word:hello,one;1
2019-03-07 14:02:59,424 INFO [org.apache.hadoop.mapreduce.Job] -  map 67% reduce 0%
2019-03-07 14:03:00,183 INFO [...WordCountTest] - word:scala,one;1
2019-03-07 14:03:00,803 INFO [...WordCountTest] - Map处理结束
2019-03-07 14:03:01,668 INFO [org.apache.hadoop.mapred.LocalJobRunner] - map > map
2019-03-07 14:03:46,532 INFO [org.apache.hadoop.mapred.LocalJobRunner] - map > map
2019-03-07 14:03:54,387 INFO [org.apache.hadoop.mapred.LocalJobRunner] - map > sort
2019-03-07 14:03:54,953 INFO [org.apache.hadoop.mapred.LocalJobRunner] - map > sort
2019-03-07 14:04:13,143 INFO [org.apache.hadoop.mapred.MapTask] - Starting flush of map output
2019-03-07 14:04:13,143 INFO [org.apache.hadoop.mapred.MapTask] - Spilling map output
2019-03-07 14:04:13,144 INFO [org.apache.hadoop.mapred.MapTask] - bufstart = 0; bufend = 98; bufvoid = 104857600
2019-03-07 14:04:13,144 INFO [org.apache.hadoop.mapred.MapTask] - kvstart = 26214396(104857584); kvend = 26214360(104857440); length = 37/6553600
2019-03-07 14:06:48,036 INFO [org.apache.hadoop.mapred.LocalJobRunner] - map > sort
2019-03-07 14:06:51,918 INFO [...WordCountTest] - Reduce,key:hello,CurrentKey:hello,CurrentValue:1
2019-03-07 14:06:53,377 INFO [...WordCountTest] - key:hello,val:1,sum:1
2019-03-07 14:06:54,290 INFO [...WordCountTest] - key:hello,val:1,sum:2
2019-03-07 14:06:55,083 INFO [...WordCountTest] - key:hello,val:1,sum:3
2019-03-07 14:06:55,835 INFO [...WordCountTest] - key:hello,val:1,sum:4
2019-03-07 14:06:56,755 INFO [...WordCountTest] - key:hello,val:1,sum:5
2019-03-07 14:06:57,809 INFO [...WordCountTest] - Reduce处理结束
2019-03-07 14:07:00,463 INFO [...WordCountTest] - Reduce,key:java,CurrentKey:java,CurrentValue:1
2019-03-07 14:07:01,369 INFO [...WordCountTest] - key:java,val:1,sum:1
2019-03-07 14:07:02,551 INFO [...WordCountTest] - Reduce处理结束
2019-03-07 14:07:06,245 INFO [...WordCountTest] - Reduce,key:php,CurrentKey:php,CurrentValue:1
2019-03-07 14:07:07,058 INFO [...WordCountTest] - key:php,val:1,sum:1
2019-03-07 14:07:08,125 INFO [...WordCountTest] - Reduce处理结束
2019-03-07 14:07:10,643 INFO [...WordCountTest] - Reduce,key:python,CurrentKey:python,CurrentValue:1
2019-03-07 14:07:11,460 INFO [...WordCountTest] - key:python,val:1,sum:1
2019-03-07 14:07:12,963 INFO [...WordCountTest] - Reduce处理结束
2019-03-07 14:07:16,042 INFO [...WordCountTest] - Reduce,key:scala,CurrentKey:scala,CurrentValue:1
2019-03-07 14:07:17,019 INFO [...WordCountTest] - key:scala,val:1,sum:1
2019-03-07 14:07:18,892 INFO [...WordCountTest] - Reduce处理结束
2019-03-07 14:07:22,786 INFO [...WordCountTest] - Reduce,key:world,CurrentKey:world,CurrentValue:1
2019-03-07 14:07:23,771 INFO [...WordCountTest] - key:world,val:1,sum:1
2019-03-07 14:07:26,351 INFO [...WordCountTest] - Reduce处理结束
2019-03-07 14:08:40,783 INFO [org.apache.hadoop.mapred.MapTask] - Finished spill 0
2019-03-07 14:10:15,654 INFO [org.apache.hadoop.mapred.Task] - Task:attempt_local43104531_0001_m_000000_0 is done. And is in the process of committing
2019-03-07 14:10:15,683 INFO [org.apache.hadoop.mapred.LocalJobRunner] - map
2019-03-07 14:10:15,684 INFO [org.apache.hadoop.mapred.Task] - Task 'attempt_local43104531_0001_m_000000_0' done.
2019-03-07 14:10:17,387 INFO [org.apache.hadoop.mapreduce.Job] -  map 100% reduce 0%
2019-03-07 14:10:17,718 INFO [org.apache.hadoop.mapred.LocalJobRunner] - Finishing task: attempt_local43104531_0001_m_000000_0
2019-03-07 14:10:18,652 INFO [org.apache.hadoop.mapred.LocalJobRunner] - Map task executor complete.
2019-03-07 14:10:18,785 INFO [org.apache.hadoop.yarn.util.ProcfsBasedProcessTree] - ProcfsBasedProcessTree currently is supported only on Linux.
2019-03-07 14:10:18,851 INFO [org.apache.hadoop.mapred.Task] -  Using ResourceCalculatorProcessTree : org.apache.hadoop.yarn.util.WindowsBasedProcessTree@af988b5
2019-03-07 14:10:18,862 INFO [org.apache.hadoop.mapred.Merger] - Merging 1 sorted segments
2019-03-07 14:10:18,881 INFO [org.apache.hadoop.mapred.Merger] - Down to the last merge-pass, with 1 segments left of total size: 64 bytes
2019-03-07 14:10:18,885 INFO [org.apache.hadoop.mapred.LocalJobRunner] - 
2019-03-07 14:10:18,927 INFO [org.apache.hadoop.conf.Configuration.deprecation] - mapred.skip.on is deprecated. Instead, use mapreduce.job.skiprecords
2019-03-07 14:10:22,129 INFO [...WordCountTest] - Reduce,key:hello,CurrentKey:hello,CurrentValue:5
2019-03-07 14:10:24,742 INFO [...WordCountTest] - key:hello,val:5,sum:5
2019-03-07 14:10:25,350 INFO [org.apache.hadoop.mapred.LocalJobRunner] - reduce > reduce
2019-03-07 14:10:25,540 INFO [org.apache.hadoop.mapreduce.Job] -  map 100% reduce 77%
2019-03-07 14:10:25,756 INFO [...WordCountTest] - Reduce处理结束
2019-03-07 14:10:28,992 INFO [...WordCountTest] - Reduce,key:java,CurrentKey:java,CurrentValue:1
2019-03-07 14:10:30,033 INFO [...WordCountTest] - key:java,val:1,sum:1
2019-03-07 14:10:31,279 INFO [...WordCountTest] - Reduce处理结束
2019-03-07 14:10:32,786 INFO [org.apache.hadoop.mapred.LocalJobRunner] - reduce > reduce
2019-03-07 14:10:34,215 INFO [org.apache.hadoop.mapreduce.Job] -  map 100% reduce 82%
2019-03-07 14:10:34,470 INFO [...WordCountTest] - Reduce,key:php,CurrentKey:php,CurrentValue:1
2019-03-07 14:10:35,345 INFO [...WordCountTest] - key:php,val:1,sum:1
2019-03-07 14:10:36,800 INFO [...WordCountTest] - Reduce处理结束
2019-03-07 14:10:39,351 INFO [org.apache.hadoop.mapred.LocalJobRunner] - reduce > reduce
2019-03-07 14:10:40,580 INFO [org.apache.hadoop.mapreduce.Job] -  map 100% reduce 88%
2019-03-07 14:10:40,581 INFO [...WordCountTest] - Reduce,key:python,CurrentKey:python,CurrentValue:1
2019-03-07 14:10:41,775 INFO [...WordCountTest] - key:python,val:1,sum:1
2019-03-07 14:10:43,503 INFO [...WordCountTest] - Reduce处理结束
2019-03-07 14:10:48,121 INFO [org.apache.hadoop.mapred.LocalJobRunner] - reduce > reduce
2019-03-07 14:10:49,769 INFO [org.apache.hadoop.mapreduce.Job] -  map 100% reduce 94%
2019-03-07 14:10:50,124 INFO [...WordCountTest] - Reduce,key:scala,CurrentKey:scala,CurrentValue:1
2019-03-07 14:10:51,822 INFO [org.apache.hadoop.mapred.LocalJobRunner] - reduce > reduce
2019-03-07 14:10:51,824 INFO [...WordCountTest] - key:scala,val:1,sum:1
2019-03-07 14:10:53,323 INFO [org.apache.hadoop.mapreduce.Job] -  map 100% reduce 99%
2019-03-07 14:10:56,006 INFO [...WordCountTest] - Reduce处理结束
2019-03-07 14:10:59,868 INFO [...WordCountTest] - Reduce,key:world,CurrentKey:world,CurrentValue:1
2019-03-07 14:11:00,766 INFO [...WordCountTest] - key:world,val:1,sum:1
2019-03-07 14:11:01,967 INFO [org.apache.hadoop.mapred.LocalJobRunner] - reduce > reduce
2019-03-07 14:11:01,968 INFO [...WordCountTest] - Reduce处理结束
2019-03-07 14:11:03,191 INFO [org.apache.hadoop.mapreduce.Job] -  map 100% reduce 100%

end

猜你喜欢

转载自blog.csdn.net/u010002184/article/details/88286907