IDEA上运行Hadoop WordCount V1.0 和 V2.0

文章目录

1. WordCount V1.0
2. WordCount V2.0
3. 坑

使用管理员身份运行IDEA

添加Maven依赖，虽然hadoop-client中有hadoop-mapreduce-client-jobclient，但不单独添加，IDEA控制台日志不会打印

<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-client</artifactId>
    <version>3.1.2</version>
</dependency>
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
    <version>3.1.2</version>
</dependency>

添加log4j.properties到resource文件夹中

log4j.rootLogger=INFO, console
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.Target=System.out
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=[%p] %d{yyyy-MM-dd HH:mm:ss,SSS} method:%l%m%n

将hdfs-site.xml和core-site.xml复制到resource文件夹

项目结构

1. WordCount V1.0

map1

public class WordCountMapper1 extends Mapper<LongWritable, Text, Text, IntWritable> {

    private final static IntWritable one = new IntWritable(1);

    private Text word = new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 读取一行
        String line = value.toString();
        // 空格分隔
        StringTokenizer stringTokenizer = new StringTokenizer(line);
        // 循环空格分隔，给每个计数1
        while(stringTokenizer.hasMoreTokens()){
            word.set(stringTokenizer.nextToken());
            context.write(word, one);
        }
    }
}

reduce1

public class WordCountReducer1 extends Reducer<Text, IntWritable, Text, IntWritable> {

    private IntWritable result = new IntWritable();

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        // 根据key对values计数
        int sum = 0;
        for(IntWritable intWritable : values){
            sum += intWritable.get();
        }
        result.set(sum);
        context.write(key, result);
    }
}

WordCount V1.0，路径写死可直接在IDEA中run，也可以设置参数

public class WordCount1 {

    public static void main( String[] args ) {
   		// 读取hdfs-site.xml，core-site.xml
        Configuration conf = new Configuration();

        try{
            Job job = Job.getInstance(conf,"WordCount V1.0");

            job.setJarByClass(WordCount1.class);

            job.setMapperClass(WordCountMapper1.class);
            job.setCombinerClass(WordCountReducer1.class);
            job.setReducerClass(WordCountReducer1.class);

            // job 输出key value 类型，mapper和reducer类型相同可用
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);

            // hdfs
            FileInputFormat.addInputPath(job, new Path("/hdfsTest/input"));
            FileOutputFormat.setOutputPath(job, new Path("/hdfsTest/output"));

			//FileInputFormat.addInputPath(job, new Path(args[0]));
			//FileOutputFormat.setOutputPath(job, new Path(args[1]));

            // windows 本地目录
			//FileInputFormat.setInputPaths(job, "D:\\hadoop-test\\input");
			//FileOutputFormat.setOutputPath(job, new Path("D:\\hadoop-test\\output"));

            System.exit(job.waitForCompletion(true) ? 0 : 1);
        }catch (Exception e){
            e.printStackTrace();
        }
    }
}

可打成jar包运行，使用参数，需要增加命名空间

hadoop jar mapreduce-test-1.0-SNAPSHOT.jar com.shpun.wordcount1.WordCount1 /hdfsTest/input /hdfsTest/output

2. WordCount V2.0

map2

public class WordCountMapper2 extends Mapper<LongWritable,Text, Text, IntWritable> {

    enum MapperCounterEnums{
        INPUT_WORDS
    }

    private static final IntWritable one = new IntWritable(1);

    private Text word = new Text();

    // 区分大小写
    private boolean caseSensitive;
    // 用于存需要过滤的pattern
    private Set<String> patternsToSkip = new HashSet<>();

    private Configuration conf;
    private BufferedReader bufferedReader;

    /**
     * 此方法被MapReduce框架仅且执行一次，在执行Map任务前，进行相关变量或者资源的集中初始化工作。
     * 1.读取配置文件中的wordcount.case.sensitive，赋值给caseSensitive变量
     * 2.读取配置文件中的wordcount.skip.patterns，如果为true，将CacheFiles的文件都加入过滤范围
     * @param context
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        conf = context.getConfiguration();
        // 获取是否区分大小写的值
        caseSensitive = conf.getBoolean("wordcount.case.sensitive",true);
        // 获取是否需要过滤
        if(conf.getBoolean("wordcount.skip.patterns", false)){
            // 读取缓存文件，在main函数中添加
            URI[] patternsURIs = Job.getInstance(conf).getCacheFiles();
            for(URI patternsURI : patternsURIs){
                Path patternsPath = new Path(patternsURI.getPath());
                String fileName = patternsPath.getName();
                parseSkipFile(fileName);
            }
        }
    }

    /**
     * 根据文件名读取每行，添加到需要过滤的set中
     * @param fileName
     */
    private void parseSkipFile(String fileName){
       try{
           bufferedReader = new BufferedReader(new FileReader(fileName));
           String patternLine;
           //读取文件每一行，并添加
           while((patternLine = bufferedReader.readLine()) != null){
               patternsToSkip.add(patternLine);
           }
       }catch (IOException e){
           e.printStackTrace();
       }
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = (caseSensitive) ? value.toString() : value.toString().toLowerCase();

        // 过滤
        for(String pattern : patternsToSkip){
            line = line.replaceAll(pattern,"");
        }

        StringTokenizer stringTokenizer = new StringTokenizer(line);
        while(stringTokenizer.hasMoreTokens()){
            word.set(stringTokenizer.nextToken());
            context.write(word, one);

            // 定义计数器，枚举类型的名称即为组的名称，枚举类型的字段就是计数器名称
            Counter counter = context.getCounter(MapperCounterEnums.class.getName(),MapperCounterEnums.INPUT_WORDS.toString());
            counter.increment(1);
        }
    }
}

reduce2

public class WordCountReducer2 extends Reducer<Text, IntWritable, Text, IntWritable> {

    private IntWritable result = new IntWritable();

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int sum = 0;
        for(IntWritable value : values){
            sum += value.get();
        }
        result.set(sum);
        context.write(key, result);
    }
}

WordCount V2.0

public class WordCount2 {
   
    public static void main( String[] args ) {
    	// 读取hdfs-site.xml，core-site.xml
        Configuration conf = new Configuration();
        
        try{
            // 获取参数
            GenericOptionsParser genericOptionsParser = new GenericOptionsParser(conf,args);
            String[] remainingArgs = genericOptionsParser.getRemainingArgs();

            // 若命令为 $ bin/hadoop jar wc.jar WordCount2 /user/joe/wordcount/input /user/joe/wordcount/output
            // genericOptionsParser.getRemainingArgs() 获取到的就是
            // /user/joe/wordcount/input /user/joe/wordcount/output

            // 若命令为 $ bin/hadoop jar wc.jar WordCount2 -Dwordcount.case.sensitive=false /user/joe/wordcount/input /user/joe/wordcount/output -skip /user/joe/wordcount/patterns.txt
            // -Dwordcount.case.sensitive=false 设置不区分大小写
            // genericOptionsParser.getRemainingArgs() 获取到的就是
            // /user/joe/wordcount/input /user/joe/wordcount/output -skip /user/joe/wordcount/patterns.txt
            if(remainingArgs.length != 2 && remainingArgs.length != 4){
                System.err.println("Usage: wordcount <in> <out> [-skip skipPatternFile]");
                System.exit(2);
            }

            Job job = Job.getInstance(conf,"WordCount V2.0");
            job.setJarByClass(WordCount2.class);

            job.setMapperClass(WordCountMapper2.class);
            job.setCombinerClass(WordCountReducer2.class);
            job.setReducerClass(WordCountReducer2.class);

            // job 输出key value 类型，mapper和reducer类型相同可用
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);

            List<String> otherArgList = new ArrayList<>();
            for(int i = 0;i < remainingArgs.length;i++){
                if("-skip".equals(remainingArgs[i])){
                    // 获取过滤文件的地址，将hdfs文件路径转成完整的路径，scheme://authority/path
                    URI patternURI = new Path(remainingArgs[++i]).toUri();
                    // 加入本地化缓存中
                    job.addCacheFile(patternURI);
                    // 需要过滤
                    job.getConfiguration().setBoolean("wordcount.skip.patterns", true);
                }else{
                    otherArgList.add(remainingArgs[i]);
                }
            }

            FileInputFormat.addInputPath(job, new Path(otherArgList.get(0)));
            FileOutputFormat.setOutputPath(job, new Path(otherArgList.get(1)));

            System.exit(job.waitForCompletion(true) ? 0 : 1);
        }catch (Exception e){
            e.printStackTrace();
        }
    }
}

IDEA赋值参数，在Program arguments中空格间隔开
在IDEA赋值参数
可打成jar包运行

hadoop jar mapreduce-test-1.0-SNAPSHOT.jar com.shpun.wordcount2.WordCount2 /hdfsTest/input /hdfsTest/output
hadoop jar mapreduce-test-1.0-SNAPSHOT.jar com.shpun.wordcount2.WordCount2 -Dwordcount.case.sensitive=false /hdfsTest/input /hdfsTest/output
hadoop jar mapreduce-test-1.0-SNAPSHOT.jar com.shpun.wordcount2.WordCount2 -Dwordcount.case.sensitive=false /hdfsTest/input /hdfsTest/output -skip /hdfsTest/skip/skipInput.txt

3. 坑

在WordCount V1.0中，在IDEA上运行时，切换注释的HDFS和Windows路径时，Maven要进行Clean再Compile，然后再Rebuild Project。不然会出现路径类型识别出错。HDFS路径识别成本地路径，Windows路径识别成HDFS路径。

参考：
hadoop MapReduce Tutorial
初识MapReduce的应用场景（附JAVA和Python代码）
自己编译WordCount编译通过执行报错
 官网MapReduce实例代码详细批注

shpunishment

发布了57 篇原创文章 · 获赞 11 · 访问量 9866

私信关注

IDEA上运行Hadoop WordCount V1.0 和 V2.0

文章目录

1. WordCount V1.0

2. WordCount V2.0

3. 坑

猜你喜欢