IDEA+maven builds hadoopMR development environment

Summarize the development environment built by IDEA+maven.
My environment is WIN7 (64-bit) hadoop2.8, hadoop cluster of 3 virtual machines, two datanodes and one nameNode. IDEA is 2016.1, maven 3.9, java1.7

IDEA+maven is very simple, follow the guide to create one A new maven project is fine.

Below is pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>hadoop.test</groupId>
    <artifactId>hadoop</artifactId>
    <version>1.0-SNAPSHOT</version>
    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <hadoop.version>2.8.0</hadoop.version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
    </dependencies>
</project>

First make sure the cluster is on and running properly.
Then, copy the hadoop package on the cluster to the windows directory, create the HADOOP_HOME environment variable, and put bin in the PATH.
Download the window extension.
http://files.cnblogs.com/files/longshiyVip/hadoop2.6%28x64%29V0.2.zip
This version is 2.6 64-bit, I have no problem with hadoop2.8.
After decompression, overwrite it to the bin directory, and put hadoop.dll into system32. Copy the relevant configuration files to the resource directory and keep them consistent with the cluster.
Among them, log4.properties is required, otherwise the log will not be printed.
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.Target=System.out
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=% d{ABSOLUTE} %5p %c{1}:%L - %m%n
log4j.rootLogger=INFO, console

starts writing code.

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

public class WordCount extends Configured implements Tool {
    public int run(String[] strings) throws Exception {
        try {
            Configuration conf = new Configuration();

            conf.addResource("/core-site.xml");
            conf.addResource("/hdfs-site.xml");
            conf.addResource("/mapred-site.xml");
            conf.addResource("/yarn-site.xml");

            conf.set("mapreduce.job.jar", "c:\\study\\java\\hadooptest\\target\\hadoop-1.0-SNAPSHOT.jar");
            conf.set("mapreduce.framework.name", "yarn");
            conf.set("yarn.resourcemanager.hostname", "master128");
            conf.set("fs.defaultFS", "hdfs://master128:9000");
            conf.set("mapreduce.app-submission.cross-platform", "true");

            Job job = Job.getInstance(conf);
            job.setJarByClass(WordCount.class);

            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(LongWritable.class);

            job.setMapperClass(WcMapper.class);
            job.setReducerClass(WcReducer.class);

            job.setInputFormatClass(TextInputFormat.class);
            job.setOutputFormatClass(TextOutputFormat.class);

            FileInputFormat.setInputPaths(job, "hdfs://master128:9000/zxq/input");
            FileOutputFormat.setOutputPath(job, new Path("hdfs://master128:9000/zxq/output"));

            job.waitForCompletion(true);
        } catch (Exception e) {
            e.printStackTrace ();
        }
        return 0;
    }

    public static class WcMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String mVal = value.toString();
            context.write(new Text(mVal), new LongWritable(1));
        }
    }
    public static class WcReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
        @Override
        protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
            long sum = 0;
            for(LongWritable lVal : values){
                sum += lVal.get();
            }
            context.write(key, new LongWritable(sum));
        }
    }
    public static void main(String[] args) throws Exception {
        ToolRunner.run(new WordCount(), args);
    }
}

In the java jvm startup parameters, you need to add the hadoop username: -DHADOOP_USER_NAME=hadoop Fill in according to your actual situation, otherwise access security issues will be reported.
The jar package must have
conf.set("mapreduce.job.jar", "c:\\study\\java\\hadooptest\\target\\hadoop-1.0-SNAPSHOT.jar");

the following configurations are based on their own The actual filling, mainly the host name (or ip) port, input and output files.

Configuration conf = new Configuration();

            conf.addResource("/core-site.xml");
            conf.addResource("/hdfs-site.xml");
            conf.addResource("/mapred-site.xml");
            conf.addResource("/yarn-site.xml");

            conf.set("mapreduce.job.jar", "c:\\study\\java\\hadooptest\\target\\hadoop-1.0-SNAPSHOT.jar");
            conf.set("mapreduce.framework.name", "yarn");
            conf.set("yarn.resourcemanager.hostname", "master128");
            conf.set("fs.defaultFS", "hdfs://master128:9000");
            conf.set("mapreduce.app-submission.cross-platform", "true");

            Job job = Job.getInstance(conf);
            job.setJarByClass(WordCount.class);

            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(LongWritable.class);

            job.setMapperClass(WcMapper.class);
            job.setReducerClass(WcReducer.class);

            job.setInputFormatClass(TextInputFormat.class);
            job.setOutputFormatClass(TextOutputFormat.class);

            FileInputFormat.setInputPaths(job, "hdfs://master128:9000/zxq/input");
            FileOutputFormat.setOutputPath(job, new Path("hdfs://master128:9000/zxq/output"));

I wrote the absolute path, that is, the jar generated by mvn clean install.

I also encountered many problems when building this environment. Since I also read articles on the Internet and practiced it myself, I found that I have gone through a lot of pits.
1. The version of the windows plug-in must use its own version of hadoop.
2. For connection problems, input and output files should be brought to the host: port and then the path. Hadoop will intercept the host and port and then access, nameNode.
3. It is a security access problem. To log in users of hadoop, the easiest way is to add jvm Startup parameter -DHADOOP_USER_NAME=hadoop.
There are other methods on the Internet, such as modifying the user name of your own window to be consistent with the hadoop user, or changing the permissions of the hdfs file.
Use the command line interface of HDFS to modify the permissions of the corresponding directory, hadoop fs -chmod 777 /user, the latter /user is the path of the file to be uploaded, which may be different in different situations, for example, the path of the file to be uploaded is hdfs://namenode /user/xxx.doc, such modification is possible, if the path of the file to be uploaded is hdfs://namenode/java/xxx.doc, then the modification is hadoop fs -chmod 777 /java or hadoop fs -chmod 777 /, the java one needs to create a Java directory in HDFS first, and the latter one is to adjust the permissions for the root directory.

According to the above code, add some files under input as the input file of wordcount.

hadoop dfs -put wordCount.txt /zxq/input

starts execution.

10:16:09,529  INFO RMProxy:123 - Connecting to ResourceManager at master128/172.23.132.84:8032
10:16:09,786  WARN JobResourceUploader:64 - Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
10:16:09,924  INFO FileInputFormat:289 - Total input files to process : 1
10:16:09,980  INFO JobSubmitter:200 - number of splits:1
10:16:10,496  INFO JobSubmitter:289 - Submitting tokens for job: job_1509588776406_0004
10:16:10,674  INFO YarnClientImpl:296 - Submitted application application_1509588776406_0004
10:16:10,699  INFO Job:1345 - The url to track the job: http://master128:8088/proxy/application_1509588776406_0004/
10:16:10,700  INFO Job:1390 - Running job: job_1509588776406_0004
10:16:15,835  INFO Job:1411 - Job job_1509588776406_0004 running in uber mode : false
10:16:15,839  INFO Job:1418 -  map 0% reduce 0%
10:16:21,069  INFO Job:1418 -  map 100% reduce 0%
10:16:26,122  INFO Job:1418 -  map 100% reduce 100%
10:16:26,162  INFO Job:1429 - Job job_1509588776406_0004 completed successfully
10:16:26,286  INFO Job:1436 - Counters: 49
	File System Counters
		FILE: Number of bytes read=363
		FILE: Number of bytes written=273713
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
		HDFS: Number of bytes read=257
		HDFS: Number of bytes written=162
		HDFS: Number of read operations=6
		HDFS: Number of large read operations=0
		HDFS: Number of write operations=2
	Job Counters
		Launched map tasks=1
		Launched reduce tasks=1
		Data-local map tasks=1
		Total time spent by all maps in occupied slots (ms)=2508
		Total time spent by all reduces in occupied slots (ms)=2528
		Total time spent by all map tasks (ms)=2508
		Total time spent by all reduce tasks (ms)=2528
		Total vcore-milliseconds taken by all map tasks=2508
		Total vcore-milliseconds taken by all reduce tasks=2528
		Total megabyte-milliseconds taken by all map tasks=2568192
		Total megabyte-milliseconds taken by all reduce tasks=5177344
	Map-Reduce Framework
		Map input records=21
		Map output records=21
		Map output bytes=315
		Map output materialized bytes=363
		Input split bytes=110
		Combine input records=0
		Combine output records=0
		Reduce input groups=18
		Reduce shuffle bytes=363
		Reduce input records=21
		Reduce output records=18
		Spilled Records=42
		Shuffled Maps =1
		Failed Shuffles=0
		Merged Map outputs=1
		GC time elapsed (ms)=451
		CPU time spent (ms)=2930
		Physical memory (bytes) snapshot=487813120
		Virtual memory (bytes) snapshot=4467601408
		Total committed heap usage (bytes)=455606272
	Shuffle Errors
		BAD_ID=0
		CONNECTION=0
		IO_ERROR = 0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	File Input Format Counters
		Bytes Read=147
	File Output Format Counters
		Bytes Written=162

And that's it.

IDEA+maven builds hadoopMR development environment

Guess you like