Hadoop Java API
Warm-up WordCount
New Maven Project
Pom.xml configuration according to zookeeper and hadoop version, available for use echo STAT | nc localhost 2181 View zookeeper version
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.aidata</groupId> <artifactId>bigdata</artifactId> <version>1.0-SNAPSHOT</version> <properties> <hadoop-version>3.0.0</hadoop-version> <zookeeper-version>3.4.5</zookeeper-version> </properties> <dependencies> <dependency> <groupId>org.apache.zookeeper</groupId> <artifactId>zookeeper</artifactId> <version>${zookeeper-version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>${hadoop-version}</version> </dependency> </dependencies> <build> <plugins> <plugin> <artifactId>maven-assembly-plugin</artifactId> <version>2.3</version> <configuration> <classifier>dist</classifier> <appendAssemblyId>true</appendAssemblyId> <descriptorRefs> <descriptor>jar-with-dependencies</descriptor> </descriptorRefs> </configuration> <executions> <execution> <id>make-assembly</id> <phase>package</phase> <goals> <goal>single</goal> </goals> </execution> </executions> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>3.6.2</version> <configuration> <source>1.8</source> <target>1.8</target> <encoding>UTF-8</encoding> </configuration> </plugin> </plugins> </build> </project>
Three word file upload HDFS, use tab-delimited
hdfs dfs -put wc_tes* /input/wc
Program in MapReduce
package com.aidata.mapreduce; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; public class{WordCountMRJob // the Map phase / ** * key input data types: * LongWritable: offset input data * Text: Input Data Type * * key to output data types: * Text: output key data type * IntWritable: output value of the data type * / public static class WordCountMapper the extends Mapper <LongWritable, the Text, the Text, IntWritable> { @Override protected void Map (LongWritable Key, the Text value, the context context) throws IOException, InterruptedException { String Line = value. toString (); String [] wordsLine.split = ( "\ T" ); for (String Word: words) { // Word. 1 context.write ( new new the Text (Word), new new IntWritable (. 1 )); } } } // the Reduce phase / ** * key input data type: * the Text: key input data type * IntWritable: the type of key input data * * outputting data key type: * the Text: key types of output data * IntWritable: key types of output data * / public static class WordCountReducer the extends the Reducer <the Text, IntWritable, the Text, IntWritable> { @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { // word {1,1,1,...} int sum = 0; for(IntWritable value : values){ sum += value.get(); } context.write(key,new IntWritable(sum)); } } public static void main(String[] args) { //1.配置job Configuration conf = new Configuration(); Job the Job = null ; // 2. Create a job the try { job = Job.getInstance (the conf); } the catch (IOException E) { e.printStackTrace (); } job.setJarByClass (. WordCountMRJob class ); // 3. job to Add execution flow // file path 3.1 HDFS in need of treatment the path path = new new the path (args [0 ]); the try { // Job add input path FileInputFormat.addInputPath (Job, path); } the catch (IOException E) { e.printStackTrace (); } // 3.2 execution phase map provided job.setMapperClass (WordCountMapper. class ); job.setMapOutputKeyClass (. the Text class ); // map output key Type job.setMapOutputValueClass (IntWritable. class ); // map output value type // 3.3 disposed reduce execution stage job.setReducerClass (WordCountReducer. class ); job.setOutputKeyClass (the Text. class ); // reduce output key type job.setOutputValueClass (IntWritable. class ); // reduce output value type // 3.4 job setting calculation result output path path output =new new the Path (args [. 1 ]); FileOutputFormat.setOutputPath (job, Output); // 4. submitted job, and the waiting job execution completes the try { Boolean Result = job.waitForCompletion ( to true ) ; System.exit (Result 0:?. 1 ); } the catch (IOException E) { e.printStackTrace (); } the catch (InterruptedException E) { e.printStackTrace (); } the catch (a ClassNotFoundException E) { e.printStackTrace () ; } } }
Click the package maven to package, jar package will target directory, if the idea does not target
Upload jar package to the cluster, run
hadoop jar bigdata-1.0-SNAPSHOT.jar com.aidata.mapreduce.WordCountMRJob /input/wc/ /output/wc
Check the output node
hdfs dfs -ls /output/wc
If you are using LZO
For example, CDH installed LZO, want to use the next
Installation lzop
yum install lzop
Jar package to the local copy, I use the CDH6.3.1, LZOjar packet follows
/opt/cloudera/parcels/GPLEXTRAS-6.3.1-1.gplextras6.3.1.p0.1470567/lib/hadoop/lib/hadoop-lzo-0.4.15-cdh6.3.1.jar
Three tab-delimited file word
Compressed file
lzop -v wc*.txt
Upload to hdfs
hdfs dfs -put wc*.txt.lzo /input
Indexing
lzo compressed file can be sliced characteristics depending on its index, so we need to manually create an index for the lzo compressed file. If there is no slice index, lzo file only one.
hadoop jar /opt/cloudera/parcels/GPLEXTRAS-6.3.1-1.gplextras6.3.1.p0.1470567/lib/hadoop/lib/hadoop-lzo-0.4.15-cdh6.3.1.jar com.hadoop.compression.lzo.DistributedLzoIndexer /input/
The idea of the resources LZOjar package into the directory, click add library
Third-party packages, you need to configure it in the maven, maven does not recognize whether positive
maven packaging process using a maven-compiler-plugin plugin compile, but due to third-party jar package exists in the project, maven-compiler-plugin not know the location of a third party jar package, so the error "Package xxx does not exist", to solve method:
<plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>3.6.2</version> <configuration> <source>1.8</source> <target>1.8</target> <compilerArguments> <extdirs>${project.basedir}/src/main/resources</extdirs> </compilerArguments> <encoding>UTF-8</encoding> </configuration> </plugin>
mapreduce program changed a bit
package com.aidata.mapreduce; import com.hadoop.compression.lzo.LzopCodec; import com.hadoop.mapreduce.LzoTextInputFormat; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; public class WordCountMRJob { //Map阶段 public static class WordCountMapper extends Mapper<LongWritable,Text, Text, IntWritable>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] words = line.split("\t"); for(String word : words){ //word 1 context.write(new Text(word),new IntWritable(1)); } } } //Reduce阶段 public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{ @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { // word {1,1,1,...} int sum = 0; for(IntWritable value : values){ sum += value.get(); } context.write(key,new IntWritable(sum)); } } public static void main(String[] args) { //1.配置job Configuration conf = new Configuration(); Job job = null; //2.创建job try { job = Job.getInstance(conf); } catch (IOException e) { e.printStackTrace(); } job.setJarByClass (WordCountMRJob. class ); job.setInputFormatClass (LzoTextInputFormat. class); // disposed reduce compression format and compression results FileOutputFormat.setCompressOutput (Job, to true ); FileOutputFormat.setOutputCompressorClass (. Job, LzopCodec class ); // . 3 . Add to the job flow execution // file path 3.1 HDFS in need of treatment the path path = new new the path (args [0 ]); the try { // job input path Add FileInputFormat.addInputPath (job, path); } the catch (IOException E ) { e.printStackTrace (); } // 3.2 execution phase map provided job.setMapperClass (WordCountMapper. class ); job.setMapOutputKeyClass (the Text. class ); // map output key Type job.setMapOutputValueClass (IntWritable. class ); // map output value type // 3.3 disposed reduce execution stage job.setReducerClass (WordCountReducer. class ); job.setOutputKeyClass (the Text. class ); // reduce output key type job.setOutputValueClass (IntWritable. class ); // reduce output value type // 3.4 job calculation result output path provided the path output = new new the path (args [. 1 ]); FileOutputFormat.setOutputPath (job, output); // 4. submitted job, and the waiting job execution completes the try { Boolean Result = job.waitForCompletion ( to true ); System.exit (Result 0:?. 1 ); } the catch (IOException E) { e.printStackTrace (); } the catch (InterruptedException E) { e.printStackTrace (); } the catch (a ClassNotFoundException E) { e.printStackTrace ( ); } } }
Run the program
hadoop jar bigdata-1.0-SNAPSHOT.jar com.aidata.mapreduce.WordCountMRJob /input/ /output
if there is not
FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, LzopCodec.class);
Will have to specify the output format
If no such input and output are arranged Lzo format, by beginning at the command line parameters -D configured in the program
hadoop jar myjar.jar \ -D mapred.reduce.tasks=2 \ -D mapreduce.job.inputformat.class=com.hadoop.mapreduce.LzoTextInputFormat \ -D mapred.output.compress=true \ -D mapred.output.compression.codec=com.hadoop.compression.lzo.LzopCodec \ /input /output
CDH reduce task number is provided in
Web site log analysis project