Hadoop (four) small practice project update

Hadoop Java API

 

 

Warm-up WordCount

New Maven Project

Pom.xml configuration according to zookeeper and hadoop version, available for use  echo STAT | nc localhost 2181  View zookeeper version

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.aidata</groupId>
    <artifactId>bigdata</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <hadoop-version>3.0.0</hadoop-version>
        <zookeeper-version>3.4.5</zookeeper-version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.apache.zookeeper</groupId>
            <artifactId>zookeeper</artifactId>
            <version>${zookeeper-version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop-version}</version>
        </dependency>

    </dependencies>

    <build>
        <plugins>
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>2.3</version>
                <configuration>
                    <classifier>dist</classifier>
                    <appendAssemblyId>true</appendAssemblyId>
                    <descriptorRefs>
                        <descriptor>jar-with-dependencies</descriptor>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>

            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.6.2</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                    <encoding>UTF-8</encoding>
                </configuration>
            </plugin>
        </plugins>
    </build>

</project>

Three word file upload HDFS, use tab-delimited

hdfs dfs -put wc_tes* /input/wc

Program in MapReduce

package com.aidata.mapreduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class{WordCountMRJob 

    // the Map phase 

    / ** 
     * key input data types: 
     * LongWritable: offset input data 
     * Text: Input Data Type 
     * 
     * key to output data types: 
     * Text: output key data type 
     * IntWritable: output value of the data type 
     * / 
    public  static  class WordCountMapper the extends Mapper <LongWritable, the Text, the Text, IntWritable> { 

        @Override 
        protected  void Map (LongWritable Key, the Text value, the context context) throws IOException, InterruptedException { 
            String Line = value. toString (); 

            String [] wordsLine.split = ( "\ T" ); 

            for (String Word: words) {
                 // Word. 1 
                context.write ( new new the Text (Word), new new IntWritable (. 1 )); 
            } 
        } 
    } 
    // the Reduce phase 

    / ** 
     * key input data type: 
     * the Text: key input data type 
     * IntWritable: the type of key input data 
     * 
     * outputting data key type: 
     * the Text: key types of output data 
     * IntWritable: key types of output data 
     * / 
    public  static  class WordCountReducer the extends the Reducer <the Text, IntWritable, the Text, IntWritable> {
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            // word {1,1,1,...}

            int sum = 0;

            for(IntWritable value : values){
                sum += value.get();
            }

            context.write(key,new IntWritable(sum));
        }
    }

    public static void main(String[] args) {
        //1.配置job
        Configuration conf = new Configuration();
        Job the Job = null ; 

        // 2. Create a job 
        the try { 
            job = Job.getInstance (the conf); 
        } the catch (IOException E) { 
            e.printStackTrace (); 
        } 
        job.setJarByClass (. WordCountMRJob class ); 

        // 3. job to Add execution flow 

        // file path 3.1 HDFS in need of treatment 
        the path path = new new the path (args [0 ]); 

        the try {
             // Job add input path 
            FileInputFormat.addInputPath (Job, path); 
        } the catch (IOException E) {
            e.printStackTrace (); 
        } 

        // 3.2 execution phase map provided 
        job.setMapperClass (WordCountMapper. class ); 
        job.setMapOutputKeyClass (. the Text class ); // map output key Type 
        job.setMapOutputValueClass (IntWritable. class ); // map output value type 

        // 3.3 disposed reduce execution stage 
        job.setReducerClass (WordCountReducer. class ); 
        job.setOutputKeyClass (the Text. class ); // reduce output key type 
        job.setOutputValueClass (IntWritable. class ); // reduce output value type

        // 3.4 job setting calculation result output path  
        path output =new new the Path (args [. 1 ]); 
        FileOutputFormat.setOutputPath (job, Output); 

        // 4. submitted job, and the waiting job execution completes 
        the try {
             Boolean Result = job.waitForCompletion ( to true ) ; 
            System.exit (Result 0:?. 1 ); 
        } the catch (IOException E) { 
            e.printStackTrace (); 
        } the catch (InterruptedException E) { 
            e.printStackTrace (); 
        } the catch (a ClassNotFoundException E) { 
            e.printStackTrace () ; 
        } 
    }
}

Click the package maven to package, jar package will target directory, if the idea does not target

 

 Upload jar package to the cluster, run

 hadoop jar bigdata-1.0-SNAPSHOT.jar com.aidata.mapreduce.WordCountMRJob /input/wc/ /output/wc

Check the output node

hdfs dfs -ls /output/wc

If you are using LZO

For example, CDH installed LZO, want to use the next

Installation lzop

yum install lzop

Jar package to the local copy, I use the CDH6.3.1, LZOjar packet follows

/opt/cloudera/parcels/GPLEXTRAS-6.3.1-1.gplextras6.3.1.p0.1470567/lib/hadoop/lib/hadoop-lzo-0.4.15-cdh6.3.1.jar

Three tab-delimited file word

Compressed file

lzop -v wc*.txt

Upload to hdfs

hdfs dfs -put wc*.txt.lzo /input

Indexing

lzo compressed file can be sliced ​​characteristics depending on its index, so we need to manually create an index for the lzo compressed file. If there is no slice index, lzo file only one.

hadoop jar /opt/cloudera/parcels/GPLEXTRAS-6.3.1-1.gplextras6.3.1.p0.1470567/lib/hadoop/lib/hadoop-lzo-0.4.15-cdh6.3.1.jar com.hadoop.compression.lzo.DistributedLzoIndexer /input/

 

The idea of ​​the resources LZOjar package into the directory, click add library

Third-party packages, you need to configure it in the maven, maven does not recognize whether positive

maven packaging process using a maven-compiler-plugin plugin compile, but due to third-party jar package exists in the project, maven-compiler-plugin not know the location of a third party jar package, so the error "Package xxx does not exist", to solve method:

            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.6.2</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                    <compilerArguments>
                        <extdirs>${project.basedir}/src/main/resources</extdirs>
                    </compilerArguments>
                    <encoding>UTF-8</encoding>
                </configuration>
            </plugin>

mapreduce program changed a bit

package com.aidata.mapreduce;

import com.hadoop.compression.lzo.LzopCodec;
import com.hadoop.mapreduce.LzoTextInputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WordCountMRJob {

    //Map阶段

    public static class WordCountMapper extends Mapper<LongWritable,Text, Text, IntWritable>{

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();

            String[] words = line.split("\t");

            for(String word : words){
                //word 1
                context.write(new Text(word),new IntWritable(1));
            }
        }
    }
    //Reduce阶段


    public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            // word {1,1,1,...}

            int sum = 0;

            for(IntWritable value : values){
                sum += value.get();
            }

            context.write(key,new IntWritable(sum));
        }
    }

    public static void main(String[] args) {
        //1.配置job
        Configuration conf = new Configuration();
        Job job = null;

        //2.创建job
        try {
            job = Job.getInstance(conf);
        } catch (IOException e) {
            e.printStackTrace();
        }
        job.setJarByClass (WordCountMRJob. class );
         job.setInputFormatClass (LzoTextInputFormat. class);
         // disposed reduce compression format and compression results 
        FileOutputFormat.setCompressOutput (Job, to true ); 
        FileOutputFormat.setOutputCompressorClass (. Job, LzopCodec class );

         // . 3 . Add to the job flow execution 

        // file path 3.1 HDFS in need of treatment 
        the path path = new new the path (args [0 ]); 

        the try {
             // job input path Add 
            FileInputFormat.addInputPath (job, path); 
        } the catch (IOException E ) {
            e.printStackTrace (); 
        } 

        // 3.2 execution phase map provided 
        job.setMapperClass (WordCountMapper. class  );
        job.setMapOutputKeyClass (the Text. class ); // map output key Type 
        job.setMapOutputValueClass (IntWritable. class ); // map output value type 

        // 3.3 disposed reduce execution stage 
        job.setReducerClass (WordCountReducer. class ); 
        job.setOutputKeyClass (the Text. class ); // reduce output key type 
        job.setOutputValueClass (IntWritable. class ); // reduce output value type 

        // 3.4 job calculation result output path provided 
        the path output = new new the path (args [. 1 ]); 
        FileOutputFormat.setOutputPath (job, output);

        // 4. submitted job, and the waiting job execution completes 
        the try {
             Boolean Result = job.waitForCompletion ( to true ); 
            System.exit (Result 0:?. 1 ); 
        } the catch (IOException E) { 
            e.printStackTrace (); 
        } the catch (InterruptedException E) { 
            e.printStackTrace (); 
        } the catch (a ClassNotFoundException E) {  
            e.printStackTrace ( ); 
        }
    } 
}

Run the program

hadoop jar bigdata-1.0-SNAPSHOT.jar com.aidata.mapreduce.WordCountMRJob /input/ /output

if there is not

FileOutputFormat.setCompressOutput(job, true);
FileOutputFormat.setOutputCompressorClass(job, LzopCodec.class);

Will have to specify the output format

If no such input and output are arranged Lzo format, by beginning at the command line parameters -D configured in the program

hadoop jar myjar.jar \
 -D mapred.reduce.tasks=2 \
 -D mapreduce.job.inputformat.class=com.hadoop.mapreduce.LzoTextInputFormat \
 -D mapred.output.compress=true \
 -D mapred.output.compression.codec=com.hadoop.compression.lzo.LzopCodec \
 /input /output

 

CDH reduce task number is provided in

 

 

 

Web site log analysis project

 

Guess you like

Origin www.cnblogs.com/aidata/p/12455716.html