Experimental MapReduce - Data Cleaning - Stage a

Result file data Description:

Ip : 106.39.41.166, (city)

DATE : 10 / Nov / 2016: 00: 01: 02 +0800, (date)

Day : 10, (number of days)

Traffic: 54 is, (traffic)

Type: video, (Type: Video video or article Article This article was )

The above mentioned id: 8701 (video or article of the above mentioned id )

Testing requirements:

1,  data cleaning: Cleaning in accordance with the data, and import data washing hive data repository .

Two-stage data cleaning:

( 1 ) First stage: the required information is extracted from the original log

ip:    199.30.25.88

time:  10/Nov/2016:00:01:03 +0800

traffic:  62

Article: Article This article was / 11325

Video: Video / 3235

( 2 ) The second stage: to do fine operation based on information extracted from the

ip ---> City City ( IP )

date--> time:2016-11-10 00:01:03

day: 10

traffic:62

type:article/video

id:11325

( . 3 ) Hive database table structure :

create table data(  ip string,  time string , day string, traffic bigint,

type string, id   string )

2 , the data processing:

· Statistical most popular video / article Top10 visits ( Video / Article This article was )

· According to the statistics of the most popular cities Top10 course ( ip )

· According to traffic statistics of the most popular Top10 course ( traffic )

3 , Data Visualization: The statistical results poured MySql database, unfolded through a graphical display mode.

 

Phase One:

 

/ * 
 * Log file mapreduce firstly washed out, and then introduced into the hive where 
 * 
 * / 
Package classtest3; 

Import java.io.IOException;
 Import the java.text.SimpleDateFormat;
 Import java.util.Date;
 Import the java.util.Iterator;
 Import java.util.Locale;
 Import java.util.StringTokenizer;
 Import org.apache.hadoop.conf.Configuration;
 Import org.apache.hadoop.fs.Path;
 Import org.apache.hadoop.io.IntWritable;
 Import ORG. apache.hadoop.io.LongWritable;
 Import org.apache.hadoop.io.Text;
 Import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 

public class Result{

 

         public static final SimpleDateFormat FORMAT = new SimpleDateFormat("d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH); //原时间格式

         public static Final the SimpleDateFormat dateformat1 = new new the SimpleDateFormat ( "the MM-dd-YYYY"); // current time format 

       Private a Date parseDateFormat (String String) {          // convert time format 

            a Date the parse = null ; 

            the try { 

                the parse = FORMAT.parse (String); 

            } the catch (Exception E) { 

                e.printStackTrace (); 

            } 

            return the parse; 

        } 

        
       // the row of data into a single cleaning string array 
        public string [] the parse (string line) { 

            string IP= parseIP(line);       //ip

            String time = parseTime(line);   //时间

            String url = parseURL(line);     //url

            String status = parseStatus(line); //状态

            String traffic = parseTraffic(line);//流量

 

            return new String[] { ip, time, url, status, traffic };

        } 
        
        
        private String parseTraffic(String line) {    //流量

            final String trim = line.substring(line.lastIndexOf("\"") + 1)

                    .trim();

            String traffic = trim.split(" ")[1];

            return traffic;

        }
        
        
        private String parseStatus(String line) {     //状态

            final String trim = line.substring(line.lastIndexOf("\"") + 1)

                    .trim();

            String status = trim.split(" ")[0];

            return status;

        }

 
       
        private String parseURL(String line) {       //url

            final int first = line.indexOf("\"");

            final int last = line.lastIndexOf("\"");

            URL String = line.substring (+. 1 First , Last); 

            return URL; 

        } 
        // parse Time 
        Private String parseTime (Line String) {     // Time 

            Final  int First line.indexOf = ( "["); // Find a string, the position of the first occurrence of the specified string. 

            Final  int Last line.indexOf = ( "+ 0800]" ); 

            String Time = line.substring (+. 1 First, Last) .trim (); // "Hamburger" .substring (3,8) Returns "Burge" 

            a Date DATE = parseDateFormat (Time); 

            return dateformat1.format (DATE); 

        } 

        PrivateParseIP String (String Line) {      // IP 

            String IP = line.split ( "- -") [0 ] .trim (); 

            return IP; 

        } 

    public  static  class the Map the extends 
            Mapper <LongWritable, the Text, the Text, IntWritable> { 

                

        public  void Map (LongWritable Key, the text value, the context context) 

                throws IOException, InterruptedException { 

            // data input plain text file will be converted into String 

            the text OutputValue = new new the text (); 

            String Line = value.toString (); 

            the Result AA= New new the Result (); 
            StringTokenizer tokenizerArticle = new new StringTokenizer (Line, "\ n-" ); 

 

            // separately processing each row 
            the while (tokenizerArticle.hasMoreElements ()) { 

                // each row of spaces divided by 
              String value of the stra = tokenizerArticle.nextToken () .toString (); 

              string [] newstr = aa.parse (value of the stra); 

 

           IF (newstr [2] .startsWith ( "the GET /")) { // filtered string beginning 

                newstr [ 2] = newstr [2] .substring ( "the GET /" .length ()); 

            } 

          the else  IF (newstr [2] .startsWith ( "the POST /" )) {

                Newstr[2] = Newstr[2].substring("POST /".length());

            }

           if (Newstr[2].endsWith(" HTTP/1.1")) { //过滤结尾字符串

                Newstr[2] = Newstr[2].substring(0, Newstr[2].length()

                        - " HTTP/1.1".length());

            }

              String[] words = Newstr[2].split("/");

              if(words.length==4){

                  outputValue.set(Newstr[0] + "\t" + Newstr[1] + "\t" + words[0]+"\t"+words[1]+"\t"+words[2]+"\t"+words[3]+"\t"+"0");

                   context.write(outputValue,new IntWritable(1));                 

              }    

            }

        }

    }
    
    
    public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
        // 实现reduce函数

        public void reduce(Text key, Iterable<IntWritable> values,
                Context context) throws IOException, InterruptedException {

          int sum = 0;

            Iterator<IntWritable> iterator = values.iterator();

            while (iterator.hasNext()) {

                sum +=Iterator.next () GET ();. 

            } 
            context.write (Key, new new IntWritable (SUM)); 
        } 
    } 
    

    public  static  void main (String [] args) throws Exception { 
        the Configuration the conf = new new the Configuration (); 

//         the conf .set ( "mapred.jar", "Namecount.jar");
 // 
//         String [] = ioArgs new new String [] { "name", "name_out"};
 // 
//         String [] = new new GenericOptionsParser otherArgs (the conf, ioArgs) .getRemainingArgs (); // O path
 //         
//         // validate the input-output path exists
//        if (otherArgs.length != 2) {
//            System.err.println("Usage: Score Average <in> <out>");
//            System.exit(2);
//        }

 

        Job job = Job.getInstance();

        job.setJarByClass(Result.class);

        // 设置Map、Combine和Reduce处理类
        job.setMapperClass(Map.class);

        job.setCombinerClass(Reduce.class);

        job.setReducerClass(Reduce.class);

 

        // 设置输出类型
        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass (IntWritable. class ); 

 

        // input data set is divided into small data blocks splites, a RecordReder implementation 
        job.setInputFormatClass (the TextInputFormat. class ); 

        // provide processes for implementing RecordWriter responsible for data output 
        job. setOutputFormatClass (the TextOutputFormat. class ); 

 

        // set the input and output directories 
        FileInputFormat.addInputPath (Job, new new the Path ( "HDFS: //192.168.57.128: 9000 / MyMapReduce / classtest3 / name.txt" )); 

        FileOutputFormat.setOutputPath (Job , new new the Path ( "HDFS: //192.168.57.128: 9000 / MyMapReduce / classtest3 / test1result" )); 

        System.exit (job.waitForCompletion ( to true) ? 0 : 1);

    }

}

Import Hive statement:

load data inpath '/MyMapReduce/classtest3/test1result/part-r-00000' into table acc_log;

 

 

Guess you like

Origin www.cnblogs.com/jmdd/p/11854191.html