MapReduce data cleansing

code show as below:

AccessLogWritable Code:
  . 1  Package cn.dataClean.mr.mapper;
   2  
  . 3  Import the java.io.DataInput;
   . 4  Import java.io.DataOutput;
   . 5  Import java.io.IOException;
   . 6  
  . 7  Import org.apache.hadoop.io.Writable;
   . 8  
  . 9  / ** 
10  * custom type serial data interfaces cleaning
 . 11  * 
 12 is  * @author the Lenovo
 13 is  *
 14   * / 
15  public  class AccessLogWritable the implements the Writable {
 16  
. 17      Private String ip;
 18     private String time;
 19     private String day;
 20     private String traffic;
 21     private String type;
 22     private String id;
 23 
 24     public AccessLogWritable(String ip, String time, String day, String traffic, String type, String id) {
 25         this.ip = ip;
 26         this.time = time;
 27         this.day = day;
 28         this.traffic = traffic;
 29         this.type = type;
 30         this.id = id;
 31     }
 32 
 33     public String getIp() {
 34         return ip;
 35     }
 36 
 37     public void setIp(String ip) {
 38         this.ip = ip;
 39     }
 40 
 41     public String getTime() {
 42         return time;
 43     }
 44 
 45     public void setTime(String time) {
 46         this.time = time;
 47     }
 48 
 49     public String getDay() {
 50         return day;
 51     }
 52 
 53     public void setDay(String day) {
 54         this.day = day;
 55     }
 56 
 57     public String getTraffic() {
 58         return traffic;
 59     }
 60 
 61     public void setTraffic(String traffic) {
 62         this.traffic = traffic;
 63     }
 64 
 65     public String getType () {
 66          return type;
 67      }
 68  
69      public  void setType (String type) {
 70          the this .Type = type;
 71 is      }
 72  
73 is      public String getId () {
 74          return ID;
 75      }
 76  
77      public  void the setId (String ID) {
 78          the this .id = ID;
 79      }
 80  
81      / ** 
82       * Hadoop system call this method when deserialization
 83      * / 
84      @Override
 85      public  void readFields (of DataInput for primitive in) throws IOException {
 86          the this .ip = in.readUTF ();
 87          the this .time = in.readUTF ();
 88          the this .DAY, = in.readUTF ();
 89          the this .traffic = in.readUTF ();
 90          the this .Type = in.readUTF ();
 91 is          the this .id = in.readUTF ();
 92      }
 93  
94      / ** 
95       * Hadoop that the system calls in the sequence of the time methods
 96       * /
 97     @Override
 98     public void write(DataOutput out) throws IOException {
 99         out.writeUTF(ip);
100         out.writeUTF(time);
101         out.writeUTF(day);
102         out.writeUTF(traffic);
103         out.writeUTF(type);
104         out.writeUTF(id);
105     }
106 
107 }

 

AccessLogCleanMapper Code:
Package cn.dataClean.mr.mapper; 

Import java.io.IOException;
 Import the java.text.SimpleDateFormat;
 Import java.util.Date;
 Import java.util.Locale; 

Import org.apache.hadoop.io.LongWritable;
 Import ORG .apache.hadoop.io.Text;
 Import org.apache.hadoop.mapreduce.Mapper; 

/ ** 
 * when the cleaning Mapper class data 
 * 
 * @author the Lenovo 
 * 
 * / 
public  class AccessLogCleanMapper the extends Mapper <LongWritable, the Text, the Text, AccessLogWritable> { 

    // set the present time format file and destination file format, for converting the format of the time
    public static final SimpleDateFormat FORMAT = new SimpleDateFormat("d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH); // 原时间格式
    public static final SimpleDateFormat dateformat1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");// 期望时间格式
    
    private String ip;
    private String time;
    private String day;
    private String traffic;
    private String type;
    private String id;
    private static Date parse;

    // 重写Mapper方法
    @Override
     protected  void Map (LongWritable Key, the Text value, the Context context) throws IOException, InterruptedException { 

        String Line = value.toString (); 

        String [] Fields = line.split ( "," ); 

//         IF (Fields == || fields.length null <. 6) { // abnormal data, a message alert
 //             System.out.println ( "data error");
 //             return;
 //         } 

        // variable assignment package 
        Date date = parseDateFormat (Fields [. 1 ]); 
        IP = Fields [0 ]; 
        Time =dateformat1.format (DATE); 
        Day = Fields [2 ]; 
        the traffic = Fields [. 3 ]; 
        type = Fields [. 4 ]; 
        ID = Fields [. 5 ]; 
        
//         System.out.println (Time);
         // variable package sent 
        context.write ( new new the Text (IP), new new AccessLogWritable (IP, time, Day, the traffic, type, ID)); 

    } 

    // override setup method, format conversion time 
    Private  static a Date parseDateFormat (String String) {
         the try { 
            the parse = FORMAT.parse (String); 
        } the catch (Exception e) {
            e.printStackTrace();
        }
        return parse;
    }

//    // 释放资源
//    @Override
//    protected void cleanup(Context context) throws IOException, InterruptedException {
//        parse = null;
//    }

}

 

AccessLogCleanReduce Code:
package cn.dataClean.mr.mapper;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class AccessLogCleanReduce extends Reducer<Text, AccessLogWritable, Text, AccessLogWritable> {

    String ip;
    String time;
    String day;
    String traffic;
    String type;
    String id;

    // 重写reduce方法
    @Override
    protected void reduce(Text key, Iterable<AccessLogWritable> values, Context context)
            throws IOException, InterruptedException {
        for (AccessLogWritable value : values) {
            ip = value.getIp();
            time = value.getTime();
            day = value.getDay();
            traffic = value.getTraffic();
            type = value.getType();
            id = value.getId();
            System.out.println(time);
        }

        
        context.write(key, new AccessLogWritable(ip, time, day, traffic, type, id));
    }

}

 

AccessLogCleanJob Code:
package cn.dataClean.mr.mapper;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class AccessLogCleanJob {

    public static void main(String[] args) throws Exception {

//         // 检查MapReduce数据是否正确
//         if (args == null || args.length < 2) {
//         System.err.println("Parameter Errors!Usage<inputPath...><outputPath>");
//         System.exit(-1);
//         }

        Job job = Job.getInstance();
        job.setJobName("AccessLogCleanJob");
        job.setJarByClass(AccessLogCleanJob.class);

        // map设置
        job.setMapperClass(AccessLogCleanMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(AccessLogWritable.class);

        // reduce设置
        job.setReducerClass(AccessLogCleanReduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(AccessLogWritable.class);

        // 文件输入路径与输出路径
        FileInputFormat.setInputPaths(job,
                new Path("hdfs://192.168.57.128:9000/MyMapReduce/AccessLogClean/result.txt"));
        FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.57.128:9000/MyMapReduce/AccessLogClean/Result"));

        // 任务提示
        boolean flag = job.waitForCompletion(true);
        System.out.println(flag);
        System.exit(flag ? 0 : 1);
    }
}

 

Guess you like

Origin www.cnblogs.com/yandashan666/p/11853695.html