MapReduce清洗数据

代码如下：

AccessLogWritable 代码：

  1 package cn.dataClean.mr.mapper;
  2 
  3 import java.io.DataInput;
  4 import java.io.DataOutput;
  5 import java.io.IOException;
  6 
  7 import org.apache.hadoop.io.Writable;
  8 
  9 /**
 10  * 数据清洗时自定义类型序列化接口
 11  * 
 12  * @author Lenovo
 13  *
 14  */
 15 public class AccessLogWritable implements Writable {
 16 
 17     private String ip;
 18     private String time;
 19     private String day;
 20     private String traffic;
 21     private String type;
 22     private String id;
 23 
 24     public AccessLogWritable(String ip, String time, String day, String traffic, String type, String id) {
 25         this.ip = ip;
 26         this.time = time;
 27         this.day = day;
 28         this.traffic = traffic;
 29         this.type = type;
 30         this.id = id;
 31     }
 32 
 33     public String getIp() {
 34         return ip;
 35     }
 36 
 37     public void setIp(String ip) {
 38         this.ip = ip;
 39     }
 40 
 41     public String getTime() {
 42         return time;
 43     }
 44 
 45     public void setTime(String time) {
 46         this.time = time;
 47     }
 48 
 49     public String getDay() {
 50         return day;
 51     }
 52 
 53     public void setDay(String day) {
 54         this.day = day;
 55     }
 56 
 57     public String getTraffic() {
 58         return traffic;
 59     }
 60 
 61     public void setTraffic(String traffic) {
 62         this.traffic = traffic;
 63     }
 64 
 65     public String getType() {
 66         return type;
 67     }
 68 
 69     public void setType(String type) {
 70         this.type = type;
 71     }
 72 
 73     public String getId() {
 74         return id;
 75     }
 76 
 77     public void setId(String id) {
 78         this.id = id;
 79     }
 80 
 81     /**
 82      * hadoop系统在反序列化的时候调用这个方法
 83      */
 84     @Override
 85     public void readFields(DataInput in) throws IOException {
 86         this.ip = in.readUTF();
 87         this.time = in.readUTF();
 88         this.day = in.readUTF();
 89         this.traffic = in.readUTF();
 90         this.type = in.readUTF();
 91         this.id = in.readUTF();
 92     }
 93 
 94     /**
 95      * hadoop系统在序列化的时候调用这个方法
 96      */
 97     @Override
 98     public void write(DataOutput out) throws IOException {
 99         out.writeUTF(ip);
100         out.writeUTF(time);
101         out.writeUTF(day);
102         out.writeUTF(traffic);
103         out.writeUTF(type);
104         out.writeUTF(id);
105     }
106 
107 }

AccessLogCleanMapper 代码：

package cn.dataClean.mr.mapper;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

/**
 * 清洗数据时的mapper类
 * 
 * @author Lenovo
 *
 */
public class AccessLogCleanMapper extends Mapper<LongWritable, Text, Text, AccessLogWritable> {

    // 设置本文件时间格式和目标文件格式，用于时间格式的转变
    public static final SimpleDateFormat FORMAT = new SimpleDateFormat("d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH); // 原时间格式
    public static final SimpleDateFormat dateformat1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");// 期望时间格式
    
    private String ip;
    private String time;
    private String day;
    private String traffic;
    private String type;
    private String id;
    private static Date parse;

    // 重写Mapper方法
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        String line = value.toString();

        String[] fields = line.split(",");

//        if (fields == null || fields.length < 6) { // 有异常数据，进行消息提醒
//            System.out.println("数据异常");
//            return;
//        }

        // 对变量进行赋值封装
        Date date = parseDateFormat(fields[1]);
        ip = fields[0];
        time = dateformat1.format(date);
        day = fields[2];
        traffic = fields[3];
        type = fields[4];
        id = fields[5];
        
//        System.out.println(time);
        // 将变量打包发送
        context.write(new Text(ip), new AccessLogWritable(ip, time, day, traffic, type, id));

    }

    // 重写setup方法，转换时间格式
    private static Date parseDateFormat(String string) {
        try {
            parse = FORMAT.parse(string);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return parse;
    }

//    // 释放资源
//    @Override
//    protected void cleanup(Context context) throws IOException, InterruptedException {
//        parse = null;
//    }

}

AccessLogCleanReduce 代码:

package cn.dataClean.mr.mapper;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class AccessLogCleanReduce extends Reducer<Text, AccessLogWritable, Text, AccessLogWritable> {

    String ip;
    String time;
    String day;
    String traffic;
    String type;
    String id;

    // 重写reduce方法
    @Override
    protected void reduce(Text key, Iterable<AccessLogWritable> values, Context context)
            throws IOException, InterruptedException {
        for (AccessLogWritable value : values) {
            ip = value.getIp();
            time = value.getTime();
            day = value.getDay();
            traffic = value.getTraffic();
            type = value.getType();
            id = value.getId();
            System.out.println(time);
        }

        
        context.write(key, new AccessLogWritable(ip, time, day, traffic, type, id));
    }

}

AccessLogCleanJob 代码：

package cn.dataClean.mr.mapper;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class AccessLogCleanJob {

    public static void main(String[] args) throws Exception {

//         // 检查MapReduce数据是否正确
//         if (args == null || args.length < 2) {
//         System.err.println("Parameter Errors!Usage<inputPath...><outputPath>");
//         System.exit(-1);
//         }

        Job job = Job.getInstance();
        job.setJobName("AccessLogCleanJob");
        job.setJarByClass(AccessLogCleanJob.class);

        // map设置
        job.setMapperClass(AccessLogCleanMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(AccessLogWritable.class);

        // reduce设置
        job.setReducerClass(AccessLogCleanReduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(AccessLogWritable.class);

        // 文件输入路径与输出路径
        FileInputFormat.setInputPaths(job,
                new Path("hdfs://192.168.57.128:9000/MyMapReduce/AccessLogClean/result.txt"));
        FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.57.128:9000/MyMapReduce/AccessLogClean/Result"));

        // 任务提示
        boolean flag = job.waitForCompletion(true);
        System.out.println(flag);
        System.exit(flag ? 0 : 1);
    }
}

猜你喜欢