代码如下:
AccessLogWritable 代码:
1 package cn.dataClean.mr.mapper; 2 3 import java.io.DataInput; 4 import java.io.DataOutput; 5 import java.io.IOException; 6 7 import org.apache.hadoop.io.Writable; 8 9 /** 10 * 数据清洗时自定义类型序列化接口 11 * 12 * @author Lenovo 13 * 14 */ 15 public class AccessLogWritable implements Writable { 16 17 private String ip; 18 private String time; 19 private String day; 20 private String traffic; 21 private String type; 22 private String id; 23 24 public AccessLogWritable(String ip, String time, String day, String traffic, String type, String id) { 25 this.ip = ip; 26 this.time = time; 27 this.day = day; 28 this.traffic = traffic; 29 this.type = type; 30 this.id = id; 31 } 32 33 public String getIp() { 34 return ip; 35 } 36 37 public void setIp(String ip) { 38 this.ip = ip; 39 } 40 41 public String getTime() { 42 return time; 43 } 44 45 public void setTime(String time) { 46 this.time = time; 47 } 48 49 public String getDay() { 50 return day; 51 } 52 53 public void setDay(String day) { 54 this.day = day; 55 } 56 57 public String getTraffic() { 58 return traffic; 59 } 60 61 public void setTraffic(String traffic) { 62 this.traffic = traffic; 63 } 64 65 public String getType() { 66 return type; 67 } 68 69 public void setType(String type) { 70 this.type = type; 71 } 72 73 public String getId() { 74 return id; 75 } 76 77 public void setId(String id) { 78 this.id = id; 79 } 80 81 /** 82 * hadoop系统在反序列化的时候调用这个方法 83 */ 84 @Override 85 public void readFields(DataInput in) throws IOException { 86 this.ip = in.readUTF(); 87 this.time = in.readUTF(); 88 this.day = in.readUTF(); 89 this.traffic = in.readUTF(); 90 this.type = in.readUTF(); 91 this.id = in.readUTF(); 92 } 93 94 /** 95 * hadoop系统在序列化的时候调用这个方法 96 */ 97 @Override 98 public void write(DataOutput out) throws IOException { 99 out.writeUTF(ip); 100 out.writeUTF(time); 101 out.writeUTF(day); 102 out.writeUTF(traffic); 103 out.writeUTF(type); 104 out.writeUTF(id); 105 } 106 107 }
AccessLogCleanMapper 代码:
package cn.dataClean.mr.mapper; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Locale; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; /** * 清洗数据时的mapper类 * * @author Lenovo * */ public class AccessLogCleanMapper extends Mapper<LongWritable, Text, Text, AccessLogWritable> { // 设置本文件时间格式和目标文件格式,用于时间格式的转变 public static final SimpleDateFormat FORMAT = new SimpleDateFormat("d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH); // 原时间格式 public static final SimpleDateFormat dateformat1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");// 期望时间格式 private String ip; private String time; private String day; private String traffic; private String type; private String id; private static Date parse; // 重写Mapper方法 @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] fields = line.split(","); // if (fields == null || fields.length < 6) { // 有异常数据,进行消息提醒 // System.out.println("数据异常"); // return; // } // 对变量进行赋值封装 Date date = parseDateFormat(fields[1]); ip = fields[0]; time = dateformat1.format(date); day = fields[2]; traffic = fields[3]; type = fields[4]; id = fields[5]; // System.out.println(time); // 将变量打包发送 context.write(new Text(ip), new AccessLogWritable(ip, time, day, traffic, type, id)); } // 重写setup方法,转换时间格式 private static Date parseDateFormat(String string) { try { parse = FORMAT.parse(string); } catch (Exception e) { e.printStackTrace(); } return parse; } // // 释放资源 // @Override // protected void cleanup(Context context) throws IOException, InterruptedException { // parse = null; // } }
AccessLogCleanReduce 代码:
package cn.dataClean.mr.mapper; import java.io.IOException; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class AccessLogCleanReduce extends Reducer<Text, AccessLogWritable, Text, AccessLogWritable> { String ip; String time; String day; String traffic; String type; String id; // 重写reduce方法 @Override protected void reduce(Text key, Iterable<AccessLogWritable> values, Context context) throws IOException, InterruptedException { for (AccessLogWritable value : values) { ip = value.getIp(); time = value.getTime(); day = value.getDay(); traffic = value.getTraffic(); type = value.getType(); id = value.getId(); System.out.println(time); } context.write(key, new AccessLogWritable(ip, time, day, traffic, type, id)); } }
AccessLogCleanJob 代码:
package cn.dataClean.mr.mapper; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class AccessLogCleanJob { public static void main(String[] args) throws Exception { // // 检查MapReduce数据是否正确 // if (args == null || args.length < 2) { // System.err.println("Parameter Errors!Usage<inputPath...><outputPath>"); // System.exit(-1); // } Job job = Job.getInstance(); job.setJobName("AccessLogCleanJob"); job.setJarByClass(AccessLogCleanJob.class); // map设置 job.setMapperClass(AccessLogCleanMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(AccessLogWritable.class); // reduce设置 job.setReducerClass(AccessLogCleanReduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(AccessLogWritable.class); // 文件输入路径与输出路径 FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.57.128:9000/MyMapReduce/AccessLogClean/result.txt")); FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.57.128:9000/MyMapReduce/AccessLogClean/Result")); // 任务提示 boolean flag = job.waitForCompletion(true); System.out.println(flag); System.exit(flag ? 0 : 1); } }