web log preprocessing
1. Requirements:Identify and segment each field in the web access log,
remove illegal records in the log
, and generate various types of access request filtering data according to the KPI statistical requirements.
2. Implementation code:
a) Define a bean to record each data field in the log data
public class WebLogBean { private String remote_addr;// Record the ip address of the client private String remote_user;// Record the client user name, ignore the attribute "-" private String time_local;// Record access time and time zone private String request;// Record the requested url and http protocol private String status;// Record the request status; success is 200 private String body_bytes_sent;// Record the content size of the file body sent to the client private String http_referer;// Used to record the link from that page private String http_user_agent;// Record the relevant information of the client browser private boolean valid = true;// Determine whether the data is legal public String getRemote_addr() { return remote_addr; } public void setRemote_addr(String remote_addr) { this.remote_addr = remote_addr; } public String getRemote_user() { return remote_user; } public void setRemote_user(String remote_user) { this.remote_user = remote_user; } public String getTime_local() { return time_local; } public void setTime_local(String time_local) { this.time_local = time_local; } public String getRequest() { return request; } public void setRequest(String request) { this.request = request; } public String getStatus() { return status; } public void setStatus(String status) { this.status = status; } public String getBody_bytes_sent() { return body_bytes_sent; } public void setBody_bytes_sent(String body_bytes_sent) { this.body_bytes_sent = body_bytes_sent; } public String getHttp_referer() { return http_referer; } public void setHttp_referer(String http_referer) { this.http_referer = http_referer; } public String getHttp_user_agent() { return http_user_agent; } public void setHttp_user_agent(String http_user_agent) { this.http_user_agent = http_user_agent; } public boolean isValid() { return valid; } public void setValid(boolean valid) { this.valid = valid; } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(this.valid); sb.append("\001").append(this.remote_addr); sb.append("\001").append(this.remote_user); sb.append("\001").append(this.time_local); sb.append("\001").append(this.request); sb.append("\001").append(this.status); sb.append("\001").append(this.body_bytes_sent); sb.append("\001").append(this.http_referer); sb.append("\001").append(this.http_user_agent); return sb.toString(); } }
b) Define a parser to parse and filter the original records of web access logs
import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Locale; public class WebLogParser { static SimpleDateFormat sd1 = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.US); static SimpleDateFormat sd2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); public static WebLogBean parser(String line) { WebLogBean webLogBean = new WebLogBean(); String[] arr = line.split(" "); if (arr.length > 11) { webLogBean.setRemote_addr(arr[0]); webLogBean.setRemote_user(arr[1]); webLogBean.setTime_local(parseTime(arr[3].substring(1))); webLogBean.setRequest(arr[6]); webLogBean.setStatus(arr[8]); webLogBean.setBody_bytes_sent(arr[9]); webLogBean.setHttp_referer(arr[10]); if (arr.length > 12) { webLogBean.setHttp_user_agent(arr[11] + " " + arr[12]); } else { webLogBean.setHttp_user_agent(arr[11]); } if (Integer.parseInt(webLogBean.getStatus()) >= 400) {// greater than 400, HTTP error webLogBean.setValid(false); } } else { webLogBean.setValid(false); } return webLogBean; } public static String parseTime(String dt) { String timeString = ""; try { Date parse = sd1.parse(dt); timeString = sd2.format(parse); } catch (ParseException e) { e.printStackTrace (); } return timeString; } public static void main(String[] args) { WebLogParser wp = new WebLogParser(); String parseTime = wp.parseTime("18/Sep/2013:06:49:48"); System.out.println(parseTime); } }
c) mapreduce program
import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class WeblogPreProcess { static class WeblogPreProcessMapper extends Mapper<LongWritable, Text, Text, NullWritable> { Text k = new Text(); NullWritable v = NullWritable.get(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); WebLogBean webLogBean = WebLogParser.parser(line); //You can insert a static resource filter (.....) /*WebLogParser.filterStaticResource(webLogBean);*/ if (!webLogBean.isValid()) return; k.set(webLogBean.toString()); context.write(k, v); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(WeblogPreProcess.class); job.setMapperClass(WeblogPreProcessMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); FileInputFormat.setInputPaths(job, new Path("C:/wordcount/weblog/input")); FileOutputFormat.setOutputPath(job, new Path("C:/wordcount/weblog/output")); job.waitForCompletion(true); } }