HADOOP cluster MAPREDUCE practice (9)

web log preprocessing

1. Requirements:
Identify and segment each field in the web access log,
remove illegal records in the log
, and generate various types of access request filtering data according to the KPI statistical requirements.


2. Implementation code:

a) Define a bean to record each data field in the log data

public class WebLogBean {
	
    private String remote_addr;// Record the ip address of the client
    private String remote_user;// Record the client user name, ignore the attribute "-"
    private String time_local;// Record access time and time zone
    private String request;// Record the requested url and http protocol
    private String status;// Record the request status; success is 200
    private String body_bytes_sent;// Record the content size of the file body sent to the client
    private String http_referer;// Used to record the link from that page
    private String http_user_agent;// Record the relevant information of the client browser

    private boolean valid = true;// Determine whether the data is legal
       
	public String getRemote_addr() {
		return remote_addr;
	}

	public void setRemote_addr(String remote_addr) {
		this.remote_addr = remote_addr;
	}

	public String getRemote_user() {
		return remote_user;
	}

	public void setRemote_user(String remote_user) {
		this.remote_user = remote_user;
	}

	public String getTime_local() {
		return time_local;
	}

	public void setTime_local(String time_local) {
		this.time_local = time_local;
	}

	public String getRequest() {
		return request;
	}

	public void setRequest(String request) {
		this.request = request;
	}

	public String getStatus() {
		return status;
	}

	public void setStatus(String status) {
		this.status = status;
	}

	public String getBody_bytes_sent() {
		return body_bytes_sent;
	}

	public void setBody_bytes_sent(String body_bytes_sent) {
		this.body_bytes_sent = body_bytes_sent;
	}

	public String getHttp_referer() {
		return http_referer;
	}

	public void setHttp_referer(String http_referer) {
		this.http_referer = http_referer;
	}

	public String getHttp_user_agent() {
		return http_user_agent;
	}

	public void setHttp_user_agent(String http_user_agent) {
		this.http_user_agent = http_user_agent;
	}

	public boolean isValid() {
		return valid;
	}

	public void setValid(boolean valid) {
		this.valid = valid;
	}
        
	@Override
	public String toString() {
        StringBuilder sb = new StringBuilder();
        sb.append(this.valid);
        sb.append("\001").append(this.remote_addr);
        sb.append("\001").append(this.remote_user);
        sb.append("\001").append(this.time_local);
        sb.append("\001").append(this.request);
        sb.append("\001").append(this.status);
        sb.append("\001").append(this.body_bytes_sent);
        sb.append("\001").append(this.http_referer);
        sb.append("\001").append(this.http_user_agent);
        return sb.toString();
}
}

b) Define a parser to parse and filter the original records of web access logs

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;

public class WebLogParser {

	static SimpleDateFormat sd1 = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.US);

	static SimpleDateFormat sd2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

	public static WebLogBean parser(String line) {
		WebLogBean webLogBean = new WebLogBean();
		String[] arr = line.split(" ");
		if (arr.length > 11) {
			webLogBean.setRemote_addr(arr[0]);
			webLogBean.setRemote_user(arr[1]);
			webLogBean.setTime_local(parseTime(arr[3].substring(1)));
			webLogBean.setRequest(arr[6]);
			webLogBean.setStatus(arr[8]);
			webLogBean.setBody_bytes_sent(arr[9]);
			webLogBean.setHttp_referer(arr[10]);

			if (arr.length > 12) {
				webLogBean.setHttp_user_agent(arr[11] + " " + arr[12]);
			} else {
				webLogBean.setHttp_user_agent(arr[11]);
			}
			if (Integer.parseInt(webLogBean.getStatus()) >= 400) {// greater than 400, HTTP error
				webLogBean.setValid(false);
			}
		} else {
			webLogBean.setValid(false);
		}
		return webLogBean;
	}

	public static String parseTime(String dt) {

		String timeString = "";
		try {
			Date parse = sd1.parse(dt);
			timeString = sd2.format(parse);

		} catch (ParseException e) {
			e.printStackTrace ();
		}

		return timeString;
	}

	public static void main(String[] args) {
		WebLogParser wp = new WebLogParser();
		String parseTime = wp.parseTime("18/Sep/2013:06:49:48");
		System.out.println(parseTime);
	}
}

c) mapreduce program

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WeblogPreProcess {

	static class WeblogPreProcessMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
		Text k = new Text();
		NullWritable v = NullWritable.get();

		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

			String line = value.toString();
			WebLogBean webLogBean = WebLogParser.parser(line);
			//You can insert a static resource filter (.....)
			/*WebLogParser.filterStaticResource(webLogBean);*/
			if (!webLogBean.isValid())
				return;
			k.set(webLogBean.toString());
			context.write(k, v);
		}
	}

	public static void main(String[] args) throws Exception {

		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);

		job.setJarByClass(WeblogPreProcess.class);

		job.setMapperClass(WeblogPreProcessMapper.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);

		FileInputFormat.setInputPaths(job, new Path("C:/wordcount/weblog/input"));
		FileOutputFormat.setOutputPath(job, new Path("C:/wordcount/weblog/output"));

		job.waitForCompletion(true);

	}
}

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324893007&siteId=291194637