Hadoop MapReduce开发--对输入日志数据进行统计

该案例对输入日志数据进行统计:
要求:区别统计GET和POST URL访问量

测试数据:

127.0.0.1 - - [03/Jul/2014:23:36:38 +0800] "GET /course/detail/3.htm HTTP/1.0" 200 38435 0.038
182.131.89.195 - - [03/Jul/2014:23:37:43 +0800] "GET / HTTP/1.0" 301 - 0.000
127.0.0.1 - - [03/Jul/2014:23:38:38 +0800] "POST /service/notes/addViewTimes_23.htm HTTP/1.0" 200 2 0.003
127.0.0.1 - - [03/Jul/2014:23:39:38 +0800] "GET /html/notes/20140617/779.htm HTTP/1.0" 200 69539 0.046
127.0.0.1 - - [03/Jul/2014:23:43:38 +0800] "GET /html/notes/20140318/24.htm HTTP/1.0" 200 67171 0.049
127.0.0.1 - - [03/Jul/2014:23:43:59 +0800] "POST /service/notes/addViewTimes_779.htm HTTP/1.0" 200 1 0.003
127.0.0.1 - - [03/Jul/2014:23:45:38 +0800] "GET / HTTP/1.0" 200 70044 0.060
127.0.0.1 - - [03/Jul/2014:23:46:38 +0800] "GET /course/list/73.htm HTTP/1.0" 200 12125 0.010
127.0.0.1 - - [03/Jul/2014:23:46:58 +0800] "GET /html/notes/20140609/542.htm HTTP/1.0" 200 94971 0.077
127.0.0.1 - - [03/Jul/2014:23:48:38 +0800] "POST /service/notes/addViewTimes_24.htm HTTP/1.0" 200 2 0.003
127.0.0.1 - - [03/Jul/2014:23:48:48 +0800] "POST /service/detail/addViewTimes_542.htm HTTP/1.0" 200 2 0.003
127.0.0.1 - - [03/Jul/2014:23:49:48 +0800] "GET /notes/index-top-3.htm HTTP/1.0" 200 53494 0.041
127.0.0.1 - - [03/Jul/2014:23:50:38 +0800] "GET /html/notes/20140609/544.htm HTTP/1.0" 200 183694 0.076
127.0.0.1 - - [03/Jul/2014:23:53:38 +0800] "POST /service/notes/addViewTimes_544.htm HTTP/1.0" 200 2 0.004
127.0.0.1 - - [03/Jul/2014:23:54:38 +0800] "GET /html/notes/20140620/900.htm HTTP/1.0" 200 151770 0.054
127.0.0.1 - - [03/Jul/2014:23:57:38 +0800] "GET /html/notes/20140620/872.htm HTTP/1.0" 200 52373 0.034
127.0.0.1 - - [03/Jul/2014:23:58:38 +0800] "POST /service/notes/addViewTimes_900.htm HTTP/1.0" 200 2 0.003
127.0.0.1 - - [03/Jul/2014:23:59:38 +0800] "GET / HTTP/1.0" 200 70044 0.057

mapper

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class LogMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    private static final IntWritable val = new IntWritable(1);

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString().trim();
        if(line.length() >= 20) {
            String newKey = handlerStr(line);
            context.write(new Text(newKey), val);
        }
    }

    private String handlerStr(String line) {
        String result = "";
        if(line.indexOf("GET") > 0) {
            result = line.substring(line.indexOf("GET"), line.indexOf("HTTP/1.0")).trim();
        } else if(line.indexOf("POST") > 0) {
            result = line.substring(line.indexOf("POST"), line.indexOf("HTTP/1.0")).trim();
        }
        return result;
    }
}

reducer

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class LogReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int sum = 0;
        for(IntWritable val : values) {
            sum += val.get() + 1;
        }
        context.write(key, new IntWritable(sum));
    }
}

job

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * 该案例对输入日志数据进行统计:
 * 要求:区别统计GET和POST URL访问量
 * 结果为:访问方式、URL、访问量
 */
public class JobMain {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        if(args.length != 2) {
            System.err.println("Usage: LogStatistics<input path> <output path>");
            System.exit(-1);
        }

        Configuration conf = new Configuration();
        conf.setInt("N", 5);
        Job job = Job.getInstance(conf, "LOG job");
        job.setJarByClass(Job.class);

        job.setMapperClass(LogMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setReducerClass(LogReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));

        Path outDirPath = new Path(args[1]);
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(outDirPath)) {
            fs.delete(outDirPath, true);
        }
        FileOutputFormat.setOutputPath(job, outDirPath);

        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

结果:

GET /    6
GET /course/detail/3.htm    2
GET /course/list/73.htm    2
GET /html/notes/20140318/24.htm    2
GET /html/notes/20140609/542.htm    2
GET /html/notes/20140609/544.htm    2
GET /html/notes/20140617/779.htm    2
GET /html/notes/20140620/872.htm    2
GET /html/notes/20140620/900.htm    2
GET /notes/index-top-3.htm    2
POST /service/detail/addViewTimes_542.htm    2
POST /service/notes/addViewTimes_23.htm    2
POST /service/notes/addViewTimes_24.htm    2
POST /service/notes/addViewTimes_544.htm    2
POST /service/notes/addViewTimes_779.htm    2
POST /service/notes/addViewTimes_900.htm    2

猜你喜欢

转载自blog.csdn.net/fengzhif0001/article/details/86301131
今日推荐