MapReduce program counts the number of PVs

PV (page view) is the number of page views. It is usually the main indicator for measuring an online news channel or website or even a piece of online news. It is one of the most commonly used indicators for evaluating website traffic. Monitoring the changing trends of the PV of the website and analyzing the reasons for their changes are the tasks that many webmasters do regularly. Page in Page Views generally refers to ordinary html web pages, and also contains dynamically generated html content such as php and jsp. An html content request from the browser will be regarded as a PV, which gradually accumulates into the total number of PVs.

A code to write WebLogPvMapper class

package com.huadian.weblogpv;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class WebLogPVMapper extends Mapper <LongWritable, Text, Text, IntWritable>{

    private Text        outputKey = new Text(  );
    private IntWritable outputValue = new IntWritable( 1 );
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
       //分割每一行内容,
        String line = value.toString();
        String [] items = line.split ("\ t"); 

        /**
         * (1) A field with 36 fields. If the length of the array is less than 36 after the split, this data is dirty data and can be discarded 
         * (2) If the URL If it is empty, the record discards subscript 1, "", null, "null" 
         * Province subscript 23 
         * Output (province Id, 1) 
         * / 
        if (items.length> = 36) { 
            if (StringUtils.isBlank (items [1])) { 
                return; 
            } 
            outputKey.set (items [23]); 
            context.write (outputKey, outputValue); 

        } else { 
            return; 
        } 
    } 
}

Two write webLogMapReduce code

package com.huadian.weblogpv;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;



public class WebLogPVMapReduce extends Configured implements Tool {
    @Override
    public int run(String[] args) throws Exception {

        //2、创建job
        Job job = Job.getInstance( this.getConf(), "WebLogUVMapReduce" );
        //设置job运行的主类
        job.setJarByClass( WebLogPVMapReduce.class );

        //设置Job
        //a、input
        Path inputPath = new Path( args[0] );
        FileInputFormat.setInputPaths( job, inputPath);

        //b、map
        job.setMapperClass( WebLogPVMapper.class );
        job.setMapOutputKeyClass( Text.class );
        job.setMapOutputValueClass( IntWritable.class );

        job.setNumReduceTasks( 2 );

        //c、reduce
        job.setReducerClass( WebLogPvReducer.class);
        job.setOutputKeyClass( Text.class  );
        job.setOutputValueClass( IntWritable.class );

        //d、output
        Path outputPath = new Path( args[1] );

        //如果输出目录存在,先删除
        FileSystem hdfs = FileSystem.get( this.getConf() );
        if(hdfs.exists(outputPath )){
            hdfs.delete( outputPath,true );
        }
        FileOutputFormat.setOutputPath( job,outputPath );

        //第四步,提交job
        boolean isSuccess = job.waitForCompletion( true );

        return isSuccess?0:1 ;
    }


    public static void main(String[] args) {
        Configuration configuration = new Configuration();
        ///public static int run(Configuration conf, Tool tool, String[] args)
        try {
           int  status =  ToolRunner.run( configuration,new WebLogPVMapReduce(),args );
           System.exit( status );
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

Three write WebLogPvReducer code

package com.huadian.weblogpv;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;


public class WebLogPvReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
    private IntWritable outputValue = new IntWritable(  );
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException, IOException {
       //key :省份;  value:<1,1,1,1>
        int sum = 0;
        for (IntWritable value:values) {
            sum+= value.get();
        }
        outputValue.set( sum );
        context.write( key,outputValue );
    }
}

 

发布了105 篇原创文章 · 获赞 536 · 访问量 7万+

Guess you like

Origin blog.csdn.net/qq_41934990/article/details/81610417