MapReduce program counts the number of UVs

UV (Unique Visitor) is an independent visitor who counts the number of users who visit a site in one day (based on cookies); a computer client accessing the website is a visitor. It can be understood as the number of computers accessing a website. The website judges the identity of the visiting computer through the cookies of the visiting computer. If you change the IP but do not clear the cookies, and then visit the same website, the number of UVs in the website's statistics is unchanged. If the user does not save cookies access, clear cookies or change device access, the count will increase by 1. The same client multiple visits between 00: 00-24: 00 will only count as 1 visitor.

A writing webLogUVMapper class file

package com.huadian.webloguvs;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class WebLogUVMapper extends Mapper <LongWritable, Text, Text, Text>{

    private Text        outputKey = new Text(  );
    private Text outputValue = new Text(  );
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
       //分割每一行内容,
        String line = value.toString();
        String[] items = line.split( "\t" );

        / ** 
         * (1) One has 36 fields. If the length of the array is less than 36 after the split, this data is dirty data and can be discarded 
         * (2) If the URL is empty, the record discards the subscript 1, "", null, "null" 
         * City subscript 23 
         * Output (city Id, 1) 
         * / 
        if (items.length> = 36) { 
            if (StringUtils.isBlank (items [5])) { 
                return; 
            } 
            outputKey .set (items [24]); 
            outputValue.set (items [5]); 
            context.write (outputKey, outputValue); 

        } else { 
            return; 
        } 
    } 
}

Two writing WebLogUVMapReduce class file

package com.huadian.webloguvs;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class WebLogUVMapReduce extends Configured implements Tool {
    @Override
    public int run(String[] args) throws Exception {

        //2、创建job
        Job job = Job.getInstance( this.getConf(), "WebLogUVMapReduce" );
        //设置job运行的主类
        job.setJarByClass( WebLogUVMapReduce.class );

        //设置Job
        //a、input
        Path inputPath = new Path( args[0] );
        FileInputFormat.setInputPaths( job, inputPath);

        //b、map
        job.setMapperClass( WebLogUVMapper.class );
        job.setMapOutputKeyClass( Text.class );
        job.setMapOutputValueClass( Text.class );

        job.setNumReduceTasks( 2 );

        //c、reduce
        job.setReducerClass( WebLogUVReducer.class );
        job.setOutputKeyClass( Text.class  );
        job.setOutputValueClass( Text.class );

        //d、output
        Path outputPath = new Path( args[1] );

        //如果输出目录存在,先删除
        FileSystem hdfs = FileSystem.get( this.getConf() );
        if(hdfs.exists(outputPath )){
            hdfs.delete( outputPath,true );
        }
        FileOutputFormat.setOutputPath( job,outputPath );

        //第四步,提交job
        boolean isSuccess = job.waitForCompletion( true );

        return isSuccess?0:1 ;
    }


    public static void main(String[] args) {
        Configuration configuration = new Configuration();
        ///public static int run(Configuration conf, Tool tool, String[] args)
        try {
           int  status =  ToolRunner.run( configuration,new WebLogUVMapReduce(),args );
           System.exit( status );
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

Three writing WebLogUVReducer class file

package com.huadian.webloguvs;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;


public class WebLogUVReducer extends Reducer<Text,Text,Text,Text> {
    private Text outputValue = new Text(  );
    @Override
    protected void reduce(Text key, Iterable<Text> values, Context context) throws InterruptedException, IOException {
       //key :城市;  value:<guid1,guid1,guid2,guid3>
        Set<Text> set = new HashSet<Text>();
        for (Text value:values) {
            set.add(value);
        }
        /*Iterator<Text> iterator = set.iterator();
        Text text = null;
        while (iterator.hasNext()){
            text = iterator.next();
        }*/
        outputValue.set(String.valueOf(set.size()));
        context.write( key,outputValue );
    }
}

 

发布了105 篇原创文章 · 获赞 536 · 访问量 7万+

Guess you like

Origin blog.csdn.net/qq_41934990/article/details/81610607