UV (Unique Visitor) is an independent visitor who counts the number of users who visit a site in one day (based on cookies); a computer client accessing the website is a visitor. It can be understood as the number of computers accessing a website. The website judges the identity of the visiting computer through the cookies of the visiting computer. If you change the IP but do not clear the cookies, and then visit the same website, the number of UVs in the website's statistics is unchanged. If the user does not save cookies access, clear cookies or change device access, the count will increase by 1. The same client multiple visits between 00: 00-24: 00 will only count as 1 visitor.
A writing webLogUVMapper class file
package com.huadian.webloguvs; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; public class WebLogUVMapper extends Mapper <LongWritable, Text, Text, Text>{ private Text outputKey = new Text( ); private Text outputValue = new Text( ); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //分割每一行内容, String line = value.toString(); String[] items = line.split( "\t" ); / ** * (1) One has 36 fields. If the length of the array is less than 36 after the split, this data is dirty data and can be discarded * (2) If the URL is empty, the record discards the subscript 1, "", null, "null" * City subscript 23 * Output (city Id, 1) * / if (items.length> = 36) { if (StringUtils.isBlank (items [5])) { return; } outputKey .set (items [24]); outputValue.set (items [5]); context.write (outputKey, outputValue); } else { return; } } } |
Two writing WebLogUVMapReduce class file
package com.huadian.webloguvs; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class WebLogUVMapReduce extends Configured implements Tool { @Override public int run(String[] args) throws Exception { //2、创建job Job job = Job.getInstance( this.getConf(), "WebLogUVMapReduce" ); //设置job运行的主类 job.setJarByClass( WebLogUVMapReduce.class ); //设置Job //a、input Path inputPath = new Path( args[0] ); FileInputFormat.setInputPaths( job, inputPath); //b、map job.setMapperClass( WebLogUVMapper.class ); job.setMapOutputKeyClass( Text.class ); job.setMapOutputValueClass( Text.class ); job.setNumReduceTasks( 2 ); //c、reduce job.setReducerClass( WebLogUVReducer.class ); job.setOutputKeyClass( Text.class ); job.setOutputValueClass( Text.class ); //d、output Path outputPath = new Path( args[1] ); //如果输出目录存在,先删除 FileSystem hdfs = FileSystem.get( this.getConf() ); if(hdfs.exists(outputPath )){ hdfs.delete( outputPath,true ); } FileOutputFormat.setOutputPath( job,outputPath ); //第四步,提交job boolean isSuccess = job.waitForCompletion( true ); return isSuccess?0:1 ; } public static void main(String[] args) { Configuration configuration = new Configuration(); ///public static int run(Configuration conf, Tool tool, String[] args) try { int status = ToolRunner.run( configuration,new WebLogUVMapReduce(),args ); System.exit( status ); } catch (Exception e) { e.printStackTrace(); } } } |
Three writing WebLogUVReducer class file
package com.huadian.webloguvs; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; import java.util.HashSet; import java.util.Iterator; import java.util.Set; public class WebLogUVReducer extends Reducer<Text,Text,Text,Text> { private Text outputValue = new Text( ); @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws InterruptedException, IOException { //key :城市; value:<guid1,guid1,guid2,guid3> Set<Text> set = new HashSet<Text>(); for (Text value:values) { set.add(value); } /*Iterator<Text> iterator = set.iterator(); Text text = null; while (iterator.hasNext()){ text = iterator.next(); }*/ outputValue.set(String.valueOf(set.size())); context.write( key,outputValue ); } } |