PV (page view) is the number of page views. It is usually the main indicator for measuring an online news channel or website or even a piece of online news. It is one of the most commonly used indicators for evaluating website traffic. Monitoring the changing trends of the PV of the website and analyzing the reasons for their changes are the tasks that many webmasters do regularly. Page in Page Views generally refers to ordinary html web pages, and also contains dynamically generated html content such as php and jsp. An html content request from the browser will be regarded as a PV, which gradually accumulates into the total number of PVs.
A code to write WebLogPvMapper class
package com.huadian.weblogpv; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; public class WebLogPVMapper extends Mapper <LongWritable, Text, Text, IntWritable>{ private Text outputKey = new Text( ); private IntWritable outputValue = new IntWritable( 1 ); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //分割每一行内容, String line = value.toString(); String [] items = line.split ("\ t"); /** * (1) A field with 36 fields. If the length of the array is less than 36 after the split, this data is dirty data and can be discarded * (2) If the URL If it is empty, the record discards subscript 1, "", null, "null" * Province subscript 23 * Output (province Id, 1) * / if (items.length> = 36) { if (StringUtils.isBlank (items [1])) { return; } outputKey.set (items [23]); context.write (outputKey, outputValue); } else { return; } } } |
Two write webLogMapReduce code
package com.huadian.weblogpv; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class WebLogPVMapReduce extends Configured implements Tool { @Override public int run(String[] args) throws Exception { //2、创建job Job job = Job.getInstance( this.getConf(), "WebLogUVMapReduce" ); //设置job运行的主类 job.setJarByClass( WebLogPVMapReduce.class ); //设置Job //a、input Path inputPath = new Path( args[0] ); FileInputFormat.setInputPaths( job, inputPath); //b、map job.setMapperClass( WebLogPVMapper.class ); job.setMapOutputKeyClass( Text.class ); job.setMapOutputValueClass( IntWritable.class ); job.setNumReduceTasks( 2 ); //c、reduce job.setReducerClass( WebLogPvReducer.class); job.setOutputKeyClass( Text.class ); job.setOutputValueClass( IntWritable.class ); //d、output Path outputPath = new Path( args[1] ); //如果输出目录存在,先删除 FileSystem hdfs = FileSystem.get( this.getConf() ); if(hdfs.exists(outputPath )){ hdfs.delete( outputPath,true ); } FileOutputFormat.setOutputPath( job,outputPath ); //第四步,提交job boolean isSuccess = job.waitForCompletion( true ); return isSuccess?0:1 ; } public static void main(String[] args) { Configuration configuration = new Configuration(); ///public static int run(Configuration conf, Tool tool, String[] args) try { int status = ToolRunner.run( configuration,new WebLogPVMapReduce(),args ); System.exit( status ); } catch (Exception e) { e.printStackTrace(); } } } |
Three write WebLogPvReducer code
package com.huadian.weblogpv; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public class WebLogPvReducer extends Reducer<Text,IntWritable,Text,IntWritable> { private IntWritable outputValue = new IntWritable( ); @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException, IOException { //key :省份; value:<1,1,1,1> int sum = 0; for (IntWritable value:values) { sum+= value.get(); } outputValue.set( sum ); context.write( key,outputValue ); } } |