网站行为日志信息统计分析

一、开发环境

(一)、开发环境:
Windows + JDK1.8 + Hadop-2.9.2+Eclipse+linux
(二)、需要的只知识:
hdfs、mapreduce、hive、简单正则表达式、用户画像等等
(三)、开发时间:2019年1月

二、项目思路

(一)、对以采集的信息先上传到hdfs上
(二)、通过打标签,对网站进行用指标画像,提取出最能描述网站指标的字段,对网站性能负载进行综合调整、评估、优化!
(三)、根据对网站指标画提取出的特征字段,对数据进行清洗
(四)、分析网站的访问量,跳出率,网络连接状态,单个ip流量的总和等 ,对网站进行研究和分析

三、系统实现

(一)、原始数据上传hdfs

1、原始数据格式
在这里插入图片描述
2、上传到hdfs上:

hadoop dfs -put ./access_2015_03_30.log   /

(二)、数据清洗(第一遍)

1、利用正则表达式对原始数据处理,提取出想要的字段:
我对网址指标画像的字段是:

ip,time,timeArea,request,url,state,dataSize

public String[] parser(String line) {
	 String str="27.19.74.143 - - [30/Mar/2015:17:38:20 +0800] \"GET /static/image/common/faq.gif HTTP/1.1\" 200 1127";
	 //Pattern.compile("(27.19.74.143) - - \\[(30/Mar/2015:17:38:20) ( +0800)\\] \"(GET) (.*) HTTP/1.1\" (200) (1127)");
	 Pattern compile = Pattern.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}) - - \\[(.*) ([-|+][0-9]{1,4})\\] \"([A-Z]{1,4}) (.*) HTTP/1.1\" ([0-9]*) ([0-9]*)");
	Matcher matcher = compile.matcher(line);
	//System.out.println(matcher.find());


	while(matcher.find()){
		if (matcher.group() != null) {
		System.out.println("成功!");
		String ip = matcher.group(1);
		String time = matcher.group(2);
		String timeArea = matcher.group(3);
		String request = matcher.group(4);
		String url = matcher.group(5);
		String state = matcher.group(6);
		String dataSize = matcher.group(7);

		SimpleDateFormat sdf1 = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss",Locale.ENGLISH);
		SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

		Date date;
		try {
			date = sdf1.parse(time);
			time = sdf.format(date);
		} catch (ParseException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		return new String[]{ip,time,timeArea,request,url,state,dataSize};

		}
	}


	return new String[]{};

	}

(三)、数据清洗(第二遍)

1、通过mapreduce对已经提取出的字段再次清洗:清理网络连接中的资源文件和清理不完整数据

public class clean {
	static final String INPUT_PATH = "hdfs://192.168.56.30:9000/access_2015_03_30.log";
	static final String OUT_PATH = "hdfs://192.168.56.30:9000/user/hive/warehouse/t1";

	public static void main(String[] args) throws Exception {
		String str="27.19.74.143 - - [30/Mar/2015:17:38:20 +0800] \"GET /static/image/common/faq.gif HTTP/1.1\" 200 1127";
		System.out.println("==========================数据清洗============================");
		String[] parser = new LongParser().parser(str);
		for (int i = 0; i < parser.length; i++) {
			System.out.println("字段:"+(i+1)+" : "+parser[i]);
		}
		System.out.println("==========================数据清洗============================");


		Configuration conf = new Configuration();
		Job job =Job.getInstance(conf,clean.class.getSimpleName());
		job.setJarByClass(clean.class);//打jar包必须在这一行

		//文件的输入格式
		FileInputFormat.setInputPaths(job, new Path(INPUT_PATH));
		job.setInputFormatClass(TextInputFormat.class);

		//map序列化
		job.setMapperClass(MyMapper.class);
		job.setMapOutputKeyClass(LongWritable.class);
		job.setMapOutputValueClass(Text.class);

		//reduce序列化
		job.setReducerClass(MyReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);

		//文件的输出格式
		String OUT_DIR =OUT_PATH;
		FileOutputFormat.setOutputPath(job, new Path(OUT_DIR));
		job.setOutputFormatClass(TextOutputFormat.class);

		//判断输出文件是否存在,若存在,则删除
		deleteOutDir(conf, OUT_DIR);

		job.waitForCompletion(true);

	}

	private static void deleteOutDir(Configuration conf, String OUT_DIR) throws IOException, URISyntaxException {
		FileSystem fileSystem = FileSystem.get(new URI(OUT_DIR), conf);
		if(fileSystem.exists(new Path(OUT_DIR))){
			fileSystem.delete(new Path(OUT_DIR), true);
		}
	}

			public static  class MyMapper extends Mapper<LongWritable, Text, LongWritable, Text>{

			    	@Override
					protected void map(LongWritable key, Text value,
			    			org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,LongWritable,Text>.Context context)
			    			throws IOException ,InterruptedException {

		              String line = value.toString();
		               String[] parser = new LongParser().parser(line);
		               //清理网络连接中的资源文件
			               if (line.contains(".gif")||line.contains("jpg")||line.contains("png")||line.contains(".css")||line.contains(".js")) {

			            	 return;
			               }
			               //清理不完整数据
			               if(parser.length != 7){
			            	   return;
			               }
		               Text text = new Text();
		               text.set(parser[0]+"\t"+parser[1]+"\t"+parser[2]+"\t"+parser[3]+"\t"+parser[4]+"\t"+parser[5]+"\t"+parser[6]+"\t");
						context.write(key, text);
			    		}
			   		}



			public static	class  MyReducer extends Reducer<LongWritable,Text, Text, NullWritable>{

						@Override
						protected void reduce(LongWritable arg0, Iterable<Text>text,
								Reducer<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
							// TODO Auto-generated method stub
							for (Text value : text) {
								context.write(value, NullWritable.get());
							}
						}

			}

}

(三)、通过hive对数据分析

1、创建表

create table t1(ip String,
time String,
timeArea String,
request String,
url String,
State String,
dataSize int
)row format delimited fields terminated by "\t";

2、pageview:用户的总访问量

select count(1) as PV from t1;

3、uv:独立用户(去重)

select count(distinct ip) as UV from t1;

4、只浏览了一次就离开的用户

select count(1) from t1 group by ip having count(1)=1;

5、只浏览了一次就离开用户的总数

select count(1) from (select count(1) from t1 group by ip having count(1)=1) nums;

6、所有浏览的总数

select ip,count(1) as nums from t1 group by ip;

7、跳出率

select sum(case when a.nums=1 then 1 else 0 end)/sum(1)
from(select count(1) as nums from t1 group by ip) a;

结果:7348/21645=0.33947793947793947

跳出率(取精度):round()

select round(sum(case when a.nums=1 then 1 else 0 end)/sum(1)*100,2)
from(select count(1) as nums from t1 group by ip) a;

结果:33.95

跳出率(字符转换):concat()

select concat(round(sum(case when a.nums=1 then 1 else 0 end)/sum(1)*100,2),"%")
from(select count(1) as nums from t1 group by ip) a;

结果:33.95%

8、ip浏览量的top100

select ip,count(1) as nums from t1 group by ip sort by nums desc limit 100;

9、统计时区

select timeArea,count(1) from t1 group by timeArea;

10、统计页面热点

select url,count(1) as nums from t1 group by url sort by nums desc limit 100;

11、网站用户连接状态

select state,count(1) as nums from t1 group by state;

12、单个ip流量的总和

select ip,sum(dataSize) as totalSize from t1 group by ip sort by totalSize desc limit 100;

四、总结

通过完成此次项目,学到了很多东西!我觉得最难的地方是正则表达式,因为以前用正则表达式较少,所以就没有学习正则表达式,真到用的时候不会,很着急!就只好现学了,通过本次项目,对正则表达式有了新的认识和理解。通过这次项目对用户画像,如何给某一事物打标签有了深刻的了解,同时也对mapreduce这一知识进行了复习掌握,最终要的是对hive的掌握也有了一定程度上的提升!真的是受益颇多!

五、完整代码:

(一)、pom.xml文件

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>Clean</groupId>
  <artifactId>clean</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  
  <dependencies>
  	<dependency>
  		<groupId>org.apache.hadoop</groupId>
  		<artifactId>hadoop-common</artifactId>
  		<version>2.2.0</version>
  	</dependency>

		<dependency>
			<groupId>jdk.tools</groupId>
			<artifactId>jdk.tools</artifactId>
			<version>1.8</version>
			<scope>system</scope>
			<systemPath>D:/java/jdk1.8/lib/tools.jar</systemPath>
		</dependency>

  	<dependency>
  		<groupId>org.apache.hadoop</groupId>
  		<artifactId>hadoop-hdfs</artifactId>
  		<version>2.2.0</version>
  	</dependency>
  	<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-mapreduce-client-core</artifactId>
    <version>2.2.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-client -->
<dependency>
    <groupId>org.apache.hbase</groupId>
    <artifactId>hbase-client</artifactId>
    <version>1.2.7</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-common -->
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-mapreduce-client-common</artifactId>
        <version>2.2.0</version>
    </dependency>

	<!-- https://mvnrepository.com/artifact/log4j/log4j -->
<dependency>
    <groupId>log4j</groupId>
    <artifactId>log4j</artifactId>
    <version>1.2.17</version>
</dependency>
	<!-- https://mvnrepository.com/artifact/junit/junit -->
<dependency>
    <groupId>junit</groupId>
    <artifactId>junit</artifactId>
    <version>4.12</version>
    <scope>test</scope>
</dependency>
   
</dependencies>
  	
</project>

初始数据清洗:截取需要字段

(二)、初始数据清洗:截取需要字段

package data;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class LongParser {

	public String[] parser(String line) {
	 String str="27.19.74.143 - - [30/Mar/2015:17:38:20 +0800] \"GET /static/image/common/faq.gif HTTP/1.1\" 200 1127";
	 //Pattern.compile("(27.19.74.143) - - \\[(30/Mar/2015:17:38:20) ( +0800)\\] \"(GET) (.*) HTTP/1.1\" (200) (1127)");
	 Pattern compile = Pattern.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}) - - \\[(.*) ([-|+][0-9]{1,4})\\] \"([A-Z]{1,4}) (.*) HTTP/1.1\" ([0-9]*) ([0-9]*)");
	Matcher matcher = compile.matcher(line);
	//System.out.println(matcher.find());


	while(matcher.find()){
		if (matcher.group() != null) {
		System.out.println("成功!");
		String ip = matcher.group(1);
		String time = matcher.group(2);
		String timeArea = matcher.group(3);
		String request = matcher.group(4);
		String url = matcher.group(5);
		String state = matcher.group(6);
		String dataSize = matcher.group(7);

		SimpleDateFormat sdf1 = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss",Locale.ENGLISH);
		SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

		Date date;
		try {
			date = sdf1.parse(time);
			time = sdf.format(date);
		} catch (ParseException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		return new String[]{ip,time,timeArea,request,url,state,dataSize};

		}
	}


	return new String[]{};

	}

}

(三)、数据清洗:清理网络连接中的资源文件和清洗不完整数据

package data;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;






public class clean {
	static final String INPUT_PATH = "hdfs://192.168.56.30:9000/access_2015_03_30.log";
	static final String OUT_PATH = "hdfs://192.168.56.30:9000/user/hive/warehouse/t1";

	public static void main(String[] args) throws Exception {
		String str="27.19.74.143 - - [30/Mar/2015:17:38:20 +0800] \"GET /static/image/common/faq.gif HTTP/1.1\" 200 1127";
		System.out.println("==========================数据清洗============================");
		String[] parser = new LongParser().parser(str);
		for (int i = 0; i < parser.length; i++) {
			System.out.println("字段:"+(i+1)+" : "+parser[i]);
		}
		System.out.println("==========================数据清洗============================");


		Configuration conf = new Configuration();
		Job job =Job.getInstance(conf,clean.class.getSimpleName());
		job.setJarByClass(clean.class);//打jar包必须在这一行

		//文件的输入格式
		FileInputFormat.setInputPaths(job, new Path(INPUT_PATH));
		job.setInputFormatClass(TextInputFormat.class);

		//map序列化
		job.setMapperClass(MyMapper.class);
		job.setMapOutputKeyClass(LongWritable.class);
		job.setMapOutputValueClass(Text.class);

		//reduce序列化
		job.setReducerClass(MyReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);

		//文件的输出格式
		String OUT_DIR =OUT_PATH;
		FileOutputFormat.setOutputPath(job, new Path(OUT_DIR));
		job.setOutputFormatClass(TextOutputFormat.class);

		//判断输出文件是否存在,若存在,则删除
		deleteOutDir(conf, OUT_DIR);

		job.waitForCompletion(true);

	}

	private static void deleteOutDir(Configuration conf, String OUT_DIR) throws IOException, URISyntaxException {
		FileSystem fileSystem = FileSystem.get(new URI(OUT_DIR), conf);
		if(fileSystem.exists(new Path(OUT_DIR))){
			fileSystem.delete(new Path(OUT_DIR), true);
		}
	}

			public static  class MyMapper extends Mapper<LongWritable, Text, LongWritable, Text>{

			    	@Override
					protected void map(LongWritable key, Text value,
			    			org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,LongWritable,Text>.Context context)
			    			throws IOException ,InterruptedException {

		              String line = value.toString();
		               String[] parser = new LongParser().parser(line);
		               //清理网络连接中的资源文件
			               if (line.contains(".gif")||line.contains("jpg")||line.contains("png")||line.contains(".css")||line.contains(".js")) {

			            	 return;
			               }
			               //清理不完整数据
			               if(parser.length != 7){
			            	   return;
			               }
		               Text text = new Text();
		               text.set(parser[0]+"\t"+parser[1]+"\t"+parser[2]+"\t"+parser[3]+"\t"+parser[4]+"\t"+parser[5]+"\t"+parser[6]+"\t");
						context.write(key, text);
			    		}
			   		}



			public static	class  MyReducer extends Reducer<LongWritable,Text, Text, NullWritable>{

						@Override
						protected void reduce(LongWritable arg0, Iterable<Text>text,
								Reducer<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
							// TODO Auto-generated method stub
							for (Text value : text) {
								context.write(value, NullWritable.get());
							}
						}

			}

}

猜你喜欢

转载自blog.csdn.net/qq_41919792/article/details/106911078