golang 流量统计系统视频总结（二）

总体流程

在这里插入图片描述

解析用户访问行为日志部分
代码实现：

package main

import (
	"bufio"
	"crypto/md5"
	"encoding/hex"
	"flag"
	"github.com/mediocregopher/radix.v2/pool"
	"github.com/mgutz/str"
	"github.com/sirupsen/logrus"
	"io"
	"net/url"
	"os"
	"strconv"
	"strings"
	"time"
)

const HANDLE_DIG = " /dig?"
const HANDLE_MOVIE = "/movie/"
const HANDLE_LIST = "/list/"
const HANDLE_HTML = ".html"

// 收集命令行参数的结构体
type cmdParams struct{
	logFilePath string
	routineNum int
}

// 用于存储一条访问日志信息解析后的关键内容
type digData struct {
	time string
	url string
	refer string
	ua string
}

// 用于存储用户的访问行为
type urlData struct{
	data digData
	uid string
	unode urlNode
}

// 要记录的数据节点，类似于数据库中的单条数据
type urlNode struct{
	unType string    // 详情页 或者 列表页 或者 首页
	unRid int    // Resource ID 资源id
	unUrl string    // 当前这个页面的url
	unTime string   // 当前访问这个页面的时间
}

// 存储一条pv/uv统计数据
type storageBlock struct{
	counterType string
	storageModel string
	unode urlNode
}

var log = logrus.New()
//var redisCli redis.Client
func init(){
	log.Out = os.Stdout
	log.SetLevel( logrus.DebugLevel)
	/*
	redisCli,err := redis.Dial("tcp","localhost:6379")
	if err != nil{
		log.Fatalln("Redis connect failed")
	} else {
		defer redisCli.Close()
	}
	*/
}

func main() {
	// 获取参数
	logFilePath := flag.String("logFilePath","/User/Pangee/Public/nginx/log/dig.log","target log file path")
	routineNum := flag.Int("routineNum",5,"consumer number by go routine")
	l := flag.String("l","/tmp/log","runtime log file")
	flag.Parse()

	params := cmdParams{*logFilePath, *routineNum}
	// 打日志
	logFd, err := os.OpenFile( *l ,os.O_CREATE|os.O_WRONLY, 0644)
	if (err == nil){
		log.Out = logFd
		defer logFd.Close()
	}
	log.Infoln("exec strat.")
	log.Infoln("params:log filepath = %s,routineNum = %d",params.logFilePath,params.routineNum)

	// 初始化一些channel，用于数据传递
	var logChannel = make(chan string, 3 * params.routineNum)   // 用于日志解析
	var pvChannel = make(chan urlData, params.routineNum)    // 用于pv统计
	var uvChannel = make(chan urlData, params.routineNum)    // 用户uv统计
	var storageChannel = make(chan storageBlock, params.routineNum)    // 用户存储统计数据

	// Redis pool
	redisPool, err := pool.New("tcp","localhost:6379",2*params.routineNum)
	if err!=nil{
		log.Fatalln("redis pool created fail")
		panic(err)
	} else {    // 保持redis的连接不闲置
		go func() {
			for {
				redisPool.Cmd( "PING")
				time.Sleep( 3 * time.Second)
			}
		}()
	}
	// 日志消费者，往logChannel中写日志数据
	go readFileLineByLine(params, logChannel)

	// 创建一组日志处理，从logChannel中读数据，并将读到的数据写入到pvChannel，uvChannel
	for i:=0;i<params.routineNum;i++{
		go logConsumer(logChannel, pvChannel, uvChannel)
	}
	// 创建PV/UV 统计器，从pvChannel和uvChannel中读取数据，然后将数据写入storageChannel
	go pvCounter(pvChannel, storageChannel)
	go uvCounter(uvChannel, storageChannel, redisPool)

	// 创建存储器
	go dataStorage(storageChannel, redisPool)

	// 之后会封装成daemon，但是先让程序跑起来
	time.Sleep(1000 * time.Second)
}
//逐行消费日志
func readFileLineByLine(params cmdParams,logChannel chan string) error {
	fd, err := os.Open(params.logFilePath)
	if(err != nil){
		log.Warningf("ReadFileLineByLine can't open file: %s",params.logFilePath)
		return err
	}
	defer fd.Close()

	count := 0
	bufferRead := bufio.NewReader( fd )
	for {
		line, err := bufferRead.ReadString( '\n' )
		logChannel <- line
		count++

		if count%(1000*params.routineNum) == 0{
			log.Infof("ReadLineByLine line: %d", count)
		}
		if err != nil {
			if err == io.EOF {    //如果读文件读完了，休息一下
				time.Sleep( 3*time.Second )
				log.Infof("ReadFileLineByLine wait,readLine: %d", count)
			} else {
				log.Warningf("ReadFileLineByLine read error")
			}
		}
	}
	return nil
}

//从logChannel中读数据，并将读到的数据写入到pvChannel，uvChannel
func logConsumer(logChannel chan string, pvChannel,uvChannel chan urlData) error {
	for logStr := range logChannel{
		//切割日志字符串，抠出打点上报的数据
		data := cutLogFetchData( logStr )
		//uid,模拟生成uid，MD5(refer+ua)
		hasher := md5.New()
		hasher.Write( []byte( data.refer + data.ua ) )
		uid := hex.EncodeToString( hasher.Sum(nil) )

		//很多的解析工作都可以放到这里完成
		//json等....
		uData :=  urlData{data, uid,formatUrl(data.url,data.time)}
		pvChannel<-uData
		uvChannel<-uData
	}
	return nil
}

//截取上报信息并返回结构体
func cutLogFetchData(logStr string) digData{
	logStr = strings.TrimSpace(logStr)
	pos1 := str.IndexOf( logStr, HANDLE_DIG, 0)
	if pos1 == -1{
		return digData{}
	}
	pos1 += len(HANDLE_DIG)   //计算偏移量
	pos2 := str.IndexOf( logStr, "HTTP/", pos1)
	d := str.Substr(logStr, pos1, pos2-pos1)
	//将截取到的k=v&k=v的形式的字符串转换
	urlInfo, err := url.Parse("http://localhost/?"+d)    //要拼接成完整的网址是因为这个方法只认完整的网址才解析
	if err != nil {
		return digData{}
	}
	data := urlInfo.Query()
	return digData{
		data.Get("time"),
		data.Get("refer"),
		data.Get("url"),
		data.Get("ua"),
	}
}

// 统计pv，将统计到的数据放到storageBlock结构体中，然后写入到storageChannel
func pvCounter(pvChannel chan urlData, storageChannel chan storageBlock){
	for data := range pvChannel{
		sItem := storageBlock{"pv","ZINCRBY",data.unode}
		storageChannel <- sItem
	}
}
// 统计uv，使用redis的HyperLoglog去重用户，将统计到的数据放到storageBlock结构体中，然后写入到storageChannel
func uvCounter(uvChannel chan urlData, storageChannel chan storageBlock, redisPool *pool.Pool){
	for data := range uvChannel {
		//HyperLoglog redis 去重用户
		hyperLogLogKey := "uv_hpll_"+ getTime(data.data.time, "day")
		// 这行会报错，因为执行到这里时获取不到redisCli这个实例，实例是在外部声明的，在goroutine中可能获取不到
		// 可以通过函数参数的形式传递进来，一般这种资源型的东西，建议使用连接池
		ret,err := redisPool.Cmd("PFADD",hyperLogLogKey,data.uid,"EX",86400).Int()
		if err != nil{
			log.Warningln("UvCounter check redis hyperloglog failed, ",err)
		}
		if ret!=1 {
			continue
		}
		sItem := storageBlock{"uv","ZINCRBY",data.unode}
		storageChannel <- sItem
	}
}

// 循环读取storageChannel中的内容，并使用redis进行相关数据统计
func dataStorage(storageChannel chan storageBlock, redisPool *pool.Pool){
	 for block := range storageChannel {
	 	prefix := block.counterType + "_"
	 	//逐层加洋葱皮，网站-大分类-小分类-终极页面，当用户访问任一级页面时，需要给其上游页面都添加相应的统计数据
	 	// 维度：天-小时-分钟
	 	// 层级：网站-大分类-小分类-终极页面
	 	// 存储模型：Redis SortedSet
	 	setKeys := []string{
			prefix+"day_"+getTime(block.unode.unTime,"day"),    //网站的uv、pv统计
			prefix+"hour_"+getTime(block.unode.unTime,"hour"),
			prefix+"min_"+getTime(block.unode.unTime,"min"),
			prefix+block.unode.unType+"_day_"+getTime(block.unode.unTime,"day"),    // 每种类型页面的统计movie、list、home
			prefix+block.unode.unType+"_hour_"+getTime(block.unode.unTime,"hour"),
			prefix+block.unode.unType+"_min_"+getTime(block.unode.unTime,"min"),
		}

	 	rowId := block.unode.unRid

	 	for _,key := range setKeys{
	 		 ret, err := redisPool.Cmd( block.storageModel, key, 1, rowId).Int()
	 		 if ret<=0 || err!=nil{
	 		 	log.Errorln("DataStorage redis storage error.",block.storageModel,key,rowId)
			 }
		}
	 }
}

// 提取出要写进存储器的单条记录
func formatUrl(url,t string) urlNode {
	// 一定从量大的着手，详情页>列表页>=首页
	pos1 := str.IndexOf(url,HANDLE_MOVIE,0)
	if pos1 != -1{
		pos1 += len(HANDLE_MOVIE)
		pos2 := str.IndexOf(url,HANDLE_HTML,pos1)
		idStr := str.Substr(url,pos1,pos2-pos1)
		id,_ := strconv.Atoi(idStr)
		return urlNode{"movie",id,url,t}
	} else {
		pos1 = str.IndexOf(url,HANDLE_LIST,0)
		if pos1 != -1{
			pos1 += len(HANDLE_LIST)
			pos2 := str.IndexOf(url,HANDLE_LIST,pos1)
			idStr := str.Substr(url,pos1,pos2-pos1)
			id,_ := strconv.Atoi(idStr)
			return urlNode{"list",id,url,t}
		} else {
			return urlNode{"home",1,url,t}
		}    // 如果页面有很多种，就不断在这里扩展
	}
}

// 根据时间类型获取时间戳
func getTime( logtime,timeType string) string {
	var item string
	switch timeType {
	case "day":
		item = "2006-01-02"
		break
	case "hour":
		item = "2006-01-02 15"
		break
	case "min":
		item = "2006-01-02 15:04"
		break
	}
	t,_ := time.Parse( item,time.Now().Format(item))
	return strconv.FormatInt(t.Unix(),10)
}

基本流程：

1.通过命令行收集用户输入的参数：logFilePath(要分析的日志所在路径)、routineNum(想要设定的用于解析日志的goroutine数目)、l(运行时日志存放的路径)，引入logrus包进行运行时的日志记录
2.根据用户定义的routineNum，初始化一些channel，用于数据传递
logChannel用于数据统计，pvChannel用于pv统计，uvChannel用于uv统计，storageChannel用于统计数据转存redis
3.使用radix.v2/pool包维持redis连接池，每3秒ping一下redis
4.生成一个goroutine，逐行读取日志，将读取到的内容写入logChannel中
5.根据用户指定的routineNum创建一组日志处理的goroutine，其将从logChannel中读一行日志数据进行解析，解析过程包括：

切割日志字符串，抠出打点上报的数据(从一行日访问志中解析出time,refer,url,ua,写入digData结构体中并返回)

	data := cutLogFetchData( logStr )

uid,模拟生成uid，MD5(refer+ua)，利用了crypto/md5，encoding/hex
这两个包

	hasher := md5.New()
	hasher.Write( []byte( data.refer + data.ua ) )
	uid := hex.EncodeToString( hasher.Sum(nil) )

之后对digData中的url进行解析（调用formatUrl()），提取出用户访问的资源类型（‘movie’、‘list’、‘home’），并且与url的访问时间一起写入到urlNode这个结构体中，然后将解析日志数据后返回的结构体(digData)、uid、解析url后返回的结构体（urlNode）写入urlData这个结构体中，然后将该结构体写入到pvChannel，uvChannel

	uData :=  urlData{data, uid,formatUrl(data.url,data.time)}
	pvChannel<-uData
	uvChannel<-uData

6.创建一个goroutine用于pv统计，循环读取pvChannel中的内容，将统计类型(pv)，储存要用的数据类型(ZINCRBY)，以及pvChannel中的unode结构体，构造成storageBlock结构体，并写入storageChannel中
7.创建一个goroutine用于uv统计，循环读取uvChannel中的内容，并根据从uvChannel中读取到的的结构体(urlData)里面的uid，利用redis的HyperLogLog进行用户去重，然后将统计类型(uv)，储存要用的数据类型(ZINCRBY)，以及uvChannel中的unode结构体，构造成storageBlock结构体，并写入storageChannel中
8.创建一个goroutine，循环读取storageChannel中的数据，构造不同的key，然后利用redis进行统计和储存
9.最后为了让程序顺利跑起来，在主线程中设置了睡眠1000s

涉及的点

logrus包的使用
bufio的使用
参考连接 bufio的解析
mgutz/str包的使用
radix.v2/pool包的使用
MD5生成(crypto/md5，encoding/hex)

参考连接 golang md5

	hasher := md5.New()
	hasher.Write( []byte( data.refer + data.ua ) )
	uid := hex.EncodeToString( hasher.Sum(nil) )

这里直接对一串字符串计算MD5。其中通过md5.New()初始化一个MD5对象，其实它是一个hash.Hash对象。函数原型为 func New() hash.Hash 。该对象实现了hash.Hash的Sum接口：计算出校验和。其函数原型为 func Sum(data []byte) [Size]byte 这里的官方Manual对其的描述我感觉有点问题。其官方描述为: " Sum returns the MD5 checksum of the data. "

通过翻阅源码可以看到他并不是对data进行校验计算，而是对hash.Hash对象内部存储的内容进行校验和计算然后将其追加到data的后面形成一个新的byte切片。因此通常的使用方法就是将data置为nil，sum方法描述

// Sum appends the current hash to b and returns the resulting slice.
// It does not change the underlying hash state.
Sum(b []byte) []byte

该方法返回一个Size大小的byte数组，对于MD5来说就是一个128bit的16字节byte数组。

可参考链接 Golang计算MD5

然后 encoding/hex包是实现十六进制编码和解码用的，参考连接腾讯云

二进制是Binary，即bin
八进制是Octal，即oct
十进制为Decimal，即dec
十六进制为Hexadecimal，即hex

golang的时间戳

import ( 
    "fmt" 
    "time"
) 
   
func main() { 
    t := time.Now() 
    fmt.Println(t) 
    t1 := time.Now().Format("2006-01-02 15:04:05") 
    fmt.Println(t1) 
    t2  := time.Now().Unix()    //秒
    fmt.Println(t2)
    t3  := time.Now().UnixNano()   //纳秒
    fmt.Println(t3)
}

关于golang中的奇怪时间 2006-01-02 15:04:05:
仔细观察这个日期，06年，1月2日下午3点4分5秒，查阅相关资料还有 -7时区，Monday，数字1~7都有了，而且都不重复。
其实日期为 2006-01-02T15:04:05Z07:00，每个数字都有意义
1 2 3 4 5 6 7

月日时分秒年时区
道理其实跟别的语言的 YYYY-mm-dd 一样，只不过go用了这个特殊的包含1~7的时间而已。

可以看这里 stackoverflow

以及这里简书

url 解析query

d := str.Substr(logStr, pos1, pos2-pos1)
//将截取到的k=v&k=v的形式的字符串转换
urlInfo, err := url.Parse("http://localhost/?"+d)    //要拼接成完整的网址是因为这个方法只认完整的网址才解析
if err != nil {
	return digData{}
}
data := urlInfo.Query()
return digData{
	data.Get("time"),
	data.Get("refer"),
	data.Get("url"),
	data.Get("ua"),
}