小项目一---Python日志分析

日志分析

概述

分析的前提

半结构化数据

文本分析

提取数据

一、空格分隔

with open('xxx.log')as f:
    for line in f:
        for field in line.split():
            print(field)

#注意这里拼接的一些技巧
logs = '''138.60.212.153 - - [19/Feb/2013:10:23:29 +0800] "GET /020/media.html?menu\
=3 HTTP/1.1" 200 16691 "-" "Mozilla/5.0 (compatible; EasouSpider; +http://www.easou\
.com/search/spider.html)"'''

fields = []
flag = False
tmp = ''

#注意拼接"GET /020/media.html?menu=3 HTTP/1.1"这种字符串需借助标记变量！
for field in logs.split():
    if not flag and (field.startswith('[') or field.startswith('"')):
        if field.endswith(']') or field.endswith('"'):#处理首尾均有[]的字符串
            fields.append(field.strip('[]"'))
        # 处理只有左中括号的字符串，但是该字符串应该与接下类的某一段含有右括号的字符拼接起来[19/Feb/2013:10:23:29
        else:#
            tmp += field[1:]
            flag = True
        continue
    #处理[19/Feb/2013:10:23:29 +0800]中的+0800]
    if flag:
        if field.endswith(']') or field.endswith('"'):
            tmp += " " + field[:-1]
            fields.append(tmp)
            tmp = ''
            flag = False
        else:
            tmp +=" " + field
        continue

    fields.append(field)#直接加入不带有[]""的字符串

类型转换

import datetime

def convert_time(timestr):
    return datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z')

#若上面的函数可简写成匿名函数形式
lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z')

请求信息的解析

def get_request(request:str):
    return dict(zip(['method','url','protocol'],request.split()))

#上面的函数对应为如下匿名函数
lambda request:dict(zip(['method','url','protocol'],request.split()))

映射

 1 import datetime
 2 logs = '''138.60.212.153 - - [19/Feb/2013:10:23:29 +0800] "GET /020/media.html?menu\
 3 =3 HTTP/1.1" 200 16691 "-" "Mozilla/5.0 (compatible; EasouSpider; +http://www.easou\
 4 .com/search/spider.html)"'''
 5 
 6 def convert_time(timestr):
 7     return datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z')
 8 
 9 # lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z')
10 
11 def get_request(request:str):
12     return dict(zip(['method','url','protocol'],request.split()))
13 
14 # lambda request:dict(zip(['method','url','protocol'],request.split()))
15 
16 names = ('remote','','','datetime','request','status','length','','useragent')
17 ops = (None,None,None,convert_time,get_request,int,int,None,None)
18 
19 def  extract(line):
20     fields = []
21     flag = False
22     tmp = ''
23 
24     #"GET /020/media.html?menu=3 HTTP/1.1"
25     for field in logs.split():
26         if not flag and (field.startswith('[') or field.startswith('"')):
27             if field.endswith(']') or field.endswith('"'):#处理首尾均有[]的字符串
28                 fields.append(field.strip('[]"'))
29             # 处理只有左中括号的字符串，但是该字符串应该与接下类的某一段含有右括号的字符拼接起来[19/Feb/2013:10:23:29
30             else:#
31                 tmp += field[1:]
32                 flag = True
33             continue
34         #处理[19/Feb/2013:10:23:29 +0800]中的+0800]
35         if flag:
36             if field.endswith(']') or field.endswith('"'):
37                 tmp += " " + field[:-1]
38                 fields.append(tmp)
39                 tmp = ''
40                 flag = False
41             else:
42                 tmp +=" " + field
43             continue
44 
45         fields.append(field)#直接加入不带有[]""的字符串
46 
47 # print(fields)
48     info = {}
49     for i,field in enumerate(fields):
50         name = names[i]
51         op = ops[i]
52         if op:
53             info[name] = (op(field))
54     return info
55 
56 print(extract(logs))

二、正则表达式提取

pattern = '''([\d.]{7,}) - - \[([/\w +:]+)\] "(\w+) (\S+) ([\w/\d.]+)" (\d+) (\d+) .+ "(.+)"'''
names = ('remote','datetime','request','method','url','ptorocol','status','length','useragent')
ops = (None,lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z'),None,None,None,int,int,None)

pattern = '''(?P<remote>[\d.]{7,}) - - \[([/\w +:]+)\] \
            "(?P<method>\w+) (?P<url>\S+) (?P<protocol>[\w/\d.]+)"\
            (?P<status>\d+) (?P<length>\d+) .+ "(?PM<useragent>.+)"'''
ops = {
    'datetime': lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z'),
    'status':int,
    'length':int
}

import datetime
import re
logs = '''138.60.212.153 - - [19/Feb/2013:10:23:29 +0800] "GET /020/media.html?menu=3 HTTP/1.1" 200 16997 "-" "Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)"'''
pattern = '''(?P<remote>[\d.]{7,}) - - \[(?P<datetime>[\w/ +:]+)\] "(?P<method>\w+) (?P<url>\S+) (?P<protocol>[\w/\d.]+)" (?P<status>\d+) (?P<length>\d+) .+ "(?P<useragent>.+)"'''

ops = {
    'datetime': lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z'),
    'status':int,
    'length':int
}

regex = re.compile(pattern)
def  extract(line):
    matcher = regex.match(line)
    info = {k:ops.get(k,lambda x:x)(v) for k,v in matcher.groupdict().items()}
    return info

print(extract(logs))

异常处理

小项目一---Python日志分析

猜你喜欢