日志分析
概述
分析的前提
半结构化数据
文本分析
提取数据
一、空格分隔
with open('xxx.log')as f: for line in f: for field in line.split(): print(field)
#注意这里拼接的一些技巧 logs = '''138.60.212.153 - - [19/Feb/2013:10:23:29 +0800] "GET /020/media.html?menu\ =3 HTTP/1.1" 200 16691 "-" "Mozilla/5.0 (compatible; EasouSpider; +http://www.easou\ .com/search/spider.html)"''' fields = [] flag = False tmp = '' #注意拼接"GET /020/media.html?menu=3 HTTP/1.1"这种字符串需借助标记变量! for field in logs.split(): if not flag and (field.startswith('[') or field.startswith('"')): if field.endswith(']') or field.endswith('"'):#处理首尾均有[]的字符串 fields.append(field.strip('[]"')) # 处理只有左中括号的字符串,但是该字符串应该与接下类的某一段含有右括号的字符拼接起来[19/Feb/2013:10:23:29 else:# tmp += field[1:] flag = True continue #处理[19/Feb/2013:10:23:29 +0800]中的+0800] if flag: if field.endswith(']') or field.endswith('"'): tmp += " " + field[:-1] fields.append(tmp) tmp = '' flag = False else: tmp +=" " + field continue fields.append(field)#直接加入不带有[]""的字符串
类型转换
import datetime def convert_time(timestr): return datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z') #若上面的函数可简写成匿名函数形式 lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z')
请求信息的解析
def get_request(request:str): return dict(zip(['method','url','protocol'],request.split())) #上面的函数对应为如下匿名函数 lambda request:dict(zip(['method','url','protocol'],request.split()))
映射
1 import datetime 2 logs = '''138.60.212.153 - - [19/Feb/2013:10:23:29 +0800] "GET /020/media.html?menu\ 3 =3 HTTP/1.1" 200 16691 "-" "Mozilla/5.0 (compatible; EasouSpider; +http://www.easou\ 4 .com/search/spider.html)"''' 5 6 def convert_time(timestr): 7 return datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z') 8 9 # lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z') 10 11 def get_request(request:str): 12 return dict(zip(['method','url','protocol'],request.split())) 13 14 # lambda request:dict(zip(['method','url','protocol'],request.split())) 15 16 names = ('remote','','','datetime','request','status','length','','useragent') 17 ops = (None,None,None,convert_time,get_request,int,int,None,None) 18 19 def extract(line): 20 fields = [] 21 flag = False 22 tmp = '' 23 24 #"GET /020/media.html?menu=3 HTTP/1.1" 25 for field in logs.split(): 26 if not flag and (field.startswith('[') or field.startswith('"')): 27 if field.endswith(']') or field.endswith('"'):#处理首尾均有[]的字符串 28 fields.append(field.strip('[]"')) 29 # 处理只有左中括号的字符串,但是该字符串应该与接下类的某一段含有右括号的字符拼接起来[19/Feb/2013:10:23:29 30 else:# 31 tmp += field[1:] 32 flag = True 33 continue 34 #处理[19/Feb/2013:10:23:29 +0800]中的+0800] 35 if flag: 36 if field.endswith(']') or field.endswith('"'): 37 tmp += " " + field[:-1] 38 fields.append(tmp) 39 tmp = '' 40 flag = False 41 else: 42 tmp +=" " + field 43 continue 44 45 fields.append(field)#直接加入不带有[]""的字符串 46 47 # print(fields) 48 info = {} 49 for i,field in enumerate(fields): 50 name = names[i] 51 op = ops[i] 52 if op: 53 info[name] = (op(field)) 54 return info 55 56 print(extract(logs))
二、正则表达式提取
pattern = '''([\d.]{7,}) - - \[([/\w +:]+)\] "(\w+) (\S+) ([\w/\d.]+)" (\d+) (\d+) .+ "(.+)"''' names = ('remote','datetime','request','method','url','ptorocol','status','length','useragent') ops = (None,lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z'),None,None,None,int,int,None)
pattern = '''(?P<remote>[\d.]{7,}) - - \[([/\w +:]+)\] \ "(?P<method>\w+) (?P<url>\S+) (?P<protocol>[\w/\d.]+)"\ (?P<status>\d+) (?P<length>\d+) .+ "(?PM<useragent>.+)"''' ops = { 'datetime': lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z'), 'status':int, 'length':int }
import datetime import re logs = '''138.60.212.153 - - [19/Feb/2013:10:23:29 +0800] "GET /020/media.html?menu=3 HTTP/1.1" 200 16997 "-" "Mozilla/5.0 (compatible; EasouSpider; +http://www.easou.com/search/spider.html)"''' pattern = '''(?P<remote>[\d.]{7,}) - - \[(?P<datetime>[\w/ +:]+)\] "(?P<method>\w+) (?P<url>\S+) (?P<protocol>[\w/\d.]+)" (?P<status>\d+) (?P<length>\d+) .+ "(?P<useragent>.+)"''' ops = { 'datetime': lambda timestr:datetime.datetime.strptime(timestr,'%d/%b/%Y:%H:%M:%S %z'), 'status':int, 'length':int } regex = re.compile(pattern) def extract(line): matcher = regex.match(line) info = {k:ops.get(k,lambda x:x)(v) for k,v in matcher.groupdict().items()} return info print(extract(logs))
异常处理