python解析nginx access日志

python解析nginx access日志,并通过django把解析结果插入数据库。

#coding=utf-8
import os
import sys
import time
from kafka import KafkaConsumer
import traceback


class nginx_tracker(object):
    """
        nginx的入口监控
    """
    def __init__(self):
        self.data={}

        self.applog_kafka_prd = [
            '192.168.1.100:9092',
        ]

        self.applog_zookeeper_prd = [
            '192.168.1.100:2181',
        ]
        self.msg_num=0
        self.now=0
        self.count_time = 0
        self.pretime = 0

    def consumer(self):
        """
            从kafka获取数据
        """
        broker_list = self.applog_kafka_prd
        topic = "nginx-log-detector"
        kafka = KafkaConsumer(topic,metadata_broker_list=broker_list,group_id='nginx_monitor2',auto_commit_enable=True,auto_commit_interval_ms=5000,auto_offset_reset='smallest')
        #标准日志格式
        # msg_list=[
        #   '118.249.144.105 - - [05/Apr/2016:09:06:29 +0800] "GET /ywr/http/common/Login?hostSessionId=1401cbdebe7a42db99e66319f782cea8 HTTP/1.1" "200" 73 "-" "Dalvik/1.6.0 (Linux; U; Android 4.4.2; SM-N9002 Build/KOT49H)" "-" "" "0.79" 172.29.4.183:30074 730 465 "0.380"',
        # ]
        while True:
            for m in kafka:
                msg=m.value
                self.msg_num+=1
                self.process_log(msg)
                kafka.task_done(m)
                now_time = time.time()
                now_time -= now_time % 60
                if self.count_time < now_time:
                    self.count_time = now_time
                    print gettime(time.time()) + " " + str(self.msg_num)

    ##解析日志
    def process_log(self,msg):
        try:
            ##把时间转化为timestamp,方便比较
            msg_time=msg.split(' ',4)[3].replace("[","")
            msg_time=int(time.mktime(time.strptime(msg_time,"%d/%b/%Y:%H:%M:%S")))
            msg_time-=msg_time%60

            ##1分钟聚合一次并插入数据库
            if msg_time > self.now:
                print "%s %s write_db %d" % (gettime(time.time()),gettime(msg_time),self.msg_num)
                self.write_db()
                self.now=msg_time
                self.data={}
                self.msg_num=0

            ##解析ip  
            remote_ip=msg.split(" ",1)[0]
            ##解析状态码
            code=msg.split('"',4)[3]
            ##解析耗时
            request_time=float(msg.split()[-1].replace('"',''))*1000
            if msg_time not in self.data:
                ##数据结构
                self.data[msg_time]={
                    'pv':0,
                    'ip_list':[],
                    '2xx':0,
                    '3xx':0,
                    '4xx':0,
                    '5xx':0,
                    'other':0,
                    'request_time':0,
                    'response_time_300':0,
                    'response_time_500':0,
                    'response_time_1000':0,
                    'response_time_3000':0,
                    'response_time_5000':0,
                    'response_time_10000':0,
                    'response_time_ge_10000':0,
                    'detail_5xx':[],
                    'detail_request_time':[],
                }
            self.data[msg_time]['pv']+=1
            if remote_ip not in self.data[msg_time]['ip_list']:
                self.data[msg_time]['ip_list'].append(remote_ip)
            if code[0] in ['2','3','4','5']:
                self.data[msg_time][code[0]+"xx"]+=1
            else:
                self.data[msg_time]['other']+=1
            ##总耗时统计 
            self.data[msg_time]['request_time']+=request_time

            if request_time < 300:
                self.data[msg_time]['response_time_300']+=1
            elif request_time < 500:
                self.data[msg_time]['response_time_500']+=1
            elif request_time < 1000:
                self.data[msg_time]['response_time_1000']+=1
            elif request_time < 3000:
                self.data[msg_time]['response_time_3000']+=1
            elif request_time < 5000:
                self.data[msg_time]['response_time_5000']+=1
            elif request_time < 10000:
                self.data[msg_time]['response_time_10000']+=1
            elif request_time > 10000:
                self.data[msg_time]['response_time_ge_10000']+=1
            if code[0]=='5':
                self.data[msg_time]['detail_5xx'].append(msg)
            if request_time > 3000:
                self.data[msg_time]['detail_request_time'].append(msg)
        except:
            traceback.print_exc()
            print msg
    ##插入数据库
    def write_db(self):
        alert_list=[]
        for each_time in self.data:
            info=self.data[each_time]
            ##如果数据延时,就补录
            if nginx_log.objects.filter(time=each_time):
                now_result=nginx_log.objects.get(time=each_time)
                request_time=(info['request_time']+now_result.pv*now_result.request_time)/(info['pv']+now_result.pv)
                nginx_log.objects.filter(time=each_time).update(
                    pv=info['pv']+now_result.pv,
                    code_2xx=info['2xx']+now_result.code_2xx,
                    code_3xx=info['3xx']+now_result.code_3xx,
                    code_4xx=info['4xx']+now_result.code_4xx,
                    code_5xx=info['5xx']+now_result.code_5xx,
                    code_other=info['other']+now_result.code_other,
                    request_time=request_time,
                    response_time_300=info['response_time_300']+now_result.response_time_300,
                    response_time_500=info['response_time_500']+now_result.response_time_500,
                    response_time_1000=info['response_time_1000']+now_result.response_time_1000,
                    response_time_3000=info['response_time_3000']+now_result.response_time_3000,
                    response_time_5000=info['response_time_5000']+now_result.response_time_5000,
                    response_time_10000=info['response_time_10000']+now_result.response_time_10000,
                    response_time_ge_10000=info['response_time_ge_10000']+now_result.response_time_ge_10000,
                    detail_5xx=str((eval(now_result.detail_5xx)+info['detail_5xx'])[0:10]),
                    detail_request_time=str((eval(now_result.detail_request_time)+info['detail_request_time'])[0:10]),
                )
            else:
                #数据插入数据库
                nginx_log.objects.create(
                    time=each_time,
                    pv=info['pv'],
                    uv=len(info['ip_list']),
                    code_2xx=info['2xx'],
                    code_3xx=info['3xx'],
                    code_4xx=info['4xx'],
                    code_5xx=info['5xx'],
                    code_other=info['other'],
                    request_time=info['request_time']/info['pv'],
                    response_time_300=info['response_time_300'],
                    response_time_500=info['response_time_500'],
                    response_time_1000=info['response_time_1000'],
                    response_time_3000=info['response_time_3000'],
                    response_time_5000=info['response_time_5000'],
                    response_time_10000=info['response_time_10000'],
                    response_time_ge_10000=info['response_time_ge_10000'],
                    detail_5xx=str(info['detail_5xx'][0:10]),
                    detail_request_time=str(info['detail_request_time'][0:10]),
                )

if __name__=="__main__":
    app_log=nginx_tracker()
    app_log.consumer()

猜你喜欢

转载自blog.csdn.net/kong2030/article/details/81327362