python解析Nginx访问日志

环境说明

python3+

pip install geoip2==2.9.0

nginx日志配置成json格式,配置如下:

log_format json_log '{ "time": "$time_local", '
                         '"remote_addr": "$remote_addr", '
                         '"remote_user": "$remote_user", '
                         '"body_bytes_sent": "$body_bytes_sent", '
                         '"request_time": "$request_time", '
                         '"status": "$status", '
                         '"request": "$request", '
                         '"request_method": "$request_method", '
                         '"http_referrer": "$http_referer", '
                         '"body_bytes_sent":"$body_bytes_sent", '
                         '"http_x_forwarded_for": "$http_x_forwarded_for", '
                         '"http_user_agent": "$http_user_agent"}';
配置日志成json格式

生成的日志如下:

配置脚本

#encoding: utf-8
import os
import sys
import json
from datetime import datetime
from geoip2.database import Reader

logfile = sys.argv[1]
BASE_DIR = os.path.dirname(os.path.abspath(__file__))


def stat_days(infile):
    """
    统计每天日志数据
    """
    day_data = {}
    with open(infile, 'r', encoding="utf-8") as fhandler:
        for line in fhandler.readlines():
            try:
                line=line.strip('\n')
                #print('---------------')
                #print(line)
                dict_line = json.loads(line)
            
                # ip                          datetime               method                  url                     status                      bytes                
                #dict_line['remote_addr']  dict_line['time']  dict_line['request_method']   dict_line['request'] dict_line['status']  dict_line['body_bytes_sent']
                #_day = datetime.strptime(dict_line['time'], '%d/%b/%Y:%H:%M:%S').strftime('%Y-%m-%d')
                _day = '2018-11-29'
                #设置每天的默认值
                day_data.setdefault(_day, {'hits': 0, 'vistors': {}, 'status': {}, 'bytes': 0})
                #设置每天出现的IP访问次数默认为0
                day_data[_day]['vistors'].setdefault(dict_line['remote_addr'], 0)
                #设置每天出现的状态码默认值为0
                day_data[_day]['status'].setdefault(dict_line['status'],0)

                #统计数据
                day_data[_day]['hits'] += 1
                day_data[_day]['vistors'][dict_line['remote_addr']] += 1
                day_data[_day]['status'][dict_line['status']] += 1
                day_data[_day]['bytes'] += int(dict_line['body_bytes_sent']) if dict_line['body_bytes_sent'].isdigit() else 0
            except Exception as err:
                continue
    return sorted(day_data.items(), key=lambda x:x[0])


def stat_total(days):
    """
    统计总数据
    """
    total_data = {'hits': 0,  'vistors': {}, 'status': {}, 'bytes': 0}

    for _day, _stat in days:
        total_data['hits'] += _stat['hits']
        total_data['bytes'] += _stat['bytes']

        for _ip, _cnt in _stat['vistors'].items():
            total_data['vistors'].setdefault(_ip, 0)
            total_data['vistors'][_ip] += _cnt

        for _status, _cnt in _stat['status'].items():
            total_data['status'].setdefault(_status, 0)
            total_data['status'][_status] += _cnt
    return total_data
            


def stat_region(total_data):
    """
    统计区域
    """
    region_data = {}
    region_location = {}

    #打开maxmind mmdb文件
    geoip2_reader = Reader(os.path.join(BASE_DIR, 'db', 'GeoLite2-City.mmdb'))

    for _ip, _cnt in total_data['vistors'].items():
        try:
            _city = geoip2_reader.city(_ip)

            #只显示国内IP地址
            #if _city.country.names.get('zh-CN', '') != '中国':
                #continue
            #获取国家和城市信息
            _city_name = '{}/{}'.format(_city.country.names.get('zh-CN', ''), _city.city.names.get('zh-CN', ''))
            region_data.setdefault(_city_name, 0)

            #统计每天城市发生访问次数
            region_data[_city_name] += _cnt
        except Exception as err:
            print(err)

    #关闭文件
    geoip2_reader.close()
    return region_data


def formatSize(bytes):
    bytes = float(bytes)
    kb = bytes / 1024
    if kb >= 1024:
        M = kb /1024
        if M >= 1024:
            G = M /1024
            return "{} G".format(G)
        else:
            return "{} M".format(M)
    else:
        return "{} K".format(kb)


def main(infile):
    """
    主程序
    """
    #获取各种统计结果
    day_data = stat_days(infile) #每天统计项
    total_data = stat_total(day_data) #总统计项
    region_data  = sorted(stat_region(total_data).items(), key=lambda x:x[1], reverse=True)
    status_data = total_data['status']

    access_num = total_data['hits']
    ip_num = len(total_data['vistors'])
    ip_detail = sorted(total_data['vistors'].items(), key=lambda x:x[1], reverse=True)
    traffic = formatSize(total_data['bytes'])


    print("""
    总访问量: {}
    总IP数: {}
    总流量: {}

    """.format(access_num, ip_num, traffic))
    print('\n-------Top 15 地区访问分布-------')
    for region in region_data[0:15]:
        print("{}:{}".format(region[0], region[1]))


    print('\n-------Top 15 ip访问-------')
    for ip in ip_detail[0:15]:
        print("{}         {}".format(ip[0], ip[1]))


    print('\n-------状态码情况-------')
    for code, cnt in status_data.items():
        print("{}   {}".format(code, cnt))


if __name__ == "__main__":
    main(logfile)
logganalysis.py

猜你喜欢

转载自www.cnblogs.com/sellsa/p/10058790.html