python网站访问日志分析

作业题目:
网站访问日志分析

作业需求:

基本需求 90%
1 统计本日志文件的总pv、uv
2 列出全天每小时的pv、uv数
3 列出top 10 uv的IP地址,以及每个ip的pv点击数
4 列出top 10 访问量最多的页面及每个页面的访问量
5 列出访问来源的设备列表及每个设备的访问量

import re

def uv(data):
    geta = re.findall("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", data)  # 获取ip
    return geta

# 全天每小时的pv数
def hour(data):
    geta1 = re.findall("2019[:](\d{2})", data)
    return geta1

def html(data):
    geta2 = re.findall('\w+ /.*HTTP/\d.\d',data)
    return geta2

def facility(data):
    geta3 = re.findall("Mozilla/.*?[)]", data)
    return geta3

# 用于存放数据
pv_all_ip = [] # 存放所有pv量
uv_ip = [] # 存放所有uv量
uv_time = [] #存放uv 的ip点击量
pv_time = {
    
    } #存放各个ip及ip的pv点击率
uv_hour = {
    
    } # 存放每小时uv各个ip及对应的点击数
pv_hour = [] # 存放每小时有多少pv点击数
html_visit = [] #存放访问的页面
html_visit_dic = {
    
    } #存放访问的页面以及每个页面对应的访问量
facility_lst = [] # 存放访问来源的设备
facility_dic = {
    
    } # 存放访问来源的设备及每个设备的访问量

with open("网站访问日志.txt","r") as f1:
    for line in f1.readlines():
        uvv = uv(line)
        pvv_hour = hour(line)
        if bool(uvv) == False:
            continue
        elif uvv:
            pv_hour.append(pvv_hour[0])
            if facility(line):
                facility_lst.append(facility(line)[0])
            if html(line):
                html_visit.append(html(line)[0].split(" ")[1])
            if uvv[0] not in pv_all_ip:
                uv_ip.append(uvv[0])
            pv_all_ip.append(uvv[0])

print(f"本日志的总pv数为{len(pv_all_ip)}")
print(f"本日志的总uv数为{len(uv_ip)}")

for ss in set(pv_hour):
    uv_hour[ss] = pv_hour.count(ss)
    print(f"第{ss}小时的pv数为{pv_hour.count(ss)}")
    print(f"第{ss}小时的uv数有{len(set(pv_all_ip[pv_hour.index(ss):pv_hour.index(ss)+uv_hour[ss]]))}")

# 获得各个uv的访问次数以及每个ip的pv点击数
for i in uv_ip:
    pv_time[i] = pv_all_ip.count(i)
    uv_time.append(pv_all_ip.count(i))
for tt in sorted(uv_time)[-1:-11:-1]:
    for k,v in pv_time.items():
        if tt == v:
            print(f"top10 uv的ip地址分别是:{k},点击数为:{tt}")
print("每个ip的点击数为:",pv_time)

for sss in set(html_visit):
    html_visit_dic[sss] = html_visit.count(sss)
html_visit_time = sorted([qq for qq in html_visit_dic.values()])[-1:-11:-1]
for tt in html_visit_time:
    for k1,v1, in html_visit_dic.items():
        if v1 == tt:
            print(f"top10 访问量最多的页面及分别是:{k1},访问量为:{tt}")

for ssss in set(facility_lst):
    facility_dic[ssss] = facility_lst.count(ssss)
for i in facility_dic.keys():
    print(f"{i}设备的访问量为{facility_dic[i]}")


猜你喜欢

转载自blog.csdn.net/m0_50481455/article/details/112180746