作业题目:
网站访问日志分析
作业需求:
基本需求 90%
1 统计本日志文件的总pv、uv
2 列出全天每小时的pv、uv数
3 列出top 10 uv的IP地址,以及每个ip的pv点击数
4 列出top 10 访问量最多的页面及每个页面的访问量
5 列出访问来源的设备列表及每个设备的访问量
import re
def uv(data):
geta = re.findall("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", data) # 获取ip
return geta
# 全天每小时的pv数
def hour(data):
geta1 = re.findall("2019[:](\d{2})", data)
return geta1
def html(data):
geta2 = re.findall('\w+ /.*HTTP/\d.\d',data)
return geta2
def facility(data):
geta3 = re.findall("Mozilla/.*?[)]", data)
return geta3
# 用于存放数据
pv_all_ip = [] # 存放所有pv量
uv_ip = [] # 存放所有uv量
uv_time = [] #存放uv 的ip点击量
pv_time = {
} #存放各个ip及ip的pv点击率
uv_hour = {
} # 存放每小时uv各个ip及对应的点击数
pv_hour = [] # 存放每小时有多少pv点击数
html_visit = [] #存放访问的页面
html_visit_dic = {
} #存放访问的页面以及每个页面对应的访问量
facility_lst = [] # 存放访问来源的设备
facility_dic = {
} # 存放访问来源的设备及每个设备的访问量
with open("网站访问日志.txt","r") as f1:
for line in f1.readlines():
uvv = uv(line)
pvv_hour = hour(line)
if bool(uvv) == False:
continue
elif uvv:
pv_hour.append(pvv_hour[0])
if facility(line):
facility_lst.append(facility(line)[0])
if html(line):
html_visit.append(html(line)[0].split(" ")[1])
if uvv[0] not in pv_all_ip:
uv_ip.append(uvv[0])
pv_all_ip.append(uvv[0])
print(f"本日志的总pv数为{len(pv_all_ip)}")
print(f"本日志的总uv数为{len(uv_ip)}")
for ss in set(pv_hour):
uv_hour[ss] = pv_hour.count(ss)
print(f"第{ss}小时的pv数为{pv_hour.count(ss)}")
print(f"第{ss}小时的uv数有{len(set(pv_all_ip[pv_hour.index(ss):pv_hour.index(ss)+uv_hour[ss]]))}")
# 获得各个uv的访问次数以及每个ip的pv点击数
for i in uv_ip:
pv_time[i] = pv_all_ip.count(i)
uv_time.append(pv_all_ip.count(i))
for tt in sorted(uv_time)[-1:-11:-1]:
for k,v in pv_time.items():
if tt == v:
print(f"top10 uv的ip地址分别是:{k},点击数为:{tt}")
print("每个ip的点击数为:",pv_time)
for sss in set(html_visit):
html_visit_dic[sss] = html_visit.count(sss)
html_visit_time = sorted([qq for qq in html_visit_dic.values()])[-1:-11:-1]
for tt in html_visit_time:
for k1,v1, in html_visit_dic.items():
if v1 == tt:
print(f"top10 访问量最多的页面及分别是:{k1},访问量为:{tt}")
for ssss in set(facility_lst):
facility_dic[ssss] = facility_lst.count(ssss)
for i in facility_dic.keys():
print(f"{i}设备的访问量为{facility_dic[i]}")