引
帮朋友写了个爬虫,出于不知名的原因直接把html页面给我,0.0 ,那更方便,直接解析了。
思路
流程:读html文件,解析html文件,获取数据,写入excel。
过程及结果示意图
1)html和页面图片
2)列表,要写进excel的内容:爬访客ID,地域,停留时间,搜索词,旺铺行为和网站行为。
3)html
4)excel效果图
代码
#coding:utf-8
import re
import os
from bs4 import BeautifulSoup as bs
import xlwt
class Visitor():
def __init__(self, vid = 0, country = '', staySecond = 0,
keywords = ['',], viewNum = 0, viewAddress = '',
webAction = '', minisiteAction = '', vtime = ''
):
self.vid = vid,
self.country = country,
self.staySecond = staySecond,
self.keywords = keywords,
self.viewNum = viewNum,
self.viewAddress = viewAddress,
self.vtime = vtime
visitors = []
#读html文件
directory = 'e:\\python-workspace\\wang\\' #遍历html 文件夹中所有html文件
for root,dirs,files in os.walk(directory):
for file_name in files:
if(not re.match('.*.txt', file_name)):
continue
filename=root+'\\' + file_name
#filename = 'd:\\wang\\table.txt' #html文件路径和名字,路径中间用\\隔开
print('正在处理:' + filename)
with open(filename, 'r') as f:
file = f.read()
#解析html
soup = bs(file, "html.parser")
for vs in soup.find_all(attrs={'name': 'visitor-check'}):
v = Visitor()
#获取访客
v.vid = vs.attrs['visitorid']
#获取地区
v.country = vs.attrs['buyercountry']
#获取停留时间
v.staySecond = vs.attrs['staysecond']
#获取常用搜索词
wordsHtml = vs.attrs['totalwords']
wordsoup = bs(wordsHtml, "html.parser")
v.keywords = ['', '', '', '', '']
if(wordsHtml != '-' and wordsHtml != ''):
wordsHtml = wordsoup.div['data-text']
keysoup = bs(wordsHtml, "html.parser")
keywordsTags = keysoup.find_all('div')
#搜索词数量只取前5个
for index in range(len(keywordsTags)):
if(index >= 5):
break
v.keywords[index] = keywordsTags[index].string
#print(v.keywords)
#获取旺铺行为
minisite = vs.find_next(attrs={'class': 'td-minisite-active'})
v.minisiteAction = minisite.get_text()
#获取网站行为
website = vs.find_next(attrs = {'class': 'td-website-active'})
v.webAction = website.get_text()
visitors.append(v)
#写EXCEL
vindex = 0
heads = ['编号', '访客', '地区', '停留时间',
'常用搜索词',
#'常用搜索词2', '常用搜索词3', '常用搜索词4', '常用搜索词5',
'旺铺行为', '网站行为']
workbook = xlwt.Workbook()
sheet = workbook.add_sheet('sheet')
#写第一行,即列的标题
for i in range(len(heads)):
sheet.write(0, i, heads[i])
#写每一行的数据
for i in range(1, len(visitors)):
tempv = visitors[vindex]
#写编号
sheet.write(i, 0, i)
#写访客id
sheet.write(i, 1, tempv.vid)
#写访客国家
sheet.write(i, 2, tempv.country)
#写停留时间
sheet.write(i, 3, tempv.staySecond)
#写搜索词
#l_words = 5 if len(tempv.keywords) >= 5 else len(tempv.keywords)
#for j in range(l_words):
# sheet.write(i, 4 + j, tempv.keywords[j])
wordstring = ''
for j in range(len(tempv.keywords)):
#print(tempv.keywords[j])
wordstring = wordstring + tempv.keywords[j] + '\n'
sheet.write(i, 4, wordstring)
#print()
#写旺铺行为
sheet.write(i, 5, tempv.minisiteAction)
#写网站行为
sheet.write(i, 6, tempv.webAction)
vindex = vindex + 1
workbook.save('e:\\python-workspace\\wang\\wgsb.xls')#要新创建的excel的路径和名字
注:
1)读文件用with open和f.read()
2)BeautifulSoup的用法可以查询API
3)写Excel用xlwt库