python - BeautifulSoup解析html页面

帮朋友写了个爬虫,出于不知名的原因直接把html页面给我,0.0 ,那更方便,直接解析了。

思路

流程:读html文件,解析html文件,获取数据,写入excel。

过程及结果示意图

1)html和页面图片
在这里插入图片描述
2)列表,要写进excel的内容:爬访客ID,地域,停留时间,搜索词,旺铺行为和网站行为。
在这里插入图片描述
3)html
在这里插入图片描述
4)excel效果图
在这里插入图片描述

代码

#coding:utf-8
import re
import os
from bs4 import BeautifulSoup as bs
import xlwt

class Visitor():
    def __init__(self, vid = 0, country = '', staySecond = 0,
                 keywords = ['',], viewNum = 0, viewAddress = '',
                 webAction = '', minisiteAction = '', vtime = ''
                 ):
        self.vid = vid,
        self.country = country,
        self.staySecond = staySecond,
        self.keywords = keywords,
        self.viewNum = viewNum,
        self.viewAddress = viewAddress,
        self.vtime = vtime

visitors = []
#读html文件
directory = 'e:\\python-workspace\\wang\\' #遍历html 文件夹中所有html文件
for root,dirs,files in os.walk(directory):
    for file_name in files:
        if(not re.match('.*.txt', file_name)):
            continue
        filename=root+'\\' + file_name
        #filename = 'd:\\wang\\table.txt' #html文件路径和名字,路径中间用\\隔开
        print('正在处理:' + filename)
        with open(filename, 'r') as f:
            file = f.read()
		
		#解析html
        soup = bs(file, "html.parser")
        for vs in soup.find_all(attrs={'name': 'visitor-check'}):
            v = Visitor()
            #获取访客
            v.vid = vs.attrs['visitorid']

            #获取地区
            v.country = vs.attrs['buyercountry']

            #获取停留时间
            v.staySecond = vs.attrs['staysecond']

            #获取常用搜索词
            wordsHtml = vs.attrs['totalwords']
            wordsoup = bs(wordsHtml, "html.parser")
            v.keywords = ['', '', '', '', '']
            if(wordsHtml != '-' and wordsHtml != ''):
                wordsHtml = wordsoup.div['data-text']
                keysoup = bs(wordsHtml, "html.parser")
                keywordsTags = keysoup.find_all('div')
                #搜索词数量只取前5个
                for index in range(len(keywordsTags)):
                    if(index >= 5):
                        break
                    v.keywords[index] = keywordsTags[index].string
                #print(v.keywords)
            #获取旺铺行为
            minisite = vs.find_next(attrs={'class': 'td-minisite-active'})
            v.minisiteAction = minisite.get_text()

            #获取网站行为
            website = vs.find_next(attrs = {'class': 'td-website-active'})
            v.webAction = website.get_text()

            visitors.append(v)

#写EXCEL
vindex = 0
heads = ['编号', '访客', '地区', '停留时间',
         '常用搜索词',
         #'常用搜索词2', '常用搜索词3', '常用搜索词4', '常用搜索词5',
         '旺铺行为', '网站行为']
           
workbook = xlwt.Workbook()
sheet = workbook.add_sheet('sheet')
#写第一行,即列的标题
for i in range(len(heads)):
    sheet.write(0, i, heads[i])

#写每一行的数据
for i in range(1, len(visitors)):
    tempv = visitors[vindex]

    #写编号
    sheet.write(i, 0, i)
    #写访客id
    sheet.write(i, 1, tempv.vid)
    #写访客国家
    sheet.write(i, 2, tempv.country)
    #写停留时间
    sheet.write(i, 3, tempv.staySecond)

    #写搜索词
    #l_words = 5 if len(tempv.keywords) >= 5 else len(tempv.keywords)
    #for j in range(l_words):
    #    sheet.write(i, 4 + j, tempv.keywords[j])
    wordstring = ''
    for j in range(len(tempv.keywords)):
        #print(tempv.keywords[j])
        wordstring = wordstring + tempv.keywords[j] + '\n'
    sheet.write(i, 4, wordstring)
    #print()

    #写旺铺行为
    sheet.write(i, 5, tempv.minisiteAction)
    #写网站行为
    sheet.write(i, 6, tempv.webAction)
    vindex = vindex + 1

    workbook.save('e:\\python-workspace\\wang\\wgsb.xls')#要新创建的excel的路径和名字

注:
1)读文件用with open和f.read()
2)BeautifulSoup的用法可以查询API
3)写Excel用xlwt库

发布了52 篇原创文章 · 获赞 16 · 访问量 6万+

猜你喜欢

转载自blog.csdn.net/Tuzi294/article/details/89578158
今日推荐