基于BeautifulSoup简单爬虫

Python语法简单、框架多、代码少，用于爬虫是一个不错的选择。爬虫是模拟Http请求获取静态网页，并解析网页的html得到相应的信息的一种方法。文章针对糗事百科的段子做爬虫练习，并用Pandas作简单统计写入到csv文件

请求网页

通过urllib框架模拟http请求，获取html，分析网页html标签，针对性地提取对应信息。

import urllib
import re
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup

page = 1
fetch_url = 'http://www.qiushibaike.com/hot/page/' + str(page)
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
# 构建Request对象
req = urllib.request.Request(fetch_url, headers=headers)
# urlopen打开Request对象
fetch_res = urlopen(req).read().decode('utf-8')
# print(fetch_res)
soup = BeautifulSoup(fetch_res, features='lxml')
# 页面格式化输出
print(soup.prettify())

页面打印如下，基本可以看出网页结构，一个div class=article block是一个段子组成，因此可以分析成一个div列表，然后再进行解析，提取信息。
网页解析

解析html标签

通过每个div解析作者、内容及评论数，code片断如下

articles = soup.findAll('div', class_=re.compile('article block'))
contents = []
authors = []
vote_nums = []
for article in articles:
    if article.children:
        contents.append(article.select('div[class="author clearfix"] h2'))
        authors.append(article.select('div[class="content"] span'))
        vote_nums.append(article.select('span[class="stats-vote"] i'))
print("{}, {}, {}".format(len(contents), len(authors), len(vote_nums)))

res_pd = pd.DataFrame({'作者': [txt[0].contents[0].strip() for txt in authors],
                       '内容': [txt[0].contents[0].strip() for txt in contents],
                      '评论数': [int(txt[0].contents[0].strip()) for txt in vote_nums]})

# 找出评论数最高的10位
res_pd.sort_values('评论数', ascending=False).head(10)

结果

python面向对象整合

根据页数爬虫当前页，返回段子dataframe
将每页段子append写入到文件

import urllib
import re
import os
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup

class QsbkText:
    # 初始化，构造函数，主要用于初始化属性
    def __init__(self):
        self.pageIndex = 1
        self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        # 初始化agent
        self.headers = {'User-Agent': self.user_agent}
        # 存放是否运行
        self.enable = False

    # 获取每页的内容，返回DataFrame
    def getPage(self, pageIndex):
        try:
            fetch_url = 'http://www.qiushibaike.com/hot/page/' + str(pageIndex)
            req = urllib.request.Request(fetch_url, headers=self.headers)
            # 请求网页获取的结果
            fetch_html = urlopen(req).read().decode('utf-8')
            # print(fetch_html)
            soup = BeautifulSoup(fetch_html, features='lxml')
            articles = soup.findAll('div', class_=re.compile('article block'))
            contents = []
            authors = []
            vote_nums = []
            for article in articles:
                if article.children:
                    authors.append(article.select_one('div[class="author clearfix"] h2'))
                    contents.append(article.select_one('div[class="content"] span'))
                    vote_nums.append(article.select_one('span[class="stats-vote"] i'))
            res_pd = pd.DataFrame({'作者': [txt.contents[0].strip() for txt in authors],
                                   '内容': [txt.contents[0].strip() for txt in contents],
                                   '评论数': [int(txt.contents[0].strip()) for txt in vote_nums]})
            return res_pd
        except Exception as e:
            print('连接失败错误原因', e)
            return None

    # 写入本地文件
    def readCsv(self, pd_data, fileName):
        if not os.path.isfile(fileName):
            pd_data.to_csv(fileName, header=True, index=False)
        else:
            pd_data.to_csv(fileName, mode='a', header=False, index=False)

    # 读取客户端输入
    def readFromClient(self):
        inputStr = input('please input, continue--"Enter"; exit--"Q" \n')
        if inputStr == "Q":
            self.enable = False

    def start(self):
        self.enable = True
        index_of_page = 1
        while self.enable:
            pageData = self.getPage(index_of_page)
            self.readCsv(pageData, '1.pachong_pract/qsbk_res.csv')
            index_of_page += 1
            self.readFromClient()

# 开始运行
QsbkText().start()

运行过程，Enter表示继续，Q表示退出，得到的CSV文件如下

csv结果

读取CSV文件，获取评论数最多的20个段子

import pandas as pd
read_data = pd.read_csv('1.pachong_pract/qsbk_res.csv', encoding='gbk', delimiter=",")
read_data.sort_values('评论数', ascending=False).head(20)

前20名段子

基于BeautifulSoup简单爬虫

请求网页

解析html标签

python面向对象整合

猜你喜欢