基于python3 爬取糗事百科

最近闲来无事,写一份关于爬虫的文章。

抓取糗事百科
目标:
    1、抓取糗事百科热门帖,获取其**发布者**、**评论(及数量)**、**赞踩(及数量)**等
    2、将信息清洗并打印,循环输出
    3、设计程序,使可以选择抓取的页面范围
    4、将每一页的信息保存到文本

废话不多说 代码如下(具体的每一步都有详细的解释):

# -*-coding:utf-8-*-
import re
import requests
import time

"""初始化查询的网址"""
siteURL = "https://www.qiushibaike.com/"


def replace(x):
    """
    方便用replace方法把换行符等删除
    :param: x
    :return: x.strip
    """
    x = re.sub(re.compile('<br>|</br>|/>|<br'), "", x)
    return x.strip()


def getSource(url):
    """
    获取网页源码
    :param: url
    :return: result
    """
    user_agent = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
    headers = {'User_agent': user_agent}
    r = requests.get(url, headers=headers)
    result = r.text
    return result


def getDetaipage(detailURL):
    """
    获取详情页信息
    :param: detailURL
    :return:items
    """
    source = getSource(detailURL)
    pattern = re.compile(
        '<div class="author.*?<h2>(.*?)</h2>.*?Icon">(.*?)</div>.*?<div class="content">.*?<span>(.*?)</span>.*?<span.*?stats-vote.*?number">(.*?)</i>.*?stats-comments.*?number">(.*?)</i>.*?up.*?number hidden">(.*?)</span>.*?down.*?number hidden">(.*?)</span>',
        re.S)
    items = re.findall(pattern, source)
    return items


def saveDetailpage(data):
    """
    保存信息写入文件
    :param: data,name
    :return:
    """
    with open("qiushibaike.txt", "a+", encoding='utf-8') as f:
        f.write(data)


def OnePage(detailURL):
    """
    对一页的操作
    :param: detailURL,name
    :return:data
    """
    data = getDetaipage(detailURL)
    return data


def getAllPage(start, end):
    """
    对很多页的操作(1:start页等于1  2:start页大于1)
    :param: start,end
    :return: False
    """
    items = []
    if start == 1:
        print(u'正在获取第 1 页的数据...')
        detailURL = siteURL
        data = OnePage(detailURL)
        # 将多个 data 合并
        items += data
        number = 2
        for page in range(2, end + 1):
            print(u'正在获取第', number, u'页的数据...')
            detailURL = siteURL + '8hr/page/' + str(page) + '/?s=4964625'
            data = OnePage(detailURL)
            items += data
            time.sleep(2)
            number += 1
        if number == end + 1:
            print(u'', u'\n加载结束!')
            return items
    elif start > 1:
        number = start
        for page in range(start, end + 1):
            print(u'', u'\n正在获取第', number, u'页的数据...')
            # https://www.qiushibaike.com/8hr/page/10/
            detailURL = siteURL + '8hr/page/' + str(page) + '/?s=4964625'
            data = OnePage(detailURL)
            items += data
            time.sleep(2)
            number += 1
        if number == end + 1:
            print(u'', u'加载结束!')
            return items


def main():
    """主函数 调用的参数可以更改 最大到13页"""
    items = getAllPage(start=int(1), end=int(13))
    number = 1
    for item in items:
        data = str(number) + u'楼' + u'\t楼主:' + replace(item[0]) + u'\t' + item[1] + u'岁' + u'\n发言:' + replace(
            item[2]) + u'\n好笑:' + item[3] + u'\t评论:' + item[4] + u'\t赞:' + item[5] + u'\t踩:' + item[6] + '\n'
        print(data)
        saveDetailpage(str(data))
        number = int(number) + 1


if __name__ == "__main__":
    main()

猜你喜欢

转载自my.oschina.net/u/2672404/blog/1609095