爬虫实战（二）：爬取糗事百科段子

源代码为：
from urllib.request import Request, urlopen
import requests
import re
import time
def getHtml(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
    }# 设置虚拟Headers信息
    request = Request(url, headers=headers)
    response = urlopen(request)
    html = response.read().decode('utf-8')
    return html
def write_to_file(content):
    with open('duanzi.txt','a',encoding='utf=8') as f:
        f.write(json.dumps(content,ensure_ascii=False)+'\n')
def getText(pageNum=1):
    text_list=[]
    for page in range(1,pageNum+1):
        url = 'https://www.qiushibaike.com/hot/page/' + str(page)
        html=getHtml(url)
        time.sleep(1)
        pattern = re.compile(
            '<div class="article block untagged mb15.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?<span class="stats-vote"><i class="number">(.*?)</i>',
            re.S)
        items = re.findall(pattern, html)
        text_list.append(items)    # 将不同页面匹配到的内容，均放入text_list列表中存放起来
        for each_items in text_list:   # 迭代获取每一个网页的每一个段子信息
            for item in each_items:
                count=0
                for i in item:  # 处理文本，加强阅读效果
                    i = i.strip('\n')  # 将'\n'去掉，避免多个换行符叠加
                    i = i.replace('<br/>', '\n')  # <br/>是HTML中的用于段落的换行标签，
                                                  # 为了保持原本的段落格式，所以需要在我们阅读时替换成文本换行符'\n'
                    print(i)
                    count+=1
                    if count%3==0:
                        print('----' * 20)


if __name__ == '__main__':
    try:
        num=int(input('请输入你想要爬取的页面数量：'))
        getText(num)
    except Exception as e:
        print("对不住，出错了！")




效果图：
爬虫实战（二）：爬取糗事百科段子

猜你喜欢