源代码为:
from urllib.request import Request, urlopen import requests import re import time def getHtml(url): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36' }# 设置虚拟Headers信息 request = Request(url, headers=headers) response = urlopen(request) html = response.read().decode('utf-8') return html def write_to_file(content): with open('duanzi.txt','a',encoding='utf=8') as f: f.write(json.dumps(content,ensure_ascii=False)+'\n') def getText(pageNum=1): text_list=[] for page in range(1,pageNum+1): url = 'https://www.qiushibaike.com/hot/page/' + str(page) html=getHtml(url) time.sleep(1) pattern = re.compile( '<div class="article block untagged mb15.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?<span class="stats-vote"><i class="number">(.*?)</i>', re.S) items = re.findall(pattern, html) text_list.append(items) # 将不同页面匹配到的内容,均放入text_list列表中存放起来 for each_items in text_list: # 迭代获取每一个网页的每一个段子信息 for item in each_items: count=0 for i in item: # 处理文本,加强阅读效果 i = i.strip('\n') # 将'\n'去掉,避免多个换行符叠加 i = i.replace('<br/>', '\n') # <br/>是HTML中的用于段落的换行标签, # 为了保持原本的段落格式,所以需要在我们阅读时替换成文本换行符'\n' print(i) count+=1 if count%3==0: print('----' * 20) if __name__ == '__main__': try: num=int(input('请输入你想要爬取的页面数量:')) getText(num) except Exception as e: print("对不住,出错了!") 效果图: