1、基础版本,没有反扒,没有分类,单纯每一页的爬取下来
1 # Author:li 2 import requests 3 import re 4 def start(): 5 page = 1 6 n = int(input("需要爬取的页数:")) 7 while page < n:#写个 8 9 url = 'https://www.qiushibaike.com/text/page/' + str(page) 10 res = requests.get(url).text #获取到了源代码 11 #正则表达式 拿来匹配项目 12 zhengze = '<div.*?article block untagged mb15 typs.*?".*?>.*?<.*?class="content">.*?<span>(.*?)</span>.*?</div>' 13 #div.*?class="article block untagged mb15".*?>.*?<.*?class="content">.*?<span>(.*?)</span>.*?</div>' 14 duanzi_list= re.findall(zhengze,res,re.S) #分别是正则表达式,源代码,re.S表示会把/n也匹配 15 #对段子进行过滤 16 #print(duanzi_list) 17 page+=1 18 num = 1 19 for duanzi in duanzi_list: 20 filter_duanzi = re.sub('<br/>','',duanzi) #re.sub 1.选取要替换的字符,2.用以替换的字符,3.对象 21 print("当前第%s页"%page) 22 print(str(num)+'.'+filter_duanzi) 23 24 25 #保存段子 26 #with open('糗事百科.txt','a',encoding='UTF-8' )as f: 27 #f.write(str(num)+'.'+filter_duanzi+'\n\n\n') 28 num += 1 29 else: 30 print("已爬取%s"%page) 31 start()