python爬虫1、~爬取糗事百科

1、基础版本,没有反扒,没有分类,单纯每一页的爬取下来

 1 # Author:li
 2 import requests
 3 import re
 4 def  start():
 5     page = 1
 6     n = int(input("需要爬取的页数:"))
 7     while page < n:#写个
 8 
 9         url = 'https://www.qiushibaike.com/text/page/' + str(page)
10         res = requests.get(url).text #获取到了源代码
11         #正则表达式 拿来匹配项目
12         zhengze = '<div.*?article block untagged mb15 typs.*?".*?>.*?<.*?class="content">.*?<span>(.*?)</span>.*?</div>'
13         #div.*?class="article block untagged mb15".*?>.*?<.*?class="content">.*?<span>(.*?)</span>.*?</div>'
14         duanzi_list= re.findall(zhengze,res,re.S) #分别是正则表达式,源代码,re.S表示会把/n也匹配
15         #对段子进行过滤
16         #print(duanzi_list)
17         page+=1
18         num = 1
19         for duanzi in duanzi_list:
20             filter_duanzi = re.sub('<br/>','',duanzi) #re.sub 1.选取要替换的字符,2.用以替换的字符,3.对象
21             print("当前第%s页"%page)
22             print(str(num)+'.'+filter_duanzi)
23 
24 
25         #保存段子
26             #with open('糗事百科.txt','a',encoding='UTF-8' )as f:
27                 #f.write(str(num)+'.'+filter_duanzi+'\n\n\n')
28             num += 1
29     else:
30         print("已爬取%s"%page)
31 start()

猜你喜欢

转载自www.cnblogs.com/ilovelh/p/10331142.html