用requests和bs做个简单的爬取网DAI之家的例子。
只做笔记用。
1 #!/usr/bin/python3 2 3 import requests 4 from bs4 import BeautifulSoup 5 import re 6 7 class wdzj_spider: 8 def request(self, url): 9 headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"} 10 return requests.get(url, headers=headers) 11 12 def getLinkFromPage(self, pingtaiLink): 13 pingtaiHtml = self.request(pingtaiLink) 14 #默认编码有问题,需手动转码 15 txtUTF8 = str(pingtaiHtml.content, 'utf-8') 16 briefSec = BeautifulSoup(txtUTF8, "lxml",from_encoding="utf8").find("div", class_="cen-zk") 17 print("简介={0}".format(briefSec.get_text())) 18 19 def getAllPage(self): 20 startUrl = 'https://www.wdzj.com/dangan/search?filter=&sort=1¤tPage=1' 21 mainHtml = self.request(startUrl) 22 pageStr = BeautifulSoup(mainHtml.text, "lxml").find("span", class_="all").text 23 searchObj = re.search(r'1/([0-9]+)', pageStr, re.M | re.I) 24 pageCount = searchObj.group(1) 25 26 startUrl = 'https://www.wdzj.com/dangan/search?filter=&sort=1¤tPage=' 27 baseUrl = 'https://www.wdzj.com' 28 29 print("pageCount={0}".format(pageCount)) 30 for i in range(1, int(pageCount)+1): 31 # for i in range(1, 2): 32 urlPage = startUrl + str(i) 33 pageHtml = self.request(urlPage) 34 pageStrs = BeautifulSoup(pageHtml.text, "lxml").find_all('a',text='查看档案') 35 for a in pageStrs: 36 print("查看档案:") 37 self.getLinkFromPage(baseUrl+a['href']) 38 39 40 if __name__ == '__main__': 41 w = wdzj_spider() 42 w.getAllPage()