Python爬虫！

爬取起点中文网小说（只爬了第一页的小说，可以爬去所有的小说，但是太多了，就只爬了第一页的小说）

源代码：

"""
created on Web Jan 02 2019
@author: Super Huan
"""
# Python爬取起点小说
import requests
from lxml import etree
import os
class spider():
 def startRequest(self):
 response = requests.get('https://www.qidian.com/all')
 html = etree.HTML(response.content.decode())
 bigTitleList = html.xpath('//div[@class="book-mid-info"]/h4/a/text()')
 bigSrcList = html.xpath('//div[@class="book-mid-info"]/h4/a/@href')
 for bigTitle, bigSrc in zip(bigTitleList, bigSrcList):
 if os.path.exists(bigTitle) == False:
 os.mkdir(bigTitle)
 self.fileData(bigTitle, bigSrc)
 def fileData(self, bigTitle, bigSrc):
 response = requests.get('http:' + bigSrc)
 html = etree.HTML(response.content.decode())
 litTitleList = html.xpath('//ul[@class="cf"]/li/a/text()')
 litSrcList = html.xpath('//ul[@class="cf"]/li/a/@href')
 for litTitle, litSrc in zip(litTitleList, litSrcList):
 self.finallyFile(litTitle, litSrc, bigTitle)
 def finallyFile(self, title, url, bigTitle):
 response = requests.get('https:' + url)
 html = etree.HTML(response.content.decode())
 text = '
'.join(html.xpath('//div[@class="read-content j_readContent"]/p/text()'))
 fileName = bigTitle + '/' + title + '.txt'
 print('正在抓取文章', fileName)
 if os.path.exists(fileName) == False:
 with open(fileName, 'a', encoding='utf-8') as f:
 f.write(text)
spider = spider()
spider.startRequest()

进群：960410445 即可获取书十套PDF！

猜你喜欢