面向对象编程,爬取一部小说
'''
面向对象编程
爬取17K小说网一部小说
'''
# 导入第三方库
import os
import requests
from lxml import etree
from fake_useragent import UserAgent
# 随机请求头
ua = UserAgent()
# 定义一个小说的类
class Novel_17():
def __init__(self):
# 初始化对象
self.headers = {'User-Agent': ua.random}
self.start_url = "https://www.17k.com/list/2433764.html"
# 定义运行函数方法
def run(self):
url = self.start_url
html = self.get_html(url)
data = self.paser_html(html)
self.paser_detail(data)
# 定义得到html文本的方法
def get_html(self, url):
html = requests.get(url, headers=self.headers).content.decode()
return html
# 定义解析html文本获取详情页地址的方法
def paser_html(self, html):
e = etree.HTML(html)
detail_link = e.xpath('//dl[@class="Volume"]/dd/a/@href')
detail_link = ['https://www.17k.com' + i for i in detail_link]
return detail_link
# 定义提取小说内容和保存的方法
def paser_detail(self, detail_link):
for link in detail_link:
detail_html = requests.get(url=link, headers=self.headers).content.decode()
e = etree.HTML(detail_html)
small_title = e.xpath('//div[@class="readAreaBox content"]/h1/text()')
content = e.xpath('//div[@class="p"]/p/text()')
if not os.path.exists('三途河畔彼岸花'):
os.mkdir('三途河畔彼岸花')
with open('三途河畔彼岸花' + '/' + '{}.txt'.format(small_title[0]), 'w', encoding="utf-8") as f:
f.write(small_title[0])
f.write('\r')
for i in content:
f.write(i)
f.write('\r')
# 程序运行接口
if __name__ == '__main__':
Nover_Spider = Novel_17()
Nover_Spider.run()
效果展示:
保存本地展示: