'''
糗事百科首页内容爬取
类的使用
'''
# 导入第三方库
import requests
from lxml import etree
# 定义糗事百科爬虫的类
class QiuShiSpider():
# 初始化对象
def __init__(self):
self.start_url = "http://www.qiushidabaike.com/index_1.html"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36"}
# 定义得到html文本的方法
def get_html(self, url):
response = requests.get(url, headers=self.headers).content.decode()
return response
# 定义解析html文本的方法
def paser_html(self, response):
e = etree.HTML(response)
content = e.xpath('//dd[@class="content"]/p/text()')
return content
# 定义保存文本的方法
def save_joke(self, content):
with open("joke.txt", "a", encoding="utf-8") as f:
for content in content:
f.write(content)
f.write('\r\n')
# 定义运行函数,实现主要逻辑
def run(self):
url = self.start_url
content = self.get_html(url)
joke = self.paser_html(content)
self.save_joke(joke)
# 程序运行接口
if __name__ == '__main__':
qiushi = QiuShiSpider()
qiushi.run()
python爬虫之提取糗事百科首页的内容
猜你喜欢
转载自blog.csdn.net/qq_46292926/article/details/104983228
今日推荐
周排行