1.爬取多个网址的数据,用list来处理
url = "https://www.qiushibaike.com/hot/page/{}/"
url_list = [url.format(i) for i in range(1, 14)]
for url in url_list:
response = requests.get(url, headers=headers)
print(response.content.decode())
2.从lxml中导入etree,使用xpath()方法
目前用到的方法有
1.// 当前子目录的属性。
2.@ 查找属性。
3..split(" ")[-1] 获取最后一个数据。
3.文本提取
1.提取属性中包含abcd的文本
contains[contains(@class, 'abcd')]
2.提取属性为abcd的文本
div[@class='abcd'].text()
list中加入str,需要取数据再加
3.提取属性名@
例如:@class
4.提取属性内文本
/text()
4.爬取糗事百科源码
import requests
from lxml import etree
import json
class QiubaiSpider:
def __init__(self):
self.url = "https://www.qiushibaike.com/text/page/{}/"
self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}
def get_url_list(self):
return [self.url.format(i) for i in range(1, 14)]
def parse_url(self, url):
response = requests.get(url, headers=self.headers)
return response.content.decode()
def get_content_list(self, html_str):
html = etree.HTML(html_str)
div_list = html.xpath("//div[@id='content-left']/div")
content_list = []
for div in div_list:
item = {}
item["id"] = div.xpath(".//h2/text()")
item["id"] = [i.replace("\n", "") for i in item["id"]]
item["content"] = div.xpath(".//div[@class='content']/span/text()")
item["content"] = [i.replace("\n", "") for i in item["content"]]
item["author_image"] = div.xpath(".//div/a/img/@src")
item["author_image"] = "https:"+item["author_image"][0] if len(item["author_image"]) > 0 else None
item["age"] = div.xpath(".//div[contains(@class, 'articleGender')]/text()")
item["age"] = item["age"] if len(item["age"]) > 0 else None
item["author_gender"] = div.xpath(".//div[contains(@class, 'articleGender')]/@class")
item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon", "") if len(item["author_gender"]) > 0 else None
item["stats_vote"] = div.xpath(".//div/span/i/text()")
item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"]) > 0 else None
item["user_commons"] = div.xpath(".//span[@class='stats-comments']/a/i/text()")
item["user_commons"] = item["user_commons"][0] if len(item["stats_vote"]) > 0 else None
content_list.append(item)
return content_list
# 保存文件
def save_html(self, html_str, page_num):
file_path = "糗事百科-第{}页.html".format(page_num)
with open(file_path, "w", encoding="utf-8") as f:
f.write(html_str)
# 打印内容
def save_content_list(self, content_list):
for i in content_list:
print(i)
with open("糗事百科.text", "w", encoding="utf-8") as f:
f.writelines(json.dumps(i)+'\n')
def run1(self):
url_list = self.get_url_list()
for url in url_list:
page_num = url_list.index(url) + 1
html_str = self.parse_url(url)
self.save_html(html_str, page_num)
def run(self):
url_list = self.get_url_list()
sava_file = ""
for url in url_list:
html_str = self.parse_url(url)
# 提取数据
content_list = self.get_content_list(html_str)
# 保存
self.save_content_list(content_list)
if __name__ == '__main__':
qiubaiSpider = QiubaiSpider()
qiubaiSpider.run()