import requests
from lxml import etree
import json
class DouBan(object):
"""豆瓣爬虫"""
def __init__(self):
self.start_url = "https://movie.douban.com/chart"
self.headers = {
"Referer":"https://movie.douban.com/",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36"
}
def run(self):
"""主要逻辑"""
# 1、准备url列表
url = self.start_url
# 2、发送请求,获取响应
el = self.parse_get(url)
# 3、解析响应,提取数据
data_list= self.get_item(el)
# 4、保存数据
self.save_data(data_list)
def parse_get(self, url):
# 发送请求,获取响应
resp = requests.get(url=url, headers=self.headers)
el = etree.HTML(resp.content)
# print(el)
return el
def get_item(self, el):
# 解析响应,提取数据
table_list = el.xpath("//div[contains(@class, 'article')]//table")
data_list = []
for table in table_list:
item = {}
# 电影名
# item["name"] = table.xpath(".//div[contains(@class, 'pl2')]/a/text()")[0].strip("\n / ")
item["name"] = table.xpath(".//a[contains(@class, 'nbg')]/@title")
# 别名
item["alias"] = table.xpath(".//div[contains(@class, 'pl2')]/a/span/text()")
# 链接
item["href"] = table.xpath(".//div[contains(@class, 'pl2')]/a/@href")
# 图片
item["src"] = table.xpath(".//a[contains(@class, 'nbg')]/img/@src")
# 简介
item["abstract"] = table.xpath(".//p[contains(@class, 'pl')]/text()")
# 评分
item["score"] = table.xpath(".//span[contains(@class, 'rating_nums')]/text()")
# 评价人数
item["counts"] = table.xpath(".//span[contains(@class, 'pl')]/text()")
data_list.append(item)
return data_list
def save_data(self, data_list):
"""保存数据"""
with open('./film.txt', 'a', encoding='utf8') as f:
f.write(json.dumps(data_list, ensure_ascii=False))
if __name__ == '__main__':
douban = DouBan()
douban.run()
豆瓣电影爬虫练习
猜你喜欢
转载自blog.csdn.net/zy_whynot/article/details/103607197
今日推荐
周排行