做个爬取最近豆瓣热映电影的爬虫,巩固所学,用lxml解析和xpath提取规则
from lxml import etree
import requests
headers = {
"User-Agent" : "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) "
"AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/72.0.3626.121 Mobile Safari/537.36",
"Referer" : "https://movie.douban.com/cinema/nowplaying/zhumadian/"
}
url = "https://movie.douban.com/cinema/nowplaying/zhumadian/"
# 1 抓取豆瓣热映网页
def get_file():
response = requests.get(url , headers=headers)
text = response.text
return text
# 2 处理抓取页面,获取需要信息
def p_file(text):
html = etree.HTML(text)
ul = html.xpath("//ul[@class='lists']")[0]
lis = ul.xpath("./li")
movies =[]
for li in lis:
title = li.xpath("@data-title")
score = li.xpath("@data-score")
region = li.xpath("@data-region")
director = li.xpath("@data-director")
actors = li.xpath("@data-actors")
movie ={
"电影" : title,
"评分" : score,
"产地" : region,
"导演" : director,
"演员" : actors
}
movies.append(movie)
for movie in movies:
print(movie)
if __name__ == '__main__':
text = get_file()
p_file(text)
下面是运行结果
多分析网页,代码条理还算清晰,一起交流学习,不断优化