目的: python爬取豆瓣电影“华语”分类下的电影基本信息
分析过程: 普通beautifulsoup 抓取不到电影信息,网站将电影信息封装在XHR文件中,查找XHR文件发现里面是JSON数据,利用python的JSON模块提取里面的url即可,其他的按基础知识来抓取即可
本次联系使用类来写代码
代码如下:
import requests
import json
from bs4 import BeautifulSoup
import csv
#定义获取电影的类
class MovieInformation():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
# 获取全部页面URL
def get_page_urls(self):
page_urls = []
for i in range(0,500, 20):
url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E5%8D%8E%E8%AF%AD&page_limit=50&page_start=' + str(
i)
page_urls.append(url)
return page_urls
# 获取每个页面中电影的url
def get_movie_urls(self):
urls = []
for page_url in self.get_page_urls():
html = requests.get(page_url, headers=self.headers).text
datas = json.loads(html)
for i in datas['subjects']:
url = i['url']
urls.append(url)
return urls
# 获取全部电影基本信息
def catch_movie_information(self):
movie_list = []
for url in self.get_movie_urls():
# 必须每次循环新建字典,不然字典key-value已固定在内存在,列表添加是永远添加的都是第一个字典的内容
movie_content = {}
code = requests.get(url, headers=self.headers).text
soup = BeautifulSoup(code, 'html.parser')
movie = soup.select('#content > h1 > span:nth-child(1)')[0].get_text()
director = soup.select('#info > span:nth-child(1) > span.attrs > a')[0].get_text()
actor_tag = soup.select('#info > span.actor')[0].get_text()
actor = actor_tag.split(':')[1].split('/')
# 主要超过3个获取前3位,少于3位全部获取
if len(actor) > 3:
actor = ''.join(actor[:3])
else:
actor = ''.join(actor)
year = soup.select('#content > h1 > span.year')[0].get_text().lstrip('(').rstrip(')')
rate = soup.select('#interest_sectl > div.rating_wrap.clearbox > div.rating_self.clearfix > strong')[
0].get_text()
vote = soup.select(
'#interest_sectl > div.rating_wrap.clearbox > div.rating_self.clearfix > div > div.rating_sum > a > span')[
0].get_text()
movie_content['电影名称'] = movie
movie_content['导演'] = director
movie_content['主演'] = actor
movie_content['上映时间'] = year
movie_content['评分数'] = rate
movie_content['评价人数'] = vote
movie_list.append(movie_content)
return movie_list
# 保存CSV格式文件
def save_csv_file(self):
with open('movie.csv', 'w', newline='', encoding='utf-8') as f:
fieldnames = ['电影名称', '导演', '主演', '上映时间', '评分数', '评价人数']
writer = csv.DictWriter(f, fieldnames)
writer.writeheader()
writer.writerows(self.catch_movie_information())
def __init__(self):
self.save_csv_file()
if __name__ == '__main__':
MovieInformation()