python爬取豆瓣电影“华语”分类下的电影基本信息(提取xhr中的JSON信息)

目的: python爬取豆瓣电影“华语”分类下的电影基本信息
分析过程: 普通beautifulsoup 抓取不到电影信息,网站将电影信息封装在XHR文件中,查找XHR文件发现里面是JSON数据,利用python的JSON模块提取里面的url即可,其他的按基础知识来抓取即可

本次联系使用类来写代码

代码如下:

import requests
import json
from bs4 import BeautifulSoup
import csv

#定义获取电影的类
class MovieInformation():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
    # 获取全部页面URL
    def get_page_urls(self):
        page_urls = []
        for i in range(0,500, 20):
            url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E5%8D%8E%E8%AF%AD&page_limit=50&page_start=' + str(
                i)
            page_urls.append(url)
        return page_urls
    # 获取每个页面中电影的url
    def get_movie_urls(self):
        urls = []
        for page_url in self.get_page_urls():
            html = requests.get(page_url, headers=self.headers).text
            datas = json.loads(html)
            for i in datas['subjects']:
                url = i['url']
                urls.append(url)
        return urls
    # 获取全部电影基本信息
    def catch_movie_information(self):
        movie_list = []
        for url in self.get_movie_urls():
            # 必须每次循环新建字典,不然字典key-value已固定在内存在,列表添加是永远添加的都是第一个字典的内容
            movie_content = {}
            code = requests.get(url, headers=self.headers).text
            soup = BeautifulSoup(code, 'html.parser')
            movie = soup.select('#content > h1 > span:nth-child(1)')[0].get_text()
            director = soup.select('#info > span:nth-child(1) > span.attrs > a')[0].get_text()

            actor_tag = soup.select('#info > span.actor')[0].get_text()
            actor = actor_tag.split(':')[1].split('/')
            # 主要超过3个获取前3位,少于3位全部获取
            if len(actor) > 3:
                actor = ''.join(actor[:3])
            else:
                actor = ''.join(actor)

            year = soup.select('#content > h1 > span.year')[0].get_text().lstrip('(').rstrip(')')
            rate = soup.select('#interest_sectl > div.rating_wrap.clearbox > div.rating_self.clearfix > strong')[
                0].get_text()
            vote = soup.select(
                '#interest_sectl > div.rating_wrap.clearbox > div.rating_self.clearfix > div > div.rating_sum > a > span')[
                0].get_text()
            movie_content['电影名称'] = movie
            movie_content['导演'] = director
            movie_content['主演'] = actor
            movie_content['上映时间'] = year
            movie_content['评分数'] = rate
            movie_content['评价人数'] = vote
            movie_list.append(movie_content)
        return movie_list
    # 保存CSV格式文件
    def save_csv_file(self):
        with open('movie.csv', 'w', newline='', encoding='utf-8') as f:
            fieldnames = ['电影名称', '导演', '主演', '上映时间', '评分数', '评价人数']
            writer = csv.DictWriter(f, fieldnames)
            writer.writeheader()
            writer.writerows(self.catch_movie_information())

    def __init__(self):
        self.save_csv_file()


if __name__ == '__main__':
    MovieInformation()
发布了6 篇原创文章 · 获赞 0 · 访问量 86

猜你喜欢

转载自blog.csdn.net/lianglee513/article/details/105058846