028. (7.27) Scrapy crawls basic information of IMDb TOP250 movies

Main code

items:

import scrapy

class ImdbItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    rank = scrapy.Field()

    movie_name = scrapy.Field()
    movie_type = scrapy.Field()
    director = scrapy.Field()
    writer = scrapy.Field()
    stars = scrapy.Field()
    score = scrapy.Field()

    country = scrapy.Field()
    metascore = scrapy.Field()
    movie_length = scrapy.Field()
    year = scrapy.Field()
    comment_num = scrapy.Field()
    critic_num = scrapy.Field()
    CWG = scrapy.Field()
    # budget = scrapy.Field()
    # budget_type = scrapy.Field()

spiders:

# -*- coding: utf-8 -*-
import scrapy
from imdb.items import ImdbItem
import re
import time
import copy

# scrapy crawl rank -o rank.csv

class RankSpider(scrapy.Spider):
    name = 'rank'
    allowed_domains = ['imdb.com']
    start_urls = ['https://www.imdb.com/chart/top/?ref_=nv_mv_250']

    # request top250 page, get movie url
    def parse(self, response):
        item = ImdbItem()
        rank_list = response.xpath('//td[@class="titleColumn"]/text()').re('\d+')
        movie_index = 0

        for i in rank_list:
            detail_url = response.xpath('//td[@class="titleColumn"]/a[1]/@href').extract()[movie_index]
            movie_index += 1

            if movie_index < 250:
                item['rank'] = i
                # we can't use response.follow here as url is not attached to the start url
                yield scrapy.Request('https://www.imdb.com' + detail_url, callback=self.parse_detail, meta={
    
    'key': copy.deepcopy(item)})
            else:
                break
            # return valuable results

    def parse_detail(self, response):
        item = response.meta['key']

        item['movie_name'] = response.xpath('//div[@class="title_wrapper"]/h1/text()').re('(.+)\xa0')[0]
        # item['year'] = response.xpath('//*[@id="titleYear"]/a/text()').get(),

        # 未进入更详细页面爬取各信息!

        s = response.xpath('//*[@id="title-overview-widget"]/div[1]/div[2]/div/div[2]/div[2]/div/a/text()').extract()
        ss = response.xpath('//*[@id="title-overview-widget"]/div[1]/div[2]/div/div[2]/div[2]/div/a/text()').extract_first()
        for i in s[1:-1]:
            ss = ss + ',' + i
        item['movie_type'] = ss

        item['director'] = response.xpath('//div[@class="credit_summary_item"][1]/a/text()').extract_first()

        item['writer'] = response.xpath('//div[@class="credit_summary_item"][2]/a/text()').extract_first()

        s = response.xpath('//div[@class="credit_summary_item"][3]/a/text()').extract()
        ss = response.xpath('//div[@class="credit_summary_item"][3]/a/text()').extract_first()
        for i in s[1:]:
            ss = ss + ',' + i
        item['stars'] = ss.strip(',See full cast & crew')

        item['score'] = response.xpath(
                '//*[@id="title-overview-widget"]/div[1]/div[2]/div/div[1]/div[1]/div[1]/strong/span/text()').extract_first()  # original

        item['country'] = response.xpath('//a[@title="See more release dates"]/text()').re(r'[(](.*)[)]')[0]  # !!!!!!

        if re.findall('<div class="metacriticScore score_favorable titleReviewBarSubItem">\n<span>(.*)</span>',response.text):
            item['metascore'] = re.findall('<div class="metacriticScore score_favorable titleReviewBarSubItem">\n<span>(.*)</span>',response.text)[0]
        else:
            item['metascore'] = ''

        item['movie_length'] = response.xpath('//div[@class="title_wrapper"]/div/time/@datetime').re('\d+')[0]
        item['year'] = response.xpath('//div[@class="title_wrapper"]/h1/span/a[1]/text()').extract_first()

        comment_num = response.xpath('//span[@itemprop="reviewCount"]/text()').extract()[0]
        comment_num = comment_num.strip(' user').split(',')
        item['comment_num'] = ''
        for i in comment_num:
            item['comment_num'] += i

        critic_num = response.xpath('//span[@itemprop="reviewCount"]/text()').extract()[1]
        critic_num = critic_num.strip(' critic').split(',')
        item['critic_num'] = ''
        for i in critic_num:
            item['critic_num'] += i

        if re.findall('Cumulative Worldwide Gross:</h4> .(.*\d) ',response.text):
            item['CWG'] = re.findall('Cumulative Worldwide Gross:</h4> .(.*\d) ',response.text)[0]
        else:
            item['CWG'] = 'unknown'
        yield item
        

Precautions

Analyze web pages in advance

With the help of common pages, elements, and web page source codes, it is necessary to analyze the web pages in advance and discover the rules and changes of the target information. You can never make a conclusion at a glance.

Regular expression to extract the string in parentheses

import re

p1 = re.compile(r'[(](.*?)[)]', re.S)  #最小匹配
freezer_kind = re.findall(p1, file_name)

error: ‘FeedExporter’ object has no attribute ‘slot’

This is because the csv file to be written has been opened, and the program cannot write it. Close the file and run scrapy to solve the problem.

copy.deepcopy(item)

scrapy.Request uses meta to transfer data, and the use of deepcopy

import copy 
a = [1, 2, 3, 4, ['a', 'b']] #原始对象 
b = a #赋值,传对象的引用 
c = copy.copy(a) #对象拷贝,浅拷贝 
d = copy.deepcopy(a) #对象拷贝,深拷贝 
   
a.append(5) #修改对象 a 
a[4].append('c') #修改对象a中的['a', 'b']数组对象 
   
print 'a = ', a 
print 'b = ', b 
print 'c = ', c 
print 'd = ', d 
 
#输出结果:
a = [1, 2, 3, 4, ['a', 'b', 'c'], 5]
b = [1, 2, 3, 4, ['a', 'b', 'c'], 5]
c = [1, 2, 3, 4, ['a', 'b', 'c']]
d = [1, 2, 3, 4, ['a', 'b']]

Guess you like

Origin blog.csdn.net/u013598957/article/details/107623988