Crawl the basic information of IMDb TOP250 movies
Main code
items:
import scrapy
class ImdbItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
rank = scrapy.Field()
movie_name = scrapy.Field()
movie_type = scrapy.Field()
director = scrapy.Field()
writer = scrapy.Field()
stars = scrapy.Field()
score = scrapy.Field()
country = scrapy.Field()
metascore = scrapy.Field()
movie_length = scrapy.Field()
year = scrapy.Field()
comment_num = scrapy.Field()
critic_num = scrapy.Field()
CWG = scrapy.Field()
# budget = scrapy.Field()
# budget_type = scrapy.Field()
spiders:
# -*- coding: utf-8 -*-
import scrapy
from imdb.items import ImdbItem
import re
import time
import copy
# scrapy crawl rank -o rank.csv
class RankSpider(scrapy.Spider):
name = 'rank'
allowed_domains = ['imdb.com']
start_urls = ['https://www.imdb.com/chart/top/?ref_=nv_mv_250']
# request top250 page, get movie url
def parse(self, response):
item = ImdbItem()
rank_list = response.xpath('//td[@class="titleColumn"]/text()').re('\d+')
movie_index = 0
for i in rank_list:
detail_url = response.xpath('//td[@class="titleColumn"]/a[1]/@href').extract()[movie_index]
movie_index += 1
if movie_index < 250:
item['rank'] = i
# we can't use response.follow here as url is not attached to the start url
yield scrapy.Request('https://www.imdb.com' + detail_url, callback=self.parse_detail, meta={
'key': copy.deepcopy(item)})
else:
break
# return valuable results
def parse_detail(self, response):
item = response.meta['key']
item['movie_name'] = response.xpath('//div[@class="title_wrapper"]/h1/text()').re('(.+)\xa0')[0]
# item['year'] = response.xpath('//*[@id="titleYear"]/a/text()').get(),
# 未进入更详细页面爬取各信息!
s = response.xpath('//*[@id="title-overview-widget"]/div[1]/div[2]/div/div[2]/div[2]/div/a/text()').extract()
ss = response.xpath('//*[@id="title-overview-widget"]/div[1]/div[2]/div/div[2]/div[2]/div/a/text()').extract_first()
for i in s[1:-1]:
ss = ss + ',' + i
item['movie_type'] = ss
item['director'] = response.xpath('//div[@class="credit_summary_item"][1]/a/text()').extract_first()
item['writer'] = response.xpath('//div[@class="credit_summary_item"][2]/a/text()').extract_first()
s = response.xpath('//div[@class="credit_summary_item"][3]/a/text()').extract()
ss = response.xpath('//div[@class="credit_summary_item"][3]/a/text()').extract_first()
for i in s[1:]:
ss = ss + ',' + i
item['stars'] = ss.strip(',See full cast & crew')
item['score'] = response.xpath(
'//*[@id="title-overview-widget"]/div[1]/div[2]/div/div[1]/div[1]/div[1]/strong/span/text()').extract_first() # original
item['country'] = response.xpath('//a[@title="See more release dates"]/text()').re(r'[(](.*)[)]')[0] # !!!!!!
if re.findall('<div class="metacriticScore score_favorable titleReviewBarSubItem">\n<span>(.*)</span>',response.text):
item['metascore'] = re.findall('<div class="metacriticScore score_favorable titleReviewBarSubItem">\n<span>(.*)</span>',response.text)[0]
else:
item['metascore'] = ''
item['movie_length'] = response.xpath('//div[@class="title_wrapper"]/div/time/@datetime').re('\d+')[0]
item['year'] = response.xpath('//div[@class="title_wrapper"]/h1/span/a[1]/text()').extract_first()
comment_num = response.xpath('//span[@itemprop="reviewCount"]/text()').extract()[0]
comment_num = comment_num.strip(' user').split(',')
item['comment_num'] = ''
for i in comment_num:
item['comment_num'] += i
critic_num = response.xpath('//span[@itemprop="reviewCount"]/text()').extract()[1]
critic_num = critic_num.strip(' critic').split(',')
item['critic_num'] = ''
for i in critic_num:
item['critic_num'] += i
if re.findall('Cumulative Worldwide Gross:</h4> .(.*\d) ',response.text):
item['CWG'] = re.findall('Cumulative Worldwide Gross:</h4> .(.*\d) ',response.text)[0]
else:
item['CWG'] = 'unknown'
yield item
Precautions
Analyze web pages in advance
With the help of common pages, elements, and web page source codes, it is necessary to analyze the web pages in advance and discover the rules and changes of the target information. You can never make a conclusion at a glance.
Regular expression to extract the string in parentheses
import re
p1 = re.compile(r'[(](.*?)[)]', re.S) #最小匹配
freezer_kind = re.findall(p1, file_name)
error: ‘FeedExporter’ object has no attribute ‘slot’
This is because the csv file to be written has been opened, and the program cannot write it. Close the file and run scrapy to solve the problem.
copy.deepcopy(item)
scrapy.Request uses meta to transfer data, and the use of deepcopy
import copy
a = [1, 2, 3, 4, ['a', 'b']] #原始对象
b = a #赋值,传对象的引用
c = copy.copy(a) #对象拷贝,浅拷贝
d = copy.deepcopy(a) #对象拷贝,深拷贝
a.append(5) #修改对象 a
a[4].append('c') #修改对象a中的['a', 'b']数组对象
print 'a = ', a
print 'b = ', b
print 'c = ', c
print 'd = ', d
#输出结果:
a = [1, 2, 3, 4, ['a', 'b', 'c'], 5]
b = [1, 2, 3, 4, ['a', 'b', 'c'], 5]
c = [1, 2, 3, 4, ['a', 'b', 'c']]
d = [1, 2, 3, 4, ['a', 'b']]