需求

爬取电影网链接的电影信息

GIT源码

创建好项目后开始进行步骤

配置setting

# 指定终端输出日志
LOG_LEVEL = 'ERROR'

# 19行 请求身份伪装user_agent 定义
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'

# 22行 是否遵从robots协议（君子协议）
ROBOTSTXT_OBEY = False

# 打开管道
ITEM_PIPELINES = {
   'moviePro.pipelines.MovieproPipeline': 300,
}

items添加字段

将需要储存的字段录入

class MovieproItem(scrapy.Item):
    name = scrapy.Field()
    kind = scrapy.Field()
    actor = scrapy.Field()
    language = scrapy.Field()
    longTime = scrapy.Field()

爬虫文件

如果获取到数据是列表用''.join(kind)为str类型
列表数据获取用.extract()
【请求传参】meta={'item': item}

import scrapy
from moviePro.items import MovieproItem

# 请求传参： 处理多个页面爬虫， 比如电影网爬取电影详细内容需要点击进电影详情页面
class MovieSpider(scrapy.Spider):
    name = 'movie'
    # allowed_domains = ['www.55xia.com']
    start_urls = ['http://www.55xia.com/movie']

    # 用于解析电影详情页面数据
    def parseByMove(self, response):
        actor = response.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[1]/td[2]/a/text()').extract_first()
        language = response.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[6]/td[2]/text()').extract_first()
        longTime = response.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[8]/td[2]/text()').extract_first()
        # 取出传来的meta参数的字典
        item = response.meta.get('item')

        item['actor'] = actor
        item['language'] = language
        item['longTime'] = longTime

        # print(item['actor'], item['language'], item['longTime'])
        # 将item提交给管道

        yield item

    def parse(self, response):
        # 爬取电影 名称 类型 导游 语言 片长
        div_list = response.xpath('/html/body/div[1]/div[1]/div[2]/div')
        for div in div_list:
            name = div.xpath('.//div[@class="meta"]/h1/a/text()').extract_first()
            kind = div.xpath('.//div[@class="otherinfo"]//text()').extract()  # 这里不是列表，不能用first!!!
            # kind 是列表！！需要转字符串
            kind = ''.join(kind)
            url = "http:" + div.xpath('.//div[@class="meta"]/h1/a/@href').extract_first()

            # 创建items对象
            item = MovieproItem()
            item['name'] = name
            item['kind'] = kind
            # print(name,kind,url)
            # !!将剩下的数据加到item对象中 回调函数传一个字典形式item
            # 对详情url请求，解析
            yield scrapy.Request(url=url, callback=self.parseByMove, meta={'item': item})

管道储存

class MovieproPipeline(object):
    fp = None

    def open_spider(self, spider):
        print('管道开启')
        self.fp = open('movieList.txt', 'w', encoding='utf-8')

    def process_item(self, item, spider):
        print(item['name'])
        print(item['kind'])
        print(item['actor'])
        print(item['language'])
        print(item['longTime'])

        detail = item['name'] + ':' + item['kind'] + ':' + item['actor'] + ':' + item['language'] + ':' + item['longTime'] + '\n\n\n'
        print(detail)
        self.fp.write(detail)

        return item

    def close_spider(self, spider):
        print('管道关闭')
        self.fp.close()

基于Scrapy框架爬取电影网数据的案例

需求

配置setting

items添加字段

爬虫文件

管道储存

猜你喜欢