需求
爬取电影网 链接 的电影信息
创建好项目后开始进行步骤
配置setting
# 指定终端输出日志
LOG_LEVEL = 'ERROR'
# 19行 请求身份伪装user_agent 定义
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
# 22行 是否遵从robots协议(君子协议)
ROBOTSTXT_OBEY = False
# 打开管道
ITEM_PIPELINES = {
'moviePro.pipelines.MovieproPipeline': 300,
}
items添加字段
将需要储存的字段录入
class MovieproItem(scrapy.Item):
name = scrapy.Field()
kind = scrapy.Field()
actor = scrapy.Field()
language = scrapy.Field()
longTime = scrapy.Field()
爬虫文件
- 如果获取到数据是列表用
''.join(kind)
为str类型 - 列表数据获取 用
.extract()
- 【请求传参】
meta={'item': item}
import scrapy
from moviePro.items import MovieproItem
# 请求传参: 处理多个页面爬虫, 比如电影网爬取电影详细内容需要点击进电影详情页面
class MovieSpider(scrapy.Spider):
name = 'movie'
# allowed_domains = ['www.55xia.com']
start_urls = ['http://www.55xia.com/movie']
# 用于解析电影详情页面数据
def parseByMove(self, response):
actor = response.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[1]/td[2]/a/text()').extract_first()
language = response.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[6]/td[2]/text()').extract_first()
longTime = response.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[8]/td[2]/text()').extract_first()
# 取出传来的meta参数的字典
item = response.meta.get('item')
item['actor'] = actor
item['language'] = language
item['longTime'] = longTime
# print(item['actor'], item['language'], item['longTime'])
# 将item提交给管道
yield item
def parse(self, response):
# 爬取电影 名称 类型 导游 语言 片长
div_list = response.xpath('/html/body/div[1]/div[1]/div[2]/div')
for div in div_list:
name = div.xpath('.//div[@class="meta"]/h1/a/text()').extract_first()
kind = div.xpath('.//div[@class="otherinfo"]//text()').extract() # 这里不是列表,不能用first!!!
# kind 是列表!!需要转字符串
kind = ''.join(kind)
url = "http:" + div.xpath('.//div[@class="meta"]/h1/a/@href').extract_first()
# 创建items对象
item = MovieproItem()
item['name'] = name
item['kind'] = kind
# print(name,kind,url)
# !!将剩下的数据加到item对象中 回调函数传一个字典形式item
# 对详情url请求,解析
yield scrapy.Request(url=url, callback=self.parseByMove, meta={'item': item})
管道储存
class MovieproPipeline(object):
fp = None
def open_spider(self, spider):
print('管道开启')
self.fp = open('movieList.txt', 'w', encoding='utf-8')
def process_item(self, item, spider):
print(item['name'])
print(item['kind'])
print(item['actor'])
print(item['language'])
print(item['longTime'])
detail = item['name'] + ':' + item['kind'] + ':' + item['actor'] + ':' + item['language'] + ':' + item['longTime'] + '\n\n\n'
print(detail)
self.fp.write(detail)
return item
def close_spider(self, spider):
print('管道关闭')
self.fp.close()