scrapy startproject todayMoive # 创建一个名为todayMoive新项目 cd todayMoive # 切换目录 scrapy genspider MoiveSpider jycinema.com # 爬虫脚本的搜索域为jycinema.com
这些文件分别是:
- scrapy.cfg:项目的配置文件
- todayMoive:该项目的python模块。之后您将在此加入代码。
- todayMoive/items.py:项目中的item文件。
- todayMoive/pipelines.py:项目中的pipelines文件。
- todayMoive/spiders/:放置spider代码的目录。
- todayMoive/middlewares.py:项目中的自定义中间件文件。
scrapy.cfg整个项目的配置文件
# Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] # 定义默认设置文件的位置为todayMoive模块下的settings文件 default = todayMoive.settings [deploy] # 定义项目名称为todayMoive #url = http://localhost:6800/ project = todayMoive
todayMoive/settings.py是上层目录中scrapy.cfg定义的设置文件
# -*- coding: utf-8 -*- # Scrapy settings for todayMoive project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'todayMoive' # 此Scrapy项目实施的bot的名称(也称为项目名称)。这将用于默认情况下构造User-Agent,也用于日志记录。 SPIDER_MODULES = ['todayMoive.spiders'] NEWSPIDER_MODULE = 'todayMoive.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'todayMoive (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'todayMoive.middlewares.TodaymoiveSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'todayMoive.middlewares.TodaymoiveDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'todayMoive.pipelines.TodaymoivePipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
todayMoive/items.py文件的作用是定义爬虫最终需要哪些项
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class TodaymoiveItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass
todayMoive/pipelines.py文件的作用是爬虫爬取网页中的内容后,这些内容怎么处理
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html class TodaymoivePipeline(object): def process_item(self, item, spider): return item
创建Scrapy项目抓取静态网页内容。。。(本来想抓取活动列表,但发现网页内基本都是动态显示)
todayMoive/items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class TodaymoiveItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() activityName = scrapy.Field()
todayMoive/pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import time class TodaymoivePipeline(object): def process_item(self, item, spider): now = time.strftime('%Y-%m-%d', time.localtime()) fileName = now + '.txt' with open(fileName, 'a') as fp: fp.write(item['activityName'][0] + '\n\n') return item
todayMoive/settings.py
# -*- coding: utf-8 -*- # Scrapy settings for todayMoive project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'todayMoive' SPIDER_MODULES = ['todayMoive.spiders'] NEWSPIDER_MODULE = 'todayMoive.spiders' ITEM_PIPELINES = {'todayMoive.pipelines.TodaymoivePipeline':300} # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'todayMoive (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False
todayMoive/spider/MoiveSpider.py
# -*- coding: utf-8 -*- import scrapy from todayMoive.items import TodaymoiveItem class MoivespiderSpider(scrapy.Spider): name = 'MoiveSpider' allowed_domains = ['jycinema.com'] start_urls = ['http://www.jycinema.com/html/default/schedule.html?cinemaId=451'] def parse(self, response): subSelector = response.xpath('//span[@class="f16"]') items = [] for sub in subSelector: print(sub) item = TodaymoiveItem() item['activityName'] = sub.xpath('text()').extract() print(item) items.append(item) return items
抓取结果如下图:
BUG1:UnicodeDecodeError: 'gbk' codec can't decode byte 0xae in position 185: illegal multibyte sequence
错误是因为配置文件scrapy.cfg文件中存在中文备注,而框架内部程序对配置文件采取cp936编码
BUG2:ModuleNotFoundError: No module named 'win32api'
Python是没有自带访问windows系统API的库,需要安装pip install pypiwin32
todayMoive/settings.py