Scrapy框架:爬取博客

终端输入

scrapy startproject cdblogSpider

创建一个scrapy项目,框架结构如下:

这里写图片描述

进入项目文件夹,创建一只爬虫

cd cnblogSpider
scrapy genspider cnblogs_spider cnblogs.com

打开以下文件并进行修改

cnblogs_spider.py

# -*- coding: utf-8 -*-
import scrapy
from cnblogSpider.items import CnblogspiderItem
from scrapy import Selector

class CnblogspiderSpider(scrapy.Spider):
    name = 'cnblogs_spider'
    allowed_domains = ['cnblogs.com']
    start_urls = ['http://www.cnblogs.com/qiyeboy/default.html?page=1']
    def parse(self, response):
        papers=response.xpath(".//*[@class='day']")
        for paper in papers:
            url=paper.xpath(".//*[@class='postTitle']/a/@href").extract()[0]
            title=paper.xpath(".//*[@class='postTitle']/a/text()").extract()[0]
            time=paper.xpath('//*[@class="dayTitle"]/a/text()').extract()[0]
            content=paper.xpath('//*[@id="mainContent"]/div/div/div[3]/div/text()').extract()[0]
            item=CnblogspiderItem(url=url,title=title,time=time,content=content)
            request=scrapy.Request(url=url,callback=self.parse_body)
            request.meta['item']=item
            yield request
        next_page=Selector(response).re(u'<a href="(\S*)">下一页</a>')
        if next_page:
            yield scrapy.Request(url=next_page[0],callback=self.parse)
    def parse_body(self,response):
        item=response.meta['item']
        body=response.xpath('//*[@class="postBody"]')
        item['image_urls']=body.xpath('.//img//@src').extract()
        yield item


if __name__=='__main__':
    process=CrawlerProcess({
        'USER_AGENT':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0'
})
    process.crawl(Cnblogspider)
    process.start()

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class CnblogspiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    url = scrapy.Field()
    title = scrapy.Field()
    time = scrapy.Field()
    content = scrapy.Field()
    image_urls = scrapy.Field()
    images = scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from scrapy.exceptions import DropItem
import scrapy
from scrapy.pipelines.images import ImagesPipeline

class CnblogspiderPipeline(object):
    def __init__(self):
        self.file=open('parse.json','wb')
    def process_item(self, item, spider):
        if item["title"]:
            line=json.dumps(dict(item))+"\n"
            self.file.write(line.encode('utf-8'))
            return item
        else:
            raise DropItem("Missing title in %s"%item)


class ImagesPipeline(ImagesPipeline):
    def get_media_requests(self,item,info):
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url)

    def item_completed(self,results,item,info):
        image_paths=[x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['image_paths']=image_paths
        return item

settings.py

# -*- coding: utf-8 -*-



# Scrapy settings for cnblogSpider project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

#     https://doc.scrapy.org/en/latest/topics/settings.html

#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html



BOT_NAME = 'cnblogSpider'



SPIDER_MODULES = ['cnblogSpider.spiders']

NEWSPIDER_MODULE = 'cnblogSpider.spiders'





# Crawl responsibly by identifying yourself (and your website) on the user-agent

#USER_AGENT = 'cnblogSpider (+http://www.yourdomain.com)'



# Obey robots.txt rules

ROBOTSTXT_OBEY = False



# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32



# Configure a delay for requests for the same website (default: 0)

# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

#DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16



# Disable cookies (enabled by default)

#COOKIES_ENABLED = False



# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False



# Override the default request headers:

#DEFAULT_REQUEST_HEADERS = {

#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

#   'Accept-Language': 'en',

#}



# Enable or disable spider middlewares

# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

#    'cnblogSpider.middlewares.CnblogspiderSpiderMiddleware': 543,

#}



# Enable or disable downloader middlewares

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#DOWNLOADER_MIDDLEWARES = {

#    'cnblogSpider.middlewares.CnblogspiderDownloaderMiddleware': 543,

#}



# Enable or disable extensions

# See https://doc.scrapy.org/en/latest/topics/extensions.html

#EXTENSIONS = {

#    'scrapy.extensions.telnet.TelnetConsole': None,

#}



# Configure item pipelines

# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {

    'cnblogSpider.pipelines.CnblogspiderPipeline': 300,

    'scrapy.pipelines.images.ImagesPipeline': 1,

}

IMAGES_STORE='/home/as/文档/爬虫/开发与项目实战/中级篇/Scrapy爬虫框架/cnblogSpider/PICTURE'

IMAGES_URLS_FILED='image_urls'

IMAGES_RESULT_FIELD='images'

IMAGES_EXPIRES=30

IMAGES_THUMBS={

    'small':(50,50),

    'big':(270,270),

}

# Enable and configure the AutoThrottle extension (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/autothrottle.html

#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False



# Enable and configure HTTP caching (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

创建一个PICTURE文件夹,用来存放爬取的图片

最后终端启动 cnblogSpider 爬虫

scrapy crawl cnblogSpider

猜你喜欢

转载自blog.csdn.net/weixin_39777626/article/details/81586193