Scrapy 爬虫库的学习

一、安装库

爬虫性能的相关知识

pip install wheel
pip install scrapy
pip install pywin32

'''
    windows 可能需要安装 Microsoft Visual C++ Build Tools 和 Visual C++ 14.0,如果C++版本不够,需要安装后才能安装scrapy库
'''

二、工作流程

1、指定初始URL

2、解析响应内容
    - 给到调度器(Scheduler)继续执行
    - 给pipeline;item 用于存储数据;格式化
    - twisted 基于事件循环的异步非阻塞模块。一个线程同时可以向多个目标发去Http请求

三、创建项目

# 创建项目
scrapy startproject 项目名称
# 进入到项目名称文件夹
cd 项目名称
# 创建 蜘蛛spidet
scrapy genspider 蜘蛛名称 url地址
# 执行
scrapy crawl 蜘蛛名称
# 执行取消日志
scrapy crawl 蜘蛛名称 --nolog

3.1、目录结构

四、scrapy操作

# windows 设置输出编码 Linux,mac不用
import sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')

# 解析返回的数据
from scrapy.selector import Selector,HtmlXPathSelector

'''
    Selector(response=response).xpath('查找的内容')找到所有标签,返回标签队形列表,通过for循环取出; 
    .extract() 将标签对象转换为字符串,通过for循环取出;
    .extract_first() 只获取列表的第一个;
    “//”表示整个标签文档查找,“.//”表示在当前标签下查找,“/”表示在当前标签的子集中查找;
    “/div[@id="i1"]” 表示查找子集中div id=i1 的标签;
    “/text()” 获取标签中的文本信息;
    “div/@id” 获取id的值;
    “//a[starts-with(@href,'/all/hot/recent/')]/@href” starts-with()获取属性值
    “//a[re:test(@href,'/all/hot/recent\d+')]” 正则获取属性值
    “//a[contents(@href,'link')]” 获取a标签中href包含link的标签对象
'''
# 语法
hxs = Selector(response=response).xpath('//标签名[@id="标签属性值"]/div[@class="标签属性值"]/text()').extract()

# 获取所有分页页码
 - 利用set() 集合类型,来去除页面的重复信息;
 - 获取所有的分页
    yield Request(url=url,callback=self.parse) # 将获取到的新分页添加到调度器中
 - 设置获取url的深度
    settings.py 文件 DEPTH_LIMIT=1

# url进行加密存储,可以提高数据库查询效率
import hashlib
obj = hashlib.md5()
obj.update(bytes(url,encoding="utf-8")) # 进行加密
obj.hexdigest() # 得到加密的值



# 蜘蛛文件下 response 
response.url # 获取请求url
response.text # 获取返回的结果
response.body # 获取返回内容
response.request # 发送请求
response.meta = {"depth":"抓取深度"}
content = str(response.body,encoding="utf-8") # windows转换编码 Linux,mac不用

4.1、抽屉小例子(分页的获取)

# -*- coding: utf-8 -*-
import scrapy
import sys
import io
from ..items import ChoutiItem
# from scrapy import Request,Selector
from scrapy.http import Request
# 解析返回的数据
from scrapy.selector import Selector,HtmlXPathSelector

sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf-8')

class ChoutisSpider(scrapy.Spider):
    name = 'choutis'
    allowed_domains = ['chouti.com']
    start_urls = ['http://dig.chouti.com/']
    visited_url = set()  # 去除重复的分页页码

    '''
        从写start_requests() 方法后,就可以改变scrapy库默认的pacse()方法的名称,
        同时改变callback 指向。
    
        def start_requests(self):
            for url in self.start_urls:
                yield Request(url,callback=self.parse) 
    '''


    # scrapy 库默认的方法pacse
    def parse(self, response):
        print(response.meta)
        # 获取标题和url
        hxs1 = Selector(response=response).xpath('//div[@id="content-list"]/div[@class="item"]')
        for obj in hxs1:
            title = obj.xpath('.//a[@class="show-content color-chag"]/text()').extract_first().strip() # strip() 去除空白
            href = obj.xpath('.//a[@class="show-content color-chag"]/@href').extract_first().strip()
            '''
                这里需要引用 item 对象,才能对数据进行格式化
                from ..items import ChoutiItem
            '''
            # 将获取到的值传入Item对象进行格式化处理
            item_obj = ChoutiItem(title=title, href=href)
            # 将item对象传递给 pipeline 必须是yield返回
            yield item_obj

        # 获取页码的url(正则方式)
        hxs = Selector(response=response).xpath('//a[re:test(@href,"/all/hot/recent/\d+")]/@href').extract()
        for url in hxs:
            md5_url = self.md5(url)
            # self.visited_url.add(md5_url) # 将分页url添加到set()集合中去重复
            if md5_url in self.visited_url:
                print('包含')
                pass
            else:
                url_page = "https://dig.chouti.com%s" %url
                print(url_page)
                yield Request(url=url_page, callback=self.parse)  # 将要访问新url添加到调度器中


    def md5(self, url):
        import hashlib
        obj = hashlib.md5()
        obj.update(bytes(url, encoding="utf-8"))  # 进行加密
        return obj.hexdigest()  # 得到加密的值

4.2、items.py 文件

import scrapy

class ChoutiItem(scrapy.Item):
    title = scrapy.Field()
    href = scrapy.Field()
    

4.3、pipelines.py 文件

# 多个处理函数不再指定后续执行函数时必须将item丢弃掉,抛出dropiten异常
from scrapy.exceptions import DropItem 

# pipeline操作事件钩子
class ChoutiPipeline(object):
    # 初始化数据
    def __init__(self,conn_str):
        self.conn_str = conn_str
        self.conn = None

    # 每当数据要写入数据时会被调用
    def process_item(self, item, spider):
        print(item)
        f = open('item.json','a+',encoding='utf-8')
        tpl = '%s \n\n %s \r\n'%(item['title'],item['href'])
        f.write(tpl)
        f.close()
        # return item # 如果有多个处理函数,必须返回item才能执行下一个处理函数
        raise DropItem() # 将此item丢弃,不再有后续的函数进行内容处理        

    def open_spider(self,spider):
        # 爬虫开启时调用
        pass

    def close_spider(self,spider):
        # 爬虫关闭时调用
        pass
    
    @classmethod
    def from_crawler(cls,crawler):
        # 获取settings.py的内容 conn_str 的 key
        val = crawler.settings.getint('DB') # 必须大写        
        return cls(val) #创建对象 cls() = ChoutiPipeline()
        '''
            将获得数据写入数据库或者文件中
            self.conn_str 可以创建数据库连接
        '''

5、自定义去重复URL或者文件

5.1、settings.py 配置

# 配置 pipelines.py 中的执行函数
ITEM_PIPELINES = {
    'chouti.pipelines.ChoutiPipeline' : 300, # 300 表示权重
    'chouti.pipelines.ChoutiPipeline2' : 300,
}

# url 深度配置
DEPTH_LIMIT=1
# 配置自定义去重类的加载
DUPEFILTER_CLASS = "chouti.duplication.RepeatFilter"

5.2、自定义去重配置

# 在项目中新建一个python文件duplication.py 这个返回的参数对应的是 Request(dont_filter=False) 参数
# 去重事件钩子

class RepeatFilter(object):
    def __init__(self):
        self.visited_set = set()

    @classmethod
    def from_settings(cls, settings):
        return cls()

    def request_seen(self, request):  # 表示请求是否被查看过
        print('....')
        if request.url in self.visited_set:
            return True
        self.visited_set.add(request.url)
        return False

    def open(self):
        print('open')


    def close(self, reason):
        print('close')

    def log(self, request, spider):
        print('log....')

5.3、choutis.py 配置

# -*- coding: utf-8 -*-
import scrapy
import sys
import io
from ..items import ChoutiItem
from scrapy import Request,Selector
# from scrapy.dupefilters import RFPDupeFilter # URL去重模块 自定义将覆盖原有的内容

sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf-8')

class ChoutisSpider(scrapy.Spider):
    name = 'choutis'
    allowed_domains = ['chouti.com']
    start_urls = ['http://dig.chouti.com/']
    # scrapy 库默认的方法pacse
    def parse(self, response):
        print(response.url)
        # 获取标题和url
        hxs1 = Selector(response=response).xpath('//div[@id="content-list"]/div[@class="item"]')
        for obj in hxs1:
            title = obj.xpath('.//a[@class="show-content color-chag"]/text()').extract_first().strip() # strip() 去除空白
            href = obj.xpath('.//a[@class="show-content color-chag"]/@href').extract_first().strip()
            '''
                这里需要引用 item 对象,才能对数据进行格式化
                from ..items import ChoutiItem
            '''
            # 将获取到的值传入Item对象进行格式化处理
            item_obj = ChoutiItem(title=title, href=href)
            # 将item对象传递给 pipeline 必须是yield返回
            yield item_obj

        # 获取页码的url(正则方式)
        hxs = Selector(response=response).xpath('//a[re:test(@href,"/all/hot/recent/\d+")]/@href').extract()
        for url in hxs:
            url_page = "https://dig.chouti.com%s" %url
            # 将要访问新url添加到调度器中 dont_filter 设置是否去除重复的URL False表示去重
            yield Request(url=url_page, callback=self.parse,dont_filter=False)  

6、cookice 相关知识

# cookies操作
# 引入处理 cookies 模块
import scrapy.http.cookies import CookieJar

# 创建对象
cookie = CookieJar() 
# response.request 发送的请求
cookie_obj.extract_cookies(response,response.request)
# 获取所有 cookie 
cookie_dict = cookie._cookies
print(cookie_dict)

6.1、登录抽屉点赞案例 

# -*- coding: utf-8 -*-
import scrapy
import sys
import io
from ..items import ChoutiItem
from scrapy import Request,Selector
# 处理 Cookies 模块
import scrapy.http.cookies import CookieJar
# from scrapy.dupefilters import RFPDupeFilter
# 解析返回的数据
# from scrapy.selector import Selector,HtmlXPathSelector

sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf-8')

class ChoutisSpider(scrapy.Spider):
    name = 'choutis'
    allowed_domains = ['chouti.com']
    start_urls = ['http://dig.chouti.com/']
    
    cookie_dict = None

    def parse(self,response):
        pprint(dir(response))
        cookie = CookieJar()
        cookie.extract_cookies(response,response.request)
        self.cookie_dict = cookie._cookies
        yield Request(
            url='http://dig.chouti.com/login',
            method='POST',
            cookies=self.cookie_dict,
            body='phone=&password=&oneMone=1',# 请求体
            headers={'Cobtebt-Type':''},
            callback=self.chek_login,
        )
        
    def chek_login(self,response):
        print(response.text)
        yield Request(
            url='http://dig.chouti.com',
            method='GET',
            callback=self.good
        )
    
    def good(self,response):
        # 查找点赞的id
        id_list = Selector(response=response).xpath('//div[@share-linkid]/@share-linkid').extract().strip()
        for nid in id_list:
            url = 'http://dig.chouti.com/link/vote?linksId=%s'%nid
            yield Request(
                url=url,
                method='POST',
                cookies=self.cookie_dict,
                callback=self.show_good
            )
          # 找分页
          page_urls = Selector(response=response).xpath('//div[@id="dig_lcpage"]//a/#href').extract()
          for page in page_urls:
             url = 'http://dig.chouti.ccom%s'%page
             yield Request(
                url=url,
                callbcak=self.good
            )
    
    def show_good(self,response):
        print(response.test)

7、scrapy扩展

# settings.py设置

EXTENSIONS = {
    '项目名.文件名.类名':300,
}

7.1、自定义扩展格式(生命周期)

# extensions.py

'''
    在scrapy运行期间过阶段的生命周期
    engine_started = object() # 引擎开始
    engine_stopped = object() # 引擎停止
    spider_opened = object() # 爬虫开始
    spider_idle = object() # 爬虫执行
    spider_closed = object() # 爬虫结束
    spider_error = object() # 爬虫出现错误
    request_scheduled = object() # 请求
    request_dropped = object() # 抛弃请求
    response_received = object() # 收到响应
    response_downloaded = object() # 下载响应数据
    item_scraped = object() # 插入item
    item_dropped = object() # 抛弃item
'''

from scrapy import signals

class MyExtend:

    def __init__(self,crawler):
        self.crawler = crawler
        # 将函数注册到相应的生命周期
        crawler.signals.connect(self.start,signals.engine_started) 
        crawler.signals.connect(self.close,signals.engine_closed)

    @claamethod
    # 注册方法
    def from_crawler(cls,crawler):
        return cls(crawler)

    def start(self):
        print('signals.engine_etarted.start')

    def close(self):
        print(signals.engine_etarted.close')

8、配置文件settings.py

BOT_NAME = "" # 爬虫名称
USE_AGENT = "" # 请求头信息浏览器信息,系统信息
ROBOTSTXT_OBEY = False # 是否遵循 robors.txt
CONCURRENT_REQUESTS = 32 # 并发请求数
DOWNLOAD_DELAY = 3 # 延迟下载限制(秒)
CONCURRENT_REQUESTS_PER_DOMAIN = 16 # 每个域名请求并发数量
CONCURRENT_REQUESTS_PER_IP = 16 # 每个IP请求并发数量
COOKIES_ENABLED = False # 是否爬取cookie
COOKIES_ENABLED = True # 是否开启cookie引擎
COOKIES_DEBUG = True # 是否开启cookie debug
DEPTH_PRIORITY = 0 # 0 和 1 表示爬取模式为深度优先Lifo,还是广度优先Fifo
 - SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleLifoDiskQueue'
 - SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.LifoMemoryQueue'
DEPTH_PRIORITY = 1
 - SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
 - SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue'

DUPEFILTER_CLASS = "chouti.duplication.RepeatFilter" # 自定义去重
DEPTH_LIMIT = 2 # 设置爬虫爬取的层级
SCHEDULER = 'scrapy.core.scheduler.Scheduler' # 调度器队列


# 监听爬虫的状态
TELNETCONSOLE_ENABLED = False # 是否开启监听
TELNETCONSOLE_PORT = 6023 # 设置端口
TELNETCONSOLE_HOST= '127.0.0.1' # 设置地址
命令:telnet 127.0.0.1 6023
ext() # 查看爬虫的信息


# 默认的请求头信息
DEFAULT_REQUEST_HEADERS = {
   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
   'Accept-Language': 'en',
}

# 爬虫中间件
SPIDER_MIDDLEWARES = {
   'chouti.middlewares.ChoutiSpiderMiddleware': 543,
}

# 扩展
EXTENSIONS = {
   'scrapy.extensions.telnet.TelnetConsole': None,
}

# 自定义数据存储类
ITEM_PIPELINES = {
   'chouti.pipelines.ChoutiPipeline': 300,
}

# 下载中间件
DOWNLOADER_MIDDLEWARES = {
    'chouti.middlewares.ChoutiDownloaderMiddleware': 543,
}

# 智能请求爬虫引擎时间
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 5 # 第一个请求延迟几秒
AUTOTHROTTLE_MAX_DELAY = 60 # 最大请求延迟多少秒
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 #
AUTOTHROTTLE_DEBUG = False

# 缓存
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache' # 缓存目录
HTTPCACHE_IGNORE_HTTP_CODES = [] # 忽略状态码
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

9、中间件(可以写在同一个文件内)

9.1、settings.py配置

# settings.py 配置
DOWNLOADER_WIDDLEWARES = {
    '项目名.文件名.类名':500,
}

9.2、scrapy 自定义代理

class ProxyMiddleWare(object):
    def process_request(self,request,spider):
        PROXIES = [
            {'ip_port':'111.11.228.75:80','user_padd':'用户名@密码'},
        ]
        proxy = random.choice(PROXIES)
        if proxy['user_pass'] is not None:
            request.meta['proxy'] = to_bytes('http://%s'%proxy['ip_port'])
            encoded_user_pass = base64.encodestring(to_bytes(proxy['user_pass']))
            request.headers['Proxy-Authorization'] = to_bytes('Basic'+encoded_user_pass)
        else:
            request.meta['proxy'] = to_bytes('http://%s'%proxy['ip_port'])

9.3、scrapy 下载中间件

'''
    在执行多个下载中间件时的执行循序:
    DownMiddleWare1.process_request -> DownMiddleWare2.process_request ->
    DownMiddleWare2.process_response -> DownMiddleWare1.process_response -> spider
    ps:
        1、只要一个下载中间件执行完成,后续的不执行
        2、process_response 必须有 return response
'''
class DownMiddleWare1(object):
    def process_request(self,request,spider):
        pass

    def process_response(self,request,response,spider):
        '''
            return:
                Response 对象:转交给其他的中间件process_response
                Request 对象:停止中间件,request会被重新调度下载
                raise IgnoreRequest 异常:调用Request.errback
        '''
        return response

    def process_exception(self,request,response,spider):
        '''
            在执行process_request时如果出现错误,将执行process_exception函数
            return:
                None:继续交给后续中间件process_exception函数处理异常
                Response 对象:停止后续的process_exception方法
                Request 对象:停止中间件 request将会被重新调用下载
        '''
        return None

class DownMiddleWare2(object):
    def process_request(self,request,spider):
        pass
    def process_response(self,request,response,spider):
        return response

9.3、spider 中间件

class SpiderMiddleWare(object):
    def process_start_requests(self,start_requests,spider): # spider 开始工作的时候执行
        return start_requests
    def process_spider_input(self,response,spider): # 执行parse之前执行
        pass
    def process_spider_output(self,response,result,spider): 
        # parse执行完之后执行 result 是parse yield 的结果
        return result # 返回值必须包含 Request 或 item 对象的可迭代对象(iterable)
    def process_spider_exception(self,response,exception,spider):
        return None 
        # 返回值为None,继续交给后续中间件处理异常;包含 Response 或 item 对象的可迭代对象(iterable),交给调度器或pipline

10、scrapy 自定义命令

在spiders同级创建任意目录,在目录中创建.py文件

from scrapy.commands import ScrapyCommand
from scrapy.utils.project import get_project_settings

class Command(AcrapyCommand):
    requires_project = True
    def syntax(self):
        return ''
    
    def short_desc(self):
        return 'Runs all of the spiders' # --help提示信息

    def run(self,args,opts):
        # 获取所有爬虫的名称
        spider_list = self.crawler_process.spiders.list()
        for name in spider_list:
            self.crawler_process.crawl(name,**opts.__dict__)
        self.crawler_process.start() # 执行所有的爬虫
        

settings.py 配置

COMMANDS_MODULE = '项目名称.目录名称'

执行命令

scrapy crawlall

猜你喜欢

转载自blog.csdn.net/u011146423/article/details/83060024
今日推荐