Scrapy source analysis framework version of custom low

isinstance (result, types.GeneratorType) determines whether the generator

Will wait for the queue is empty, block = False range will complain

reactor.callLater(0,self.next_request)

from twisted.internet Import the Reactor    # event loop (termination condition, all of the socket have been removed) 
from twisted.web.client Import getPage # socket objects (if the download is complete, the time automatically removed from circulation ...) 
from Twisted .internet Import the defer      # defer.Deferred special socket objects (not retransmission request, manually remove) 

class the request (Object): 

    DEF  the __init__ (Self, URL, the callback): 
        self.url = URL 
        self.callback = the callback
 class the HttpResponse (Object): 

    DEF  __init__ (Self, Content, Request): 
        self.content = content
        self.request = request
        self.url = request.url
        self.text = str(content,encoding='utf-8')


class ChoutiSpider(object):
    name = 'chouti'

    def start_requests(self):
        start_url = ['http://www.baidu.com','http://www.bing.com',]
        for url in start_url:
            yield Request(url,self.parse)

    def parse(self,response):
        print(response) #response是下载的页面
        yield Request('http://www.cnblogs.com',callback=self.parse)

import queue
Q = queue.Queue()

class Engine(object):

    def __init__(self):
        self._close = None
        self.max = 5
        self.crawlling = []

    def get_response_callback(self,content,request):
        self.crawlling.remove(request)
        rep = HttpResponse(content,request)
        result = request.callback(rep)
        import types
        if isinstance(result,types.GeneratorType):
            for req in result:
                Q.put(req)



    def _next_request(self):
        """
        去取request对象,并发送请求
        最大并发数限制
        :return:
        """
        print(self.crawlling,Q.qsize())
        if Q.qsize() == 0 and len(self.crawlling) == 0:
            self._close.callback(None)
            return

        iflen (self.crawlling)> = self.max:
             return
         while len (self.crawlling) < self.max:
             the try : 
                REQ = Q.get (Block = False) 
                self.crawlling.append (REQ) 
                D = the getPage (REQ. url.encode ( ' UTF-. 8 ' ))
                 # page download is complete, get_response_callback, spider call parse user defined method, and adding a new request to the scheduler 
                d.addCallback (self.get_response_callback, REQ)
                 # does not reach the maximum concurrent number, Get Request vessel may go schedule 
                d.addCallback ( the lambda _: reactor.callLater (0, self._next_request))
            the except Exception AS E:
                 Print (E)
                 return 

    @ defer.inlineCallbacks 
    DEF crawl (Self, Spider):
         # adds the original Request object to the scheduler 
        start_requests = ITER (spider.start_requests ())
         the while True:
             the try : 
                Request = Next ( start_requests) 
                Q.put (request) 
            the except the StopIteration AS E:
                 BREAK 

        # to fetch scheduler request, and sends a request 
        # self._next_request () 
        reactor.callLater (0, self._next_request) 

        self._close = defer.Deferred ()
         the yield self._close

spider = ChoutiSpider()


_active = set()
engine = Engine()
d = engine.crawl(spider)
_active.add(d)

dd = defer.DeferredList(_active)
dd.addBoth(lambda a:reactor.stop())

reactor.run()
Custom Scrapy framework .py

Guess you like

Origin www.cnblogs.com/jintian/p/11440028.html