转自 :http://www.cnblogs.com/wangshuyang/p/7717263.html
目录:
- 爬虫性能原理
- scrapy框架解析
一、爬虫性能原理
在编写爬虫时,性能的消耗主要在IO请求中,当单进程单线程模式下请求URL时必然会引起等待,从而使得请求整体变慢。
1、同步执行
1 import requests
2
3 def fetch_async(url):
4 response = requests.get(url)
5 return response
6
7
8 url_list = ['http://www.github.com', 'http://www.bing.com']
9
10 for url in url_list:
11 fetch_async(url)
2、多线程执行
1 from concurrent.futures import ThreadPoolExecutor
2 #导入线程池
3 import requests
4
5
6 def fetch_async(url):
7 response = requests.get(url)
8 return response
9
10
11 url_list = ['http://www.github.com', 'http://www.bing.com']
12 pool = ThreadPoolExecutor(5)
13 for url in url_list:
14 pool.submit(fetch_async, url)
15 pool.shutdown(wait=True)
多线程 + 回调函数
1 from concurrent.futures import ThreadPoolExecutor
2 import requests
3
4 def fetch_async(url):
5 response = requests.get(url)
6 return response
7
8
9 def callback(future):
10 print(future.result())
11
12
13 url_list = ['http://www.github.com', 'http://www.bing.com']
14 pool = ThreadPoolExecutor(5)
15 for url in url_list:
16 v = pool.submit(fetch_async, url)
17 v.add_done_callback(callback)
18 pool.shutdown(wait=True)
3、多进程执行
1 from concurrent.futures import ProcessPoolExecutor
2 import requests
3
4 def fetch_async(url):
5 response = requests.get(url)
6 return response
7
8
9 url_list = ['http://www.github.com', 'http://www.bing.com']
10 pool = ProcessPoolExecutor(5)
11 for url in url_list:
12 pool.submit(fetch_async, url)
13 pool.shutdown(wait=True)
多进程 + 回调函数
1 from concurrent.futures import ProcessPoolExecutor
2 import requests
3
4
5 def fetch_async(url):
6 response = requests.get(url)
7 return response
8
9
10 def callback(future):
11 print(future.result())
12
13
14 url_list = ['http://www.github.com', 'http://www.bing.com']
15 pool = ProcessPoolExecutor(5)
16 for url in url_list:
17 v = pool.submit(fetch_async, url)
18 v.add_done_callback(callback)
19 pool.shutdown(wait=True)
通过上述代码均可以完成对请求性能的提高,对于多线程和多进行的缺点是在IO阻塞时会造成了线程和进程的浪费,所以异步IO回事首选:
1、asyncio示例
1 import asyncio
2
3
4 @asyncio.coroutine
5 def func1():
6 print('before...func1......')
7 yield from asyncio.sleep(5)
8 print('end...func1......')
9
10
11 tasks = [func1(), func1()]
12
13 loop = asyncio.get_event_loop()
14 loop.run_until_complete(asyncio.gather(*tasks))
15 loop.close()
2、asyncio+aiohttp示例
3、asyncio+ requests示例
1 import asyncio
2 import requests
3
4
5 @asyncio.coroutine
6 def fetch_async(func, *args):
7 loop = asyncio.get_event_loop()
8 future = loop.run_in_executor(None, func, *args)
9 response = yield from future
10 print(response.url, response.content)
11
12
13 tasks = [
14 fetch_async(requests.get, 'http://www.cnblogs.com/wupeiqi/'),
15 fetch_async(requests.get, 'http://dig.chouti.com/pic/show?nid=4073644713430508&lid=10273091')
16 ]
17
18 loop = asyncio.get_event_loop()
19 results = loop.run_until_complete(asyncio.gather(*tasks))
20 loop.close()
4、gevent+requests示例
1 import gevent
2
3 import requests
4 from gevent import monkey
5
6 monkey.patch_all()
7
8
9 def fetch_async(method, url, req_kwargs):
10 print(method, url, req_kwargs)
11 response = requests.request(method=method, url=url, **req_kwargs)
12 print(response.url, response.content)
13
14 # ##### 发送请求 #####
15 gevent.joinall([
16 gevent.spawn(fetch_async, method='get', url='https://www.python.org/', req_kwargs={}),
17 gevent.spawn(fetch_async, method='get', url='https://www.yahoo.com/', req_kwargs={}),
18 gevent.spawn(fetch_async, method='get', url='https://github.com/', req_kwargs={}),
19 ])
20
21 # ##### 发送请求(协程池控制最大协程数量) #####
22 # from gevent.pool import Pool
23 # pool = Pool(None)
24 # gevent.joinall([
25 # pool.spawn(fetch_async, method='get', url='https://www.python.org/', req_kwargs={}),
26 # pool.spawn(fetch_async, method='get', url='https://www.yahoo.com/', req_kwargs={}),
27 # pool.spawn(fetch_async, method='get', url='https://www.github.com/', req_kwargs={}),
28 # ])
5、grequests示例
1 import grequests
2
3
4 request_list = [
5 grequests.get('http://httpbin.org/delay/1', timeout=0.001),
6 grequests.get('http://fakedomain/'),
7 grequests.get('http://httpbin.org/status/500')
8 ]
9
10
11 # ##### 执行并获取响应列表 #####
12 # response_list = grequests.map(request_list)
13 # print(response_list)
14
15
16 # ##### 执行并获取响应列表(处理异常) #####
17 # def exception_handler(request, exception):
18 # print(request,exception)
19 # print("Request failed")
20
21 # response_list = grequests.map(request_list, exception_handler=exception_handler)
22 # print(response_list)
6、twisted示例
1 from twisted.web.client import getPage, defer
2 from twisted.internet import reactor
3
4
5 def all_done(arg):
6 reactor.stop()
7
8
9 def callback(contents):
10 print(contents)
11
12
13 deferred_list = []
14
15 url_list = ['http://www.bing.com', 'http://www.baidu.com', ]
16 for url in url_list:
17 deferred = getPage(bytes(url, encoding='utf8'))
18 deferred.addCallback(callback)
19 deferred_list.append(deferred)
20
21 dlist = defer.DeferredList(deferred_list)
22 dlist.addBoth(all_done)
23
24 reactor.run()
7、tornado示例
1 from twisted.internet import reactor
2 from twisted.web.client import getPage
3 import urllib.parse
4
5
6 def one_done(arg):
7 print(arg)
8 reactor.stop()
9
10 post_data = urllib.parse.urlencode({'check_data': 'adf'})
11 post_data = bytes(post_data, encoding='utf8')
12 headers = {b'Content-Type': b'application/x-www-form-urlencoded'}
13 response = getPage(bytes('http://dig.chouti.com/login', encoding='utf8'),
14 method=bytes('POST', encoding='utf8'),
15 postdata=post_data,
16 cookies={},
17 headers=headers)
18 response.addBoth(one_done)
19
20 reactor.run()
以上均是Python内置以及第三方模块提供异步IO请求模块,使用简便大大提高效率,而对于异步IO请求的本质则是【非阻塞Socket】+【IO多路复用】:
自写异步IO框架
1 import select
2 import socket
3 import time
4
5
6 class AsyncTimeoutException(TimeoutError):
7 """
8 请求超时异常类
9 """
10
11 def __init__(self, msg):
12 self.msg = msg
13 super(AsyncTimeoutException, self).__init__(msg)
14
15
16 class HttpContext(object):
17 """封装请求和相应的基本数据"""
18
19 def __init__(self, sock, host, port, method, url, data, callback, timeout=5):
20 """
21 sock: 请求的客户端socket对象
22 host: 请求的主机名
23 port: 请求的端口
24 port: 请求的端口
25 method: 请求方式
26 url: 请求的URL
27 data: 请求时请求体中的数据
28 callback: 请求完成后的回调函数
29 timeout: 请求的超时时间
30 """
31 self.sock = sock
32 self.callback = callback
33 self.host = host
34 self.port = port
35 self.method = method
36 self.url = url
37 self.data = data
38
39 self.timeout = timeout
40
41 self.__start_time = time.time()
42 self.__buffer = []
43
44 def is_timeout(self):
45 """当前请求是否已经超时"""
46 current_time = time.time()
47 if (self.__start_time + self.timeout) < current_time:
48 return True
49
50 def fileno(self):
51 """请求sockect对象的文件描述符,用于select监听"""
52 return self.sock.fileno()
53
54 def write(self, data):
55 """在buffer中写入响应内容"""
56 self.__buffer.append(data)
57
58 def finish(self, exc=None):
59 """在buffer中写入响应内容完成,执行请求的回调函数"""
60 if not exc:
61 response = b''.join(self.__buffer)
62 self.callback(self, response, exc)
63 else:
64 self.callback(self, None, exc)
65
66 def send_request_data(self):
67 content = """%s %s HTTP/1.0\r\nHost: %s\r\n\r\n%s""" % (
68 self.method.upper(), self.url, self.host, self.data,)
69
70 return content.encode(encoding='utf8')
71
72
73 class AsyncRequest(object):
74 def __init__(self):
75 self.fds = []
76 self.connections = []
77
78 def add_request(self, host, port, method, url, data, callback, timeout):
79 """创建一个要请求"""
80 client = socket.socket()
81 client.setblocking(False)
82 try:
83 client.connect((host, port))
84 except BlockingIOError as e:
85 pass
86 # print('已经向远程发送连接的请求')
87 req = HttpContext(client, host, port, method, url, data, callback, timeout)
88 self.connections.append(req)
89 self.fds.append(req)
90
91 def check_conn_timeout(self):
92 """检查所有的请求,是否有已经连接超时,如果有则终止"""
93 timeout_list = []
94 for context in self.connections:
95 if context.is_timeout():
96 timeout_list.append(context)
97 for context in timeout_list:
98 context.finish(AsyncTimeoutException('请求超时'))
99 self.fds.remove(context)
100 self.connections.remove(context)
101
102 def running(self):
103 """事件循环,用于检测请求的socket是否已经就绪,从而执行相关操作"""
104 while True:
105 r, w, e = select.select(self.fds, self.connections, self.fds, 0.05)
106
107 if not self.fds:
108 return
109
110 for context in r:
111 sock = context.sock
112 while True:
113 try:
114 data = sock.recv(8096)
115 if not data:
116 self.fds.remove(context)
117 context.finish()
118 break
119 else:
120 context.write(data)
121 except BlockingIOError as e:
122 break
123 except TimeoutError as e:
124 self.fds.remove(context)
125 self.connections.remove(context)
126 context.finish(e)
127 break
128
129 for context in w:
130 # 已经连接成功远程服务器,开始向远程发送请求数据
131 if context in self.fds:
132 data = context.send_request_data()
133 context.sock.sendall(data)
134 self.connections.remove(context)
135
136 self.check_conn_timeout()
137
138
139 if __name__ == '__main__':
140 def callback_func(context, response, ex):
141 """
142 :param context: HttpContext对象,内部封装了请求相关信息
143 :param response: 请求响应内容
144 :param ex: 是否出现异常(如果有异常则值为异常对象;否则值为None)
145 :return:
146 """
147 print(context, response, ex)
148
149 obj = AsyncRequest()
150 url_list = [
151 {'host': 'www.google.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5,
152 'callback': callback_func},
153 {'host': 'www.baidu.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5,
154 'callback': callback_func},
155 {'host': 'www.bing.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5,
156 'callback': callback_func},
157 ]
158 for item in url_list:
159 print(item)
160 obj.add_request(**item)
161
162 obj.running()
基本原理:
IO多路复用:select,用于检测socket对象是否发生变化(是否连接成功,是否有数据到来)
Socket:socket客户端
1 import socket
2 import select
3
4 class Request(object):
5 def __init__(self,sock,func,url):
6 self.sock = sock
7 self.func = func
8 self.url = url
9
10 def fileno(self):
11 return self.sock.fileno()
12
13 def async_request(url_list):
14
15 input_list = []
16 conn_list = []
17
18 for url in url_list:
19 client = socket.socket()
20 client.setblocking(False)
21 # 创建连接,不阻塞
22 try:
23 client.connect((url[0],80,)) # 100个向百度发送的请求
24 except BlockingIOError as e:
25 pass
26
27 obj = Request(client,url[1],url[0])
28
29 input_list.append(obj)
30 conn_list.append(obj)
31
32 while True:
33 # 监听socket是否已经发生变化 [request_obj,request_obj....request_obj]
34 # 如果有请求连接成功:wlist = [request_obj,request_obj]
35 # 如果有响应的数据: rlist = [request_obj,request_obj....client100]
36 rlist,wlist,elist = select.select(input_list,conn_list,[],0.05)
37 for request_obj in wlist:
38 # print('连接成功')
39 # # # # 发送Http请求
40 # print('发送请求')
41 request_obj.sock.sendall("GET / HTTP/1.0\r\nhost:{0}\r\n\r\n".format(request_obj.url).encode('utf-8'))
42 conn_list.remove(request_obj)
43
44 for request_obj in rlist:
45 data = request_obj.sock.recv(8096)
46 request_obj.func(data)
47 request_obj.sock.close()
48 input_list.remove(request_obj)
49
50 if not input_list:
51 break
1 使用一个线程完成并发操作,如何并发?
2 当第一个任务到来时,先发送连接请求,此时会发生IO等待,但是我不等待,我继续发送第二个任务的连接请求....
3
4 IO多路复用监听socket变化
5 先连接成功:
6 发送请求信息: GET / http/1.0\r\nhost....
7 遇到IO等待,不等待,继续检测是否有人连接成功:
8 发送请求信息: GET / http/1.0\r\nhost....
9 遇到IO等待,不等待,继续检测是否有人连接成功:
10 发送请求信息: GET / http/1.0\r\nhost....
11
12 有结果返回:
13 读取返回内容,执行回调函数
14 读取返回内容,执行回调函数
15 读取返回内容,执行回调函数
16 读取返回内容,执行回调函数
17 读取返回内容,执行回调函数
18 读取返回内容,执行回调函数
19 读取返回内容,执行回调函数
20
21
22
23 问题:什么是协程?
24 单纯的执行一端代码后,调到另外一端代码执行,再继续跳...
25
26 异步IO:
27 - 【基于协程】可以用 协程+非阻塞socket+select实现,gevent
28 - 【基于事件循环】完全通用socket+select实现,Twsited
29
30 1. 如何提高爬虫并发?
31 利用异步IO模块,如:asyncio,twisted,gevent
32 本质:
33 - 【基于协程】可以用 协程+非阻塞socket+select实现,gevent
34 - 【基于事件循环】完全通用socket+select实现,Twsited,tornado
35
36 2. 异步非阻塞
37 异步:回调 select
38 非阻塞:不等待 setblocking(False)
39
40 3. 什么是协程?
41 pip3 install gevent
42
43 from greenlet import greenlet
44
45 def test1():
46 print(12)
47 gr2.switch()
48 print(34)
49 gr2.switch()
50
51
52 def test2():
53 print(56)
54 gr1.switch()
55 print(78)
56
57 gr1 = greenlet(test1)
58 gr2 = greenlet(test2)
59 gr1.switch()
二、scrapy框架解析
Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架。 其可以应用在数据挖掘,信息处理或存储历史数据等一系列的程序中。
其最初是为了页面抓取 (更确切来说, 网络抓取 )所设计的, 也可以应用在获取API所返回的数据(例如 Amazon Associates Web Services ) 或者通用的网络爬虫。Scrapy用途广泛,可以用于数据挖掘、监测和自动化测试。
Scrapy 使用了 Twisted异步网络库来处理网络通讯。整体架构大致如下
Scrapy主要包括了以下组件:
-
- 引擎(Scrapy)
用来处理整个系统的数据流处理, 触发事务(框架核心) - 调度器(Scheduler)
用来接受引擎发过来的请求, 压入队列中, 并在引擎再次请求的时候返回. 可以想像成一个URL(抓取网页的网址或者说是链接)的优先队列, 由它来决定下一个要抓取的网址是什么, 同时去除重复的网址 - 下载器(Downloader)
用于下载网页内容, 并将网页内容返回给蜘蛛(Scrapy下载器是建立在twisted这个高效的异步模型上的) - 爬虫(Spiders)
爬虫是主要干活的, 用于从特定的网页中提取自己需要的信息, 即所谓的实体(Item)。用户也可以从中提取出链接,让Scrapy继续抓取下一个页面 - 项目管道(Item Pipeline)
负责处理爬虫从网页中抽取的实体,主要的功能是持久化实体、验证实体的有效性、清除不需要的信息。当页面被爬虫解析后,将被发送到项目管道,并经过几个特定的次序处理数据。 - 下载器中间件(Downloader Middlewares)
位于Scrapy引擎和下载器之间的框架,主要是处理Scrapy引擎与下载器之间的请求及响应。 - 爬虫中间件(Spider Middlewares)
介于Scrapy引擎和爬虫之间的框架,主要工作是处理蜘蛛的响应输入和请求输出。
- 引擎(Scrapy)
简而言之:
5个模块功能
- (1) 最重要的模块是Engine:它是数据流的指挥官,负责控制数据流(控制各个模块之间的通信);
- (2) scheduler:负责将Engine提交的URL排成一个队列;
- (3) spider:用户自己写的代码放在spider。主要负责HTTP response的解析,从回复的HTML中提取关键数据。
- (4) downloader:负责跟URL对应的server通信,并获取返回的内容。
- (5) item pipeline:负责处理spider提取出来的信息,一般用于做跟DB相关的操作。
2个中间件
中间件是处于两个模块之间的一种特殊hook,它的目的是提供一种简易的机制,通过插拔用户自己写的代码,来扩展新功能。
典型的数据流
- (1) Engine启动,从spider中读出要爬的第一个URL
- (2) Engine将读到的第一个URL送给scheduler
- (3) Engine向scheduler请求下一个要爬的URL
- (4) scheduler从队列中读出一个URL,送给Engine,Engine将这个URL送到downloader
- (5) downloader去GET这个URL,并将HTTP response生成一个Response对象。downloader将生成的Response返回给Engine
- (6) Engine将这个Response对象发给spider
- (7) spider处理这个Response对象,提取其中的信息,生成item。还会生成新的请求。并将item和请求送给Engine
- (7) Engine将收到的请求送给scheduler,将收到的item送给item pipline
- (8) 重复步骤(2),直到没有URL需要继续处理
1、安装:
1 Linux 2 pip3 install scrapy 3 4 5 Windows 6 a. pip3 install wheel 7 b. 下载twisted http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted 8 c. 进入下载目录,执行 pip3 install Twisted‑17.1.0‑cp35‑cp35m‑win_amd64.whl 9 d. pip3 install scrapy 10 e. 下载并安装pywin32:https://sourceforge.net/projects/pywin32/files/
2、基本使用命令:
1 1. scrapy startproject 项目名称 2 - 在当前目录中创建中创建一个项目文件(类似于Django) 3 4 2. scrapy genspider [-t template] <name> <domain> 5 - 创建爬虫应用 6 如: 7 scrapy gensipider -t basic oldboy oldboy.com 8 scrapy gensipider -t xmlfeed autohome autohome.com.cn 9 PS: 10 查看所有命令:scrapy gensipider -l 11 查看模板命令:scrapy gensipider -d 模板名称 12 13 3. scrapy list 14 - 展示爬虫应用列表 15 16 4. scrapy crawl 爬虫应用名称 17 - 运行单独爬虫应用
3、项目结构以及爬虫应用简介
1 project_name/ 2 scrapy.cfg 3 project_name/ 4 __init__.py 5 items.py 6 pipelines.py 7 settings.py 8 spiders/ 9 __init__.py 10 爬虫1.py 11 爬虫2.py 12 爬虫3.py
- scrapy.cfg: 项目配置文件
- project_name/: 项目python模块, 呆会代码将从这里导入
- project_name/items.py: 项目items文件
- project_name/pipelines.py: 项目管道文件
- project_name/settings.py: 项目配置文件
- project_name/spiders: 放置spider的目录
- project_name/middlewares: 放置中间件文件
注意:一般创建爬虫文件时,以网站域名命名
1 import scrapy
2
3 class XiaoHuarSpider(scrapy.spiders.Spider):
4 name = "xiaohuar" # 爬虫名称 *****
5 allowed_domains = ["xiaohuar.com"] # 允许的域名
6 start_urls = [
7 "http://www.xiaohuar.com/hua/", # 其实URL
8 ]
9
10 def parse(self, response):
11 # 访问起始URL并获取结果后的回调函数
window编码问题:
import sys,os sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
3、书写爬虫
1 import scrapy
2 from scrapy.selector import HtmlXPathSelector
3 from scrapy.http.request import Request
4
5
6 class DigSpider(scrapy.Spider):
7 # 爬虫应用的名称,通过此名称启动爬虫命令
8 name = "dig"
9
10 # 允许的域名
11 allowed_domains = ["chouti.com"]
12
13 # 起始URL
14 start_urls = [
15 'http://dig.chouti.com/',
16 ]
17
18 has_request_set = {}
19
20 def parse(self, response):
21 print(response.url)
22
23 hxs = HtmlXPathSelector(response)
24 page_list = hxs.select('//div[@id="dig_lcpage"]//a[re:test(@href, "/all/hot/recent/\d+")]/@href').extract()
25 for page in page_list:
26 page_url = 'http://dig.chouti.com%s' % page
27 key = self.md5(page_url)
28 if key in self.has_request_set:
29 pass
30 else:
31 self.has_request_set[key] = page_url
32 obj = Request(url=page_url, method='GET', callback=self.parse)
33 yield obj
34
35 @staticmethod
36 def md5(val):
37 import hashlib
38 ha = hashlib.md5()
39 ha.update(bytes(val, encoding='utf-8'))
40 key = ha.hexdigest()
41 return key
执行:
scrapy crawl dig
-
-
nolog
对于上述代码重要之处在于:
- Request是一个封装用户请求的类,在回调函数中yield该对象表示继续访问
- HtmlXpathSelector用于结构化HTML代码并提供选择器功能
4、选择器:
1 #!/usr/bin/env python
2 # -*- coding:utf-8 -*-
3 from scrapy.selector import Selector, HtmlXPathSelector
4 from scrapy.http import HtmlResponse
5 html = """<!DOCTYPE html>
6 <html>
7 <head lang="en">
8 <meta charset="UTF-8">
9 <title></title>
10 </head>
11 <body>
12 <ul>
13 <li class="item-"><a id='i1' href="link.html">first item</a></li>
14 <li class="item-0"><a id='i2' href="llink.html">first item</a></li>
15 <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li>
16 </ul>
17 <div><a href="llink2.html">second item</a></div>
18 </body>
19 </html>
20 """
21 response = HtmlResponse(url='http://example.com', body=html,encoding='utf-8')
22 # hxs = HtmlXPathSelector(response)
23 # print(hxs)
24 # hxs = Selector(response=response).xpath('//a')
25 # print(hxs)
26 # hxs = Selector(response=response).xpath('//a[2]')
27 # print(hxs)
28 # hxs = Selector(response=response).xpath('//a[@id]')
29 # print(hxs)
30 # hxs = Selector(response=response).xpath('//a[@id="i1"]')
31 # print(hxs)
32 # hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]')
33 # print(hxs)
34 # hxs = Selector(response=response).xpath('//a[contains(@href, "link")]')
35 # print(hxs)
36 # hxs = Selector(response=response).xpath('//a[starts-with(@href, "link")]')
37 # print(hxs)
38 # hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]')
39 # print(hxs)
40 # hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/text()').extract()
41 # print(hxs)
42 # hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/@href').extract()
43 # print(hxs)
44 # hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract()
45 # print(hxs)
46 # hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first()
47 # print(hxs)
48
49 # ul_list = Selector(response=response).xpath('//body/ul/li')
50 # for item in ul_list:
51 # v = item.xpath('./a/span')
52 # # 或
53 # # v = item.xpath('a/span')
54 # # 或
55 # # v = item.xpath('*/a/span')
56 # print(v)
示例:
登陆抽屉并点赞
1 # -*- coding: utf-8 -*-
2 import scrapy
3 from scrapy.selector import HtmlXPathSelector
4 from scrapy.http.request import Request
5 from scrapy.http.cookies import CookieJar
6 from scrapy import FormRequest
7
8
9 class ChouTiSpider(scrapy.Spider):
10 # 爬虫应用的名称,通过此名称启动爬虫命令
11 name = "chouti"
12 # 允许的域名
13 allowed_domains = ["chouti.com"]
14
15 cookie_dict = {}
16 has_request_set = {}
17
18 def start_requests(self):
19 url = 'http://dig.chouti.com/'
20 # return [Request(url=url, callback=self.login)]
21 yield Request(url=url, callback=self.login)
22
23 def login(self, response):
24 cookie_jar = CookieJar()
25 cookie_jar.extract_cookies(response, response.request)
26 for k, v in cookie_jar._cookies.items():
27 for i, j in v.items():
28 for m, n in j.items():
29 self.cookie_dict[m] = n.value
30
31 req = Request(
32 url='http://dig.chouti.com/login',
33 method='POST',
34 headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'},
35 body='phone=8615131255089&password=pppppppp&oneMonth=1',
36 cookies=self.cookie_dict,
37 callback=self.check_login
38 )
39 yield req
40
41 def check_login(self, response):
42 req = Request(
43 url='http://dig.chouti.com/',
44 method='GET',
45 callback=self.show,
46 cookies=self.cookie_dict,
47 dont_filter=True
48 )
49 yield req
50
51 def show(self, response):
52 # print(response)
53 hxs = HtmlXPathSelector(response)
54 news_list = hxs.select('//div[@id="content-list"]/div[@class="item"]')
55 for new in news_list:
56 # temp = new.xpath('div/div[@class="part2"]/@share-linkid').extract()
57 link_id = new.xpath('*/div[@class="part2"]/@share-linkid').extract_first()
58 yield Request(
59 url='http://dig.chouti.com/link/vote?linksId=%s' %(link_id,),
60 method='POST',
61 cookies=self.cookie_dict,
62 callback=self.do_favor
63 )
64
65 page_list = hxs.select('//div[@id="dig_lcpage"]//a[re:test(@href, "/all/hot/recent/\d+")]/@href').extract()
66 for page in page_list:
67
68 page_url = 'http://dig.chouti.com%s' % page
69 import hashlib
70 hash = hashlib.md5()
71 hash.update(bytes(page_url,encoding='utf-8'))
72 key = hash.hexdigest()
73 if key in self.has_request_set:
74 pass
75 else:
76 self.has_request_set[key] = page_url
77 yield Request(
78 url=page_url,
79 method='GET',
80 callback=self.show
81 )
82
83 def do_favor(self, response):
84 print(response.text)
注意:settings.py中设置DEPTH_LIMIT = 1来指定“递归”的层数。
5. 格式化处理
上述实例只是简单的处理,所以在parse方法中直接处理。如果对于想要获取更多的数据处理,则可以利用Scrapy的items将数据格式化,然后统一交由pipelines来处理。
spiders/xiahuar.py
1 import scrapy
2 from scrapy.selector import HtmlXPathSelector
3 from scrapy.http.request import Request
4 from scrapy.http.cookies import CookieJar
5 from scrapy import FormRequest
6
7
8 class XiaoHuarSpider(scrapy.Spider):
9 # 爬虫应用的名称,通过此名称启动爬虫命令
10 name = "xiaohuar"
11 # 允许的域名
12 allowed_domains = ["xiaohuar.com"]
13
14 start_urls = [
15 "http://www.xiaohuar.com/list-1-1.html",
16 ]
17 # custom_settings = {
18 # 'ITEM_PIPELINES':{
19 # 'spider1.pipelines.JsonPipeline': 100
20 # }
21 # }
22 has_request_set = {}
23
24 def parse(self, response):
25 # 分析页面
26 # 找到页面中符合规则的内容(校花图片),保存
27 # 找到所有的a标签,再访问其他a标签,一层一层的搞下去
28
29 hxs = HtmlXPathSelector(response)
30
31 items = hxs.select('//div[@class="item_list infinite_scroll"]/div')
32 for item in items:
33 src = item.select('.//div[@class="img"]/a/img/@src').extract_first()
34 name = item.select('.//div[@class="img"]/span/text()').extract_first()
35 school = item.select('.//div[@class="img"]/div[@class="btns"]/a/text()').extract_first()
36 url = "http://www.xiaohuar.com%s" % src
37 from ..items import XiaoHuarItem
38 obj = XiaoHuarItem(name=name, school=school, url=url)
39 yield obj
40
41 urls = hxs.select('//a[re:test(@href, "http://www.xiaohuar.com/list-1-\d+.html")]/@href')
42 for url in urls:
43 key = self.md5(url)
44 if key in self.has_request_set:
45 pass
46 else:
47 self.has_request_set[key] = url
48 req = Request(url=url,method='GET',callback=self.parse)
49 yield req
50
51 @staticmethod
52 def md5(val):
53 import hashlib
54 ha = hashlib.md5()
55 ha.update(bytes(val, encoding='utf-8'))
56 key = ha.hexdigest()
57 return key
item:
1 import scrapy
2
3
4 class XiaoHuarItem(scrapy.Item):
5 name = scrapy.Field()
6 school = scrapy.Field()
7 url = scrapy.Field()
pipelines:
1 import json
2 import os
3 import requests
4
5
6 class JsonPipeline(object):
7 def __init__(self):
8 self.file = open('xiaohua.txt', 'w')
9
10 def process_item(self, item, spider):
11 v = json.dumps(dict(item), ensure_ascii=False)
12 self.file.write(v)
13 self.file.write('\n')
14 self.file.flush()
15 return item
16
17
18 class FilePipeline(object):
19 def __init__(self):
20 if not os.path.exists('imgs'):
21 os.makedirs('imgs')
22
23 def process_item(self, item, spider):
24 response = requests.get(item['url'], stream=True)
25 file_name = '%s_%s.jpg' % (item['name'], item['school'])
26 with open(os.path.join('imgs', file_name), mode='wb') as f:
27 f.write(response.content)
28 return item
settings:
1 ITEM_PIPELINES = {
2 'spider1.pipelines.JsonPipeline': 100,
3 'spider1.pipelines.FilePipeline': 300,
4 }
5 # 每行后面的整型值,确定了他们运行的顺序,item按数字从低到高的顺序,通过pipeline,通常将这些数字定义在0-1000范围内。
对于pipeline可以做更多,如下:
1 from scrapy.exceptions import DropItem
2
3 class CustomPipeline(object):
4 def __init__(self,v):
5 self.value = v
6
7 def process_item(self, item, spider):
8 # 操作并进行持久化
9
10 # return表示会被后续的pipeline继续处理
11 return item
12
13 # 表示将item丢弃,不会被后续pipeline处理
14 # raise DropItem()
15
16
17 @classmethod
18 def from_crawler(cls, crawler):
19 """
20 初始化时候,用于创建pipeline对象
21 :param crawler:
22 :return:
23 """
24 val = crawler.settings.getint('MMMM')
25 return cls(val)
26
27 def open_spider(self,spider):
28 """
29 爬虫开始执行时,调用
30 :param spider:
31 :return:
32 """
33 print('000000')
34
35 def close_spider(self,spider):
36 """
37 爬虫关闭时,被调用
38 :param spider:
39 :return:
40 """
41 print('111111')
6、中间件
爬虫中间件:
1 class SpiderMiddleware(object):
2
3 def process_spider_input(self,response, spider):
4 """
5 下载完成,执行,然后交给parse处理
6 :param response:
7 :param spider:
8 :return:
9 """
10 pass
11
12 def process_spider_output(self,response, result, spider):
13 """
14 spider处理完成,返回时调用
15 :param response:
16 :param result:
17 :param spider:
18 :return: 必须返回包含 Request 或 Item 对象的可迭代对象(iterable)
19 """
20 return result
21
22 def process_spider_exception(self,response, exception, spider):
23 """
24 异常调用
25 :param response:
26 :param exception:
27 :param spider:
28 :return: None,继续交给后续中间件处理异常;含 Response 或 Item 的可迭代对象(iterable),交给调度器或pipeline
29 """
30 return None
31
32
33 def process_start_requests(self,start_requests, spider):
34 """
35 爬虫启动时调用
36 :param start_requests:
37 :param spider:
38 :return: 包含 Request 对象的可迭代对象
39 """
40 return start_requests
下载器中间件:
1 class DownMiddleware1(object):
2 def process_request(self, request, spider):
3 """
4 请求需要被下载时,经过所有下载器中间件的process_request调用
5 :param request:
6 :param spider:
7 :return:
8 None,继续后续中间件去下载;
9 Response对象,停止process_request的执行,开始执行process_response
10 Request对象,停止中间件的执行,将Request重新调度器
11 raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
12 """
13 pass
14
15
16
17 def process_response(self, request, response, spider):
18 """
19 spider处理完成,返回时调用
20 :param response:
21 :param result:
22 :param spider:
23 :return:
24 Response 对象:转交给其他中间件process_response
25 Request 对象:停止中间件,request会被重新调度下载
26 raise IgnoreRequest 异常:调用Request.errback
27 """
28 print('response1')
29 return response
30
31 def process_exception(self, request, exception, spider):
32 """
33 当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
34 :param response:
35 :param exception:
36 :param spider:
37 :return:
38 None:继续交给后续中间件处理异常;
39 Response对象:停止后续process_exception方法
40 Request对象:停止中间件,request将会被重新调用下载
41 """
42 return None
7、自定制命令
- 在spiders同级创建任意目录,如:commands
- 在其中创建 crawlall.py 文件 (此处文件名就是自定义的命令)
crawlall.py:
1 from scrapy.commands import ScrapyCommand
2 from scrapy.utils.project import get_project_settings
3
4
5 class Command(ScrapyCommand):
6
7 requires_project = True
8
9 def syntax(self):
10 return '[options]'
11
12 def short_desc(self):
13 return 'Runs all of the spiders'
14
15 def run(self, args, opts):
16 spider_list = self.crawler_process.spiders.list()
17 for name in spider_list:
18 self.crawler_process.crawl(name, **opts.__dict__)
19 self.crawler_process.start()
- 在settings.py 中添加配置 COMMANDS_MODULE = '项目名称.目录名称'
- 在项目目录执行命令:scrapy crawlall
8、自定义扩展
自定义扩展时,利用信号在指定位置注册制定操作
1 from scrapy import signals
2
3
4 class MyExtension(object):
5 def __init__(self, value):
6 self.value = value
7
8 @classmethod
9 def from_crawler(cls, crawler):
10 val = crawler.settings.getint('MMMM')
11 ext = cls(val)
12
13 crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
14 crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
15
16 return ext
17
18 def spider_opened(self, spider):
19 print('open')
20
21 def spider_closed(self, spider):
22 print('close')
9. 避免重复访问
scrapy默认使用 scrapy.dupefilter.RFPDupeFilter 进行去重,相关配置有:
1 DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter' 2 DUPEFILTER_DEBUG = False 3 JOBDIR = "保存范文记录的日志路径,如:/root/" # 最终路径为 /root/requests.seen
自定义url去重:
1 class RepeatUrl:
2 def __init__(self):
3 self.visited_url = set()
4
5 @classmethod
6 def from_settings(cls, settings):
7 """
8 初始化时,调用
9 :param settings:
10 :return:
11 """
12 return cls()
13
14 def request_seen(self, request):
15 """
16 检测当前请求是否已经被访问过
17 :param request:
18 :return: True表示已经访问过;False表示未访问过
19 """
20 if request.url in self.visited_url:
21 return True
22 self.visited_url.add(request.url)
23 return False
24
25 def open(self):
26 """
27 开始爬去请求时,调用
28 :return:
29 """
30 print('open replication')
31
32 def close(self, reason):
33 """
34 结束爬虫爬取时,调用
35 :param reason:
36 :return:
37 """
38 print('close replication')
39
40 def log(self, request, spider):
41 """
42 记录日志
43 :param request:
44 :param spider:
45 :return:
46 """
47 print('repeat', request.url)
10、settings详解
1 # -*- coding: utf-8 -*-
2
3 # Scrapy settings for step8_king project
4 #
5 # For simplicity, this file contains only settings considered important or
6 # commonly used. You can find more settings consulting the documentation:
7 #
8 # http://doc.scrapy.org/en/latest/topics/settings.html
9 # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11
12 # 1. 爬虫名称
13 BOT_NAME = 'step8_king'
14
15 # 2. 爬虫应用路径
16 SPIDER_MODULES = ['step8_king.spiders']
17 NEWSPIDER_MODULE = 'step8_king.spiders'
18
19 # Crawl responsibly by identifying yourself (and your website) on the user-agent
20 # 3. 客户端 user-agent请求头
21 # USER_AGENT = 'step8_king (+http://www.yourdomain.com)'
22
23 # Obey robots.txt rules
24 # 4. 禁止爬虫配置
25 # ROBOTSTXT_OBEY = False
26
27 # Configure maximum concurrent requests performed by Scrapy (default: 16)
28 # 5. 并发请求数
29 # CONCURRENT_REQUESTS = 4
30
31 # Configure a delay for requests for the same website (default: 0)
32 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
33 # See also autothrottle settings and docs
34 # 6. 延迟下载秒数
35 # DOWNLOAD_DELAY = 2
36
37
38 # The download delay setting will honor only one of:
39 # 7. 单域名访问并发数,并且延迟下次秒数也应用在每个域名
40 # CONCURRENT_REQUESTS_PER_DOMAIN = 2
41 # 单IP访问并发数,如果有值则忽略:CONCURRENT_REQUESTS_PER_DOMAIN,并且延迟下次秒数也应用在每个IP
42 # CONCURRENT_REQUESTS_PER_IP = 3
43
44 # Disable cookies (enabled by default)
45 # 8. 是否支持cookie,cookiejar进行操作cookie
46 # COOKIES_ENABLED = True
47 # COOKIES_DEBUG = True
48
49 # Disable Telnet Console (enabled by default)
50 # 9. Telnet用于查看当前爬虫的信息,操作爬虫等...
51 # 使用telnet ip port ,然后通过命令操作
52 # TELNETCONSOLE_ENABLED = True
53 # TELNETCONSOLE_HOST = '127.0.0.1'
54 # TELNETCONSOLE_PORT = [6023,]
55
56
57 # 10. 默认请求头
58 # Override the default request headers:
59 # DEFAULT_REQUEST_HEADERS = {
60 # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
61 # 'Accept-Language': 'en',
62 # }
63
64
65 # Configure item pipelines
66 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 # 11. 定义pipeline处理请求
68 # ITEM_PIPELINES = {
69 # 'step8_king.pipelines.JsonPipeline': 700,
70 # 'step8_king.pipelines.FilePipeline': 500,
71 # }
72
73
74
75 # 12. 自定义扩展,基于信号进行调用
76 # Enable or disable extensions
77 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
78 # EXTENSIONS = {
79 # # 'step8_king.extensions.MyExtension': 500,
80 # }
81
82
83 # 13. 爬虫允许的最大深度,可以通过meta查看当前深度;0表示无深度
84 # DEPTH_LIMIT = 3
85
86 # 14. 爬取时,0表示深度优先Lifo(默认);1表示广度优先FiFo
87
88 # 后进先出,深度优先
89 # DEPTH_PRIORITY = 0
90 # SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleLifoDiskQueue'
91 # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.LifoMemoryQueue'
92 # 先进先出,广度优先
93
94 # DEPTH_PRIORITY = 1
95 # SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
96 # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue'
97
98 # 15. 调度器队列
99 # SCHEDULER = 'scrapy.core.scheduler.Scheduler'
100 # from scrapy.core.scheduler import Scheduler
101
102
103 # 16. 访问URL去重
104 # DUPEFILTER_CLASS = 'step8_king.duplication.RepeatUrl'
105
106
107 # Enable and configure the AutoThrottle extension (disabled by default)
108 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
109
110 """
111 17. 自动限速算法
112 from scrapy.contrib.throttle import AutoThrottle
113 自动限速设置
114 1. 获取最小延迟 DOWNLOAD_DELAY
115 2. 获取最大延迟 AUTOTHROTTLE_MAX_DELAY
116 3. 设置初始下载延迟 AUTOTHROTTLE_START_DELAY
117 4. 当请求下载完成后,获取其"连接"时间 latency,即:请求连接到接受到响应头之间的时间
118 5. 用于计算的... AUTOTHROTTLE_TARGET_CONCURRENCY
119 target_delay = latency / self.target_concurrency
120 new_delay = (slot.delay + target_delay) / 2.0 # 表示上一次的延迟时间
121 new_delay = max(target_delay, new_delay)
122 new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
123 slot.delay = new_delay
124 """
125
126 # 开始自动限速
127 # AUTOTHROTTLE_ENABLED = True
128 # The initial download delay
129 # 初始下载延迟
130 # AUTOTHROTTLE_START_DELAY = 5
131 # The maximum download delay to be set in case of high latencies
132 # 最大下载延迟
133 # AUTOTHROTTLE_MAX_DELAY = 10
134 # The average number of requests Scrapy should be sending in parallel to each remote server
135 # 平均每秒并发数
136 # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
137
138 # Enable showing throttling stats for every response received:
139 # 是否显示
140 # AUTOTHROTTLE_DEBUG = True
141
142 # Enable and configure HTTP caching (disabled by default)
143 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
144
145
146 """
147 18. 启用缓存
148 目的用于将已经发送的请求或相应缓存下来,以便以后使用
149
150 from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware
151 from scrapy.extensions.httpcache import DummyPolicy
152 from scrapy.extensions.httpcache import FilesystemCacheStorage
153 """
154 # 是否启用缓存策略
155 # HTTPCACHE_ENABLED = True
156
157 # 缓存策略:所有请求均缓存,下次在请求直接访问原来的缓存即可
158 # HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy"
159 # 缓存策略:根据Http响应头:Cache-Control、Last-Modified 等进行缓存的策略
160 # HTTPCACHE_POLICY = "scrapy.extensions.httpcache.RFC2616Policy"
161
162 # 缓存超时时间
163 # HTTPCACHE_EXPIRATION_SECS = 0
164
165 # 缓存保存路径
166 # HTTPCACHE_DIR = 'httpcache'
167
168 # 缓存忽略的Http状态码
169 # HTTPCACHE_IGNORE_HTTP_CODES = []
170
171 # 缓存存储的插件
172 # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
173
174
175 """
176 19. 代理,需要在环境变量中设置
177 from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware
178
179 方式一:使用默认
180 os.environ
181 {
182 http_proxy:http://root:[email protected]:9999/
183 https_proxy:http://192.168.11.11:9999/
184 }
185 方式二:使用自定义下载中间件
186
187 def to_bytes(text, encoding=None, errors='strict'):
188 if isinstance(text, bytes):
189 return text
190 if not isinstance(text, six.string_types):
191 raise TypeError('to_bytes must receive a unicode, str or bytes '
192 'object, got %s' % type(text).__name__)
193 if encoding is None:
194 encoding = 'utf-8'
195 return text.encode(encoding, errors)
196
197 class ProxyMiddleware(object):
198 def process_request(self, request, spider):
199 PROXIES = [
200 {'ip_port': '111.11.228.75:80', 'user_pass': ''},
201 {'ip_port': '120.198.243.22:80', 'user_pass': ''},
202 {'ip_port': '111.8.60.9:8123', 'user_pass': ''},
203 {'ip_port': '101.71.27.120:80', 'user_pass': ''},
204 {'ip_port': '122.96.59.104:80', 'user_pass': ''},
205 {'ip_port': '122.224.249.122:8088', 'user_pass': ''},
206 ]
207 proxy = random.choice(PROXIES)
208 if proxy['user_pass'] is not None:
209 request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
210 encoded_user_pass = base64.encodestring(to_bytes(proxy['user_pass']))
211 request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass)
212 print "**************ProxyMiddleware have pass************" + proxy['ip_port']
213 else:
214 print "**************ProxyMiddleware no pass************" + proxy['ip_port']
215 request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
216
217 DOWNLOADER_MIDDLEWARES = {
218 'step8_king.middlewares.ProxyMiddleware': 500,
219 }
220
221 """
222
223 """
224 20. Https访问
225 Https访问时有两种情况:
226 1. 要爬取网站使用的可信任证书(默认支持)
227 DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
228 DOWNLOADER_CLIENTCONTEXTFACTORY = "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory"
229
230 2. 要爬取网站使用的自定义证书
231 DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
232 DOWNLOADER_CLIENTCONTEXTFACTORY = "step8_king.https.MySSLFactory"
233
234 # https.py
235 from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
236 from twisted.internet.ssl import (optionsForClientTLS, CertificateOptions, PrivateCertificate)
237
238 class MySSLFactory(ScrapyClientContextFactory):
239 def getCertificateOptions(self):
240 from OpenSSL import crypto
241 v1 = crypto.load_privatekey(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.key.unsecure', mode='r').read())
242 v2 = crypto.load_certificate(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.pem', mode='r').read())
243 return CertificateOptions(
244 privateKey=v1, # pKey对象
245 certificate=v2, # X509对象
246 verify=False,
247 method=getattr(self, 'method', getattr(self, '_ssl_method', None))
248 )
249 其他:
250 相关类
251 scrapy.core.downloader.handlers.http.HttpDownloadHandler
252 scrapy.core.downloader.webclient.ScrapyHTTPClientFactory
253 scrapy.core.downloader.contextfactory.ScrapyClientContextFactory
254 相关配置
255 DOWNLOADER_HTTPCLIENTFACTORY
256 DOWNLOADER_CLIENTCONTEXTFACTORY
257
258 """
259
260
261
262 """
263 21. 爬虫中间件
264 class SpiderMiddleware(object):
265
266 def process_spider_input(self,response, spider):
267 '''
268 下载完成,执行,然后交给parse处理
269 :param response:
270 :param spider:
271 :return:
272 '''
273 pass
274
275 def process_spider_output(self,response, result, spider):
276 '''
277 spider处理完成,返回时调用
278 :param response:
279 :param result:
280 :param spider:
281 :return: 必须返回包含 Request 或 Item 对象的可迭代对象(iterable)
282 '''
283 return result
284
285 def process_spider_exception(self,response, exception, spider):
286 '''
287 异常调用
288 :param response:
289 :param exception:
290 :param spider:
291 :return: None,继续交给后续中间件处理异常;含 Response 或 Item 的可迭代对象(iterable),交给调度器或pipeline
292 '''
293 return None
294
295
296 def process_start_requests(self,start_requests, spider):
297 '''
298 爬虫启动时调用
299 :param start_requests:
300 :param spider:
301 :return: 包含 Request 对象的可迭代对象
302 '''
303 return start_requests
304
305 内置爬虫中间件:
306 'scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware': 50,
307 'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': 500,
308 'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': 700,
309 'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': 800,
310 'scrapy.contrib.spidermiddleware.depth.DepthMiddleware': 900,
311
312 """
313 # from scrapy.contrib.spidermiddleware.referer import RefererMiddleware
314 # Enable or disable spider middlewares
315 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
316 SPIDER_MIDDLEWARES = {
317 # 'step8_king.middlewares.SpiderMiddleware': 543,
318 }
319
320
321 """
322 22. 下载中间件
323 class DownMiddleware1(object):
324 def process_request(self, request, spider):
325 '''
326 请求需要被下载时,经过所有下载器中间件的process_request调用
327 :param request:
328 :param spider:
329 :return:
330 None,继续后续中间件去下载;
331 Response对象,停止process_request的执行,开始执行process_response
332 Request对象,停止中间件的执行,将Request重新调度器
333 raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
334 '''
335 pass
336
337
338
339 def process_response(self, request, response, spider):
340 '''
341 spider处理完成,返回时调用
342 :param response:
343 :param result:
344 :param spider:
345 :return:
346 Response 对象:转交给其他中间件process_response
347 Request 对象:停止中间件,request会被重新调度下载
348 raise IgnoreRequest 异常:调用Request.errback
349 '''
350 print('response1')
351 return response
352
353 def process_exception(self, request, exception, spider):
354 '''
355 当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
356 :param response:
357 :param exception:
358 :param spider:
359 :return:
360 None:继续交给后续中间件处理异常;
361 Response对象:停止后续process_exception方法
362 Request对象:停止中间件,request将会被重新调用下载
363 '''
364 return None
365
366
367 默认下载中间件
368 {
369 'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': 100,
370 'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware': 300,
371 'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware': 350,
372 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400,
373 'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500,
374 'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550,
375 'scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware': 580,
376 'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 590,
377 'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600,
378 'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700,
379 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 750,
380 'scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware': 830,
381 'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': 850,
382 'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': 900,
383 }
384
385 """
386 # from scrapy.contrib.downloadermiddleware.httpauth import HttpAuthMiddleware
387 # Enable or disable downloader middlewares
388 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
389 # DOWNLOADER_MIDDLEWARES = {
390 # 'step8_king.middlewares.DownMiddleware1': 100,
391 # 'step8_king.middlewares.DownMiddleware2': 500,
392 # }
11、TinyScrapy
1 #!/usr/bin/env python
2 # -*- coding:utf-8 -*-
3 import types
4 from twisted.internet import defer
5 from twisted.web.client import getPage
6 from twisted.internet import reactor
7
8
9
10 class Request(object):
11 def __init__(self, url, callback):
12 self.url = url
13 self.callback = callback
14 self.priority = 0
15
16
17 class HttpResponse(object):
18 def __init__(self, content, request):
19 self.content = content
20 self.request = request
21
22
23 class ChouTiSpider(object):
24
25 def start_requests(self):
26 url_list = ['http://www.cnblogs.com/', 'http://www.bing.com']
27 for url in url_list:
28 yield Request(url=url, callback=self.parse)
29
30 def parse(self, response):
31 print(response.request.url)
32 # yield Request(url="http://www.baidu.com", callback=self.parse)
33
34
35
36
37 from queue import Queue
38 Q = Queue()
39
40
41 class CallLaterOnce(object):
42 def __init__(self, func, *a, **kw):
43 self._func = func
44 self._a = a
45 self._kw = kw
46 self._call = None
47
48 def schedule(self, delay=0):
49 if self._call is None:
50 self._call = reactor.callLater(delay, self)
51
52 def cancel(self):
53 if self._call:
54 self._call.cancel()
55
56 def __call__(self):
57 self._call = None
58 return self._func(*self._a, **self._kw)
59
60
61 class Engine(object):
62 def __init__(self):
63 self.nextcall = None
64 self.crawlling = []
65 self.max = 5
66 self._closewait = None
67
68 def get_response(self,content, request):
69 response = HttpResponse(content, request)
70 gen = request.callback(response)
71 if isinstance(gen, types.GeneratorType):
72 for req in gen:
73 req.priority = request.priority + 1
74 Q.put(req)
75
76
77 def rm_crawlling(self,response,d):
78 self.crawlling.remove(d)
79
80 def _next_request(self,spider):
81 if Q.qsize() == 0 and len(self.crawlling) == 0:
82 self._closewait.callback(None)
83
84 if len(self.crawlling) >= 5:
85 return
86 while len(self.crawlling) < 5:
87 try:
88 req = Q.get(block=False)
89 except Exception as e:
90 req = None
91 if not req:
92 return
93 d = getPage(req.url.encode('utf-8'))
94 self.crawlling.append(d)
95 d.addCallback(self.get_response, req)
96 d.addCallback(self.rm_crawlling,d)
97 d.addCallback(lambda _: self.nextcall.schedule())
98
99
100 @defer.inlineCallbacks
101 def crawl(self):
102 spider = ChouTiSpider()
103 start_requests = iter(spider.start_requests())
104 flag = True
105 while flag:
106 try:
107 req = next(start_requests)
108 Q.put(req)
109 except StopIteration as e:
110 flag = False
111
112 self.nextcall = CallLaterOnce(self._next_request,spider)
113 self.nextcall.schedule()
114
115 self._closewait = defer.Deferred()
116 yield self._closewait
117
118 @defer.inlineCallbacks
119 def pp(self):
120 yield self.crawl()
121
122 _active = set()
123 obj = Engine()
124 d = obj.crawl()
125 _active.add(d)
126
127 li = defer.DeferredList(_active)
128 li.addBoth(lambda _,*a,**kw: reactor.stop())
129
130 reactor.run()
复制代码