1、创建scrapy工程
scrapy startproject youboy
2、scrapy工程目录介绍
│ main.py #爬虫程序入口cmdline.execute("scrapy crawl youboySpider".split())
│ scrapy.cfg
└─spider_youboy
│ items.py #定义要存储的字段,items通过spider返回,接收来至spider的字典数据
│ middlewares.py
│ pipelines.py #管道,从items获取字段,将数据存储到数据库中,可以使Mysql、Mongodb、Json、csv等
│ settings.py #配置文件,数据库信息,参数以及pipeLine的设置等等
│ __init__.py
│
├─spiders
│ │ ddl.py
│ │ mysqldb.py
│ │ youboySpider.py #爬虫的核心部分,直接解析和处理网页,将数据传输给item和pipe
│ │ __init__.py
3、安装mongodb以及redis模块
pip install pymongo
pip install scrapy-redis
4、爬虫源码
Mysql表结构,如果使用Mongodb不需要使用表结构,配置好items和settings就行了
1
#coding=utf-8
2
#Version:python3.5.2
3
#Tools:Pycharm
4
#Date:
5
__author__ = "Colby"
6
'''
7
drop table youboy_diqu;
8
drop table youboy_enterprise;
9
CREATE TABLE
10
youboy_diqu
11
(
12
provinceName VARCHAR(50) NOT NULL COMMENT '省份',
13
cityName VARCHAR(50) NOT NULL COMMENT '市区',
14
url VARCHAR(255) COMMENT 'url地址',
15
flag VARCHAR(1),
16
PRIMARY KEY (provinceName, cityName)
17
)
18
ENGINE=InnoDB DEFAULT CHARSET=utf8;
19
20
CREATE TABLE youboy_enterprise
21
(
22
provinceName VARCHAR(50) comment '省份、直辖市、自治区',
23
cityName VARCHAR(50) comment '市、自治州',
24
catagory_1_Name VARCHAR(50) comment '一级类目url',
25
catagory_1_Url VARCHAR(50) comment '一级类目名称',
26
catagory_2_Name VARCHAR(50) comment '二级类目名称',
27
catagory_2_Url VARCHAR(50) comment '二级类目url',
28
catagory_3_Name VARCHAR(50) comment '三级类目名称',
29
catagory_3_Url VARCHAR(50) comment '三级类目url',
30
enterpriseName VARCHAR(125) comment '企业名称',
31
contactPerson VARCHAR(50) comment '企业联系人',
32
enterpriseFax VARCHAR(50) comment '企业传真',
33
enterprisePhone VARCHAR(50) comment '企业电话',
34
enterpriseMobile VARCHAR(50) comment '企业手机',
35
enterpriseAddr VARCHAR(255) comment '企业联系地址'
36
)
37
ENGINE=InnoDB DEFAULT CHARSET=utf8;
38
'''
4-1、spider.py
1
#coding=utf-8
2
'''
3
Tools:PyCharm 2017.1
4
Version:Python3.5
5
Author:colby_chen
6
Date:2017-09-26
7
'''
8
import copy
9
from scrapy import Request
10
from scrapy.selector import Selector, HtmlXPathSelector
11
#from scrapy.spiders import CrawlSpider
12
from scrapy_redis.spiders import RedisSpider
13
from .mysqldb import connClose,connDB,exeBath,exeQuery,exeUpdate
14
import urllib.request
15
from lxml import etree
16
from ..items import SpiderYouboyItem
17
def gethtml(url):
18
page = urllib.request.urlopen(url)
19
html = page.read().decode('utf-8')
20
return html
21
22
def getPage(url):
23
'''
24
根据传过来的url,获取所有分页,并返回一个url列表
25
:param url:
26
:return:
27
'''
28
urlList=[]
29
startUrl=url
30
html=gethtml(startUrl)
31
selector=etree.HTML(html)
32
nextPageFlag=selector.xpath('//dl[@class="sheng_weizhi_next01"]/a[last()]/text()')
33
print('nextPageFlag',nextPageFlag)
34
maxPage=None
35
if nextPageFlag.__len__()>0:
36
endurl=url+'10000'
37
endhtml=gethtml(endurl)
38
maxPage = selector.xpath('//dl[@class="sheng_weizhi_next01"]/strong/text()')[0]
39
print('maxPage', maxPage)
40
for i in range(1,int(maxPage)+1):
41
currentUrl=url+str(i)
42
print('currentUrl',currentUrl)
43
urlList.append(currentUrl)
44
else:
45
urlList.append(startUrl)
46
print('urlList...............................................', urlList)
47
return urlList
48
49
def enterpriseContentDetail(enterpriseUrl,*args,**kwargs):
50
page = urllib.request.urlopen(enterpriseUrl)
51
html = page.read().decode('utf-8')
52
selector = etree.HTML(html)
53
# enterpriseContent = selector.xpath('//div[@class="txl_content_con"]/ul[1]/')
54
# print('enterpriseContent', enterpriseContent)
55
enterpriseDetail = []
56
enterpriseName = selector.xpath('//div[@class="txl_content_con"]/ul[1]/h1/text()')[0].replace('\t','').replace('\r\n','')
57
contactPerson = selector.xpath('//div[@class="txl_content_con"]/ul[1]/li[2]/text()')[0].replace('\t','').replace('\r\n','')
58
enterpriseFax = selector.xpath('//div[@class="txl_content_con"]/ul[1]/li[3]/text()')[0].replace('\t','').replace('\r\n','')
59
enterprisePhone = selector.xpath('//div[@class="txl_content_con"]/ul[1]/li[4]/text()')[0].replace('\t','').replace('\r\n','')
60
enterpriseMobile = selector.xpath('//div[@class="txl_content_con"]/ul[1]/li[5]/text()')[0].replace('\t','').replace('\r\n','')
61
enterpriseAddr = selector.xpath('//div[@class="txl_content_con"]/ul[1]/li[6]/text()')[0].replace('\t','').replace('\r\n','')
62
enterpriseUrl=enterpriseUrl
63
base=list(*args)
64
enterpriseDetail = [enterpriseName,contactPerson,enterpriseFax,enterprisePhone,enterpriseMobile,enterpriseAddr,enterpriseUrl]
65
if enterpriseDetail.__len__() == 0:
66
enterpriseDetail = ['', '', '', '', '', '',enterpriseUrl]
67
base.extend(enterpriseDetail)
68
return base
69
70
class youboySpider(RedisSpider):
71
name="youboySpider"
72
redis_key="youboySpider:start_urls"
73
start_urls=['http://book.youboy.com/diqu.html']
74
def enterpriseContent(self,response):
75
'''企业列表处理'''
76
select_enterpriseList = Selector(response)
77
items_enterpriseList = response.meta['baseInfo2']
78
print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')
79
enterpriseList = select_enterpriseList.xpath('//*[@id="content"]/ul/div/strong/a')
80
provinceName = items_enterpriseList['provinceName']
81
cityName = items_enterpriseList['cityName']
82
catagory_1_Name = items_enterpriseList['catagory_1_Name']
83
catagory_1_Url = items_enterpriseList['catagory_1_Url']
84
catagory_2_Name = items_enterpriseList['catagory_2_Name']
85
catagory_2_Url = items_enterpriseList['catagory_2_Url']
86
catagory_3_Name = items_enterpriseList['catagory_3_Name']
87
catagory_3_Url = items_enterpriseList['catagory_3_Url']
88
baseInfo = [provinceName, cityName, catagory_1_Name, catagory_1_Url, catagory_2_Name, catagory_2_Url,
89
catagory_3_Name, catagory_3_Url]
90
enterpriseContentList = []
91
if enterpriseList.__len__()==0:
92
items_enterpriseList['enterpriseName']=''
93
items_enterpriseList['contactPerson']=''
94
items_enterpriseList['enterpriseFax']=''
95
items_enterpriseList['enterprisePhone']=''
96
items_enterpriseList['enterpriseMobile']=''
97
items_enterpriseList['enterpriseAddr']=''
98
items_enterpriseList['enterpriseUrl']=''
99
#enterpriseContentDict=[(provinceName,cityName,catagory_1_Name,catagory_1_Url,catagory_2_Name,catagory_2_Url,catagory_3_Name,catagory_3_Url,'','','','','','','')]
100
for enterpriseInfo in enterpriseList:
101
enterpriseUrl=enterpriseInfo.xpath('@href').extract()[0]
102
enterpriseContent=enterpriseContentDetail(enterpriseUrl,baseInfo)
103
items_enterpriseList['enterpriseName'] = enterpriseContent[8]
104
items_enterpriseList['contactPerson'] = enterpriseContent[9]
105
items_enterpriseList['enterpriseFax'] = enterpriseContent[10]
106
items_enterpriseList['enterprisePhone'] = enterpriseContent[11]
107
items_enterpriseList['enterpriseMobile'] = enterpriseContent[12]
108
items_enterpriseList['enterpriseAddr'] = enterpriseContent[13]
109
items_enterpriseList['enterpriseUrl'] = enterpriseContent[14]
110
yield items_enterpriseList
111
112
# sql = "replace into youboy_enterprise(provinceName,cityName,catagory_1_Name,catagory_1_Url,catagory_2_Name,catagory_2_Url,catagory_3_Name,catagory_3_Url" \
113
# ",enterpriseName,contactPerson,enterpriseFax,enterprisePhone,enterpriseMobile,enterpriseAddr,enterpriseUrl) " \
114
# "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
115
# connMysql = connDB()
116
# result = exeBath(connMysql[0], connMysql[1], sql, enterpriseContentList)
117
# connClose(connMysql[0], connMysql[1])
118
119
def parse_enterpriseFirstPage(self, response):
120
'''企业列表处理'''
121
select_enterpriseList=Selector(response)
122
baseInfo2 = response.meta['items_catagory_3']
123
firstPage = baseInfo2['catagory_3_Url']
124
pageList=getPage(firstPage)
125
for pageurl in pageList:
126
'''
127
dont_filter=True 多层循环失效加上此参数
128
'''
129
yield Request(pageurl,meta={'baseInfo2':copy.deepcopy(baseInfo2)},callback=self.enterpriseContent,dont_filter=True)
130
131
def parse_catagory_3(self,response):
132
'''行业三级类目处理函数'''
133
'''行业二级类目处理函数'''
134
selector_catagory_3 = Selector(response)
135
items_catagory_3 = response.meta['items_catagory_2']
136
print('二级类目', items_catagory_3['catagory_2_Name'])
137
catagory_3_List = selector_catagory_3.xpath('//div[@class="sheng_weizhi_con"]/ul[2]/li/a')
138
data=[]
139
for catagory_3 in catagory_3_List:
140
catagory_3_Name = catagory_3.xpath('text()').extract()[0]
141
catagory_3_Url = catagory_3.xpath('@href').extract()[0]
142
items_catagory_3['catagory_3_Name'] = catagory_3_Name
143
items_catagory_3['catagory_3_Url'] = items_catagory_3['url'] + catagory_3_Url
144
#print(items_catagory_3['provinceName'],items_catagory_3['cityName'],items_catagory_3['catagory_1_Name'],items_catagory_3['catagory_1_Url'],items_catagory_3['catagory_2_Name'],items_catagory_3['catagory_2_Url'],items_catagory_3['catagory_3_Name'],items_catagory_3['catagory_3_Url'])
145
yield Request(items_catagory_3['catagory_3_Url'], meta={'items_catagory_3': copy.deepcopy(items_catagory_3)}
146
,callback=self.parse_enterpriseFirstPage)
147
#data.append((items_catagory_3['provinceName'],items_catagory_3['cityName'],items_catagory_3['catagory_1_Name'],items_catagory_3['catagory_1_Url'],items_catagory_3['catagory_2_Name'],items_catagory_3['catagory_2_Url'],items_catagory_3['catagory_3_Name'],items_catagory_3['catagory_3_Url']))
148
149
def parse_catagory_2(self, response):
150
'''行业二级类目处理函数'''
151
selector_catagory_2 = Selector(response)
152
items_catagory_2 = response.meta['items_catagory_1']
153
print('一级类目', items_catagory_2['catagory_1_Name'])
154
catagory_2_List = selector_catagory_2.xpath('//div[@class="sheng_weizhi_con"]/ul[2]/li/a')
155
for catagory_2 in catagory_2_List:
156
catagory_2_Name = catagory_2.xpath('text()').extract()[0]
157
catagory_2_Url = catagory_2.xpath('@href').extract()[0]
158
items_catagory_2['catagory_2_Name'] = catagory_2_Name
159
items_catagory_2['catagory_2_Url'] = items_catagory_2['url'] + catagory_2_Url
160
print(items_catagory_2['provinceName']
161
,items_catagory_2['cityName']
162
,items_catagory_2['catagory_1_Name']
163
,items_catagory_2['catagory_1_Url']
164
,items_catagory_2['catagory_2_Name']
165
,items_catagory_2['catagory_2_Url'])
166
yield Request(items_catagory_2['catagory_2_Url'], meta={'items_catagory_2': copy.deepcopy(items_catagory_2)}, callback=self.parse_catagory_3)
167
168
def parse_catagory_1(self,response):
169
'''行业一级类目处理函数'''
170
selector_catagory_1 = Selector(response)
171
items_catagory_1=response.meta['items']
172
# 大类
173
print('当前地区',items_catagory_1['provinceName'],items_catagory_1['cityName'])
174
catagory_1_List = selector_catagory_1.xpath('//div[@class="sheng_weizhi_con"]/ul[2]/li/a')
175
if catagory_1_List.__len__() ==0:
176
catagory_1_List = selector_catagory_1.xpath('//div[@class="sheng_weizhi_con"]/ul/li/a')
177
for catagory_1 in catagory_1_List:
178
items_catagory_1['catagory_1_Name'] = catagory_1.xpath('text()').extract()[0]
179
items_catagory_1['catagory_1_Url'] = items_catagory_1['url']+catagory_1.xpath('@href').extract()[0]
180
yield Request(items_catagory_1['catagory_1_Url'], meta={'items_catagory_1':copy.deepcopy(items_catagory_1)}, callback=self.parse_catagory_2)
181
def parse(self,response):
182
selector=Selector(response)
183
url='http://book.youboy.com'
184
#获取class="ybs-bcTitle"下所有的a标签
185
diquUrl = []
186
diqu1 = selector.xpath('//div[@class="ybs-bcTitle"]/a')
187
for bg in diqu1:
188
cityUrl = bg.xpath('@href').extract()[0]
189
cityUrl=url+cityUrl
190
cityName=bg.xpath('text()').extract()[0]
191
#print(cityName,cityName,cityUrl)
192
diquUrl.append((cityName,cityName,cityUrl,'Y'))
193
diqu2 = selector.xpath('//div[@class="ybs-bcBody"]/ul/li')
194
for bg in diqu2:
195
provinceName=bg.xpath('h3/a/text()').extract()[0]
196
cityList=bg.xpath('span/a')
197
for city in cityList:
198
cityName = city.xpath('text()').extract()[0]
199
cityUrl = city.xpath('@href').extract()[0]
200
cityUrl = url + cityUrl
201
diquUrl.append((provinceName,cityName,cityUrl,'Y'))
202
#print(diquUrl)
203
'''批量加载数据入库'''
204
sql = "replace into youboy_diqu(provinceName,cityName,url,flag) " \
205
"values(%s,%s,%s,%s)"
206
connMysql = connDB()
207
result = exeBath(connMysql[0], connMysql[1],sql,diquUrl)
208
#print('加载记录数:', result)
209
connClose(connMysql[0], connMysql[1])
210
#############################################################################################################
211
#############################################################################################################
212
#############################################################################################################
213
#读取url,按省市分别处理
214
selectsql = "select provinceName,cityName,url from youboy_diqu where provinceName='上海' and cityName='上海' and flag='Y'"
215
connMysql = connDB()
216
results = exeQuery(connMysql[1],selectsql)
217
# updatesql = "update youboy_diqu set flag='N' where provinceName='%s' and cityName='%s'" %(result[0],result[1])
218
# updateresult = exeUpdate(connMysql[0],connMysql[1], updatesql)
219
connClose(connMysql[0], connMysql[1])
220
for result in results:
221
print('当前地区%s-%s' %(result[0],result[1]))
222
items = {}
223
items['provinceName'] = result[0]
224
items['cityName'] = result[1]
225
items['cityUrl'] = result[2]
226
items['url']=url
227
#print('url',items['cityUrl'])
228
yield Request(items['cityUrl'], meta={'items':copy.deepcopy(items)},callback=self.parse_catagory_1)
4-2、items.py
1
# -*- coding: utf-8 -*-
2
3
# Define here the models for your scraped items
4
#
5
# See documentation in:
6
# http://doc.scrapy.org/en/latest/topics/items.html
7
8
import scrapy
9
10
class SpiderYouboyItem(scrapy.Item):
11
# define the fields for your item here like:
12
# name = scrapy.Field()
13
provinceName=scrapy.Field()
14
cityName=scrapy.Field()
15
catagory_1_Name=scrapy.Field()
16
catagory_1_Url=scrapy.Field()
17
catagory_2_Name=scrapy.Field()
18
catagory_2_Url=scrapy.Field()
19
catagory_3_Name=scrapy.Field()
20
catagory_3_Url=scrapy.Field()
21
enterpriseName=scrapy.Field()
22
contactPerson=scrapy.Field()
23
enterpriseFax=scrapy.Field()
24
enterprisePhone=scrapy.Field()
25
enterpriseMobile=scrapy.Field()
26
enterpriseAddr=scrapy.Field()
27
enterpriseUrl=scrapy.Field()
4-3、pipelines.py
1
# -*- coding: utf-8 -*-
2
3
# Define your item pipelines here
4
#
5
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
6
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7
import pymongo
8
from scrapy.conf import settings
9
class SpiderYouboyPipeline(object):
10
# def process_item(self, item, spider):
11
# return item
12
def __init__(self):
13
# 链接数据库
14
self.client = pymongo.MongoClient(
15
host=settings['MONGODB_HOST']
16
, port=settings['MONGODB_PORT'])
17
# 数据库登录需要帐号密码的话
18
# self.client.admin.authenticate(settings['MINGO_USER']
19
# , settings['MONGO_PSW'])
20
self.db = self.client[settings['MONGODB_DB']]
21
# 获得数据库的句柄
22
self.postItem = self.db[settings['MONGODB_COLL']]
23
# 获得collection的句柄
24
25
def process_item(self, item, spider):
26
postItem = dict(item)
27
# 把item转化成字典形式
28
print('postItem', postItem)
29
self.postItem.insert_one(postItem)
30
# 向数据库插入一条记录
31
# 会在控制台输出原item数据,可以选择不写
32
return item
4-4、settings.py
1
# -*- coding: utf-8 -*-
2
3
# Scrapy settings for spider_youboy project
4
#
5
# For simplicity, this file contains only settings considered important or
6
# commonly used. You can find more settings consulting the documentation:
7
#
8
# http://doc.scrapy.org/en/latest/topics/settings.html
9
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11
12
BOT_NAME = 'spider_youboy'
13
14
SPIDER_MODULES = ['spider_youboy.spiders']
15
NEWSPIDER_MODULE = 'spider_youboy.spiders'
16
17
18
# Crawl responsibly by identifying yourself (and your website) on the user-agent
19
#USER_AGENT = 'spider_youboy (+http://www.yourdomain.com)'
20
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
21
# Obey robots.txt rules
22
ROBOTSTXT_OBEY = True
23
ITEM_PIPELINES = {
24
'spider_youboy.pipelines.SpiderYouboyPipeline': 300,
25
}
26
MONGODB_HOST = "127.0.0.1"
27
MONGODB_PORT = 27017
28
MONGODB_DB = 'youboy'
29
MONGODB_COLL = 'enterprise'
30
31
SCHEDULER="scrapy_redis.scheduler.Scheduler"
32
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
33
SCHEDULER_PERSIST=True
34
SCHEDULER_QUEUE_CLASS="scrapy_redis.queue.SpiderPriorityQueue"
35
REDIS_URL = None
36
REDIS_HOST='127.0.0.1'
37
REDIS_PORT=6379
38
# Configure maximum concurrent requests performed by Scrapy (default: 16)
39
#CONCURRENT_REQUESTS = 32
40
41
# Configure a delay for requests for the same website (default: 0)
42
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
43
# See also autothrottle settings and docs
44
#DOWNLOAD_DELAY = 5
45
# The download delay setting will honor only one of:
46
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
47
#CONCURRENT_REQUESTS_PER_IP = 16
48
49
# Disable cookies (enabled by default)
50
#COOKIES_ENABLED = False
51
52
# Disable Telnet Console (enabled by default)
53
#TELNETCONSOLE_ENABLED = False
54
55
# Override the default request headers:
56
#DEFAULT_REQUEST_HEADERS = {
57
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
58
# 'Accept-Language': 'en',
59
#}
60
61
# Enable or disable spider middlewares
62
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
63
#SPIDER_MIDDLEWARES = {
64
# 'spider_youboy.middlewares.SpiderYouboySpiderMiddleware': 543,
65
#}
66
67
# Enable or disable downloader middlewares
68
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
69
#DOWNLOADER_MIDDLEWARES = {
70
# 'spider_youboy.middlewares.MyCustomDownloaderMiddleware': 543,
71
#}
72
73
# Enable or disable extensions
74
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
75
#EXTENSIONS = {
76
# 'scrapy.extensions.telnet.TelnetConsole': None,
77
#}
78
79
# Configure item pipelines
80
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
81
# ITEM_PIPELINES = {
82
# 'spider_youboy.pipelines.SpiderYouboyPipeline': 300,
83
# }
84
85
# Enable and configure the AutoThrottle extension (disabled by default)
86
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
87
#AUTOTHROTTLE_ENABLED = True
88
# The initial download delay
89
#AUTOTHROTTLE_START_DELAY = 5
90
# The maximum download delay to be set in case of high latencies
91
#AUTOTHROTTLE_MAX_DELAY = 60
92
# The average number of requests Scrapy should be sending in parallel to
93
# each remote server
94
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
95
# Enable showing throttling stats for every response received:
96
#AUTOTHROTTLE_DEBUG = False
97
98
# Enable and configure HTTP caching (disabled by default)
99
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
100
#HTTPCACHE_ENABLED = True
101
#HTTPCACHE_EXPIRATION_SECS = 0
102
#HTTPCACHE_DIR = 'httpcache'
103
#HTTPCACHE_IGNORE_HTTP_CODES = []
104
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
105
4-5、mysqldb.py
1
#coding=utf-8
2
#!/usr/bin/python
3
'''
4
Author:chenlun
5
Date:2017-04-10
6
'''
7
import pymysql
8
def connDB():
9
# 连接数据库
10
try:
11
conn = pymysql.connect(host='localhost', user='root', passwd='root', db='youboy', charset='utf8')
12
cur = conn.cursor()
13
return (conn, cur)
14
except Exception as e:
15
return "connect Error!"
16
def exeUpdate(conn, cur, sql):
17
'''更新语句,可执行Update,Insert语句'''
18
sta = cur.execute(sql)
19
conn.commit()
20
return (sta)
21
def exeBath(conn, cur, sql,data):
22
'''批量插入数据'''
23
#try:
24
sta = cur.executemany(sql,data)
25
conn.commit()
26
return sta
27
#except Exception as e:
28
# return pymysql.err
29
def exeQuery(cur, sql):
30
# 查询语句
31
cur.execute(sql)
32
result = cur.fetchall()
33
return result
34
def connClose(conn, cur):
35
# 关闭所有连接
36
cur.close()
37
conn.close()
5、spider注意事项
5-1、yield meta多层传递参数失效:
加上参数:dont_filter=True ‘多层循环失效加上此参数’
5-2、yield不是立即返回,而是异步执行,完成后调用callback函数,将url和meta字典传给指定函数处理
5-3、xpath的使用注意,常用功能必须熟悉
5-4、最后的落地动作,return items,这一步会将items字典传递给items.py函数
items['key']与key=scrapy.field()相对应
5-5、注意深度copy和浅copy的使用,一般使用copy.deepcopy(Info)
5-6、非分布式使用:
class youboySpider(CrawlSpider):
分布式Redis的话youboySpider继承RedisSpider类class youboySpider(RedisSpider):
6、数据库信息配置
Mongodb配置:
ITEM_PIPELINES = {
'spider_youboy.pipelines.SpiderYouboyPipeline': 300,
}
MONGODB_HOST = "127.0.0.1"
MONGODB_PORT = 27017
MONGODB_DB = 'youboy'
MONGODB_COLL = 'enterprise'
Redis配置
SCHEDULER="scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER_PERSIST=True
SCHEDULER_QUEUE_CLASS="scrapy_redis.queue.SpiderPriorityQueue"
REDIS_URL = None
REDIS_HOST='127.0.0.1'
REDIS_PORT=6379
7、通过main启动爬虫程序
如果使用了分布式,同时需要运行redis-cli客户端,启动监听,否则redis为空,程序会一直等待:
lpush youboySpider:start_urls http://book.youboy.com/diqu.html