Python之Scrapy框架Redis实现分布式爬虫详解

1、创建scrapy工程
scrapy startproject youboy
2、scrapy工程目录介绍
│  main.py    #爬虫程序入口cmdline.execute("scrapy  crawl youboySpider".split())
│  scrapy.cfg     
└─spider_youboy
    │  items.py #定义要存储的字段,items通过spider返回,接收来至spider的字典数据
    │  middlewares.py
    │  pipelines.py #管道,从items获取字段,将数据存储到数据库中,可以使Mysql、Mongodb、Json、csv等
    │  settings.py #配置文件,数据库信息,参数以及pipeLine的设置等等
    │  __init__.py
    │  
    ├─spiders
    │  │  ddl.py
    │  │  mysqldb.py
    │  │  youboySpider.py #爬虫的核心部分,直接解析和处理网页,将数据传输给item和pipe
    │  │  __init__.py
3、安装mongodb以及redis模块
pip install pymongo
pip install scrapy-redis
4、爬虫源码
Mysql表结构,如果使用Mongodb不需要使用表结构,配置好items和settings就行了
4-1、spider.py
1
#coding=utf-8
2
'''
3
Tools:PyCharm 2017.1
4
Version:Python3.5
5
Author:colby_chen
6
Date:2017-09-26
7
'''
8
import copy
9
from scrapy import Request
10
from scrapy.selector import Selector, HtmlXPathSelector
11
#from scrapy.spiders import CrawlSpider
12
from scrapy_redis.spiders import RedisSpider
13
from .mysqldb import connClose,connDB,exeBath,exeQuery,exeUpdate
14
import urllib.request
15
from lxml import etree
16
from ..items import SpiderYouboyItem
17
def gethtml(url):
18
    page = urllib.request.urlopen(url)
19
    html = page.read().decode('utf-8')
20
    return html
21
22
def getPage(url):
23
    '''
24
    根据传过来的url,获取所有分页,并返回一个url列表
25
    :param url:
26
    :return:
27
    '''
28
    urlList=[]
29
    startUrl=url
30
    html=gethtml(startUrl)
31
    selector=etree.HTML(html)
32
    nextPageFlag=selector.xpath('//dl[@class="sheng_weizhi_next01"]/a[last()]/text()')
33
    print('nextPageFlag',nextPageFlag)
34
    maxPage=None
35
    if nextPageFlag.__len__()>0:
36
        endurl=url+'10000'
37
        endhtml=gethtml(endurl)
38
        maxPage = selector.xpath('//dl[@class="sheng_weizhi_next01"]/strong/text()')[0]
39
        print('maxPage', maxPage)
40
        for i in range(1,int(maxPage)+1):
41
            currentUrl=url+str(i)
42
            print('currentUrl',currentUrl)
43
            urlList.append(currentUrl)
44
    else:
45
        urlList.append(startUrl)
46
    print('urlList...............................................', urlList)
47
    return urlList
48
49
def enterpriseContentDetail(enterpriseUrl,*args,**kwargs):
50
    page = urllib.request.urlopen(enterpriseUrl)
51
    html = page.read().decode('utf-8')
52
    selector = etree.HTML(html)
53
    # enterpriseContent = selector.xpath('//div[@class="txl_content_con"]/ul[1]/')
54
    # print('enterpriseContent', enterpriseContent)
55
    enterpriseDetail = []
56
    enterpriseName = selector.xpath('//div[@class="txl_content_con"]/ul[1]/h1/text()')[0].replace('\t','').replace('\r\n','')
57
    contactPerson = selector.xpath('//div[@class="txl_content_con"]/ul[1]/li[2]/text()')[0].replace('\t','').replace('\r\n','')
58
    enterpriseFax = selector.xpath('//div[@class="txl_content_con"]/ul[1]/li[3]/text()')[0].replace('\t','').replace('\r\n','')
59
    enterprisePhone = selector.xpath('//div[@class="txl_content_con"]/ul[1]/li[4]/text()')[0].replace('\t','').replace('\r\n','')
60
    enterpriseMobile = selector.xpath('//div[@class="txl_content_con"]/ul[1]/li[5]/text()')[0].replace('\t','').replace('\r\n','')
61
    enterpriseAddr = selector.xpath('//div[@class="txl_content_con"]/ul[1]/li[6]/text()')[0].replace('\t','').replace('\r\n','')
62
    enterpriseUrl=enterpriseUrl
63
    base=list(*args)
64
    enterpriseDetail = [enterpriseName,contactPerson,enterpriseFax,enterprisePhone,enterpriseMobile,enterpriseAddr,enterpriseUrl]
65
    if enterpriseDetail.__len__() == 0:
66
        enterpriseDetail = ['', '', '', '', '', '',enterpriseUrl]
67
    base.extend(enterpriseDetail)
68
    return base
69
70
class youboySpider(RedisSpider):
71
    name="youboySpider"
72
    redis_key="youboySpider:start_urls"
73
    start_urls=['http://book.youboy.com/diqu.html']
74
    def enterpriseContent(self,response):
75
        '''企业列表处理'''
76
        select_enterpriseList = Selector(response)
77
        items_enterpriseList = response.meta['baseInfo2']
78
        print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')
79
        enterpriseList = select_enterpriseList.xpath('//*[@id="content"]/ul/div/strong/a')
80
        provinceName = items_enterpriseList['provinceName']
81
        cityName = items_enterpriseList['cityName']
82
        catagory_1_Name = items_enterpriseList['catagory_1_Name']
83
        catagory_1_Url = items_enterpriseList['catagory_1_Url']
84
        catagory_2_Name = items_enterpriseList['catagory_2_Name']
85
        catagory_2_Url = items_enterpriseList['catagory_2_Url']
86
        catagory_3_Name = items_enterpriseList['catagory_3_Name']
87
        catagory_3_Url = items_enterpriseList['catagory_3_Url']
88
        baseInfo = [provinceName, cityName, catagory_1_Name, catagory_1_Url, catagory_2_Name, catagory_2_Url,
89
                    catagory_3_Name, catagory_3_Url]
90
        enterpriseContentList = []
91
        if enterpriseList.__len__()==0:
92
            items_enterpriseList['enterpriseName']=''
93
            items_enterpriseList['contactPerson']=''
94
            items_enterpriseList['enterpriseFax']=''
95
            items_enterpriseList['enterprisePhone']=''
96
            items_enterpriseList['enterpriseMobile']=''
97
            items_enterpriseList['enterpriseAddr']=''
98
            items_enterpriseList['enterpriseUrl']=''
99
            #enterpriseContentDict=[(provinceName,cityName,catagory_1_Name,catagory_1_Url,catagory_2_Name,catagory_2_Url,catagory_3_Name,catagory_3_Url,'','','','','','','')]
100
        for enterpriseInfo in enterpriseList:
101
            enterpriseUrl=enterpriseInfo.xpath('@href').extract()[0]
102
            enterpriseContent=enterpriseContentDetail(enterpriseUrl,baseInfo)
103
            items_enterpriseList['enterpriseName'] = enterpriseContent[8]
104
            items_enterpriseList['contactPerson'] = enterpriseContent[9]
105
            items_enterpriseList['enterpriseFax'] = enterpriseContent[10]
106
            items_enterpriseList['enterprisePhone'] = enterpriseContent[11]
107
            items_enterpriseList['enterpriseMobile'] = enterpriseContent[12]
108
            items_enterpriseList['enterpriseAddr'] = enterpriseContent[13]
109
            items_enterpriseList['enterpriseUrl'] = enterpriseContent[14]
110
            yield items_enterpriseList
111
112
        # sql = "replace into youboy_enterprise(provinceName,cityName,catagory_1_Name,catagory_1_Url,catagory_2_Name,catagory_2_Url,catagory_3_Name,catagory_3_Url" \
113
        #       ",enterpriseName,contactPerson,enterpriseFax,enterprisePhone,enterpriseMobile,enterpriseAddr,enterpriseUrl) " \
114
        #       "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
115
        # connMysql = connDB()
116
        # result = exeBath(connMysql[0], connMysql[1], sql, enterpriseContentList)
117
        # connClose(connMysql[0], connMysql[1])
118
119
    def parse_enterpriseFirstPage(self, response):
120
        '''企业列表处理'''
121
        select_enterpriseList=Selector(response)
122
        baseInfo2 = response.meta['items_catagory_3']
123
        firstPage = baseInfo2['catagory_3_Url']
124
        pageList=getPage(firstPage)
125
        for pageurl in pageList:
126
            '''
127
            dont_filter=True 多层循环失效加上此参数
128
            '''
129
            yield Request(pageurl,meta={'baseInfo2':copy.deepcopy(baseInfo2)},callback=self.enterpriseContent,dont_filter=True)
130
131
    def parse_catagory_3(self,response):
132
        '''行业三级类目处理函数'''
133
        '''行业二级类目处理函数'''
134
        selector_catagory_3 = Selector(response)
135
        items_catagory_3 = response.meta['items_catagory_2']
136
        print('二级类目', items_catagory_3['catagory_2_Name'])
137
        catagory_3_List = selector_catagory_3.xpath('//div[@class="sheng_weizhi_con"]/ul[2]/li/a')
138
        data=[]
139
        for catagory_3 in catagory_3_List:
140
            catagory_3_Name = catagory_3.xpath('text()').extract()[0]
141
            catagory_3_Url = catagory_3.xpath('@href').extract()[0]
142
            items_catagory_3['catagory_3_Name'] = catagory_3_Name
143
            items_catagory_3['catagory_3_Url'] = items_catagory_3['url'] + catagory_3_Url
144
            #print(items_catagory_3['provinceName'],items_catagory_3['cityName'],items_catagory_3['catagory_1_Name'],items_catagory_3['catagory_1_Url'],items_catagory_3['catagory_2_Name'],items_catagory_3['catagory_2_Url'],items_catagory_3['catagory_3_Name'],items_catagory_3['catagory_3_Url'])
145
            yield Request(items_catagory_3['catagory_3_Url'], meta={'items_catagory_3': copy.deepcopy(items_catagory_3)}
146
                          ,callback=self.parse_enterpriseFirstPage)
147
            #data.append((items_catagory_3['provinceName'],items_catagory_3['cityName'],items_catagory_3['catagory_1_Name'],items_catagory_3['catagory_1_Url'],items_catagory_3['catagory_2_Name'],items_catagory_3['catagory_2_Url'],items_catagory_3['catagory_3_Name'],items_catagory_3['catagory_3_Url']))
148
149
    def parse_catagory_2(self, response):
150
        '''行业二级类目处理函数'''
151
        selector_catagory_2 = Selector(response)
152
        items_catagory_2 = response.meta['items_catagory_1']
153
        print('一级类目', items_catagory_2['catagory_1_Name'])
154
        catagory_2_List = selector_catagory_2.xpath('//div[@class="sheng_weizhi_con"]/ul[2]/li/a')
155
        for catagory_2 in catagory_2_List:
156
            catagory_2_Name = catagory_2.xpath('text()').extract()[0]
157
            catagory_2_Url = catagory_2.xpath('@href').extract()[0]
158
            items_catagory_2['catagory_2_Name'] = catagory_2_Name
159
            items_catagory_2['catagory_2_Url'] = items_catagory_2['url'] + catagory_2_Url
160
            print(items_catagory_2['provinceName']
161
                  ,items_catagory_2['cityName']
162
                  ,items_catagory_2['catagory_1_Name']
163
                  ,items_catagory_2['catagory_1_Url']
164
                  ,items_catagory_2['catagory_2_Name']
165
                  ,items_catagory_2['catagory_2_Url'])
166
            yield Request(items_catagory_2['catagory_2_Url'], meta={'items_catagory_2': copy.deepcopy(items_catagory_2)}, callback=self.parse_catagory_3)
167
168
    def parse_catagory_1(self,response):
169
        '''行业一级类目处理函数'''
170
        selector_catagory_1 = Selector(response)
171
        items_catagory_1=response.meta['items']
172
        # 大类
173
        print('当前地区',items_catagory_1['provinceName'],items_catagory_1['cityName'])
174
        catagory_1_List = selector_catagory_1.xpath('//div[@class="sheng_weizhi_con"]/ul[2]/li/a')
175
        if catagory_1_List.__len__() ==0:
176
            catagory_1_List = selector_catagory_1.xpath('//div[@class="sheng_weizhi_con"]/ul/li/a')
177
        for catagory_1 in catagory_1_List:
178
            items_catagory_1['catagory_1_Name'] = catagory_1.xpath('text()').extract()[0]
179
            items_catagory_1['catagory_1_Url'] = items_catagory_1['url']+catagory_1.xpath('@href').extract()[0]
180
            yield Request(items_catagory_1['catagory_1_Url'], meta={'items_catagory_1':copy.deepcopy(items_catagory_1)}, callback=self.parse_catagory_2)
181
    def parse(self,response):
182
        selector=Selector(response)
183
        url='http://book.youboy.com'
184
        #获取class="ybs-bcTitle"下所有的a标签
185
        diquUrl = []
186
        diqu1 = selector.xpath('//div[@class="ybs-bcTitle"]/a')
187
        for bg in diqu1:
188
            cityUrl = bg.xpath('@href').extract()[0]
189
            cityUrl=url+cityUrl
190
            cityName=bg.xpath('text()').extract()[0]
191
            #print(cityName,cityName,cityUrl)
192
            diquUrl.append((cityName,cityName,cityUrl,'Y'))
193
        diqu2 = selector.xpath('//div[@class="ybs-bcBody"]/ul/li')
194
        for bg in diqu2:
195
            provinceName=bg.xpath('h3/a/text()').extract()[0]
196
            cityList=bg.xpath('span/a')
197
            for city in cityList:
198
                cityName = city.xpath('text()').extract()[0]
199
                cityUrl = city.xpath('@href').extract()[0]
200
                cityUrl = url + cityUrl
201
                diquUrl.append((provinceName,cityName,cityUrl,'Y'))
202
        #print(diquUrl)
203
        '''批量加载数据入库'''
204
        sql = "replace into youboy_diqu(provinceName,cityName,url,flag) " \
205
              "values(%s,%s,%s,%s)"
206
        connMysql = connDB()
207
        result = exeBath(connMysql[0], connMysql[1],sql,diquUrl)
208
        #print('加载记录数:', result)
209
        connClose(connMysql[0], connMysql[1])
210
        #############################################################################################################
211
        #############################################################################################################
212
        #############################################################################################################
213
        #读取url,按省市分别处理
214
        selectsql = "select provinceName,cityName,url from youboy_diqu where provinceName='上海' and cityName='上海' and flag='Y'"
215
        connMysql = connDB()
216
        results = exeQuery(connMysql[1],selectsql)
217
        # updatesql = "update youboy_diqu set flag='N' where provinceName='%s' and cityName='%s'" %(result[0],result[1])
218
        # updateresult = exeUpdate(connMysql[0],connMysql[1], updatesql)
219
        connClose(connMysql[0], connMysql[1])
220
        for result in results:
221
            print('当前地区%s-%s' %(result[0],result[1]))
222
            items = {}
223
            items['provinceName'] = result[0]
224
            items['cityName'] = result[1]
225
            items['cityUrl'] = result[2]
226
            items['url']=url
227
            #print('url',items['cityUrl'])
228
            yield Request(items['cityUrl'], meta={'items':copy.deepcopy(items)},callback=self.parse_catagory_1)
4-2、items.py
4-3、pipelines.py
1
# -*- coding: utf-8 -*-
2
3
# Define your item pipelines here
4
#
5
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
6
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7
import pymongo
8
from scrapy.conf import settings
9
class SpiderYouboyPipeline(object):
10
    # def process_item(self, item, spider):
11
    #     return item
12
    def __init__(self):
13
        # 链接数据库
14
        self.client = pymongo.MongoClient(
15
            host=settings['MONGODB_HOST']
16
            , port=settings['MONGODB_PORT'])
17
        # 数据库登录需要帐号密码的话
18
        # self.client.admin.authenticate(settings['MINGO_USER']
19
        # , settings['MONGO_PSW'])
20
        self.db = self.client[settings['MONGODB_DB']]
21
        # 获得数据库的句柄
22
        self.postItem = self.db[settings['MONGODB_COLL']]
23
        # 获得collection的句柄
24
25
    def process_item(self, item, spider):
26
        postItem = dict(item)
27
        # 把item转化成字典形式
28
        print('postItem', postItem)
29
        self.postItem.insert_one(postItem)
30
        # 向数据库插入一条记录
31
        # 会在控制台输出原item数据,可以选择不写
32
        return item
4-4、settings.py
4-5、mysqldb.py
1
#coding=utf-8
2
#!/usr/bin/python
3
'''
4
Author:chenlun
5
Date:2017-04-10
6
'''
7
import pymysql
8
def connDB():
9
    # 连接数据库
10
    try:
11
        conn = pymysql.connect(host='localhost', user='root', passwd='root', db='youboy', charset='utf8')
12
        cur = conn.cursor()
13
        return (conn, cur)
14
    except Exception as e:
15
        return "connect Error!"
16
def exeUpdate(conn, cur, sql):
17
    '''更新语句,可执行Update,Insert语句'''
18
    sta = cur.execute(sql)
19
    conn.commit()
20
    return (sta)
21
def exeBath(conn, cur, sql,data):
22
    '''批量插入数据'''
23
    #try:
24
    sta = cur.executemany(sql,data)
25
    conn.commit()
26
    return sta
27
    #except Exception as e:
28
    #    return pymysql.err
29
def exeQuery(cur, sql):
30
    # 查询语句
31
    cur.execute(sql)
32
    result = cur.fetchall()
33
    return result
34
def connClose(conn, cur):
35
    # 关闭所有连接
36
    cur.close()
37
    conn.close()
5、spider注意事项
5-1、yield meta多层传递参数失效:
    加上参数:dont_filter=True ‘多层循环失效加上此参数’
5-2、yield不是立即返回,而是异步执行,完成后调用callback函数,将url和meta字典传给指定函数处理
5-3、xpath的使用注意,常用功能必须熟悉
5-4、最后的落地动作,return items,这一步会将items字典传递给items.py函数
    items['key']与key=scrapy.field()相对应
5-5、注意深度copy和浅copy的使用,一般使用
    copy.deepcopy(Info)
5-6、非分布式使用:
    class youboySpider(CrawlSpider):
    分布式Redis的话youboySpider继承RedisSpider类
    class youboySpider(RedisSpider):

6、数据库信息配置  
    Mongodb配置:
ITEM_PIPELINES = {
   'spider_youboy.pipelines.SpiderYouboyPipeline': 300,
}
MONGODB_HOST = "127.0.0.1"
MONGODB_PORT = 27017
MONGODB_DB = 'youboy'
MONGODB_COLL = 'enterprise'
    
    Redis配置
SCHEDULER="scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER_PERSIST=True
SCHEDULER_QUEUE_CLASS="scrapy_redis.queue.SpiderPriorityQueue"
REDIS_URL = None
REDIS_HOST='127.0.0.1'
REDIS_PORT=6379
7、通过main启动爬虫程序
    如果使用了分布式,同时需要运行redis-cli客户端,启动监听,否则redis为空,程序会一直等待:
    lpush youboySpider:start_urls http://book.youboy.com/diqu.html
发布了35 篇原创文章 · 获赞 6 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/colby_chenlun/article/details/78133684
今日推荐