scrapy 分布式爬取数据同步写入数据库

spider文件

继承RedisCrawlSpider，实现全站爬取数据

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from houseBjPro.items import HousebjproItem
from scrapy_redis.spiders import RedisCrawlSpider
import re

""""
爬取58同城北京二手房数据
"""
class HousebjSpider(RedisCrawlSpider):
    name = 'houseBj'
    # # allowed_domains = ['https://bj.58.com/ershoufang/']
    # start_urls = ['https://bj.58.com/ershoufang/']
    # 调度器队列的名称
    redis_key = 'houseBjurl'

    rules = (
        Rule(LinkExtractor(allow=r'ershoufang/pn\d+/'), callback='parse_item', follow=True), # 爬取分页信息
    )

    def parse_item(self, response):
        i = {}
        house_list = response.xpath('//ul[@class="house-list-wrap"]/li') # 得到房产列表
        for h_item in house_list:
            sum_price = h_item.xpath('./div[@class="price"]/p[1]/b/text()').extract_first()
            unit_price = h_item.xpath('./div[@class="price"]/p[2]/text()').extract_first()
            title = h_item.xpath('./div[@class="list-info"]/h2/a/text()').extract_first()
            house_info = h_item.xpath('./div[@class="list-info"]/p[1]//text()').extract()
            house_loc = h_item.xpath('./div[@class="list-info"]/p[2]/span//text()').extract()
            house_info = re.sub(r'\s+',',',"".join(house_info).replace('-','').strip())
            house_loc = re.sub(r'\s+',',',"".join(house_loc).replace('-','').strip())
            item = HousebjproItem()
            print(sum_price)
            print(unit_price)
            print(house_loc)
            print(title)
            print(house_info)

            item['sum_price'] = sum_price+'万' # 总价
            item['unit_price'] =unit_price # 单价
            item['title'] = title # 标题
            item['house_info'] = house_info # 户型面积信息
            item['house_loc'] = house_loc # 地段信息
            # print('new_title:{}'.format(new_title))
            yield item

pipeline文件

同步写入mysql数据库

import json
from twisted.enterprise import adbapi
import pymysql


class HousebjproPipeline(object):
    """
    同步写入数据库
    """

    def __init__(self, conn):
        self.conn = conn
        self.cursor = self.conn.cursor()

    @classmethod
    def from_settings(cls, settings):

        # 先将setting中连接数据库所需内容取出，构造一个地点
        dbparms = dict(
            host=settings["MYSQL_HOST"],
            db=settings["MYSQL_DBNAME"],
            port=3306,
            user=settings["MYSQL_USER"],
            password=settings["MYSQL_PASSWORD"],
            charset='utf8mb4',
            # 游标设置
            cursorclass=pymysql.cursors.DictCursor,
            # 设置编码是否使用Unicode
            use_unicode=True
        )

        conn = pymysql.connect(**dbparms)

        return cls(conn)

    def process_item(self, item, spider):
        insert_sql = """
         insert ignore into tongcheng_content(title,sum_price,unit_price,house_info,house_loc
             )
         VALUES(%s,%s,%s,%s,%s)
    """
        try:
            self.cursor.execute(insert_sql, (item["title"], item["sum_price"], item["unit_price"],
                                             item["house_info"], item["house_loc"]))
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()
        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.conn.close()

middleware文件

使用ua池

from scrapy import signals
import time
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
import random

#UA池代码的编写（单独给UA池封装一个下载中间件的一个类）
#1，导包UserAgentMiddlware类
class RandomUserAgent(UserAgentMiddleware):

    def process_request(self, request, spider):
        #从列表中随机抽选出一个ua值
        ua = random.choice(user_agent_list)
        #ua值进行当前拦截到请求的ua的写入操作
        request.headers.setdefault('User-Agent',ua)

#批量对拦截到的请求进行ip更换
# class Proxy(object):
#     def process_request(self, request, spider):
#         #对拦截到请求的url进行判断（协议头到底是http还是https）
#         #request.url返回值：http://www.xxx.com
#         h = request.url.split(':')[0]  #请求的协议头
#         if h == 'https':
#             ip = random.choice(PROXY_https)
#             request.meta['proxy'] = 'https://'+ip



# PROXY_https = [
#     '120.83.49.90:9000',
#     '95.189.112.214:35508',
# ]

user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]

setting配置

BOT_NAME = 'houseBjPro'

SPIDER_MODULES = ['houseBjPro.spiders']
NEWSPIDER_MODULE = 'houseBjPro.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'  # 伪装请求载体身份
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False  #可以忽略或者不遵守robots协议
#只显示指定类型的日志信息
LOG_LEVEL='ERROR'

#开启访问频率限制
AUTOTHROTTLE_ENABLED = True
#设置访问开始的延迟
AUTOTHROTTLE_START_DELAY = 5
#设置访问之间的最大延迟
AUTOTHROTTLE_MAX_DELAY = 60
#设置Scrapy 并行发给每台远程服务器的请求数量
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
#设置下裁之后的自动延迟
DOWNLOAD_DELAY = 3

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'houseBjPro.middlewares.HousebjproSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

DOWNLOADER_MIDDLEWARES = {

    'houseBjPro.middlewares.RandomUserAgent': 542,

}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}
ITEM_PIPELINES = {
        'houseBjPro.pipelines.HousebjproPipeline': 300,
        'scrapy_redis.pipelines.RedisPipeline': 400
    }

REDIS_HOST = '127.0.0.1'  # 将这句注释掉
REDIS_PORT = 6379
REDIS_ENCODING = 'utf-8'

MYSQL_HOST = '49.233.xx.xx'
MYSQL_DBNAME = 'spider_db'
MYSQL_USER = 'xx'
MYSQL_PASSWORD = 'xx'



# 增加了一个去重容器类的配置, 作用使用Redis的set集合来存储请求的指纹数据, 从而实现请求去重的持久化
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 使用scrapy-redis组件自己的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 配置调度器是否要持久化, 也就是当爬虫结束了, 要不要清空Redis中请求队列和去重指纹的set。如果是True, 就表示要持久化存储, 就不清空数据, 否则清空数据
SCHEDULER_PERSIST = True

scrapy 分布式爬取数据同步写入数据库

spider文件

pipeline文件

middleware文件

setting配置

猜你喜欢