利用scrapy建立代理池

一开始爬取xicidaili，频率太快ip被禁了。。。，只能回家爬取。明天把爬取的ip存到redis里做持久化，并且爬取后自动测试代理评级。

proxypool.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request,Spider
from pyquery import PyQuery
from ..items import ProxyItem

class ProxySpider(Spider):
    name = 'proxy'
    allowed_domains = ['proxy']
    start_urls = ['http://proxy/']
    page = 1
    xicidail_url = "http://www.xicidaili.com/nn/{page}"
    kuaidaili_url = "https://www.kuaidaili.com/free/inha/{page}/"
    _66daili_url = "http://www.66ip.cn/areaindex_{page}/1.html"
    ip3366_url = "http://www.ip3366.net/?stype=1&page={page}"

    def start_requests(self):
        yield Request(url=self.kuaidaili_url.format(page=self.page), callback=self.kuaidaili_parse)
        yield Request(url=self._66daili_url.format(page=self.page), callback=self._66_daili_parse)
        yield Request(url=self.ip3366_url.format(page=self.page), callback=self.ip3366_parse)
        # yield Request(url=self.xicidail_url.format(page=1), callback=self.xicidaili_parse)

    def kuaidaili_parse(self, response):
        pq = PyQuery(response.text)
        item = ProxyItem()
        proxies = pq.find("#list .table-bordered tbody").find("tr")
        for proxy in proxies.items():
            ip = proxy.find("td").eq(0).text()
            port = proxy.find("td").eq(1).text()
            item["proxy"] = ip + ":" + port
            print("从%s成功获取代理：IP：%s PORT：%s" % ("www.kuaidaili.com", ip, port))
            yield item
        now_page = int(response.url.split("/")[-2])
        next_page = now_page + 1
        if next_page <= 10:
            yield Request(url=self.kuaidaili_url.format(page=str(next_page)), callback=self.kuaidaili_parse, dont_filter=True)

    def _66_daili_parse(self, response):
        pq = PyQuery(response.text)
        item = ProxyItem()
        proxies = pq.find("#footer table tr:gt(0)")
        for proxy in proxies.items():
            ip = proxy.find("td").eq(0).text()
            port = proxy.find("td").eq(1).text()
            item["proxy"] = ip + ":" + port
            print("从%s成功获取代理：IP：%s PORT：%s" % ("http://www.66ip.cn", ip, port))
            yield item
        now_page = int(response.url.split("/")[-2].split("_")[1])
        next_page = now_page + 1
        if next_page <= 34:
            yield Request(url=self._66daili_url.format(page=str(next_page)), callback=self._66_daili_parse, dont_filter=True)

    def ip3366_parse(self, response):
        pq = PyQuery(response.text)
        item = ProxyItem()
        proxyies = pq.find("#list table tbody tr:gt(0)")
        for proxy in proxyies.items():
            ip = proxy.find("td").eq(0).text()
            port = proxy.find("td").eq(1).text()
            item["proxy"] = ip + ":" + port
            print("从%s成功获取代理：IP：%s PORT：%s" % ("www.ip3366.net", ip, port))
            yield item
        now_page = int(response.url.split("=")[2])
        next_page = now_page + 1
        if next_page <= 10:
            yield Request(url=self.ip3366_url.format(page=str(next_page)), callback=self.ip3366_parse, dont_filter=True)


    # def xicidaili_parse(self, response):
    #     print(response)
    #     page = int(response.url.split("/")[-1])
    #     pq = PyQuery(response.text)
    #     item = XicidailiItem()
    #     proxies = pq.find("#ip_list").find("tr")
    #     total_page = int(pq.find("#body .pagination a").eq(-2).text())
    #     self.page += 1
    #     for proxy in proxies.items():
    #         ip = proxy.find("td").eq(1).text()
    #         port = proxy.find("td").eq(2).text()
    #         item["proxy_info"] = ip + ":"+ port
    #         # print(item["proxy_info"])
    #     yield item
    #     page += 1
    #     if page <= 10:
    #         yield Request(url=self.xicidail_url.format(page=str(page)), callback=self.xicidaili_parse, dont_filter=True)

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy
from scrapy import Item,Field

class ProxyItem(Item):
    ip = Field()
    port = Field()

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for ProxyPool project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'ProxyPool'

SPIDER_MODULES = ['ProxyPool.spiders']
NEWSPIDER_MODULE = 'ProxyPool.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'ProxyPool (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 4

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36"
}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'ProxyPool.middlewares.ProxypoolSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'ProxyPool.middlewares.ProxypoolDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# ITEM_PIPELINES = {
#    'ProxyPool.pipelines.ProxyPipeline': 300,
# }

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
AUTOTHROTTLE_START_DELAY = 30
# The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

利用scrapy建立代理池

proxypool.py

items.py

settings.py

猜你喜欢