第2.2章 scrapy之多进程检测代理ip的有效性

1 multiprocessing
Python多进程multiprocessing使用示例
mutilprocess的作用是能够像线程一样管理进程,在多核CPU利用率比threading要好的多。
2 从数据库中读取爬到的代理进行验证
下面的代码参考了qiyeboy/IPProxyPool

# -*- coding: utf-8 -*-
'''
Created on 2017年6月14日
检测ip是否可用
@author: dzm
'''
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from eie.service.EieIpService import EieIpService
import multiprocessing
from multiprocessing import Process
from eie.middlewares import udf_config
from gevent import monkey
import gevent
monkey.patch_all()
import os
from eie.middlewares.random_user_agent import RandomUserAgent
import json
import time
import requests

logger = udf_config.logger

eieIpService = EieIpService()

class CheckIpProxyService(object):

    def __init__(self):
        '''
                        使用httpbin做http验证
                        检测有效期为5s
        '''
        self.http_timeout = 5
        self.target_url = 'http://httpbin.org/get'
        self.target_url_https = 'https://httpbin.org/get'
        self.target_url_ip = 'http://httpbin.org/ip'
        self.my_ip = None
        # 最大进程数量
        self.max_check_process = multiprocessing.cpu_count()
        # 每个进程最大并发
        self.max_check_construct_per_process = 30
        # 任务队列数量
        self.task_queue_size = 50
        # 进程数达到上限时的等待时间
        self.check_wati_time = 1

    def detect_proxy(self,proxy):
        ip = proxy['ip']
        port = proxy['port']
        proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
        http, types, speed = self.check_proxy(proxies,proxy['types'])
        if http==False:
            eieIpService.delete(ip, port)
        else:
            pass

    def check_proxy(self,proxies,types):
        if types == 'HTTP':
            http, http_types, http_speed = self._checkHttpProxy(proxies)
        else:
            http, http_types, http_speed = self._checkHttpProxy(proxies,False)
        return http, http_types, http_speed

    def _checkHttpProxy(self,proxies,is_http=True):
        types = -1
        speed = -1
        if is_http:
            test_url = self.target_url
        else:
            test_url = self.target_url_https
        try:
            start = time.time()
            randomUserAgent = RandomUserAgent()
            r = requests.get(url=test_url,headers=randomUserAgent.get_headers(),timeout=self.http_timeout,proxies=proxies)
            logger.debug('请求结果为%s' % r)
            if r.ok:
                speed = round(time.time()-start,2)
                content = json.loads(r.text)
                headers = content['headers']
                ip = content['origin']
                proxy_connection = headers.get('Proxy-Connection',None)
                if ',' in ip:
                    types = 2
                elif proxy_connection:
                    types = 1
                else:
                    types = 0
                logger.debug('%s 代理有效' % proxies)
                return True,types, speed
            else:
                return False,types, speed
        except Exception,e:
            logger.debug('%s 代理无效' % proxies)
            return False,types, speed

    def get_my_ip(self):
        '''
                        检测自己的ip地址
        '''
        try:
            randomUserAgent = RandomUserAgent()
            r = requests.get(url=self.target_url_ip, headers=randomUserAgent.get_headers(), timeout=self.http_timeout)
            ip = json.loads(r.text)
            self.my_ip = ip['origin']
        except Exception,e:
            raise Exception('访问 %s 失败,请检查网络连接' % self.target_url_ip)

    def run(self):
        '''
        gevent协程的用法
        @see: http://www.cnblogs.com/tkqasn/p/5705338.html
        '''
        proxy_list = eieIpService.select()
        spawns = []
        for proxy in proxy_list:
            spawns.append(gevent.spawn(self.detect_proxy, proxy))
            if len(spawns) >= self.max_check_construct_per_process:
                gevent.joinall(spawns)
                spawns = []
        if len(spawns)>0:
            gevent.joinall(spawns)


if __name__ == '__main__':
    ip = '59.37.17.202'
    port = '808'
    proxies = {"http": "http://%s:%s" % (ip, port)}
    c = CheckIpProxyService()
#     c.check_proxy(proxies, 'HTTP')
    p = Process(target=c.run)
    p.start()
    p.join()

3 gevent
待续…

猜你喜欢

转载自blog.csdn.net/warrah/article/details/73330776