关于scarpy的一些说明

一  scrapy添加代理

  1 内置代理:os.environ。

    固定格式,不推荐

os.environ['http_proxy'] = "http://root:[email protected]:9999/"
os.environ['https_proxy'] = "http://192.168.11.11:9999/"

  2 自定义代理:通过中间件实现

                import six
                import random
                import base64

                from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware



                def to_bytes(text, encoding=None, errors='strict'):
                    if isinstance(text, bytes):
                        return text
                    if not isinstance(text, six.string_types):
                        raise TypeError('to_bytes must receive a unicode, str or bytes '
                                        'object, got %s' % type(text).__name__)
                    if encoding is None:
                        encoding = 'utf-8'
                    return text.encode(encoding, errors)


                class ProxyMiddleware(object):
                    def process_request(self, request, spider):
                        PROXIES = [
                            {'ip_port': '111.11.228.75:80', 'user_pass': ''},
                            {'ip_port': '120.198.243.22:80', 'user_pass': ''},
                            {'ip_port': '111.8.60.9:8123', 'user_pass': ''},
                            {'ip_port': '101.71.27.120:80', 'user_pass': ''},
                            {'ip_port': '122.96.59.104:80', 'user_pass': ''},
                            {'ip_port': '122.224.249.122:8088', 'user_pass': ''},
                        ]
                        proxy = random.choice(PROXIES)
                        if proxy['user_pass'] is not None:
                            request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
                            encoded_user_pass = base64.encodestring(to_bytes(proxy['user_pass']))
                            request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass)
                        else:
                            request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])

                
    
                DOWNLOADER_MIDDLEWARES = {
                   'sp1.proxy.ProxyMiddleware': 666,
                }

猜你喜欢

转载自www.cnblogs.com/654321cc/p/8955915.html
今日推荐