2019 7.24学习笔记

设置随机请求头和ip代理池

middlewares.py代码如下:

import json,random
import requests
from useragent_randomchange.models import ProxyModel
from twisted.internet.defer import DeferredLock

class UseragentRandomchangeDownloaderMiddleware(object):
    USER_AGENTS=[
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36']

    def process_request(self, request, spider):
        user_agent=random.choice(self.USER_AGENTS)
        request.headers['User-Agent']=user_agent

class IPProxyRandomchangeDownloaderMiddleware(object):
    PROXY_URL='xxxxxxxxx' #接口URL
    def __init__(self):
        super(IPProxyRandomchangeDownloaderMiddleware,self).__init__()
        self.current_proxy=None
        self.lock=DeferredLock() #定义一把锁

    def process_request(self, request, spider):
        if 'proxy' not in requests.meta or self.current_proxy.is_expiring:
            #请求代理
            self.update_proxy()
        request.meta['proxy']=self.current_proxy.proxy

    def process_response(self,request,response,spider):
        if response.status !=200 or "captcha"in response.url:
            if not self.current_proxy.blacked:
                self.current_proxy.blacked=True
                print('%s这个代理被加入黑名单了' % self.current_proxy.ip)
            self.update_proxy()
            #如果来到这里,说明这个请求已经被boss直聘识别为爬虫了
            #所以这个请求就相当于说明都没有获取到
            #如果不返回request,那么这个request就相当于没有获取到数据
            #也就是说,这个请求就被废掉了,这个数据就没有被抓取到
            #所有要重新返回request,让这个请求重新加入到调度中
            #下次再发送
            return request
        #如果是正常的,那么要记得返回response
        #如果不返回,那么这个reponse就不会被传到爬虫那里去
        #也就得不到解析
        return response
    def update_proxy(self):
        self.lock.acquire()
        if not self.current_proxy or self.current_proxy.is_expiring or self.current_proxy.blacked:
            response = requests.get(self.PROXY_URL)
            text = response.text
            print("重新获取了一个代理:",text)
            result = json.loads(text)
            if len(result['data']) > 0:
                data = result['data'][0]
                proxy_model = ProxyModel(data)
                self.current_proxy = proxy_model
        self.lock.release()

封装了一个models.py

from datetime import datetime,timedelta

class ProxyModel(object):
    def __init__(self,data):
        self.ip=data['ip']
        self.port=data['port']
        self.expire_str=data['expire_time']
        self.blacked=False

        data_str,time_str=self.expire_str.split(" ")
        year,month,day=data_str.split("-")
        hour,minute,second=time_str.split(":")
        self.expire_time=datetime(year=int(year),month=int(month),day=int(day),hour=int(hour),minute=int(minute),second=int(second))
        #https://ip:port
        self.proxy="http://{}:{}".format(self.ip,self.port)

    @property
    def is_expiring(self):
        now=datetime.now()
        if(self.expire_time-now)<timedelta(seconds=5):
            return True
        else:
            return False

还需在setting.py中设置

DOWNLOADER_MIDDLEWARES = {
   'useragent_randomchange.middlewares.UseragentRandomchangeDownloaderMiddleware': 100,
    'useragent_randomchange.middlewares.IPProxyRandomchangeDownloaderMiddleware': 200
}
DOWNLOAD_DELAY = 1

猜你喜欢

转载自www.cnblogs.com/jyjoker/p/11241652.html