1、scrapy框架自定义UserAgent
配置UserAgent需要middlewares.py在文件中配置
1.1、自定义UserAgent方法一:
class UserAgentDownloadMiddleware(object):
"""自定义请求头"""
USER_AGENTS=[
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; TencentTraveler 4.0; Trident/4.0; SLCC1; Media Center PC 5.0; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30618',
'Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; QQDownload 1.7; GTB6.6; TencentTraveler 4.0; SLCC1; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.5.30729; .NET CLR 3.0.30729'
]
def process_request(self,request,spider):
user_agent= random.choice(self.USER_AGENTS)
request.headers['User-Agent'] = user_agent
1.2、自定义UserAgent方法二:
# UserAgent自动生成user-agent的类
from fake_useragent import UserAgent
class RandomUserAgentMiddleware(object):
def process_request(self, request, spider):
request.headers.setdefault("User-Agent",UserAgent().random)
2、scrapy框架中配置cookie
这里代码不是先把获取到的cookie存储在redis,scrapy框架从redis里面读取cookie;然后在process_request(self, request, spider)方法中给request.cookies赋值。
class CookiesMiddleWare():
# self,redis_host="172.16.1.10",redis_port=6379,redis_db=0
def __init__(self,redis_host,redis_port,redis_db):
self.logger = logging.getLogger(__name__)
redis_pool = redis.ConnectionPool(host=redis_host,port=redis_port,db=redis_db)
self.redis = redis.StrictRedis(connection_pool=redis_pool)
def _get_cookie(self):
cookies_str = self.redis.get("weibo_17513322968")
if cookies_str:
return json.loads(cookies_str)
else:
return None
def process_request(self, request, spider):
cookies = self._get_cookie()
if cookies: # 请求添加cookies
request.cookies = cookies
self.logger.debug('Using Cookies ' + json.dumps(cookies))
else:
self.logger.debug('No Valid Cookies')
@classmethod
def from_clawler(cls,crawler):
return cls(redis_host = crawler.setting.get("REDIS_HOST"),
redis_port=crawler.setting.get("REDIS_PORT"),
redis_db=crawler.setting.get("REIDS_DB"),
)