Python爬虫——Scrapy中使用IP池和用户代理池
本文使用的版本为Python3
# IP池设置
IPPOOL = [
{"ipaddr": "101.236.21.22:8866"},
{"ipaddr": "101.126.18.101:8866"},
{"ipaddr": "118.31.220.3:8080"}
]
# 用户代理(User-Agent)池设置
UAPOOL = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0"
]
- settings.py中开启IP池和用户代理池设置
DOWNLOADER_MIDDLEWARES = {
'ScrapyXmlDemo.middlewares.ScrapyxmldemoDownloaderMiddleware': 543,
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 123,
'ScrapyXmlDemo.middlewares.IPPOOLS': 125,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 126,
'ScrapyXmlDemo.middlewares.Uamid': 127,
}
- middlewares.py中写入IP和用户代理选择
import random
from ScrapyXmlDemo.settings import IPPOOL
from ScrapyXmlDemo.settings import UAPOOL
from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
class IPPOOLS(HttpProxyMiddleware):
def __init__(self, ip=""):
self.ip = ip
def process_request(self, request, spider):
thisip = random.choice(IPPOOL)
print("当前使用的IP为: " + thisip["ipaddr"])
request.meta["proxy"] = "http://" + thisip["ipaddr"]
class Uamid(UserAgentMiddleware):
def __init__(self, user_agent=""):
self.user_agent = user_agent
def process_request(self, request, spider):
thisua = random.choice(UAPOOL)
print("当前使用的User-Agent是: " + thisua)
request.headers.setdefault("User-Agent", thisua)