Python爬虫——Scrapy中使用IP池和用户代理池

Python爬虫——Scrapy中使用IP池和用户代理池

本文使用的版本为Python3

  • settings.py中配置IP池和用户代理池
# IP池设置
IPPOOL = [
    {"ipaddr": "101.236.21.22:8866"},
    {"ipaddr": "101.126.18.101:8866"},
    {"ipaddr": "118.31.220.3:8080"}
]

# 用户代理(User-Agent)池设置
UAPOOL = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0"
]
  • settings.py中开启IP池和用户代理池设置
DOWNLOADER_MIDDLEWARES = {
   'ScrapyXmlDemo.middlewares.ScrapyxmldemoDownloaderMiddleware': 543,
   'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 123,
   # 这里要替换为自己的项目名称(数字仅代表优先级,数字越小,优先级越高)
   'ScrapyXmlDemo.middlewares.IPPOOLS': 125,
   'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 126,
   # 这里要替换为自己的项目名称(数字仅代表优先级,数字越小,优先级越高)
   'ScrapyXmlDemo.middlewares.Uamid': 127,
}
  • middlewares.py中写入IP和用户代理选择
import random
from ScrapyXmlDemo.settings import IPPOOL
from ScrapyXmlDemo.settings import UAPOOL
from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware


# IP代理池
class IPPOOLS(HttpProxyMiddleware):
    def __init__(self, ip=""):
        self.ip = ip

    def process_request(self, request, spider):
        thisip = random.choice(IPPOOL)
        print("当前使用的IP为: " + thisip["ipaddr"])
        request.meta["proxy"] = "http://" + thisip["ipaddr"]


# 用户代理池
class Uamid(UserAgentMiddleware):
    def __init__(self, user_agent=""):
        self.user_agent = user_agent

    def process_request(self, request, spider):
        thisua = random.choice(UAPOOL)
        print("当前使用的User-Agent是: " + thisua)
        request.headers.setdefault("User-Agent", thisua)

猜你喜欢

转载自blog.csdn.net/m0_37770300/article/details/81316681