The maximum url inside the browser can enter is limited
safari up to ten thousand
ie at least 2083
urllength source middleware
Google and Firefox normally more than eight thousand
""" Url Length Spider Middleware See documentation in docs/topics/spider-middleware.rst """ import logging from scrapy.http import Request from scrapy.exceptions import NotConfigured logger = logging.getLogger(__name__) class UrlLengthMiddleware(object): def __init__(self, maxlength): self.maxlength = maxlength @classmethod def from_settings(cls, settings): maxlength = settings.getint('URLLENGTH_LIMIT') if not maxlength: raise NotConfigured return cls(maxlength) def process_spider_output(self, response, result, spider): def _filter(request): if isinstance(request, Request) and len(request.url) > self.maxlength: logger.debug("Ignoring link (url length > %(maxlength)d): %(url)s ", {'maxlength': self.maxlength, 'url': request.url}, extra={'spider': spider}) return False else: return True return (r for r in result or () if _filter(r))
setting the default length scrapy
If you want to set up their own configuration setting can increase inside
URLLENGTH_LIMIT = 60
If the length exceeds this setting url
It will ignore this request url running print log
logger.debug("Ignoring link (url length > %(maxlength)d): %(url)s ", {'maxlength': self.maxlength, 'url': request.url}, extra={'spider': spider})