在scrapy框架Header中使用Content-Length字段使爬虫返回400错误的问题

之前在抓一个网站Danaos的时候,发现能用requests和postman发送同样的header和payload的时候都能得到正确的结果,但是scrapy就会返回400错误,后来发现这是Twisted本身存在的问题,看了官网也没找到解决方法
chrome显示的XMR的request headers和payload
spider文件在这里:

class DanspiderSpider(scrapy.Spider):
    name = 'danspider'
    allowed_domains = ['www.danaos.com']
    
    def start_requests(self):
        payload = '{"serviceDto":{"ViewType":"2","ViewDate":"","RevisionNumber":"1","LanguageId":"1","ItemCount":-1,"StartIndex":0,"Signature":"","TagList":[]},"year":-1,"assetType":"Fleet_Details"}'

        url = "https://www.danaos.com/Services/ContentAssetService.svc/GetContentAssetList"
        headers = {
            "Accept": "application/json, text/javascript, */*; q=0.01",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "en-US,en;q=0.9",
            "Connection": "keep-alive",
            "Content-Length": "179",
            "Content-Type": "application/json; charset=UTF-8",
            "Cookie": "_ga=GA1.2.757680490.1537640028; _gid=GA1.2.1595345749.1537640028; _gat=1,Host: www.danaos.com",
            "Origin": "https://www.danaos.com",
            "Referer": "https://www.danaos.com/fleet/fleet-details/default.aspx",
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36",
            "X-NewRelic-ID": "VQYBUlRVChABXFNXBAcCXw==",
            "X-Requested-With": "XMLHttpRequest"
        }
        yield scrapy.Request(url=url, body=payload, method="POST", headers=headers, callback=self.parse)
        
    def parse(self, response):
        print(response.text)
        pass

用twisted改写这个爬虫,去掉了content-length字段,只留下了Content-Type字段,是可以获取返回结果的
twisted_danaos.py:

from __future__ import print_function

from pprint import pformat

from twisted.internet import reactor
from twisted.internet.defer import Deferred
from twisted.internet.protocol import Protocol
from twisted.web.client import Agent
from twisted.web.http_headers import Headers
from twisted.web.iweb import UNKNOWN_LENGTH

from bytesprod import BytesProducer

class BeginningPrinter(Protocol):
    def __init__(self, finished):
        self.finished = finished
        self.remaining = 1024 * 10

    def dataReceived(self, bytes):
        if self.remaining:
            display = bytes[:self.remaining]
            print('Some data received:')
            print(display)
            self.remaining -= len(display)

    def connectionLost(self, reason):
        print('Finished receiving body:', reason.getErrorMessage())
        self.finished.callback(None)

agent = Agent(reactor)
body=BytesProducer(b'{"serviceDto":{"ViewType":"2","ViewDate":"","RevisionNumber":"1","LanguageId":"1","ItemCount":-1,"StartIndex":0,"Signature":"","TagList":[]},"year":-1,"assetType":"Fleet_Details"}')

headers = {
"Content-Type": ["application/json; charset=UTF-8"],
        }

d = agent.request(
        b'POST', b'https://www.danaos.com/Services/ContentAssetService.svc/GetContentAssetList',
        Headers(headers),body)

def cbRequest(response):
    print('Response version:', response.version)
    print('Response code:', response.code)
    print('Response phrase:', response.phrase)
    print('Response headers:')
    print(pformat(list(response.headers.getAllRawHeaders())))
    finished = Deferred()
    response.deliverBody(BeginningPrinter(finished))
    return finished
d.addCallback(cbRequest)

def cbShutdown(ignored):
    reactor.stop()
d.addBoth(cbShutdown)

reactor.run()

bytesprod.py:

from zope.interface import implementer

from twisted.internet.defer import succeed
from twisted.web.iweb import IBodyProducer

@implementer(IBodyProducer)
class BytesProducer(object):
    def __init__(self, body):
        self.body = body
        self.length = len(body)

    def startProducing(self, consumer):
        consumer.write(self.body)
        return succeed(None)

    def pauseProducing(self):
        pass

    def stopProducing(self):
        pass

但是header中如果加上content-length字段,爬虫就会失败,但是加上其他字段不会影响结果获取

猜你喜欢

转载自blog.csdn.net/kekefen01/article/details/83042742
今日推荐