之前在抓一个网站Danaos的时候,发现能用requests和postman发送同样的header和payload的时候都能得到正确的结果,但是scrapy就会返回400错误,后来发现这是Twisted本身存在的问题,看了官网也没找到解决方法
spider文件在这里:
class DanspiderSpider(scrapy.Spider):
name = 'danspider'
allowed_domains = ['www.danaos.com']
def start_requests(self):
payload = '{"serviceDto":{"ViewType":"2","ViewDate":"","RevisionNumber":"1","LanguageId":"1","ItemCount":-1,"StartIndex":0,"Signature":"","TagList":[]},"year":-1,"assetType":"Fleet_Details"}'
url = "https://www.danaos.com/Services/ContentAssetService.svc/GetContentAssetList"
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
"Content-Length": "179",
"Content-Type": "application/json; charset=UTF-8",
"Cookie": "_ga=GA1.2.757680490.1537640028; _gid=GA1.2.1595345749.1537640028; _gat=1,Host: www.danaos.com",
"Origin": "https://www.danaos.com",
"Referer": "https://www.danaos.com/fleet/fleet-details/default.aspx",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36",
"X-NewRelic-ID": "VQYBUlRVChABXFNXBAcCXw==",
"X-Requested-With": "XMLHttpRequest"
}
yield scrapy.Request(url=url, body=payload, method="POST", headers=headers, callback=self.parse)
def parse(self, response):
print(response.text)
pass
用twisted改写这个爬虫,去掉了content-length字段,只留下了Content-Type字段,是可以获取返回结果的
twisted_danaos.py:
from __future__ import print_function
from pprint import pformat
from twisted.internet import reactor
from twisted.internet.defer import Deferred
from twisted.internet.protocol import Protocol
from twisted.web.client import Agent
from twisted.web.http_headers import Headers
from twisted.web.iweb import UNKNOWN_LENGTH
from bytesprod import BytesProducer
class BeginningPrinter(Protocol):
def __init__(self, finished):
self.finished = finished
self.remaining = 1024 * 10
def dataReceived(self, bytes):
if self.remaining:
display = bytes[:self.remaining]
print('Some data received:')
print(display)
self.remaining -= len(display)
def connectionLost(self, reason):
print('Finished receiving body:', reason.getErrorMessage())
self.finished.callback(None)
agent = Agent(reactor)
body=BytesProducer(b'{"serviceDto":{"ViewType":"2","ViewDate":"","RevisionNumber":"1","LanguageId":"1","ItemCount":-1,"StartIndex":0,"Signature":"","TagList":[]},"year":-1,"assetType":"Fleet_Details"}')
headers = {
"Content-Type": ["application/json; charset=UTF-8"],
}
d = agent.request(
b'POST', b'https://www.danaos.com/Services/ContentAssetService.svc/GetContentAssetList',
Headers(headers),body)
def cbRequest(response):
print('Response version:', response.version)
print('Response code:', response.code)
print('Response phrase:', response.phrase)
print('Response headers:')
print(pformat(list(response.headers.getAllRawHeaders())))
finished = Deferred()
response.deliverBody(BeginningPrinter(finished))
return finished
d.addCallback(cbRequest)
def cbShutdown(ignored):
reactor.stop()
d.addBoth(cbShutdown)
reactor.run()
from zope.interface import implementer
from twisted.internet.defer import succeed
from twisted.web.iweb import IBodyProducer
@implementer(IBodyProducer)
class BytesProducer(object):
def __init__(self, body):
self.body = body
self.length = len(body)
def startProducing(self, consumer):
consumer.write(self.body)
return succeed(None)
def pauseProducing(self):
pass
def stopProducing(self):
pass
但是header中如果加上content-length字段,爬虫就会失败,但是加上其他字段不会影响结果获取