爬虫总结2——解决POST方法翻页及异步请求问题

在爬取http://www.chinaparking.org/news/1-A007--0-1-0-1-0-1这个网站的时候，点击下一页发现是使用POST方式提交的，并且返回了XHR异步请求。如下：

查看参数信息如下：

多点击几个会发现每次翻页只有pageIndex这个参数会随页数发生变化。具体实现代码如下：

from pyspider.libs.base_handler import *
import re
import json

class Handler(BaseHandler):
crawl_config = {
    'header': {
'Accept':'application/json, text/javascript, */*; q=0.01',
#'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
#'Connection':'keep-alive',
#'Content-Length':'63',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie':'Hm_lvt_41163db3e6bf51ff3afeafa…0ddc415ce0f895822e=1534321147',
'Host':'www.chinaparking.org',
'Referer':'http://www.chinaparking.org/news/1-A007--0-1-0-1-0-1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/61.0',
'X-Requested-With':'XMLHttpRequest',
}
}

@every(minutes=24 * 60)
def on_start(self):
    data={
'action': 'paginglist',
'pageIndex': 1,
'pageSize': 15,
'typeone': 'A007',
'typetwo':''
}
    self.crawl('http://www.chinaparking.org/WebInfo/News', callback=self.index_page,data=data, method='POST')

@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
    web_data=response.text
    print(web_data)
    web_data=json.loads(web_data)
    shuju=web_data['data']#必须loads才能转换成字典，理解成解码，只有通过解码才能转换成字典
    print(shuju)
    for i in range(0,len(shuju)):
    url1='http://www.chinaparking.org/newsid_'+str(shuju[i]['ID'])
    self.crawl(url1, callback=self.detail_page,save={'id':shuju[i]['ID'],'url':url1})
    for i in range(2,279):
    data={
'action': 'paginglist',
'pageIndex': i,
'pageSize': 15,
'typeone': 'A007',
'typetwo':''
}
    self.crawl('http://www.chinaparking.org/WebInfo/News', callback=self.index_page, data=data, method='POST')

@config(priority=2)
def detail_page(self, response):
    id=response.save['id']
    url1=response.save['url']
    print(url1)
    data={
    'action':'byid',
    'id':str(id)
    }
    self.crawl('http://www.chinaparking.org/WebInfo/News', callback=self.last_page,data=data, method='POST',save={'url':url1})
def last_page(self, response):
    web_data=response.text
    print(web_data)
    web_data=json.loads(web_data)
    url=response.save['url']
    title=web_data['_title']
    time=web_data['_createdate']
    context=web_data['_content']

这里分析数据的方式与爬虫总结一里不同，使用的是将JSON数据转换成字典，直接通过键对URL等数据进行提取。

爬虫总结2——解决POST方法翻页及异步请求问题

猜你喜欢