目的:对于一些网页的cookies值变化的,用selenium模拟浏览器拿到cookies,再用requests发送请求拿到数据。
通过一个小代码展示一下:
目标:拿到黑龙江省采购网中标信息。
思路:
1.构造URL,selenium拿到cookies,发送post请求。
2.拿到每一个中标项目的URL,发送get请求,拿到数据。
3.保存数据。
具体代码如下:
import requests
from lxml import etree
import re
import os
from get_cookies import get_cookies
class HeiLongJiang_Spider():
def __init__(self):
self.url = "http://www.hljcg.gov.cn/xwzs!queryGd.action"
self.i = 1
def run(self):
#1.拿到url页,
for i in range(1,1072):
href_list = self.parse_url(i)
for href in href_list:
href = "http://www.hljcg.gov.cn" + href.split(";")[0].split("=", 1)[1].replace("'", "")
# 每一个中标的url
print(href)
# 详细内容
text = self.parse_data(href)
# 创建路径
path = "黑龙江"
if not os.path.exists(path):
os.makedirs(path)
# 创建文件名
filename = path + "/黑龙江_{}.txt".format(self.i)
self.i += 1
print(self.i)
# 保存每一个合同的信息
self.save_info(text, filename, href)
if self.i % 20 == 0:
num = self.i / 20
print("黑龙江省第" + str(num) + "页爬取完毕")
def parse_url(self,i):
cookies = get_cookies()
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Content-Length": "121",
"Connection": "keep-alive",
"Cache-Control": "max-age=0",
"Content-Type": "application/x-www-form-urlencoded",
"Cookie": cookies,
"Host": "www.hljcg.gov.cn",
"Origin": "http://www.hljcg.gov.cn",
"Referer": "http://www.hljcg.gov.cn/index.jsp",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
}
data = {
"xwzsPage.pageNo": i,
"xwzsPage.pageSize": "20",
"xwzsPage.pageCount": "1017",
"lbbh": "5",
"id": "110",
"xwzsPage.LBBH": "5",
"xwzsPage.zlbh": "",
"xwzsPage.GJZ: ":"",
}
response = requests.post(url=self.url,headers=headers,data=data)
html = etree.HTML(response.content.decode())
href_list = html.xpath("/html/body//div[@class='yahoo']/div/span/a/@onclick")
return href_list
def parse_data(self,url):
cookies = get_cookies()
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Cache-Control": "max-age=0",
"Cookie": cookies,
"Host": "www.hljcg.gov.cn",
"Origin": "http://www.hljcg.gov.cn",
"Referer": "http://www.hljcg.gov.cn/index.jsp",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
}
response = requests.get(url=url,headers=headers)
html = etree.HTML(response.content.decode())
info_p = html.xpath("/html/body//div[@id='cen']//div[@class='xxej']/div[2]//p//text()")
info_p = ''.join(info_p)
info_p = re.sub("一、|二、|三、|四、|五、|六、|七、|八、|九、|十、|十一、|十二、|十三、|十四、|十五、","\n",info_p)
print(info_p)
return info_p
def save_info(self, content_list, filename, url):
print("正在保存" + "\n" + url)
with open(filename, 'w', encoding='utf-8') as fp:
fp.write(content_list)
fp.write("\n")
if __name__ == '__main__':
heilongjiang = HeiLongJiang_Spider()
heilongjiang.run()
拿到cookies的代码如下:
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import time
def get_cookies():
url = "http://www.hljcg.gov.cn/home.jsp"
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)
# driver = webdriver.Chrome()
driver.get(url)
driver.find_element_by_class_name("sbj_btn").click()
driver.find_element_by_xpath("//div[@class='cen_new04']/div[@class='xx1']/a").click()
time.sleep(2)
for cookies in driver.get_cookies():
name = cookies["name"]
value = cookies["value"]
cookies = "{}={}".format(name,value)
return cookies
driver.quit()
总结:针对cookies变化的网页,这样可以保证稳定性,但是效率会降低,遇到对反cookies严重的可以这么做。