requests+selenium获取cookies

目的:对于一些网页的cookies值变化的,用selenium模拟浏览器拿到cookies,再用requests发送请求拿到数据。
通过一个小代码展示一下:
目标:拿到黑龙江省采购网中标信息。
思路:
1.构造URL,selenium拿到cookies,发送post请求。
2.拿到每一个中标项目的URL,发送get请求,拿到数据。
3.保存数据。
具体代码如下:

import requests
from lxml import etree
import re
import os
from get_cookies import get_cookies

class HeiLongJiang_Spider():
    def __init__(self):
        self.url = "http://www.hljcg.gov.cn/xwzs!queryGd.action"
        self.i = 1

    def run(self):
        #1.拿到url页,
        for i in range(1,1072):
            href_list = self.parse_url(i)
            for href in href_list:
                href = "http://www.hljcg.gov.cn" + href.split(";")[0].split("=", 1)[1].replace("'", "")
                # 每一个中标的url
                print(href)
                # 详细内容
                text = self.parse_data(href)
                # 创建路径
                path = "黑龙江"
                if not os.path.exists(path):
                    os.makedirs(path)
                # 创建文件名
                filename = path + "/黑龙江_{}.txt".format(self.i)
                self.i += 1
                print(self.i)
                # 保存每一个合同的信息
                self.save_info(text, filename, href)
                if self.i % 20 == 0:
                    num = self.i / 20
                    print("黑龙江省第" + str(num) + "页爬取完毕")



    def parse_url(self,i):
        cookies = get_cookies()
        headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Content-Length": "121",
            "Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Content-Type": "application/x-www-form-urlencoded",
            "Cookie": cookies,
            "Host": "www.hljcg.gov.cn",
            "Origin": "http://www.hljcg.gov.cn",
            "Referer": "http://www.hljcg.gov.cn/index.jsp",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
        }

        data = {
            "xwzsPage.pageNo": i,
            "xwzsPage.pageSize": "20",
            "xwzsPage.pageCount": "1017",
            "lbbh": "5",
            "id": "110",
            "xwzsPage.LBBH": "5",
            "xwzsPage.zlbh": "",
            "xwzsPage.GJZ: ":"",

        }
        response = requests.post(url=self.url,headers=headers,data=data)
        html = etree.HTML(response.content.decode())
        href_list = html.xpath("/html/body//div[@class='yahoo']/div/span/a/@onclick")
        return href_list



    def parse_data(self,url):
        cookies = get_cookies()
        headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Cookie": cookies,
            "Host": "www.hljcg.gov.cn",
            "Origin": "http://www.hljcg.gov.cn",
            "Referer": "http://www.hljcg.gov.cn/index.jsp",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
        }
        response = requests.get(url=url,headers=headers)
        html = etree.HTML(response.content.decode())
        info_p = html.xpath("/html/body//div[@id='cen']//div[@class='xxej']/div[2]//p//text()")
        info_p = ''.join(info_p)
        info_p = re.sub("一、|二、|三、|四、|五、|六、|七、|八、|九、|十、|十一、|十二、|十三、|十四、|十五、","\n",info_p)
        print(info_p)
        return info_p

    def save_info(self, content_list, filename, url):
        print("正在保存" + "\n" + url)
        with open(filename, 'w', encoding='utf-8') as fp:
            fp.write(content_list)
            fp.write("\n")



if __name__ == '__main__':
    heilongjiang = HeiLongJiang_Spider()
    heilongjiang.run()

拿到cookies的代码如下:

from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import time
def get_cookies():
    url = "http://www.hljcg.gov.cn/home.jsp"
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)
    # driver = webdriver.Chrome()
    driver.get(url)
    driver.find_element_by_class_name("sbj_btn").click()
    driver.find_element_by_xpath("//div[@class='cen_new04']/div[@class='xx1']/a").click()
    time.sleep(2)
    for cookies in driver.get_cookies():
        name = cookies["name"]
        value = cookies["value"]
        cookies = "{}={}".format(name,value)
        return cookies

    driver.quit()
    

总结:针对cookies变化的网页,这样可以保证稳定性,但是效率会降低,遇到对反cookies严重的可以这么做。

发布了5 篇原创文章 · 获赞 0 · 访问量 305

猜你喜欢

转载自blog.csdn.net/weixin_41927456/article/details/104617101