拉钩网的两种爬取方法

方法一：

import requests



url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="

headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/77.0.3865.75 Safari/537.36",
        "Referer": "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=",
        "X-Anit-Forge-Code": '0',
        "X-Anit-Forge-Token": 'None',
        "X-Requested-With": "XMLHttpRequest",
        "Sec-Fetch-Site": 'same-origin',
        "Sec-Fetch-Mode": 'cors',
        "Origin": "Origin",
        "Host": "www.lagou.com",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Accept": "application/json, text/javascript, */*; q=0.01"
    }

data = {
    "first": "false",
    "pn": 1,
    "kd": "python"
}

urls = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
s = requests.Session()
s.get(urls, headers=headers, timeout=3)
cookie = s.cookies
for x in range(1, 11):
    data['pn'] = x
    response = s.post(url, data=data, headers=headers, cookies=cookie, timeout=3)
    print(response.text)

方法二：

from selenium import webdriver
from lxml import etree
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait


class LagouSpider(object):
    def __init__(self):
        self.driver = webdriver.Chrome()
        self.url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="

    def run(self):
        self.driver.get(self.url)
        while True:
            source = self.driver.page_source
            WebDriverWait(driver=self.driver, timeout=10).until(
                EC.presence_of_element_located((By.XPATH, "//div[@class='pager_container']/span[last()]"))
            )
            self.parse_list_page(source)
            next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
            if "pager_next_disabled" in next_btn.get_attribute("class"):
                break
            else:
                next_btn.click()

    def parse_list_page(self, source):
        html = etree.HTML(source)
        links = html.xpath("//a[@class='position_link']/@href")
        for link in links:
            self.request_detail_page(link)
            time.sleep(1)

    def request_detail_page(self, url):
        # self.driver.get(url)
        self.driver.execute_script("window.open('%s')" % url)
        self.driver.switch_to.window(self.driver.window_handles[1])
        WebDriverWait(self.driver, timeout=10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='job-name']/span[@class='name']"))
        )
        source = self.driver.page_source
        self.parse_detail_page(source)
        #  关闭当前详情页
        self.driver.close()
        #  切换回去
        self.driver.switch_to.window(self.driver.window_handles[0])

    def parse_detail_page(self, source):
        html = etree.HTML(source)
        desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
        print(desc)
        print("*"*50)


if __name__ == '__main__':
    spider = LagouSpider()
    spider.run()

程序猿玖月柒

发布了61 篇原创文章 · 获赞 48 · 访问量 4446

私信关注

拉钩网的两种爬取方法

猜你喜欢