方法一:
import requests
url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/77.0.3865.75 Safari/537.36",
"Referer": "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=",
"X-Anit-Forge-Code": '0',
"X-Anit-Forge-Token": 'None',
"X-Requested-With": "XMLHttpRequest",
"Sec-Fetch-Site": 'same-origin',
"Sec-Fetch-Mode": 'cors',
"Origin": "Origin",
"Host": "www.lagou.com",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Accept": "application/json, text/javascript, */*; q=0.01"
}
data = {
"first": "false",
"pn": 1,
"kd": "python"
}
urls = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
s = requests.Session()
s.get(urls, headers=headers, timeout=3)
cookie = s.cookies
for x in range(1, 11):
data['pn'] = x
response = s.post(url, data=data, headers=headers, cookies=cookie, timeout=3)
print(response.text)
方法二:
from selenium import webdriver
from lxml import etree
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
class LagouSpider(object):
def __init__(self):
self.driver = webdriver.Chrome()
self.url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
def run(self):
self.driver.get(self.url)
while True:
source = self.driver.page_source
WebDriverWait(driver=self.driver, timeout=10).until(
EC.presence_of_element_located((By.XPATH, "//div[@class='pager_container']/span[last()]"))
)
self.parse_list_page(source)
next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
if "pager_next_disabled" in next_btn.get_attribute("class"):
break
else:
next_btn.click()
def parse_list_page(self, source):
html = etree.HTML(source)
links = html.xpath("//a[@class='position_link']/@href")
for link in links:
self.request_detail_page(link)
time.sleep(1)
def request_detail_page(self, url):
# self.driver.get(url)
self.driver.execute_script("window.open('%s')" % url)
self.driver.switch_to.window(self.driver.window_handles[1])
WebDriverWait(self.driver, timeout=10).until(
EC.presence_of_element_located((By.XPATH, "//div[@class='job-name']/span[@class='name']"))
)
source = self.driver.page_source
self.parse_detail_page(source)
# 关闭当前详情页
self.driver.close()
# 切换回去
self.driver.switch_to.window(self.driver.window_handles[0])
def parse_detail_page(self, source):
html = etree.HTML(source)
desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
print(desc)
print("*"*50)
if __name__ == '__main__':
spider = LagouSpider()
spider.run()