爬虫22-使用selenium爬取信息

1.正常使用cookie爬取拉勾网ajax数据

import requests
from lxml import etree
import time
import re
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
    "Referer": "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=",
    "Cookie": "user_trace_token=20200226133453-084540c1-9531-4fa8-873f-0dda32aa3ca4; _ga=GA1.2.836052667.1582695295; LGUID=20200226133454-167deda5-1930-4e79-8834-719427ac01be; index_location_city=%E5%85%A8%E5%9B%BD; lagou_utm_source=A; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221707ffdf39c2c3-0001957fd8ade1-3a614f0b-2073600-1707ffdf39de5f%22%2C%22%24device_id%22%3A%221707ffdf39c2c3-0001957fd8ade1-3a614f0b-2073600-1707ffdf39de5f%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; gate_login_token=5976db005818f45ed7756b1348563965e46f1400511d886af3d4d57dd9d9166a; LG_LOGIN_USER_ID=5b895ff2a4e23c48dc4c9110a6a1361bbf709630b5b17ac6756340fef1babfbf; LG_HAS_LOGIN=1; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; privacyPolicyPopup=false; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1583857959,1583912708,1583912713; JSESSIONID=ABAAAECABGFABFF1412C84500FD39A23D7C1D5172179D66; WEBTJ-ID=20200315123348-170dc782d0e4cf-05e9fb23740e5e-3a614f0b-2073600-170dc782d0f63d; _gid=GA1.2.1720707822.1584246829; _putrc=387928C58CE0A7D1123F89F2B170EADC; login=true; unick=%E7%90%B3%E7%90%B3; TG-TRACK-CODE=index_search; X_MIDDLE_TOKEN=0a8830791829a77f99654a1bb3d568ae; LGSID=20200315140707-568ce08c-c655-44b2-9cd4-66632e1bb6f4; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist%5Fpython%2Fp-city%5F0%3F%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; _gat=1; SEARCH_ID=79abbbd66c2b4a59b7ca19ee8fb77e01; X_HTTP_TOKEN=9944cc335d13b0d30552524851b568c7665cd1a0ff; LGRID=20200315140911-acf5dfc4-1c8f-4943-a93f-983d364a96db",
    "Origin": "https://www.lagou.com",
    'X-Anit-Forge-Code': "0",
    "X -Anit-Forge-Token": "None",
    "X-Requested-With": "XMLHttpRequest"
}
positions = []

def request_list_page():
    url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"

    data = {
        "frist": "false",
        "pn": "1",
        "kd": "python"
    }
    for x in range(1,10):
        data['pn']=x
        response = requests.post(url, data=data, headers=headers)
        result=response.json()  # 如果返回的是json数据，会被load成一个字典
        positions=result['content']['positionResult']['result']
        for position in positions:
            positionId=position['positionId']#根据这个id找页面
            position_url='https://www.lagou.com/jobs/%s.html'%positionId
            parse_position_detail(position_url)
            break
        time.sleep(2)
        break

def parse_position_detail(url):
    response=requests.get(url,headers=headers)
    text=response.text
    html=etree.HTML(text)
    name=html.xpath("//div[@class='job-name']/@title")[0]
    job_span=html.xpath("//dd[@class='job_request']//span")
    salary=job_span[0].xpath('.//text()')[0].strip()
    city=job_span[1].xpath(".//text()")[0].strip()
    city=re.sub(r"[\s/]","",city)
    position = {
        'name': name,
        'salary': salary,
        'city': city
    }
    positions.append(position)

def main():
    request_list_page()
    print(positions)
if __name__ == '__main__':
    main()

2.使用selenium爬取拉勾网ajax数据

#encoding: utf-8

from selenium import webdriver
from lxml import etree
import re
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

class LagouSpider(object):
    def __init__(self):
        self.driver = webdriver.Firefox()
        self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
        self.positions = []

    def run(self):
        self.driver.get(self.url)
        while True:
            source = self.driver.page_source
            WebDriverWait(driver=self.driver,timeout=10).until(
                EC.presence_of_element_located((By.XPATH,"div[@class='pager_container']/span[last()]]"))
            )
            self.parse_list_page(source)
            try:
                next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
                if "pager_next_disabled" in next_btn.get_attribute("class"):
                    break
                else:
                    next_btn.click()
            except:
                print(source)
            time.sleep(1)

    def parse_list_page(self,source):
        html = etree.HTML(source)
        links = html.xpath("//a[@class='position_link']/@href")
        for link in links:
            self.request_detail_page(link)
            time.sleep(1)

    def request_detail_page(self,url):
        # self.driver.get(url)
        self.driver.execute_script("window.open('%s')"%url)#打开新标签
        self.driver.switch_to.window(self.driver.window_handles[1])#driver移动到新标签
        WebDriverWait(self.driver,timeout=10).until(
            EC.presence_of_element_located((By.XPATH,"//div[@class='job-name']"))
        )
        source = self.driver.page_source
        self.parse_detail_page(source)
        self.driver.close()# 关闭当前这个详情页
        self.driver.switch_to.window(self.driver.window_handles[0])# 继续切换回职位列表页


    def parse_detail_page(self,source):
        html = etree.HTML(source)
        name = html.xpath("//div[@class='job-name']/@title")[0]
        job_span = html.xpath("//dd[@class='job_request']//span")
        salary = job_span[0].xpath('.//text()')[0].strip()
        city = job_span[1].xpath(".//text()")[0].strip()
        city = re.sub(r"[\s/]", "", city)
        position = {
            'name': name,
            'salary': salary,
            'city': city
        }
        self.positions.append(position)
        print(position)
        print('='*40)


if __name__ == '__main__':
    spider = LagouSpider()
    spider.run()

爬虫22-使用selenium爬取信息

猜你喜欢