python使用webdriver爬取boss直聘招聘

from selenium import webdriver
import time
import queue
import  lxml.html

from lxml import etree
class ItemURL(object):
    '''
    记录url类型和地址
    '''
    def __init__(self,url_type,url_str):
        self.type = url_type
        self.url  = url_str

browser = webdriver.Chrome(executable_path="/home/nicemoe/software/chromedriver")
#browser.get("https://www.zhipin.com/job_detail/?query=%E6%B7%B1%E5%BA%A6%E5%AD%A6%E4%B9%A0&city=101200100&industry=&position=")

#分很多页码
#广度优先遍历
download_queue = queue.Queue()
seed_item = ItemURL(0,"https://www.zhipin.com/job_detail/?query=%E6%B7%B1%E5%BA%A6%E5%AD%A6%E4%B9%A0&city=101200100&industry=&position=")
download_queue.put(seed_item)

while not download_queue.empty():
    #queue是线程安全的,get方法从队首去除一个元素
    item = download_queue.get()
    if item.type == 0:
        browser.get(item.url)
        time.sleep(2)
    if item.type == 1:
        browser.execute_script("window.scrollTo(0,document.body.scrollHeight - 400)")
        time.sleep(2)
        next_page = browser.find_element_by_xpath("//div[@class='page']/a[@ka='page-next']")
        next_page.click()
        time.sleep(3)
    #后续继续将种子地址能够达到的网址,放入队列,进行广度搜索
    #page_source能够返回渲染后的html内容
    parser = lxml.html.fromstring(browser.page_source)
    #获取所有列表
    job_lists = parser.xpath("//div[@class='job-list']/ul/li")
    #遍历job_list,job标识(zhì)以li开始的根目录
    for job in job_lists:
        print(job.xpath(".//span[@class='job-name']/a/@title")[0])
        #print(job.xpath(".//span[@class='job-name']/a/@href")[0])
        #download_queue.put()
    download_queue.put(ItemURL(1,"http://www.baidu.com"))

猜你喜欢

转载自blog.csdn.net/qq_26018075/article/details/106978710