from selenium import webdriver
import time
import queue
import lxml.html
from lxml import etree
class ItemURL(object):
'''
记录url类型和地址
'''
def __init__(self,url_type,url_str):
self.type = url_type
self.url = url_str
browser = webdriver.Chrome(executable_path="/home/nicemoe/software/chromedriver")
#browser.get("https://www.zhipin.com/job_detail/?query=%E6%B7%B1%E5%BA%A6%E5%AD%A6%E4%B9%A0&city=101200100&industry=&position=")
#分很多页码
#广度优先遍历
download_queue = queue.Queue()
seed_item = ItemURL(0,"https://www.zhipin.com/job_detail/?query=%E6%B7%B1%E5%BA%A6%E5%AD%A6%E4%B9%A0&city=101200100&industry=&position=")
download_queue.put(seed_item)
while not download_queue.empty():
#queue是线程安全的,get方法从队首去除一个元素
item = download_queue.get()
if item.type == 0:
browser.get(item.url)
time.sleep(2)
if item.type == 1:
browser.execute_script("window.scrollTo(0,document.body.scrollHeight - 400)")
time.sleep(2)
next_page = browser.find_element_by_xpath("//div[@class='page']/a[@ka='page-next']")
next_page.click()
time.sleep(3)
#后续继续将种子地址能够达到的网址,放入队列,进行广度搜索
#page_source能够返回渲染后的html内容
parser = lxml.html.fromstring(browser.page_source)
#获取所有列表
job_lists = parser.xpath("//div[@class='job-list']/ul/li")
#遍历job_list,job标识(zhì)以li开始的根目录
for job in job_lists:
print(job.xpath(".//span[@class='job-name']/a/@title")[0])
#print(job.xpath(".//span[@class='job-name']/a/@href")[0])
#download_queue.put()
download_queue.put(ItemURL(1,"http://www.baidu.com"))
python使用webdriver爬取boss直聘招聘
猜你喜欢
转载自blog.csdn.net/qq_26018075/article/details/106978710
今日推荐
周排行