python爬取拉勾网之selenium

重点代码解释：
1.调用lxml的etree实现xpath方法调用，xpath相对正则比较简单，可以不在使用Beauitfulsoup定位

from lxml import etree

2.界面的可视话与否，对于你的运行资源只能用减少

opt=webdriver.ChromeOptions()
# 把chrome设置成无界面模式，不论windows还是linux都可以，自动适配对应参数
opt.set_headless()#无界面
self.driver=webdriver.Chrome(options=opt)

3.加载数据时到xpath定位的位置进行爬取

#此句话大致意思，执行driver 时间不超过20s 什么时候加载到xpath定位的位置神魔时候停止开始执行页面内容爬去
WebDriverWait(driver=self.driver,timeout=20).until(EC.presence_of_all_elements_located(By.XPATH,’//*[@id=“s_position_list”]/div[2]/div/a[6]’))

4.python 中join()函数strip() 函数和 split() 函数的详解及实例详细内容请到我的另一篇博客
查看

content = “”.join(html.xpath("//dd[@class=‘job_bt’]//text()")).strip()
‘’’
语法： ‘sep’.join(seq)
参数说明
sep：分隔符。可以为空
seq：要连接的元素序列、字符串、元组、字典
上面的语法即：以sep作为分隔符，将seq所有的元素合并成一个新的字符串
返回值：返回一个以分隔符sep连接各个元素后生成的字符串
‘’’

from selenium import  webdriver
import  lxml
from lxml import  etree
import re
import time
import  pymysql
import urllib.request
import requests

from selenium.webdriver.support.ui import  WebDriverWait
from selenium.webdriver.support import  expected_conditions as EC
from selenium.webdriver.common.by import  By



class LagouSpider(object):
    def __init__(self):
        opt=webdriver.ChromeOptions()
        # 把chrome设置成无界面模式，不论windows还是linux都可以，自动适配对应参数
        opt.set_headless()#无界面
        self.driver=webdriver.Chrome(options=opt)
        self.url="https://www.lagou.com/zhaopin/Python/"
       
    def run(self):
        self.driver.get(self.url)
        while True:
            source = self.driver.page_source
            #此句话大致意思，执行driver 时间不超过20s 什么时候加载到xpath定位的位置神魔时候停止开始执行页面 内容爬去
            WebDriverWait(driver=self.driver,timeout=20).until(EC.presence_of_all_elements_located(By.XPATH,'//*[@id="s_position_list"]/div[2]/div/a[6]'))

            # WebDriverWait(driver=self.driver, timeout=20).until(
            #     EC.presence_of_element_located((By.XPATH, '//*[@id="s_position_list"]/div[2]/div/a[6]'))
            # )
            self.parse_list_page(source)
            # 点“下一页”
            next_btn=self.driver.find_element_by_xpath(  '//*[@id="s_position_list"]/div[2]/div/a[6]')
            # 提取下一页的按钮，注意class的值中有空格不可用。
            if "pager_next_disabled" in next_btn.get_attribute("class"):
                break
            else:
                next_btn.click()
            time.sleep(1)
            # next_btn = self.driver.find_element_by_xpath(
            #      '//*[@id="s_position_list"]/div[2]/div/a[6]')
            # if "pager_next_disabled" in next_btn.get_attribute("class"):
            #     break
            # else:
            #     next_btn.click()
            # time.sleep(1)


        # source=self.driver.page_source
        # #print(source)
        # self.parse_list_page(source)
    #职位url列表
    def parse_list_page(self,source):
        #t通过etree调用xpath
        html=etree.HTML(source)
        links=html.xpath('//*[@id="s_position_list"]/ul/li/div[1]/div[1]/div[1]/a/@href')
        for  link in links:
            self.request_detail_page(link)
            # print(link)
            #time.sleep(1)


    #执行提取的url
    def request_detail_page(self,url):
        #self.driver.get(url)
        #打开新的页面
        self.driver.execute_script("window.open('%s')"%url)
        #切换句柄进入新打开的页面
        self.driver.switch_to.window((self.driver.window_handles[1]))
        # self.driver.execute_script("window.open('%s')" % url)
        # self.driver.switch_to.window(self.driver.window_handles[1])
        #加载出来工作名开始爬取
        WebDriverWait(driver=self.driver, timeout=20).until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='job-name']/span[@class='name']")) )

        source=self.driver.page_source
        self.parse_detail_page(source)
        # 关闭当前详情页，并且切换到列表页
        self.driver.close()
        self.driver.switch_to.window(self.driver.window_handles[0])
        #self.parse_list_page(source)

    #提取具体信息
    def parse_detail_page(self,source):
        html=etree.HTML(source)
        positionName=html.xpath("//div[@class='position-head']/div/div[1]/div/span/text()")[0]
        job_request_spans=html.xpath("//div[@class='position-head']/div/div[1]/dd/p[1]/span")
        salary=job_request_spans[0].xpath(".//text()")[0].strip()
        city=job_request_spans[1].xpath('.//text()')[0].strip()
        #city = re.match(r'<span class="xh-highlight">/(.*?) /</span>',city)
        city = re.sub(r"[\s/]", "", city)#此处将"/"替换为空""
        work_years = job_request_spans[2].xpath('.//text()')[0].strip()
        work_years = re.sub(r"[\s/]", "", work_years)
        education = job_request_spans[3].xpath('.//text()')[0].strip()
        education = re.sub(r"[\s/]", "", education)
        content = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
        #print(positionName)
        #mysql=MySQLPipeline()
        #mysql.process_item(positionName,salary,city,work_years,education,content)
        '''
        语法：  'sep'.join(seq)
        参数说明
        sep：分隔符。可以为空
        seq：要连接的元素序列、字符串、元组、字典
        上面的语法即：以sep作为分隔符，将seq所有的元素合并成一个新的字符串
        返回值：返回一个以分隔符sep连接各个元素后生成的字符串
        '''
class MySQLPipeline(object):
    def __init__(self):
        self.conn = pymysql.connect(host="localhost",user="root",password="root",db="lagou", charset='utf8')
        self.cursor = self.conn.cursor()

    def process_item(self,positionName,salary,city,work_years,education,content):
        insert_sql = '''
                    insert into lagou_table(positionName,salary,city,work_years,education,content)
                    values(%s,%s,%s,%s,%s,%s)
                '''
        self.cursor.execute(insert_sql,(positionName,salary,city,work_years,education,content))
        self.conn.commit()


    def close_spider(self,spider): #TypeError: close_spider() takes 1 positional argument but 2 were given
        self.cursor.close()
        self.conn.close()



if __name__=="__main__":
    spider=LagouSpider()
    spider.run()

python爬取拉勾网之selenium

猜你喜欢