python爬取拉勾网之selenium

重点代码解释:
1.调用lxml的etree实现xpath方法调用,xpath相对正则比较简单,可以不在使用Beauitfulsoup定位

from lxml import etree

2.界面的可视话与否,对于你的运行资源只能用减少

opt=webdriver.ChromeOptions()
# 把chrome设置成无界面模式,不论windows还是linux都可以,自动适配对应参数
opt.set_headless()#无界面
self.driver=webdriver.Chrome(options=opt)

3.加载数据时到xpath定位的位置进行爬取

#此句话大致意思,执行driver 时间不超过20s 什么时候加载到xpath定位的位置神魔时候停止开始执行页面 内容爬去
WebDriverWait(driver=self.driver,timeout=20).until(EC.presence_of_all_elements_located(By.XPATH,’//*[@id=“s_position_list”]/div[2]/div/a[6]’))

4.python 中join()函数strip() 函数和 split() 函数的详解及实例详细内容请到我的另一篇博客
查看

content = “”.join(html.xpath("//dd[@class=‘job_bt’]//text()")).strip()
‘’’
语法: ‘sep’.join(seq)
参数说明
sep:分隔符。可以为空
seq:要连接的元素序列、字符串、元组、字典
上面的语法即:以sep作为分隔符,将seq所有的元素合并成一个新的字符串
返回值:返回一个以分隔符sep连接各个元素后生成的字符串
‘’’

from selenium import  webdriver
import  lxml
from lxml import  etree
import re
import time
import  pymysql
import urllib.request
import requests

from selenium.webdriver.support.ui import  WebDriverWait
from selenium.webdriver.support import  expected_conditions as EC
from selenium.webdriver.common.by import  By



class LagouSpider(object):
    def __init__(self):
        opt=webdriver.ChromeOptions()
        # 把chrome设置成无界面模式,不论windows还是linux都可以,自动适配对应参数
        opt.set_headless()#无界面
        self.driver=webdriver.Chrome(options=opt)
        self.url="https://www.lagou.com/zhaopin/Python/"
       
    def run(self):
        self.driver.get(self.url)
        while True:
            source = self.driver.page_source
            #此句话大致意思,执行driver 时间不超过20s 什么时候加载到xpath定位的位置神魔时候停止开始执行页面 内容爬去
            WebDriverWait(driver=self.driver,timeout=20).until(EC.presence_of_all_elements_located(By.XPATH,'//*[@id="s_position_list"]/div[2]/div/a[6]'))

            # WebDriverWait(driver=self.driver, timeout=20).until(
            #     EC.presence_of_element_located((By.XPATH, '//*[@id="s_position_list"]/div[2]/div/a[6]'))
            # )
            self.parse_list_page(source)
            # 点“下一页”
            next_btn=self.driver.find_element_by_xpath(  '//*[@id="s_position_list"]/div[2]/div/a[6]')
            # 提取下一页的按钮,注意class的值中有空格不可用。
            if "pager_next_disabled" in next_btn.get_attribute("class"):
                break
            else:
                next_btn.click()
            time.sleep(1)
            # next_btn = self.driver.find_element_by_xpath(
            #      '//*[@id="s_position_list"]/div[2]/div/a[6]')
            # if "pager_next_disabled" in next_btn.get_attribute("class"):
            #     break
            # else:
            #     next_btn.click()
            # time.sleep(1)


        # source=self.driver.page_source
        # #print(source)
        # self.parse_list_page(source)
    #职位url列表
    def parse_list_page(self,source):
        #t通过etree调用xpath
        html=etree.HTML(source)
        links=html.xpath('//*[@id="s_position_list"]/ul/li/div[1]/div[1]/div[1]/a/@href')
        for  link in links:
            self.request_detail_page(link)
            # print(link)
            #time.sleep(1)


    #执行提取的url
    def request_detail_page(self,url):
        #self.driver.get(url)
        #打开新的页面
        self.driver.execute_script("window.open('%s')"%url)
        #切换句柄进入新打开的页面
        self.driver.switch_to.window((self.driver.window_handles[1]))
        # self.driver.execute_script("window.open('%s')" % url)
        # self.driver.switch_to.window(self.driver.window_handles[1])
        #加载出来工作名开始爬取
        WebDriverWait(driver=self.driver, timeout=20).until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='job-name']/span[@class='name']")) )

        source=self.driver.page_source
        self.parse_detail_page(source)
        # 关闭当前详情页,并且切换到列表页
        self.driver.close()
        self.driver.switch_to.window(self.driver.window_handles[0])
        #self.parse_list_page(source)

    #提取具体信息
    def parse_detail_page(self,source):
        html=etree.HTML(source)
        positionName=html.xpath("//div[@class='position-head']/div/div[1]/div/span/text()")[0]
        job_request_spans=html.xpath("//div[@class='position-head']/div/div[1]/dd/p[1]/span")
        salary=job_request_spans[0].xpath(".//text()")[0].strip()
        city=job_request_spans[1].xpath('.//text()')[0].strip()
        #city = re.match(r'<span class="xh-highlight">/(.*?) /</span>',city)
        city = re.sub(r"[\s/]", "", city)#此处将"/"替换为空""
        work_years = job_request_spans[2].xpath('.//text()')[0].strip()
        work_years = re.sub(r"[\s/]", "", work_years)
        education = job_request_spans[3].xpath('.//text()')[0].strip()
        education = re.sub(r"[\s/]", "", education)
        content = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
        #print(positionName)
        #mysql=MySQLPipeline()
        #mysql.process_item(positionName,salary,city,work_years,education,content)
        '''
        语法:  'sep'.join(seq)
        参数说明
        sep:分隔符。可以为空
        seq:要连接的元素序列、字符串、元组、字典
        上面的语法即:以sep作为分隔符,将seq所有的元素合并成一个新的字符串
        返回值:返回一个以分隔符sep连接各个元素后生成的字符串
        '''
class MySQLPipeline(object):
    def __init__(self):
        self.conn = pymysql.connect(host="localhost",user="root",password="root",db="lagou", charset='utf8')
        self.cursor = self.conn.cursor()

    def process_item(self,positionName,salary,city,work_years,education,content):
        insert_sql = '''
                    insert into lagou_table(positionName,salary,city,work_years,education,content)
                    values(%s,%s,%s,%s,%s,%s)
                '''
        self.cursor.execute(insert_sql,(positionName,salary,city,work_years,education,content))
        self.conn.commit()


    def close_spider(self,spider): #TypeError: close_spider() takes 1 positional argument but 2 were given
        self.cursor.close()
        self.conn.close()



if __name__=="__main__":
    spider=LagouSpider()
    spider.run()

猜你喜欢

转载自blog.csdn.net/work_you_will_see/article/details/84638750