selenium实例登陆拉勾网 外加手动验证验证码

selenium模拟登陆拉钩网

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os, json, time
from urllib import parse
from lxml import etree
from fake_useragent import UserAgent
from pwd import username,password
from pymongo import MongoClient

ua = UserAgent()
#搜索的关键字
keywords_ls = ['python','java','web','c']
#搜索的热门城市
citys_ls = ['北京','上海','深圳','广州','杭州','成都','南京','武汉','西安','厦门','长沙','苏州','天津']

class LaGouselenium():
    def __init__(self,keywords_ls=keywords_ls,citys_ls=citys_ls):
        self.keywords_ls=keywords_ls
        self.citys_ls=citys_ls
        self.crawl_city=[] #已经爬取的城市
        #断点续传
        crawledCityPath='./lagou_crawled_city.json'
        if os.path.exists(crawledCityPath):
            with open(crawledCityPath,'r',encoding='utf8') as f:
                ls = json.load(f)
                self.crawled_city=ls
        self.col = MongoClient()['selenium']['LGW']
        #抓取条数
        print('已抓取',self.col.count_documents({}))
        #生成游览对象
        self.chrome = webdriver.Chrome('D:\data\chromedriver\chromedriver.exe')
        #隐式等待三秒
        self.chrome.implicitly_wait(3)

    def login(self):
        loginUrl='https://passport.lagou.com/login/login.html'
        self.chrome.get(loginUrl)
        #账号 密码 登录
        self.chrome.find_element_by_xpath('//form[@class="active"]/div[@data-propertyname="username"]/input').send_keys(username)
        self.chrome.find_element_by_xpath('//form[@class="active"]/div[@data-propertyname="password"]/input').send_keys(password)
        self.chrome.find_element_by_xpath('//form[@class="active"]/div[@data-propertyname="submit"]/input').click()
        #遇到验证码后  阻塞  手动验证
        c = input('如果出现验证码 手动验证后 回车, 否则直接回车')

    def crawl(self):
        #点击红包页面
        init=True
        for k in self.keywords_ls:
            #爬取哪一个关键字的信息
            print(k)
            #https://www.lagou.com/jobs/list_java/p-city_2?px=default#filterBox
            #最新发布排序
            url = 'https://www.lagou.com/jobs/list_{}/p-city_0?px=new&#filterBox'.format(k)
            self.chrome.get(url)
            if 'sec.lagou.com' in self.chrome.current_url: #301重定向
                input('欢迎进入验证码页面!手动处理完成后回车')
            #红包页面只点一次
            if init:
                self.chrome.find_element_by_class_name('body-btn').click()
                init = False

            for city in self.citys_ls:
                print(city)
                #维护断点续传
                if (k,city) in self.crawl_city:
                    continue
                #点击城市
                self.chrome.find_element_by_link_text(city).click()

                if 'sec.lagou.com' in self.chrome.current_url: #301重定向
                    input('欢迎进入验证码页面!手动处理完成后回车')

                #循环翻页
                while 1:
                    #总页面
                    totalpage = int(self.chrome.find_element_by_class_name('totalNum').text.strip())
                    #当前页面
                    curpage = int(self.chrome.find_element_by_class_name('curNum').text.strip())
                    print('翻页',curpage,'/',totalpage)
                    self.parseListPage(k,city)
                    if curpage == totalpage:
                        break
                    else:
                        self.chrome.find_element_by_class_name('next').click()
                        if 'sec.lagou.com' in self.chrome.current_url: #301重定向
                            input('欢迎进入验证码页面!手动处理完成后回车')


                #一定要刷新  否则下一次循环  无法定位城市
                self.chrome.refresh()
                #这个字段的城市已经爬取
                self.crawl_city.append((k,city))
                with open('./lagou_crawled_city.json','w',encoding='utf8') as f:
                    print('已爬+++++++++++++',city,k)
                    #存储
                    json.dump(self.crawl_city,f,ensure_ascii=False)

        self.checkDetailPage()



    def handleTime(self,timestr):
        pass

    def parseListPage(self,keyword,city):
        WebDriverWait(self.chrome,3).until(EC.presence_of_all_elements_located(('class name','con_list_item')))
        time.sleep(1)
        for ele in self.chrome.find_elements_by_class_name('con_list_item'):
            item = {"keyword":keyword,'city':city}
            try:
                url = ele.find_element_by_xpath('./div/div/div/a').get_attribute('href')
                jd_id = url.split('.html')[0].split('/')[-1].strip()
                print(jd_id)
            except:
                return
            if self.col.find_one({'jd_id':jd_id}):
                print('重复',jd_id)
                #continue #如果是初次爬取 continue 后期增量爬取 break
                break
            item['jd_id'] = jd_id
            text = ele.find_element_by_xpath('./div').text
            ls = text.split('\n')
            title = ls[0]
            addr =ls[1]
            #pubtime =self.handleTime(ls[2])
            item['title'] =title
            item['addr'] =addr
            #其他字段 自行添加
            salaryRange=ls[3].split(' ')[0].split('-')
            item['salaryRange']=salaryRange
            print(item,'***********')

            self.col.insert_one(item)

    def checkDetailPage(self):
        '''在数据库中遍历 补全信息'''
        for item in self.col.find({'job_description':False}):
            url = 'https://www.lagou.com/jobs/%s.html'%item['jd_id'].strip()
            self.chrome.get(url)
            des = self.chrome.find_element_by_class_name('job-detail').text.strip()
            self.col.update_one({'jd_id':item['jd_id']},{'$set':{'job_description':des}})

if __name__ == "__main__":
    lagou = LaGouselenium(keywords_ls,citys_ls)
    lagou.login()
    lagou.crawl()

效果演示

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/qq_42662411/article/details/104523154
今日推荐