selenium模拟登陆LinkedIn

运行代码前记得先翻墙!!!,还有你使用的账号得开通领英会员,否则员工具体信息看不到!

1、所需的环境如下
        

pyjson
selenium

2、代码中需要导入`chromedriver.exe`驱动,我会放在文章后最后

 3、总代码如下。

#需要翻墙才能登陆领英国际版
import json
import math
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import re
from selenium.webdriver.chrome.options import Options
#不弹出浏览器,无头操作,可选择使用
# chrome_options = Options()
# chrome_options.add_argument('--headless')
chrome = webdriver.Chrome(executable_path=r'chromedriver.exe')#导入驱动路径
#在领英上搜索的公司,点击'查看全部会员结果'后的链接网址
chrome.get('https://www.linkedin.com/search/results/people/?keywords=%E5%8C%BB%E8%8D%AF%E7%8C%8E%E5%A4%B4&origin=SWITCH_SEARCH_VERTICAL&page=1&sid=CkF')#请求网址
time.sleep(2)
#定位到密码登陆按钮
password_login = chrome.find_element(By.XPATH,'/html/body/div[1]/main/p[1]/a')
#点击密码登陆
password_login.click()
time.sleep(2)
#定位用户名框并输入
user_name = chrome.find_element(By.XPATH,'//*[@id="username"]')
user_name.clear()#先清空
user_name.send_keys('***')#你的账号
time.sleep(2)
#定位密码框并输入
pass_word = chrome.find_element(By.XPATH,'//*[@id="password"]')
pass_word.clear()#先清空
pass_word.send_keys('***')#你的密码
time.sleep(2)
#点击登录键
login_icon = chrome.find_element(By.XPATH,'//*[@id="organic-div"]/form/div[3]/button')
login_icon.click()
time.sleep(2)
#获得总页数
total_page = chrome.find_element(By.XPATH,'//*[@id="main"]/div/div/h2').text#定位
a = re.findall('\d*',total_page)#利用正则提取字符串中数字,人物数字此时为str
a = ''.join(a)
total_page = math.ceil(int(a)/10)#向上取整,得到总页数
if total_page > 100:#最多显示一百页
    total_page = 100
time.sleep(2)
item={}#定义空字典
for i in range(1,total_page+1):
    chrome.get(f'https://www.linkedin.com/search/results/people/?keywords=%E5%8C%BB%E8%8D%AF%E7%8C%8E%E5%A4%B4&origin=SWITCH_SEARCH_VERTICAL&page={i}')#翻页操作
    node_list= chrome.find_element(By.XPATH,'//*[@id="main"]/div/div/div[1]/ul/li/div/div/div[2]/div[1]/div[1]/div/span[1]/span/a')#存储人物信息href列表
    #循环爬取节点列表中的内容
    for node in node_list:
    #得到跳转链接,重新加载页面
        node = node_list.get_attribute('href')#提取出每个href
        time.sleep(2)
        chrome.get(node)#用浏览器重新get href
        time.sleep(2)
        name = chrome.find_element(By.XPATH,'//*[@id="main"]/section[1]/div[2]/div[2]/div[1]/div[1]/h1').text#姓名
        title = chrome.find_element(By.XPATH,'//*[@id="main"]//section[1]/div[2]/div[2]/div[1]/div[2]').text#职位
        location = chrome.find_element(By.XPATH,'//*[@id="main"]/section[1]/div[2]/div[2]/div[2]/span[1]').text#地点
        #个人简介
        try:
            individual_resume = chrome.find_element(By.XPATH,'//*[@id="about"]/../div[3]').text
            individual_resume = individual_resume.split('\n')[1::2]  # 简介重复2次,只取偶数,按偶数分割列表
            individual_resume = ','.join(individual_resume)
        except:
            individual_resume = 'None'
        # 工作经历
        try:
            work_experience = chrome.find_element(By.XPATH,'//*[@id="experience"]/../div[3]').text
            work_experience = work_experience.split('\n')[1::2]  # 按偶数分割列表
            work_experience = ','.join(work_experience)
        except:
            work_experience = 'None'
        #教育经历
        try:
            education_experience = chrome.find_element(By.XPATH,'//*[@id="education"]/../div[3]').text
            education_experience = education_experience.split('\n')[1::2]  # 按偶数分割列表
            education_experience = ','.join(education_experience)
        except:
            education_experience = 'None'
        #技能
        try:
            ability = chrome.find_element(By.XPATH,'//*[@id="skills"]/../div[3]').text
            ability = ability.split('\n')[1::2]  # 按偶数分割列表
            ability = ','.join(ability)
        except:
            ability = 'None'
        #用字典存储信息
        item['name'] = name
        item['title'] = title
        item['location'] = location
        item['info'] = individual_resume
        item['experience'] = work_experience
        item['education'] = education_experience
        item['ability'] = ability
        try:
            path = r'data/'#保存路径
            #用人名给每个人的信息做保存文件的名字
            set2 = r'[?*/\|:><"]'#利用正则表达去掉不能命名的字符
            name = re.sub(set2,"",name)
            with open(path+f'{name}.json','a',encoding='utf-8') as f:
                f.write(json.dumps(item,ensure_ascii=False)+ ',\n')
        except IOError as err:
            print('error:' + str(err))
        finally:
            f.close()
        time.sleep(3)
    time.sleep(2)
#休息5秒关闭浏览器
time.sleep(5)
chrome.quit()

7、chromedriver.exe驱动的下载地址:

链接:https://pan.baidu.com/s/1ATLdFEQfVvCzLpSqd2W3Mg?pwd=5555&_at_=1662003753714 
提取码:5555

猜你喜欢

转载自blog.csdn.net/weixin_39357271/article/details/126639824