LinkedInのデータをクロールPythonのセレン

# -*- coding: utf-8 -*-
import os
import pickle
import time

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
import csv
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
option = webdriver.ChromeOptions()
option.add_argument("headless")
brower = webdriver.Chrome(options=option)
brower = webdriver.Chrome()

def getLinkeninCookies():
    # get login taobao cookies
    url = 'https://www.linkedin.com/feed/'
    brower.get('https://www.linkedin.com/login/')
    time.sleep(1)
    brower.find_element_by_xpath('//*[@id="username"]').send_keys('[email protected]')
    brower.find_element_by_xpath('//*[@id="password"]').send_keys('123456yp')
    brower.find_element_by_xpath('//*[@id="app__container"]/main/div/form/div[3]/button').click()

    while True:
        print("Please login in linkedin.com!")
        time.sleep(1)
        # if login in successfully, url  jump to www.taobao.com
        while brower.current_url ==  url:
            tbCookies  = brower.get_cookies()
            brower.quit()
            cookies = {}
            for item in tbCookies:
                cookies[item['name']] = item['value']
            outputPath = open('linkeninCookies.pickle','wb')
            pickle.dump(cookies,outputPath)
            outputPath.close()
            return cookies
def readLinkeninCookies():
    # if hava cookies file ,use it 
    # if not , getTaobaoCookies()
    if os.path.exists('linkeninCookies.pickle'):
        readPath = open('linkeninCookies.pickle','rb')
        tbCookies = pickle.load(readPath)
    else:
        tbCookies = getLinkeninCookies()
    return tbCookies
def findTotalPage():
    tbCookies = readLinkeninCookies()
    brower.get("https://www.linkedin.com")
    brower.delete_all_cookies()
    for cookie in tbCookies:
        brower.add_cookie({
        "domain":".linkedin.com",
        "name":cookie,
        "value":tbCookies[cookie],
        "path":'/',
        "expires":None
    })
    brower.get("https://www.linkedin.com/search/results/all/?keywords=%E6%9F%AF%E9%A9%AC%E5%B7%A5%E7%A8%8B&origin=GLOBAL_SEARCH_HEADER")
    time.sleep(1)
    
    brower.execute_script("window.scrollTo(0,document.body.scrollHeight)")
    time.sleep(1)
    Element=brower.find_element_by_xpath('/html/body/div[5]/div[7]/div[4]/div/div[2]/div/div[2]/div/div/div/div/div[1]/artdeco-pagination/ul/li[10]/button/span')
    return Element.text
def getInfo():
    tbCookies = readLinkeninCookies()
    brower.get("https://www.linkedin.com")
    time.sleep(2)
    brower.delete_all_cookies()
    for cookie in tbCookies:
        brower.add_cookie({
        "domain":".linkedin.com",
        "name":cookie,
        "value":tbCookies[cookie],
        "path":'/',
        "expires":None
    })
    brower.get('https://www.linkedin.com/search/results/all/?keywords=%E6%9F%AF%E9%A9%AC%E5%B7%A5%E7%A8%8B&origin=GLOBAL_SEARCH_HEADER')
    time.sleep(0.5)
    brower.execute_script("window.scrollTo(0,document.body.scrollHeight)")
    time.sleep(2)
    ul = brower.find_elements_by_css_selector('.search-result__wrapper')
    person = []
    for li in ul:
        name = li.find_element_by_css_selector('span.actor-name').text
        job = li.find_element_by_css_selector('p.subline-level-1').text
        localtion = li.find_element_by_css_selector('p.subline-level-2').text
        rs = isElementExist(li,'p.search-result__snippets')
        if rs:
            company = li.find_element_by_css_selector('p.search-result__snippets').text
        else:
            company = None
        person.append([name,job,localtion,company])
    with open('data.csv','w',newline='',encoding='utf-8-sig') as file:
        writer = csv.writer(file)
        writer.writerow(["name","job","localtion","company"])
        m = len(person)
        for i in range(m):
            writer.writerow(person[i])

def isElementExist(parent,element):
        flag=True
        try:
            parent.find_element_by_css_selector(element)
            return flag
        except:
            flag=False
            return flag       
if __name__ == "__main__":
    
    getInfo()
    brower.quit()
    

大量のデータを作るためにオンラインを見て、彼らはクロールデータにできなかったことが判明し、seleniun自身の使用や爬虫類を作るために、独自のドライバを書くには、会社名に応じて検索することができ、基本的なインフォメーションワーカーをクロール

公開された29元の記事 ウォンの賞賛5 ビュー7994

おすすめ

転載: blog.csdn.net/qq_25194685/article/details/97265267