Reptile >>> Selenium module

Selenium Introduction

  At first is an automated testing tool; but in the reptile can be  used to solve the problem requests can not be directly executed JavaScript code.

  Essence: drive through the browser, the browser simulation operation (jump, click the drop-down, etc.), to get results after page rendering

  Note: Support for multiple browsers ; such as: Chrome, Firefox, PhantomJS, Safari , Edge , etc.

installation

pip3 install selenium

 also need to download chromdriver.exe with the use of script in python installation directory path to
the domestic mirror site address: HTTP:
//npm.taobao.org/mirrors/chromedriver/2.38/ official website: HTTPS:// sites .google.com / a / chromium.org / chromedriver /Downloads
placed directly under the root directory Pycharm can

 Basic use

Import the Selenium from webdriver 

Browser = webdriver.Chrome () # to Google browser, for example, generates a browser object
 Browser. GET (url) 
Browser. page_source    # returned results (HTML page data) 
browser.current_url    # page current url 
Browser. get_cookies ()   # were added login request is to be returned to remove all of the cookie value 
browser.get_cookie (key)   # added is a login request, the cookie back to the key value may be taken as the corresponding key

 

 No browser interface

  Headless mode requires chrome 59/60 version of the above; and follows

from Selenium Import the webdriver
 from selenium.webdriver.chrome.options Import the Options 

chrome_options = the Options () 
chrome_options.add_argument ( ' window-size = 1920x3000 ' )   
# BROWSER resolution
chrome_options.add_argument ( ' --disable-GPU ' )
# Google documents mentioned the need to add this property to circumvent the bug
chrome_options.add_argument ( ' --hide-scrollbars ' )
# scroll bar is hidden, to deal with some special pages
chrome_options.add_argument ( ' Blink-Settings = imagesEnabled = false ')
# Not loaded images, to enhance the speed
chrome_options.add_argument ( ' --headless ' )
# browser does not offer the visual page under linux If your system does not support this visualization without fail to start.
Chrome_options.binary_location r = " C: \ Program Files (x86) \ Google \ Chrome \ the Application \ chrome.exe "
# browser manually specify the location of
browser = webdriver.Chrome ( chrome_options = chrome_options ) # set chrome_options parameters browser.get ( ' https://www.baidu. COM ' ) Print ( ' hao123 ' in driver.page_source) browser.close () # Remember to close your browser, resource recovery

 Advanced Use

# . 1, find_element_by_id  according to a specific one and only one lookup id 
browser.find_element_by_id ( ' kW ' ) 


# 2, find_element_by_link_text   finds the first link in accordance with a text 
# 2.1, find_elements_by_link_text   find multiple [a linked list based on the text returns result] 
the Login = browser.find_element_by_link_text ( ' login ' )   # find the login button button 
login.click ()   # click 


# 3, find_element_by_partial_link_text   find the first part of the text based on the contents with links 
# 3.1, find_elements_by_partial_link_text  in accordance with part of the text to find all links
= browser.find_elements_by_partial_link_text Login ( ' record ' ) [0] 
login.click () 



# . 4, find_element_by_tag_name   lookups The first tag name 
# 4.1, find_elements_by_tag_name  a name tag according to find all 
browser.find_element_by_tag_name ( ' A ' ) 


# . 5 , find_element_by_class_name  find a first class name 
# 5.1, find_elements_by_class_name  find all class name 
Button browser.find_element_by_class_name = ( ' KKK ' ) 
Button.Click () 


# . 6, find_element_by_name Find property in accordance with the first name 
# 6.1, find_elements_by_name  Find all based on the attribute name 
input_user = browser.find_element_by_name ( ' userName ' ) 
input_pwd = browser.find_element_by_name ( ' password ' ) 
input_user. Send_keys ( ' Account ' ) 
input_pwd. Send_keys ( ' password ' ) 
submit_button to = browser.find_element_by_id ( ' TANGRAM__PSP_10__submit ' ) 
submit_button to. the Click () 



# . 7,find_element_by_css_selector  lookup The first selector css
#. 7, find_elements_by_css_selector  find all css The selector
browser.find_element_by_css_selector ('#kw')  #The ID
browser.find_element_by_css_selector ('.kw')  #The class

------- -------------------------------------------------- ---------------------------- #8, find_element_by_xpath according xpath syntax to find the first #8.1, find_elements_by_xpath to find all the grammar according to xpath

xpath use

  Details Use: https://www.w3school.com.cn/xpath/xpath_syntax.asp

  Basic grammar:

                      

  Tsuhaifu:

                     

   Example:

                      


                       

                        

    例子使用:https://www.cnblogs.com/xiaoyuanqujing/articles/11805718.html


  获取标签属性

from selenium import webdriver
from selenium.webdriver.common.by import By  # 按照什么方式查找,By.ID, By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys  # 键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素

browser = webdriver.Chrome()
browser.get('https://www.amazon.cn/')
# tag = browser.find_element_by_css_selector('#cc-lm-tcgShowImgContainer img') 等同与下
tag = browser.find_element(By.CSS_SELECTOR, '#cc-lm-tcgShowImgContainer img')

# 获取标签属性
print(tag.get_attribute())
# 获取标签 src 属性
print(tag.get_attribute('src'))

# 获取标签 ID,位置,名称,大小(了解)
print(tag.id)
print(tag.location)
print(tag.tag_name)
print(tag.size)

input_tag = browser.find_element_by_id('kw')
input_tag.send_keys('美女')  # python2 中输入中文错误,字符串前加个u
input_tag.send_keys(Keys.ENTER)  # 输入回车

browser.close()

   隐示/显示等待

'''
selenium只是模拟浏览器的行为,而浏览器解析页面是需要时间的(执行css,js)
一些元素可能需要过一段时间才能加载出来,为了保证能查找到元素,必须等待 
'''

# 等待的方式分两种:
# 1、隐式等待:在 browser.get('xxx')前就设置,针对所有元素有效   --------------> 推荐使用
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素

browser = webdriver.Chrome()
# 隐式等待:在查找所有元素时,如果尚未被加载,则等10秒(10秒内加载出来则直接查找,不是每次都10秒)
browser.implicitly_wait(10)  # get 之前
browser.get('https://www.baidu.com')
# 再执行操作


# 2、显式等待:在 browser.get('xxx')之后设置,只针对某个元素有效
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素

browser = webdriver.Chrome()
browser.get('https://www.baidu.com')
wait = WebDriverWait(browser, 10)  # get 之后
wait.until(EC.presence_of_element_located((By.ID,'content_left')))
# 再执行操作

  功能补充 

from selenium import webdriver

browser = webdriver.Chrome()
browser.get('https://www.baidu.com')
browser.back()   # 模拟浏览器的后退
browser.forward()   # 模拟浏览器的前进
browser.get_cookies()  # 获取 cookies
browser.add_cookie('字典形式')  # 添加 cookie
browser.delete_all_cookies()  # 删除所有 cookies
browser.window_handles  # 获取所有的选项卡
browser.switch_to_window(browser.window_handles[1])  # 移动到底1个选项卡
# 在交互动作比较难实现时候可以自己写 JS   --->   execute_script
browser.execute_script('alert("hello world")')  # 打印警告
browser.close()

  异常处理

    一般都在 try 里面进行任务的执行,一旦发现报错,会在 except 里面捕获,且最终都要在 finally 里面执行关闭浏览器功能

from selenium import webdriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException, NoSuchFrameException

try:
    browser = webdriver.Chrome()
    browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
    browser.switch_to.frame('iframssseResult')
except TimeoutException as e:
    print(e)
except NoSuchFrameException as e:
    print(e)
finally:
    browser.close()

练习:爬取京东商城商品信息

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By  # 按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys  # 键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
import time


def get_goods(driver):
    try:
        goods = driver.find_elements_by_class_name('gl-item')
        for good in goods:
            detail_url = good.find_element_by_tag_name('a').get_attribute('href')

            p_name = good.find_element_by_css_selector('.p-name em').text.replace('\n','')
            price = good.find_element_by_css_selector('.p-price i').text
            p_commit = good.find_element_by_css_selector('.p-commit a').text

            msg = '''
            商品 : %s
            链接 : %s
            价钱 :%s
            评论 :%s
            ''' % (p_name, detail_url, price, p_commit)

            print(msg, end='\n\n')

        button = driver.find_element_by_partial_link_text('下一页')
        button.click()
        time.sleep(1)
        get_goods(driver)
    except Exception:
        pass

    
def spider(url,keyword):
    driver = webdriver.Chrome()
    driver.get(url)
    driver.implicitly_wait(3)  # 使用隐式等待
    try:
        input_tag=driver.find_element_by_id('key')
        input_tag.send_keys(keyword)
        input_tag.send_keys(Keys.ENTER)
        get_goods(driver)
    finally:
        driver.close()

if __name__ == '__main__':
    spider('https://www.jd.com/',keyword='iPhone8手机')

 

Guess you like

Origin www.cnblogs.com/pupy/p/11989892.html