At first is an automated testing tool; but in the reptile can be used to solve the problem requests can not be directly executed JavaScript code.
Essence: drive through the browser, the browser simulation operation (jump, click the drop-down, etc.), to get results after page rendering
Note: Support for multiple browsers ; such as: Chrome, Firefox, PhantomJS, Safari , Edge , etc.
installation
pip3 install selenium also need to download chromdriver.exe with the use of script in python installation directory path to
the domestic mirror site address: HTTP://npm.taobao.org/mirrors/chromedriver/2.38/ official website: HTTPS:// sites .google.com / a / chromium.org / chromedriver /Downloads
placed directly under the root directory Pycharm can
Basic use
Import the Selenium from webdriver Browser = webdriver.Chrome () # to Google browser, for example, generates a browser object Browser. GET (url) Browser. page_source # returned results (HTML page data) browser.current_url # page current url Browser. get_cookies () # were added login request is to be returned to remove all of the cookie value browser.get_cookie (key) # added is a login request, the cookie back to the key value may be taken as the corresponding key
No browser interface
from Selenium Import the webdriver from selenium.webdriver.chrome.options Import the Options chrome_options = the Options () chrome_options.add_argument ( ' window-size = 1920x3000 ' )
# BROWSER resolution
chrome_options.add_argument ( ' --disable-GPU ' )
# Google documents mentioned the need to add this property to circumvent the bug
chrome_options.add_argument ( ' --hide-scrollbars ' )
# scroll bar is hidden, to deal with some special pages
chrome_options.add_argument ( ' Blink-Settings = imagesEnabled = false ')
# Not loaded images, to enhance the speed
chrome_options.add_argument ( ' --headless ' )
# browser does not offer the visual page under linux If your system does not support this visualization without fail to start.
Chrome_options.binary_location r = " C: \ Program Files (x86) \ Google \ Chrome \ the Application \ chrome.exe "
# browser manually specify the location of browser = webdriver.Chrome ( chrome_options = chrome_options ) # set chrome_options parameters browser.get ( ' https://www.baidu. COM ' ) Print ( ' hao123 ' in driver.page_source) browser.close () # Remember to close your browser, resource recovery
Advanced Use
# . 1, find_element_by_id according to a specific one and only one lookup id browser.find_element_by_id ( ' kW ' ) # 2, find_element_by_link_text finds the first link in accordance with a text # 2.1, find_elements_by_link_text find multiple [a linked list based on the text returns result] the Login = browser.find_element_by_link_text ( ' login ' ) # find the login button button login.click () # click # 3, find_element_by_partial_link_text find the first part of the text based on the contents with links # 3.1, find_elements_by_partial_link_text in accordance with part of the text to find all links = browser.find_elements_by_partial_link_text Login ( ' record ' ) [0] login.click () # . 4, find_element_by_tag_name lookups The first tag name # 4.1, find_elements_by_tag_name a name tag according to find all browser.find_element_by_tag_name ( ' A ' ) # . 5 , find_element_by_class_name find a first class name # 5.1, find_elements_by_class_name find all class name Button browser.find_element_by_class_name = ( ' KKK ' ) Button.Click () # . 6, find_element_by_name Find property in accordance with the first name # 6.1, find_elements_by_name Find all based on the attribute name input_user = browser.find_element_by_name ( ' userName ' ) input_pwd = browser.find_element_by_name ( ' password ' ) input_user. Send_keys ( ' Account ' ) input_pwd. Send_keys ( ' password ' ) submit_button to = browser.find_element_by_id ( ' TANGRAM__PSP_10__submit ' ) submit_button to. the Click () # . 7,find_element_by_css_selector lookup The first selector css #. 7, find_elements_by_css_selector find all css The selector browser.find_element_by_css_selector ('#kw') #The ID browser.find_element_by_css_selector ('.kw') #The class
------- -------------------------------------------------- ---------------------------- #8, find_element_by_xpath according xpath syntax to find the first #8.1, find_elements_by_xpath to find all the grammar according to xpath
Details Use: https://www.w3school.com.cn/xpath/xpath_syntax.asp
Basic grammar:
Tsuhaifu:
Example:
例子使用:https://www.cnblogs.com/xiaoyuanqujing/articles/11805718.html
获取标签属性
from selenium import webdriver from selenium.webdriver.common.by import By # 按照什么方式查找,By.ID, By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys # 键盘按键操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素 browser = webdriver.Chrome() browser.get('https://www.amazon.cn/') # tag = browser.find_element_by_css_selector('#cc-lm-tcgShowImgContainer img') 等同与下 tag = browser.find_element(By.CSS_SELECTOR, '#cc-lm-tcgShowImgContainer img') # 获取标签属性 print(tag.get_attribute()) # 获取标签 src 属性 print(tag.get_attribute('src')) # 获取标签 ID,位置,名称,大小(了解) print(tag.id) print(tag.location) print(tag.tag_name) print(tag.size) input_tag = browser.find_element_by_id('kw') input_tag.send_keys('美女') # python2 中输入中文错误,字符串前加个u input_tag.send_keys(Keys.ENTER) # 输入回车 browser.close()
''' selenium只是模拟浏览器的行为,而浏览器解析页面是需要时间的(执行css,js) 一些元素可能需要过一段时间才能加载出来,为了保证能查找到元素,必须等待 ''' # 等待的方式分两种: # 1、隐式等待:在 browser.get('xxx')前就设置,针对所有元素有效 --------------> 推荐使用 from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素 browser = webdriver.Chrome() # 隐式等待:在查找所有元素时,如果尚未被加载,则等10秒(10秒内加载出来则直接查找,不是每次都10秒) browser.implicitly_wait(10) # get 之前 browser.get('https://www.baidu.com') # 再执行操作 # 2、显式等待:在 browser.get('xxx')之后设置,只针对某个元素有效 from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素 browser = webdriver.Chrome() browser.get('https://www.baidu.com') wait = WebDriverWait(browser, 10) # get 之后 wait.until(EC.presence_of_element_located((By.ID,'content_left'))) # 再执行操作
功能补充
from selenium import webdriver browser = webdriver.Chrome() browser.get('https://www.baidu.com') browser.back() # 模拟浏览器的后退 browser.forward() # 模拟浏览器的前进 browser.get_cookies() # 获取 cookies browser.add_cookie('字典形式') # 添加 cookie browser.delete_all_cookies() # 删除所有 cookies browser.window_handles # 获取所有的选项卡 browser.switch_to_window(browser.window_handles[1]) # 移动到底1个选项卡 # 在交互动作比较难实现时候可以自己写 JS ---> execute_script browser.execute_script('alert("hello world")') # 打印警告 browser.close()
异常处理
from selenium import webdriver from selenium.common.exceptions import TimeoutException, NoSuchElementException, NoSuchFrameException try: browser = webdriver.Chrome() browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') browser.switch_to.frame('iframssseResult') except TimeoutException as e: print(e) except NoSuchFrameException as e: print(e) finally: browser.close()
from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By # 按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys # 键盘按键操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素 import time def get_goods(driver): try: goods = driver.find_elements_by_class_name('gl-item') for good in goods: detail_url = good.find_element_by_tag_name('a').get_attribute('href') p_name = good.find_element_by_css_selector('.p-name em').text.replace('\n','') price = good.find_element_by_css_selector('.p-price i').text p_commit = good.find_element_by_css_selector('.p-commit a').text msg = ''' 商品 : %s 链接 : %s 价钱 :%s 评论 :%s ''' % (p_name, detail_url, price, p_commit) print(msg, end='\n\n') button = driver.find_element_by_partial_link_text('下一页') button.click() time.sleep(1) get_goods(driver) except Exception: pass def spider(url,keyword): driver = webdriver.Chrome() driver.get(url) driver.implicitly_wait(3) # 使用隐式等待 try: input_tag=driver.find_element_by_id('key') input_tag.send_keys(keyword) input_tag.send_keys(Keys.ENTER) get_goods(driver) finally: driver.close() if __name__ == '__main__': spider('https://www.jd.com/',keyword='iPhone8手机')