(六)爬虫之使用selenium

  selenium是使用javascript编写,主要用来进行web应用程序测试,在python爬虫中可以用来进行动态网页爬取,解决爬虫中的javascript渲染(执行js语句)。总结记录下,以备后面学习

1. selenium基本使用

  安装:pip install selenium

  查看支持的浏览器: 安装完后,命令行输入下面语句,可以查看selenium支持的web浏览器,如下图所示

    from selenium import webdriver

    help(webdriver)

  简单使用:这里使用Firefox浏览器,首先得下载Firefox的驱动geckodriver到本地,并将其路径配置到环境变量,下面为简单使用。

from selenium import webdriver

browser = webdriver.Firefox()  #firefox驱动
browser.get("https://www.zhihu.com/signup?next=%2F")  #浏览器打开网页
print(browser.page_source)  #打印网页源代码
browser.close()  # 关闭浏览器

  (报错Message: 'geckodriver' executable needs to be in PATH时,  下载geckodriver,并将其路径配置到环境变量)

  简单使用方法:

#简单方法
driver = = webdriver.Firefox() 初始化浏览器
driver.get("http://www.example.com") #打开网页
driver.forward() #前进
driver.back()  #后退
driver.close() #关闭浏览器

  使用cookie:

# Go to the correct domain
driver.get("http://www.example.com")

# Now set the cookie. This one's valid for the entire domain
cookie = {‘name’ : ‘foo’, ‘value’ : ‘bar’}
driver.add_cookie(cookie)  #添加cookie,全局的cookie(对该域名下所有url)

# And now output all the available cookies for the current URL
driver.get_cookies()  #获取cookie

2.查找元素

        webdriver打开网页后,可以查找网页中的单个或多个元素,并对其操作,下面为常用方法

https://selenium-python.readthedocs.io/locating-elements.html

#单个元素
find_element_by_id()   通过id属性
find_element_by_name()   通过name属性
find_element_by_xpath()      xpath选择器
find_element_by_link_text()  通过超链接文本定位
find_element_by_partial_link_text()  通过部分超链接文本定位
find_element_by_tag_name()   标签名称
find_element_by_class_name()  类选择器
find_element_by_css_selector()    css选择器

#多个元素  (返回一个列表)
find_elements_by_name
find_elements_by_id
find_elements_by_xpath
find_elements_by_link_text
find_elements_by_partial_link_text
find_elements_by_tag_name
find_elements_by_class_name
find_elements_by_css_selector

  通过元素name属性查找:

<html>
 <body>
  <form id="loginForm">
   <input name="username" type="text" />
   <input name="password" type="password" />
   <input name="continue" type="submit" value="Login" />
   <input name="continue" type="button" value="Clear" />
  </form>
</body>
<html>

username = driver.find_element_by_name('username')
password = driver.find_element_by_name('password')
find_element_by_name

  通过超链接文本查找:

<html>
 <body>
  <p>Are you sure you want to do this?</p>
  <a href="continue.html">Continue</a>
  <a href="cancel.html">Cancel</a>
</body>
<html>

continue_link = driver.find_element_by_link_text('Continue')  #Continue为a标签的文本
continue_link = driver.find_element_by_partial_link_text('Conti')
find_element_by_link_text()

  通过Xpath选择器

<html>
 <body>
  <form id="loginForm">
   <input name="username" type="text" />
   <input name="password" type="password" />
   <input name="continue" type="submit" value="Login" />
   <input name="continue" type="button" value="Clear" />
  </form>
</body>
<html>

#查找form元素
login_form = driver.find_element_by_xpath("/html/body/form[1]")
login_form = driver.find_element_by_xpath("//form[1]")
login_form = driver.find_element_by_xpath("//form[@id='loginForm']")

#查找form表单中的username输入框input
username = driver.find_element_by_xpath("//form[input/@name='username']")
username = driver.find_element_by_xpath("//form[@id='loginForm']/input[1]")
username = driver.find_element_by_xpath("//input[@name='username']")

#查找form表单中的clear button
clear_button = driver.find_element_by_xpath("//input[@name='continue'][@type='button']")
clear_button = driver.find_element_by_xpath("//form[@id='loginForm']/input[4]")
find_element_by_xpath

  通过css_selector选择器

<html>
 <body>
  <p class="content">Site content goes here.</p>
</body>
<html>

content = driver.find_element_by_css_selector('p.content')
find_element_by_css_selector()

  还可以通过get_element()和get_elements()两个方法查找,使用如下:

from selenium.webdriver.common.by import By

driver.find_element(By.XPATH, '//button[text()="Some text"]')
driver.find_elements(By.XPATH, '//button')


#By支持的其他属性

ID = "id"                          (By.ID)
XPATH = "xpath"
LINK_TEXT = "link text"
PARTIAL_LINK_TEXT = "partial link text"
NAME = "name"
TAG_NAME = "tag name"
CLASS_NAME = "class name"
CSS_SELECTOR = "css selector"
find_element()

3. 元素操控

   查找到元素后,可以获得元素的相关属性,并对元素进行操控,下面为常用方法:

#元素操作
元素.click() 点击元素
元素.clear()  清楚元素的内容
元素.send_keys(value)  给元素赋值
元素.get_attribute("class") 获得元素属性

#元素属性
元素.text                   获得元素文本
元素.id                       获得元素id
元素.location                获得元素
元素.tag_name          获得元素标签名
元素.size                   获得元素大小

  对于表单中的select标签,selenium也可以进行选中和反选,和获取相应的options,方法如下:

https://blog.csdn.net/huilan_same/article/details/52246012
https://selenium-python.readthedocs.io/navigating.html

#选中select中某一项
from selenium.webdriver.support.ui import Select
select = Select(driver.find_element_by_name('name'))  #初始化select
select.select_by_index(index)    #通过options编号查找,第一个option的index=0
select.select_by_visible_text("text")  #通过选择text=text的值,即在下拉时我们可以看到的文本
select.select_by_value(value)  #通过options的value属性选择


#反选
select = Select(driver.find_element_by_id('id'))
select.deselect_all()
select.deselect_by_index(index)
select.deselect_by_value(value)
select.deselect_by_visible_text(text)

#查看选择的option
select = Select(driver.find_element_by_xpath("//select[@name='name']"))
all_selected_options = select.all_selected_options  #所有选择的options
select.first_selected_option   #第一个选择的option
options = select.options    #所有的options

4.window和frame切换

   一般大型的网页window中都会包含frame,selenium可以在多个window,多个frame,window和frame之间进行切换,使用如下:

#切换窗口
<a href="somewhere.html" target="windowName">Click here to open a new window</a>
driver.switch_to_window("windowName")  #切换到windw::windowName
driver.switch_to_default_content() #切回到父窗口

#切换到frame
driver.switch_to_frame("frameName") #切换到frame
driver.switch_to.frame("app_canvas_frame") #切换到frame,和switch_to_frame()一样
driver.switch_to.parent_frame() #切回到父frame

#切换到弹出窗口
alert = driver.switch_to_alert()

  selenium还支持移动一个元素到一定位置,主要利用drag_and_drop()方法:

element = driver.find_element_by_name("source")
target = driver.find_element_by_name("target")

from selenium.webdriver import ActionChains
action_chains = ActionChains(driver)  #添加责任链
action_chains.drag_and_drop(element, target).perform()  #从element移动到target
drag_and_drop()

5. 显示等待和隐式等待

       由于ajax的使用,当打开网页时,有些元素需要一定的时间通过ajax进行传输才显示在网页中,因此当利用selenium查找元素时,需要等待一段时间,不然会抛出错误(ElementNotVisibleException exception),selenium主要有显示等待(Explicit waits)和隐式等待(implicit waits)两种方式

  Explicit waits : 在某个条件满足前,浏览器会等待一段时间;若等待设定时间后,条件依旧没满足,则抛出错误,若条件提前满足,则结束等待。使用如下:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = webdriver.Firefox()
driver.get("http://somedomain/url_that_delays_loading")
try:
    element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "myDynamicElement"))  #在ID为myDynamicElement的元素出现前,等待10s
    )
finally:
    driver.quit()

  selenium还支持如下的等待事件:

#等待事件
    title_is
    title_contains
    presence_of_element_located
    visibility_of_element_located
    visibility_of
    presence_of_all_elements_located
    text_to_be_present_in_element
    text_to_be_present_in_element_value
    frame_to_be_available_and_switch_to_it
    invisibility_of_element_located
    element_to_be_clickable
    staleness_of
    element_to_be_selected
    element_located_to_be_selected
    element_selection_state_to_be
    element_located_selection_state_to_be
    alert_is_present

#使用示例
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(driver, 10)
element = wait.until(EC.element_to_be_clickable((By.ID, 'someid')))
等待事件

  自定义等待事件:

class element_has_css_class(object):
  """An expectation for checking that an element has a particular css class.

  locator - used to find the element
  returns the WebElement once it has the particular css class
  """
  def __init__(self, locator, css_class):
    self.locator = locator
    self.css_class = css_class

  def __call__(self, driver):
    element = driver.find_element(*self.locator)   # Finding the referenced element
    if self.css_class in element.get_attribute("class"):
        return element
    else:
        return False

# Wait until an element with id='myNewInput' has class 'myCSSClass'
wait = WebDriverWait(driver, 10)
element = wait.until(element_has_css_class((By.ID, 'myNewInput'), "myCSSClass"))
View Code

   implicit waits: 在查找所有元素前都会等待设定时间,默认的等待时间为0,使用如下:

from selenium import webdriver

driver = webdriver.Firefox()
driver.implicitly_wait(10) # 设定等待10s
driver.get("http://somedomain/url_that_delays_loading")
myDynamicElement = driver.find_element_by_id("myDynamicElement")  #先等待10s,再查找元素

 除上面介绍的方法外,selenium还支持很多API方法,参考:https://selenium-python.readthedocs.io/api.html

6. selenium爬取QQ空间

   练习下selenium的使用,爬取朋友qq空间的所有说说到本地,并生成一张云图。整个思路如下:

    1,登陆网页版QQ,需要selenium输入用户名和密码,并点击登陆

    2,访问朋友的QQ空间,爬取说说。需要selenium下滑进度条,并点击翻页

    3,利用wordcloud模块,将爬取的说说生成一张云图

   爬取代码如下: 

#coding:utf-8
#登陆qq,进入朋友qq空间说说界面,爬取其所有的说说

from selenium import webdriver
import time
from lxml import html
from word_cloud import generate_wd

def login_qq(browser,qq,password):
    browser.get("https://i.qq.com/")

    #切换到登陆的子frame
    browser.switch_to_frame("login_frame")  #从frame切换到frame,或从父窗口切入iframe

    #time.sleep(3)   #等待3秒,等待子frame出现完全  (如果报错:selenium.common.exceptions.ElementNotInteractableException)

    browser.find_element_by_id("switcher_plogin").click() #点击账号密码登陆选项

    browser.find_element_by_id("u").send_keys(str(qq))
    browser.find_element_by_id("p").send_keys(password)
    browser.find_element_by_id("login_button").click()  #点击登录

    time.sleep(3) #登陆后等待3秒,页面加载

    browser.switch_to_default_content()  #回到parent frame
    
def crawel_qqzone(browser,friend_qq):
    browser.get("https://user.qzone.qq.com/%s/311"%friend_qq)  #访问朋友主页说说界面,后面311表示说说界面

    r = open("qq_content.txt","a+")

    page = 0
    while True:
        for i in range(1,6):   #document.body.scrollHeight 可以查看进度条的像素(高度)
            height = 2000*i
            movedown = "window.scrollBy(0,"+ str(height)+")"   #滚动到指定像素值
            browser.execute_script(movedown)
            time.sleep(3)
        
        #进入到说说内容界面的frame
        browser.switch_to.frame("app_canvas_frame")
        docu = browser.page_source.encode("utf-8")
        s = html.fromstring(docu)
        li_tags = s.xpath("//ol[@id='msgList']/li")
        #print(li_tags)

        for li_tag in li_tags:
            qq_name = li_tag.xpath("./div[3]/div[2]/a/text()")
            qq_content = li_tag.xpath("./div[3]/div[2]/pre/text()")
            qq_time = li_tag.xpath("./div[3]/div[4]/div[1]/span[1]/a/text()")
            qq_name = qq_name[0] if len(qq_name)>0 else ""
            qq_content = qq_content[0] if len(qq_content)>0 else ""
            qq_time = qq_time[0] if len(qq_time)>0 else ""
            print qq_name,qq_content,qq_time
            r.write(qq_content.encode("utf-8")+"\n")
        
        #最后一页时,下一页标签没有id值    
        if docu.find("pager_next_%s"%page)== -1:
            break
        #找到下一页标签,点击
        browser.find_element_by_id("pager_next_%s"%page).click()
        page = page+1
        browser.switch_to.parent_frame()    #回到父窗口,下次循环中往下滑页面
    r.close()
    browser.close()
    
if __name__=="__main__":
    browser = webdriver.Firefox()
    login_qq(browser,1368884216,"密码")
    crawel_qqzone(browser,993342902)
    #generate_wd("qq_content.txt")     #将爬取下来的QQ空间说说生成词云
爬取qq空间

     生成云图代码如下:

#coding:utf-8

from wordcloud import WordCloud
from matplotlib import pyplot as plt
import jieba
import chardet
import os

#使用jieba来解决中文乱码
def generate_wd(filename):
    
    with open(filename) as f:
        text = f.read()
    wordlist = jieba.cut(text,cut_all=True)
    text = " ".join(wordlist)
    wc = WordCloud(
        background_color = "white",
        max_words = 2000,
        font_path =r"C:\Windows\Fonts\STFANGSO.TTF",  #需要中文字体文件来显示中文
        height = 1200,
        width = 1600,
        max_font_size = 100,
        random_state = 30,
    )

    myword = wc.generate(text)  
    plt.imshow(myword)
    plt.axis("off")
    plt.show()
    dirname,basename = os.path.split(filename)
    fn,ext = basename.splitext(basename)
    wc.to_file(os.path.join(dirname,fn+".png")
    
    
#使用字体编码来解决中文乱码
# def generate_wd(filename):
    
    # with open(filename) as f:
        # text = f.read()
    # code = chardet.detect(text)  #检测文件编码格式
    # wc = WordCloud(
        # background_color = "white",
        # max_words = 2000,
        # font_path =r"C:\Windows\Fonts\STFANGSO.TTF",  #需要中文字体文件来显示中文
        # height = 1200,
        # width = 1600,
        # max_font_size = 100,
        # random_state = 30,
    # )
    # myword = wc.generate(text.decode(code["encoding"]))  #中文需传入unicode字符
    # plt.imshow(myword)
    # plt.axis("off")
    # plt.show()
    # dirname,basename = os.path.split(filename)
    # fn,ext = basename.splitext(basename)
    # wc.to_file(os.path.join(dirname,fn+".png"))
    
if __name__=="__main__":
    generate_wd("qq_content.txt")
生成云图

 参考: https://www.cnblogs.com/zhaof/p/6953241.html

猜你喜欢

转载自www.cnblogs.com/silence-cho/p/10639331.html