Selenium+PhantomJS使用

selenium使用PhantomJS,主要有两个原因:

1、使用“无界面”浏览器操作;2、调用一些js函数实现一些功能,比如网页的长截图。

一、下载

1、selenium下载直接通过pip安装即可

但是新版本的selenium已经不在支持PhantomJS,因此需要安装旧版本的selenium(但是太旧也不行)

例如:

pip install selenium==2.48.0

2、下载Chromedriver
载与自己浏览器版本相对应的Chromedriver版本:

http://chromedriver.storage.googleapis.com/index.html

下载windows版本的32位,下载后解压,将.exe放入python安装路径

3、PhantomJS下载安装

1、PhantomJS需要下载安装,(有时还需要加入环境变量);

Download PhantomJS

2、下载后解压,将路径加入环境变量Path;

3、再将.exe复制到python的安装路径内;

4、pip 安装phantomjs库

二、测试代码

from selenium import webdriver
import selenium

driver = selenium.webdriver.PhantomJS()
# 没有环境变量时使用executable_path指定路径:
# driver = selenium.webdriver.PhantomJS(executable_path=r'D:\softs\phantomjs-2.1.1\phantomjs-2.1.1-windows/bin/phantomjs.exe')
driver.close()

# 测试webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=chrome_options)
browser.get('https://www.baidu.com/')
browser.close()

三、selenium、phantomjs执行js代码,替换浏览器里网页的节点的源码


# 替换webdriver浏览器里网页的节点(修改网页源码):
driver.execute_script('document.querySelector("body").innerHTML="{}";'.format(new_pagesource))

四、长截图

"""截网页长图"""
def get_long_png(web, no):
    ss = time.time()
    try:
        # browser_js = webdriver.PhantomJS('phantomjs')
        # browser_js.maximize_window()
        # browser_js = web
        # browser_js.execute_script(
        #     'document.querySelector("body").innerHTML="{}";'.format(new_pagesource.replace('"', "'").replace('\n', '')))
        # browser_js = web
        time.sleep(10)
        web.save_screenshot('.\\imgs\\{}.png'.format(no))
        print('.\\imgs\\{}.png saved!'.format(no))
    except BaseException as msg:
        print(msg)
    print("get pic use:" + str(time.time() - ss))

 五、获取文本叶子节点及tag_path(Xpath)


"""获取文本叶子节点及tag_path  text_node"""
def get_one_text_node(web, file_name):
    # 删除一些无用节点
    # 先用正则删除  注释
    del_re = '<!--\s*((?!-->).[\s]*)*-->'
    root = etree.HTML(re.sub(del_re, '', web.page_source))

    # 注释、script、style、无文本节点
    # 隐藏节点
    del_nodes = root.xpath('//style|//script|//img|//base|//head|//link|//input|'
                           '//*[contains(@*[1],"hidden")]|'
                           '//*[contains(@*[2],"hidden")]|'
                           '//*[contains(@*[3],"hidden")]|'
                           '//*[contains(@*[4],"hidden")]|'
                           '//*[contains(@style,"hidden")]|'
                           '//*[contains(@type,"hidden")]')
    for node in del_nodes:
        parent = node.getparent()
        if parent is not None:
            parent.remove(node)

    body = root.xpath('//body')[0]
    all_text_nodes = body.xpath('.//*[text()]')

    window_size = web.find_element_by_xpath('//html').size
    docu_height = web.execute_script('return document.body.scrollHeight')
    docu_width = web.execute_script('return document.body.scrollWidth')
    window_size = {'width': max(window_size.get('width'), docu_width), 'height': max(window_size.get('height'), docu_height)}

    text_leaf_nodes = []
    text_leaf_nodes_tag_path = []
    text_node_info_list = []
    # from tqdm import tqdm
    # for node in tqdm(all_text_nodes):
    for node in all_text_nodes:
        node_text = re.sub(r'\s+', ' ', "".join(node.xpath('./text()'))).strip()
        if len(node_text) > 0:
            text_leaf_nodes.append(node)
            tag_path = getEtreeXPath(node, root)
            text_leaf_nodes_tag_path.append(tag_path)
            # node['tag_path'] = getEtreeXPath(node)

            webelement = web.find_element_by_xpath(tag_path)
            # 是否为不显示元素:
            if webelement.size.get('height') == 0 and webelement.size.get('width') == 0:
                continue
                ttext = re.sub(r'[((][\s\S]*[))]', '', node_text).strip()
                if len(re.findall(r'[^\x00-\xff]', ttext)) < 10 and \
                        len(re.findall(r'[A-Za-z-]+', ttext)) < 5:
                    continue
                print(ttext)

            one_node_info = TextNodeInfo(dom_node=node, tag_path=tag_path, webelement=webelement, window_size=window_size,
                                         label_xpath_set=node_label_xpath.zw_xpath_set)
            if len(one_node_info.text_content) > 0:
                one_node_info.print(file_name)
            text_node_info_list.append(one_node_info)
    print("{} write finish".format(file_name))

六、js获取页面文档的 高度和宽度 

# js获取页面文档的 高度和宽度(不是浏览器页面的宽高)
docu_height = web.execute_script('return document.body.scrollHeight')
docu_width = web.execute_script('return document.body.scrollWidth')

猜你喜欢

转载自blog.csdn.net/qq_38767359/article/details/125287930