一、Selenium+PhantomJs
最新版本的Selenium已不再支持PhantomJs,详情参考官网
from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities def getSource(url): # 设置请求头 headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4882.400 QQBrowser/9.7.13059.400', 'referer':'http://www.taobao.com' } #使用copy()防止修改原代码定义dict cap = DesiredCapabilities.PHANTOMJS.copy() for key, value in headers.items(): cap['phantomjs.page.customHeaders.{}'.format(key)] = value # 不载入图片 cap["phantomjs.page.settings.loadImages"] = False driver = webdriver.PhantomJS(desired_capabilities=cap) driver.get(url) driver.quit() getSource(url)
from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.desired_capabilities import DesiredCapabilities # 启动PhantomJS driver = webdriver.PhantomJS() # 设置User-Agent dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36" ) driver = webdriver.PhantomJS(desired_capabilities=dcap) driver.get("http://www.baidu.com") # assert u"百度" in driver.title elem = driver.find_element_by_name("wd") # elem.clear() # elem.send_keys(u"网络") #按下回车键 # elem.send_keys(Keys.ENTER) # time.sleep(3) # assert u"网络爬虫." not in driver.page_source # driver.quit() # data = driver.page_source #把当前网页保存为图片 # driver.save_screenshot("3.png") print(elem) # quit和close的区别 driver.quit()
二、Selenium+Chrome
from selenium import webdriver from selenium.webdriver.common.keys import Keys import time #修改Chrome的User-Agent options = webdriver.ChromeOptions() options.add_argument("Accept-Language=zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3") options.add_argument("Accept=text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") options.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.103 Safari/537.36"') # 设置图片不加载 # 1:允许所有图片;2:阻止所有图片;3:阻止第三方服务器图片 prefs = {"profile.managed_default_content_settings.images":1} options.add_experimental_option("prefs",prefs) driver = webdriver.Chrome(chrome_options=options) # 设置Cookie driver.add_cookie({'name': 'key-aaaaaaa'}, {'value': 'value-bbbbb'}) # driver.delete_all_cookies()#删除所有的Cookie driver.get("http://www.taobao.com") # 浏览器全屏 driver.maximize_window() # 编写js代码,控制滚动条向下滑动 js="document.documentElement.scrollTop=11800" # 执行js代码 driver.execute_script(js) data2 = driver.page_source.encode("utf-8") time.sleep(10) with open("taobao02.html","wb") as f: f.write(data2) time.sleep(5) driver.quit()