【代码】第7章 动态渲染页面爬取

7.1 selenium

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver import ActionChains
import time

browser = webdriver.Chrome()
try:
    browser.get('https://www.baidu.com')
    elem = browser.find_element_by_id('kw')
    elem.send_keys('python')
    elem.send_keys(Keys.ENTER)
    wait = WebDriverWait(browser, 10)
    wait.until(EC.presence_of_element_located((By.ID, 'content_left')))
    print(browser.current_url)
    print(browser.get_cookies())
    print(browser.page_source)
finally:
    browser.close()


# 基本操作
browser = webdriver.Chrome()
browser.get('https://www.taobao.com')
print(browser.page_source)
in_1 = browser.find_element_by_id('q')
in_2 = browser.find_element_by_css_selector('#q')
in_3 = browser.find_element_by_xpath("//*[@id='q']")
print(in_1, in_2, in_3)

# 多个节点
in_4 = browser.find_elements_by_css_selector('.service-bd li')
print(in_4)

# 节点交互
in_5 = browser.find_element_by_id('q')
in_5.send_keys('iphone')
time.sleep(1)
in_5.clear()
in_5.send_keys('ipad')
button = browser.find_element_by_class_name('btn-search')
button.click()
time.sleep(1)

# 动作链, 节点拖拽
browser = webdriver.Chrome()
url = 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
browser.get(url)
browser.switch_to.frame('iframeResult')
source = browser.find_element_by_css_selector('#draggable')
target = browser.find_element_by_css_selector('#droppable')
action = ActionChains(browser)
action.drag_and_drop(source, target)
action.perform()
time.sleep(1)

# 执行JS
browser = webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
browser.execute_script('window.scrollTo(0, (document.body.scrollHeight)/4)')
browser.execute_script('alert("To Bottom")')
time.sleep(1)
browser.close()

# 获取属性,内容,以及其他属性, 以淘宝网的主题市场元素内容为例
browser = webdriver.Chrome()
url = 'https://www.taobao.com'
browser.get(url)
elem = browser.find_elements_by_xpath('//ul[@class="service-bd"]/li/a')
# 定位元素后,在列表中对每个元素用 .text获取文本内容 .get_attribute()获取属性
for i in elem:
    print(i.id, i.location, i.tag_name, i.size, i.text, i.get_attribute('href'), sep=" ")

# 延时等待,隐式的:
browser = webdriver.Chrome()
url = 'https://www.zhihu.com/explore'
browser.implicitly_wait(20)     # 在已打开的网页DOM中查找节点延时
browser.get(url)
elem = browser.find_element_by_class_name('zu-top-add-question')
print(elem)

# 前进,后退
browser = webdriver.Chrome()
url0 = 'https://www.taobao.com'
url1 = 'https://www.zhihu.com/explore'
url2 = 'https://www.baidu.com'
browser.get(url0)
browser.get(url1)
browser.get(url2)
browser.back()
browser.forward()

# 选项卡关联
browser = webdriver.Chrome()
url = 'https://www.taobao.com'
browser.get(url)
browser.execute_script('window.open()')
print(browser.window_handles)
# JS创建新标签后,必须切换窗口句柄,才能输入url
browser.switch_to.window(browser.window_handles[1])
browser.get('https://www.baidu.com')
time.sleep(1)
browser.switch_to.window(browser.window_handles[0])
browser.get('https://www.zhihu.com')

7.2 splash
在ubuntu下安装docker,再直接运行splash,如果没有运行,则自动进行安装

7.3 splash负载均衡
需要安装Nginx,未尝试

7.4 selenium 爬取淘宝商品(登录验证滑块出错。。。。。待继续)

from selenium  import webdriver
url = 'https://s.taobao.com/search?q=ipad'
# 加请求头
options = webdriver.ChromeOptions()
options.add_argument('lang=zh_CN.UTF-8')
options.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                     'AppleWebKit/537.36 (KHTML, like Gecko) '
                     'Chrome/69.0.3497.100 Safari/537.36"')
browser = webdriver.Chrome()
# 最大化窗口,显示内容
browser.maximize_window()
browser.get(url)
# 输入用户名
elem = browser.find_element_by_id('J_Quick2Static')
elem.click()
elem = browser.find_element_by_id('TPL_username_1')
elem.send_keys('[email protected]')
# 输入密码
elem = browser.find_element_by_id('TPL_password_1')
pw = input()
elem.send_keys(pw)
# 拖动滑块,此处:哎呀,出错了,点击刷新再来一次(error:H8gqCi)
button = browser.find_element_by_id('nc_1_n1z')
action = webdriver.ActionChains(browser)
action.drag_and_drop_by_offset(button, 400, 0).perform()
# 提交登录
elem = browser.find_element_by_id('J_SubmitStatic')
elem.click()

猜你喜欢

转载自blog.csdn.net/C_Python_/article/details/83108253