1. Introduction to selenium
Selenium is an automated testing tool for testing websites. It supports various browsers including mainstream interface browsers such as Chrome, Firefox, and Safari, as well as phantomJS interfaceless browsers.
2. The use of selenuum
1. Installation
pip install selenium
2. Download the browser driver
Chrome browser driver
Firefox browser
Find your browser version to download the driver
3. The browser automatically plays the second song in the Migu music Jay Chou song list
from selenium import webdriver
import time
bro = webdriver.Chrome(executable_path="./chromedriver") # 实例化浏览器对象,executable_path是刚下载的浏览器驱动
bro.maximize_window()#窗口最大化
bro.get(url="https://music.migu.cn/v3") # 浏览器访问咪咕音乐
bro.find_element_by_id("search_ipt").send_keys("周杰伦") # 在搜索框输入周杰伦
bro.find_element_by_xpath('//*[@id="header"]/div[1]/div/div[2]/div[1]/div/span').click() # 点击搜索按钮
bro.switch_to.window(bro.window_handles[1]) # 切换到第二个tab栏
bro.execute_script("window.scrollTo(0,400)")#定位到周杰伦第二个音乐,为了效果明显,可省略
time.sleep(1)
bro.find_element_by_xpath('//*[@id="J_PageSonglist"]/div[2]/div[2]/div[3]/a').click() # 点击第二个歌曲
bro.switch_to.window(bro.window_handles[2]) # 切换到第三个tab栏
time.sleep(1)
bro.find_element_by_xpath('/html/body/div[2]/div[2]/div[2]/div[3]/div[1]').click() # 播放歌曲
while True:#页面跳转太慢,无法获得最新的标签,用个死循环夯住
if len(bro.window_handles) == 4:
bro.switch_to.window(bro.window_handles[3])
break
bro.find_element_by_xpath('//*[@id="header"]/div/div[2]/div[1]/div').click()#点击沉浸模式按钮
time.sleep(240) # 播放4m
bro.quit() # 退出浏览器
4. Some methods to control browser operation
method | Description |
---|---|
set_window_size() | Set the size of the browser |
back() | Control browser back |
forward() | Control the browser forward |
refresh() | Refresh the current page |
clear() | Clear text |
send_keys (value) | Analog key input |
click() | Click element |
submit() | Used to submit the form |
get_attribute(name) | Get element attribute value |
is_displayed() | Set whether the element is visible to the user |
size | Returns the size of the element |
text | Get the text of the element |
5. Mouse events
method | Description |
---|---|
ActionChains(driver) | Construct ActionChains object |
context_click() | Perform mouse hover |
move_to_element(above) | Right click |
double_click() | Double click |
drag_and_drop() | drag |
move_to_element(above) | Perform mouse hover |
context_click() | Used to simulate the operation of the right mouse button, you need to specify the positioning of the element when calling |
perform() | Execute all the behaviors stored in ActionChains, which can be understood as a submission action for the entire operation |
6. Action chain and iframe processing
i. Action chain
Simulate complex mouse actions, such as long-press and drag the mouse
ii.iframe processing
If the current page has an iframe, the tags in the iframe cannot be crawled. You can only locate the iframe first, and then get the tags
iii. Code
from selenium import webdriver
from time import sleep
#导入动作链对应的类
from selenium.webdriver import ActionChains
bro = webdriver.Chrome(executable_path='./chromedriver')
bro.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
#如果定位的标签是存在于iframe标签之中的则必须通过如下操作在进行标签定位
bro.switch_to.frame('iframeResult')#切换浏览器标签定位的作用域
div = bro.find_element_by_id('draggable')
#动作链
action = ActionChains(bro)
#点击长按指定的标签
action.click_and_hold(div)
for i in range(5):
#perform()立即执行动作链操作
#move_by_offset(x,y):x水平方向 y竖直方向
action.move_by_offset(17,0).perform()
sleep(0.5)
#释放动作链
action.release()
bro.quit()
7. Headless browsers and anti-detection methods
i. Headless browser
We found that when we use selenium to crawl the page, the browser will always pop up and run automatically. When we don’t want to see the browser running, we can hide the browser when we only want to see the data.
ii. Anti-detection measures
Some servers detect that the browser is sent by selenium, and will block the access of the request, so you need to pretend selenuim
iii. Code
from selenium import webdriver
from time import sleep
#实现无可视化界面
from selenium.webdriver.chrome.options import Options
#实现规避检测
from selenium.webdriver import ChromeOptions
#实现无可视化界面的操作
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
#实现规避检测
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
#如何实现让selenium规避被检测到的风险
bro = webdriver.Chrome(executable_path='./chromedriver',chrome_options=chrome_options,options=option)
#无可视化界面(无头浏览器) phantomJs
bro.get('https://www.baidu.com')
print(bro.page_source)
sleep(2)
bro.quit()
7. selenium realizes automatic login 12306
#下述代码为超级鹰提供的示例代码
import requests
from hashlib import md5
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
self.username = username
password = password.encode('utf8')
self.password = md5(password).hexdigest()
self.soft_id = soft_id
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def PostPic(self, im, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {
'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()
# chaojiying = Chaojiying_Client('bobo328410948', 'bobo328410948', '899370') #用户中心>>软件ID 生成一个替换 96001
# im = open('12306.jpg', 'rb').read() #本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
# print(chaojiying.PostPic(im, 9004)['pic_str'])
#上述代码为超级鹰提供的示例代码
#使用selenium打开登录页面
from selenium import webdriver
import time
from PIL import Image
from selenium.webdriver import ActionChains
bro = webdriver.Chrome(executable_path='./chromedriver')
bro.get('https://kyfw.12306.cn/otn/login/init')
time.sleep(1)
#save_screenshot就是将当前页面进行截图且保存
bro.save_screenshot('aa.png')
#确定验证码图片对应的左上角和右下角的坐标(裁剪的区域就确定)
code_img_ele = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img')
location = code_img_ele.location # 验证码图片左上角的坐标 x,y
print('location:',location)
size = code_img_ele.size #验证码标签对应的长和宽
print('size:',size)
#左上角和右下角坐标
rangle = (
int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height']))
#至此验证码图片区域就确定下来了
i = Image.open('./aa.png')
code_img_name = './code.png'
#crop根据指定区域进行图片裁剪
frame = i.crop(rangle)
frame.save(code_img_name)
#将验证码图片提交给超级鹰进行识别
chaojiying = Chaojiying_Client('bobo328410948', 'bobo328410948', '899370') #用户中心>>软件ID 生成一个替换 96001
im = open('code.png', 'rb').read() #本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
print(chaojiying.PostPic(im, 9004)['pic_str'])
result = chaojiying.PostPic(im, 9004)['pic_str']
all_list = [] #要存储即将被点击的点的坐标 [[x1,y1],[x2,y2]]
if '|' in result:
list_1 = result.split('|')
count_1 = len(list_1)
for i in range(count_1):
xy_list = []
x = int(list_1[i].split(',')[0])
y = int(list_1[i].split(',')[1])
xy_list.append(x)
xy_list.append(y)
all_list.append(xy_list)
else:
x = int(result.split(',')[0])
y = int(result.split(',')[1])
xy_list = []
xy_list.append(x)
xy_list.append(y)
all_list.append(xy_list)
print(all_list)
#遍历列表,使用动作链对每一个列表元素对应的x,y指定的位置进行点击操作
for l in all_list:
x = l[0]
y = l[1]
ActionChains(bro).move_to_element_with_offset(code_img_ele, x, y).click().perform()
time.sleep(0.5)
bro.find_element_by_id('username').send_keys('[email protected]')
time.sleep(2)
bro.find_element_by_id('password').send_keys('bobo_15027900535')
time.sleep(2)
bro.find_element_by_id('loginSub').click()
time.sleep(30)
bro.quit()