07. Introduction to Python browser automation module selenium

1. Introduction to selenium

Selenium is an automated testing tool for testing websites. It supports various browsers including mainstream interface browsers such as Chrome, Firefox, and Safari, as well as phantomJS interfaceless browsers.

2. The use of selenuum

1. Installation

pip install selenium

2. Download the browser driver

Chrome browser driver
Firefox browser

Find your browser version to download the driver

3. The browser automatically plays the second song in the Migu music Jay Chou song list

from selenium import webdriver

import time

bro = webdriver.Chrome(executable_path="./chromedriver")  # 实例化浏览器对象,executable_path是刚下载的浏览器驱动
bro.maximize_window()#窗口最大化
bro.get(url="https://music.migu.cn/v3")  # 浏览器访问咪咕音乐
bro.find_element_by_id("search_ipt").send_keys("周杰伦")  # 在搜索框输入周杰伦
bro.find_element_by_xpath('//*[@id="header"]/div[1]/div/div[2]/div[1]/div/span').click()  # 点击搜索按钮
bro.switch_to.window(bro.window_handles[1])  # 切换到第二个tab栏
bro.execute_script("window.scrollTo(0,400)")#定位到周杰伦第二个音乐,为了效果明显,可省略
time.sleep(1)
bro.find_element_by_xpath('//*[@id="J_PageSonglist"]/div[2]/div[2]/div[3]/a').click()  # 点击第二个歌曲
bro.switch_to.window(bro.window_handles[2])  # 切换到第三个tab栏
time.sleep(1)
bro.find_element_by_xpath('/html/body/div[2]/div[2]/div[2]/div[3]/div[1]').click()  # 播放歌曲
while True:#页面跳转太慢,无法获得最新的标签,用个死循环夯住
    if len(bro.window_handles) == 4:
        bro.switch_to.window(bro.window_handles[3])
        break
bro.find_element_by_xpath('//*[@id="header"]/div/div[2]/div[1]/div').click()#点击沉浸模式按钮

time.sleep(240)  # 播放4m
bro.quit()  # 退出浏览器

4. Some methods to control browser operation

method Description
set_window_size() Set the size of the browser
back() Control browser back
forward() Control the browser forward
refresh() Refresh the current page
clear() Clear text
send_keys (value) Analog key input
click() Click element
submit() Used to submit the form
get_attribute(name) Get element attribute value
is_displayed() Set whether the element is visible to the user
size Returns the size of the element
text Get the text of the element

5. Mouse events

method Description
ActionChains(driver) Construct ActionChains object
context_click() Perform mouse hover
move_to_element(above) Right click
double_click() Double click
drag_and_drop() drag
move_to_element(above) Perform mouse hover
context_click() Used to simulate the operation of the right mouse button, you need to specify the positioning of the element when calling
perform() Execute all the behaviors stored in ActionChains, which can be understood as a submission action for the entire operation

6. Action chain and iframe processing

i. Action chain

Simulate complex mouse actions, such as long-press and drag the mouse

ii.iframe processing

If the current page has an iframe, the tags in the iframe cannot be crawled. You can only locate the iframe first, and then get the tags

iii. Code

from selenium import webdriver
from time import sleep
#导入动作链对应的类
from selenium.webdriver import ActionChains
bro = webdriver.Chrome(executable_path='./chromedriver')

bro.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')

#如果定位的标签是存在于iframe标签之中的则必须通过如下操作在进行标签定位
bro.switch_to.frame('iframeResult')#切换浏览器标签定位的作用域
div = bro.find_element_by_id('draggable')

#动作链
action = ActionChains(bro)
#点击长按指定的标签
action.click_and_hold(div)

for i in range(5):
    #perform()立即执行动作链操作
    #move_by_offset(x,y):x水平方向 y竖直方向
    action.move_by_offset(17,0).perform()
    sleep(0.5)

#释放动作链
action.release()

bro.quit()

7. Headless browsers and anti-detection methods

i. Headless browser

We found that when we use selenium to crawl the page, the browser will always pop up and run automatically. When we don’t want to see the browser running, we can hide the browser when we only want to see the data.

ii. Anti-detection measures

Some servers detect that the browser is sent by selenium, and will block the access of the request, so you need to pretend selenuim

iii. Code

from selenium import webdriver
from time import sleep
#实现无可视化界面
from selenium.webdriver.chrome.options import Options
#实现规避检测
from selenium.webdriver import ChromeOptions

#实现无可视化界面的操作
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')

#实现规避检测
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])

#如何实现让selenium规避被检测到的风险
bro = webdriver.Chrome(executable_path='./chromedriver',chrome_options=chrome_options,options=option)

#无可视化界面(无头浏览器) phantomJs
bro.get('https://www.baidu.com')

print(bro.page_source)
sleep(2)
bro.quit()

7. selenium realizes automatic login 12306

#下述代码为超级鹰提供的示例代码
import requests
from hashlib import md5

class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password =  password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
    
    
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
    
    
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
    
    
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {
    
    'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
    
    
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()

# chaojiying = Chaojiying_Client('bobo328410948', 'bobo328410948', '899370')	#用户中心>>软件ID 生成一个替换 96001
# im = open('12306.jpg', 'rb').read()													#本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
# print(chaojiying.PostPic(im, 9004)['pic_str'])
#上述代码为超级鹰提供的示例代码

#使用selenium打开登录页面
from selenium import webdriver
import time
from PIL import Image
from selenium.webdriver import ActionChains
bro = webdriver.Chrome(executable_path='./chromedriver')
bro.get('https://kyfw.12306.cn/otn/login/init')
time.sleep(1)

#save_screenshot就是将当前页面进行截图且保存
bro.save_screenshot('aa.png')

#确定验证码图片对应的左上角和右下角的坐标(裁剪的区域就确定)
code_img_ele = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img')
location = code_img_ele.location  # 验证码图片左上角的坐标 x,y
print('location:',location)
size = code_img_ele.size  #验证码标签对应的长和宽
print('size:',size)
#左上角和右下角坐标
rangle = (
int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height']))
#至此验证码图片区域就确定下来了

i = Image.open('./aa.png')
code_img_name = './code.png'
#crop根据指定区域进行图片裁剪
frame = i.crop(rangle)
frame.save(code_img_name)

#将验证码图片提交给超级鹰进行识别
chaojiying = Chaojiying_Client('bobo328410948', 'bobo328410948', '899370')	#用户中心>>软件ID 生成一个替换 96001
im = open('code.png', 'rb').read()													#本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
print(chaojiying.PostPic(im, 9004)['pic_str'])
result = chaojiying.PostPic(im, 9004)['pic_str']
all_list = [] #要存储即将被点击的点的坐标  [[x1,y1],[x2,y2]]
if '|' in result:
    list_1 = result.split('|')
    count_1 = len(list_1)
    for i in range(count_1):
        xy_list = []
        x = int(list_1[i].split(',')[0])
        y = int(list_1[i].split(',')[1])
        xy_list.append(x)
        xy_list.append(y)
        all_list.append(xy_list)
else:
    x = int(result.split(',')[0])
    y = int(result.split(',')[1])
    xy_list = []
    xy_list.append(x)
    xy_list.append(y)
    all_list.append(xy_list)
print(all_list)
#遍历列表,使用动作链对每一个列表元素对应的x,y指定的位置进行点击操作
for l in all_list:
    x = l[0]
    y = l[1]
    ActionChains(bro).move_to_element_with_offset(code_img_ele, x, y).click().perform()
    time.sleep(0.5)

bro.find_element_by_id('username').send_keys('[email protected]')
time.sleep(2)
bro.find_element_by_id('password').send_keys('bobo_15027900535')
time.sleep(2)
bro.find_element_by_id('loginSub').click()
time.sleep(30)
bro.quit()








7. More details

Selenium detailed

Guess you like

Origin blog.csdn.net/qq_40837794/article/details/109782814