来一套爬虫儿

写一起太过冗长,那就分开写吧

PyQuery库

安装&调用

pip install pyquery
from pyquery import PyQuery

走你

'字符串初始化'
from pyquery import PyQuery as pq
html = '''
            <select node-type="changeLanguage" suda-data="key=tblog_home_click&amp;value=language_versions_click">
                <option value="zh-cn" selected>中文(简体)</option>
                <option value="zh-tw">中文(臺灣)</option>
                <option value="zh-hk">中文(香港)</option>
                <option value="en">English</option>
              </select></p>
'''
temp = pq(html)
print(temp('option'))				#打印option标签
-->
<option value="zh-cn" selected="selected">中文(简体)</option>
                <option value="zh-tw">中文(臺灣)</option>
                <option value="zh-hk">中文(香港)</option>
                <option value="en">English</option>


'url初始化'
from pyquery import PyQuery as pq
url = 'http://www.baidu.com'
temmp = pq(url)
print(temp('head'))

-->
<head><meta http-equiv="content-type" content="text/html;charset=utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta content="always" name="referrer"/><link rel="stylesheet" type="text/css" href="http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css"/><title>ç™¾åº¦ä¸€ä¸‹ï¼Œä½ å°±çŸ¥é“</title></head> 


最基本的选择查找
html = '''
            <select id="changeLanguage" suda-data="key=tblog_home_click&amp;value=language_versions_click">
                <option class="zh-cn" selected>中文(简体)</option>
                <option value="zh-tw">中文(臺灣)</option>
                <option value="zh-hk">中文(香港)</option>
                <option value="en">English</option>
              </select></p>
'''
from pyquery import PyQuery as pq
temp = pq(html)
print(temp('#changeLanguage .zh-cn'))  #id=changeLanguage标签下的class=zh-cn的标签

-->
<option class="zh-cn" selected="selected">中文(简体)</option>


查找子元素
from pyquery import PyQuery as pq
html = '''
            <select id="changeLanguage" suda-data="key=tblog_home_click&amp;value=language_versions_click">
                <option class="zh-cn" selected>中文(简体)</option>
                <option value="zh-tw">中文(臺灣)</option>
                <option value="zh-hk">中文(香港)</option>
                <option value="en">English</option>
              </select></p>
'''
temp = pq(html)
lis1 = temp('#changeLanguage')
print(lis1)
-->
<select id="changeLanguage" suda-data="key=tblog_home_click&amp;value=language_versions_click">
                <option class="zh-cn" selected="selected">中文(简体)</option>
                <option value="zh-tw">中文(臺灣)</option>
                <option value="zh-hk">中文(香港)</option>
                <option value="en">English</option>
              </select>

lis2 = lis1.find('option')
print(lis2)
-->
<option class="zh-cn" selected="selected">中文(简体)</option>
                <option value="zh-tw">中文(臺灣)</option>
                <option value="zh-hk">中文(香港)</option>
                <option value="en">English</option>

父元素查找
from pyquery import PyQuery as pq
html = '''
            <select id="changeLanguage" suda-data="key=tblog_home_click&amp;value=language_versions_click">
                <option class="zh-cn" selected>中文(简体)</option>
                <option value="zh-tw">中文(臺灣)</option>
                <option value="zh-hk">中文(香港)</option>
                <option value="en">English</option>
              </select></p>
'''
temp = pq(html)
lis1 = temp('option')
parent = lis1.parent()  #lis1.parents()可以查找所有的父类
print(parent)
-->
<select id="changeLanguage" suda-data="key=tblog_home_click&amp;value=language_versions_click">
                <option class="zh-cn" selected="selected">中文(简体)</option>
                <option value="zh-tw">中文(臺灣)</option>
                <option value="zh-hk">中文(香港)</option>
                <option value="en">English</option>
              </select>

兄弟标签
XXXX.siblings('关键字')


遍历
XXX.items()

获取信息

获取文本
from pyquery import PyQuery as pq
html = '''
            <select id="changeLanguage" suda-data="key=tblog_home_click&amp;value=language_versions_click">
                <option class="zh-cn" selected>中文(简体)</option>
                <option value="zh-tw">中文(臺灣)</option>
                <option value="zh-hk">中文(香港)</option>
                <option value="en">English</option>
              </select></p>
'''
temp = pq(html)
lis1 = temp('option').text()
print(lis1)
-->
中文(简体) 中文(臺灣) 中文(香港) English

lis2 = temp('option').eq(1).text()
print(lis2)
-->
中文(臺灣)

lis3 = temp('option[class]').text()
print(lis3)
-->
中文(简体)


获取属性
'遇到 class 用 .    遇到 id  用 #'
'目标:获取<option value="zh-tw">中文(臺灣)</option>中的value属性'
from pyquery import PyQuery as pq
html = '''
            <select id="changeLanguage" suda-data="key=tblog_home_click&amp;value=language_versions_click">
                <option class="zh-cn" selected>中文(简体)</option>
                <option value="zh-tw">中文(臺灣)</option>
                <option value="zh-hk">中文(香港)</option>
                <option value="en">English</option>
              </select></p>
'''
temp = pq(html)
print(temp('option[value]'))	#打印属性含有value的option标签
-->		
<option value="zh-hk">中文(香港)</option>
                <option value="en">English</option>

print(temp('option[value]').eq(0).attr.value)	#包含有option的标签有两个,eq(0)是选第一个标签,之后用attr选取其中的value属性
-->
zh-hk

print(temp('option[value]').eq(1).attr.value)
-->
en

DOM操作

增、删 addClass removeClass

from pyquery import PyQuery as pq
html = '''
            <select id="changeLanguage" suda-data="key=tblog_home_click&amp;value=language_versions_click">
                <option class="zh-cn" selected>中文(简体)</option>
                <option value="zh-tw">中文(臺灣)</option>
                <option value="zh-hk">中文(香港)</option>
                <option value="en">English</option>
              </select></p>
'''
temp = pq(html)
cla = temp('option[class]')
print(cla)								#原代码
-->
<option class="zh-cn" selected="selected">中文(简体)</option>

remcla = cla.removeClass("zh-cn")		#删除class属性下的"zh-cn"
print(remcla)
-->
<option class="" selected="selected">中文(简体)</option>

addcla = cla.addClass("zh-cn")			#增加class属性下的"zh-cn"
print(addcla)
-->
<option class="zh-cn" selected="selected">中文(简体)</option>

增、改 attr css

print(cla)
-->
<option class="zh-cn" selected="selected">中文(简体)</option>	#原代码

att = cla.attr('a','b')
print(att)
-->
<option class="zh-cn" selected="selected" a="b">中文(简体)</option>	#加入a,b后代码

att1 = cla.attr('class','b')
-->
<option class="b" selected="selected" a="b">中文(简体)</option>		#修改class属性

style = cla.css('lala','hehe')				#增加style属性
print(style)
-->
<option class="b" selected="selected" a="b" style="lala: hehe">中文(简体)</option>

删除 remove(前面的修改指令是 removeClass)

from pyquery import PyQuery as pq
html = '''
            <select id="changeLanguage" suda-data="key=tblog_home_click&amp;value=language_versions_click">
                <option class="zh-cn" selected>中文(简体)</option>
                <option value="zh-tw">中文(臺灣)</option>
                <option value="zh-hk">中文(香港)</option>
                <option value="en">English</option>
              </select></p>
'''
temp = pq(html)
dell = temp.find('option[class]').remove()
print(temp)
-->
<select id="changeLanguage" suda-data="key=tblog_home_click&amp;value=language_versions_click">
                #少了一行 
                <option value="zh-tw">中文(臺灣)</option>
                <option value="zh-hk">中文(香港)</option>
                <option value="en">English</option>
              </select>

伪类选择器

from pyquery import PyQuery as pq
html = '''
            <select id="changeLanguage" suda-data="key=tblog_home_click&amp;value=language_versions_click">
                <option class="zh-cn" selected>中文(简体)</option>
                <option value="zh-tw">中文(臺灣)</option>
                <option value="zh-hk">中文(香港)</option>
                <option value="en">English</option>
              </select></p>
'''
temp = pq(html)
fir = temp('option:first-child')
print(fir)
-->
<option class="zh-cn" selected="selected">中文(简体)</option>	#打印第一个option

temp('option:last-child')			#最后一个
temp('option:nth-child(2)')			#第二个
temp('option:gt(2)')				#从0开始,除去0,1,2个
-->
<option value="en">English</option>

temp('option:gt(0)')				#除去第0个
-->
<option value="zh-tw">中文(臺灣)</option>
                <option value="zh-hk">中文(香港)</option>
                <option value="en">English</option>

temp('option:nth-child(2n)')         #获取第偶数个标签  0,2,4,6
temp('option:contains(关键字)')		 #获取带有关键字内容的标签
	

selenium库

自动化测试工具
解决爬虫的js渲染的问题

基本操作

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
browser = webdriver.Chrome()				#chrome为驱动对象
try:
    url = 'https://www.baidu.com'			
    browser.get(url)						#.get  获取网址
    input = browser.find_element_by_id('kw')#寻找关键字'kw'赋值为input
    input.send_keys('Python')				#输入关键字'Python'
    input.send_keys(Keys.ENTER)				#Keys.ENTER = 回车键
    wait = WebDriverWait(browser,10)		
    wait.until(EC.presence_of_element_located((By.ID,'content_left')))	#等待ID为'content_left'的元素加载出来
    print(browser.current_url)				
    print(browser.get_cookies())
    print(browser.page_source)				#网页源码
finally:
    browser.close()
-->
https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=Python&rsv_pq=a6b5304700084dbf&rsv_t=dd6dDj90wfh%2F0Id0HfaFrnzUALKv%2Bg9e%2BrAiJ%2BZpwh%2BVZfgw%2BJHHqS2kFOw&rqlang=cn&rsv_enter=1&rsv_sug3=6&rsv_sug2=0&inputT=157&rsv_sug4=157		#current_url
[{'domain': '.baidu.com', 'httpOnly': False, 'name': 'H_PS_PSSID', 'path': '/', 'secure': False, 'value': '1444_28777_21098_28775_28724_28839_28585_28604_22160'}, {'domain': '.baidu.com', 'httpOnly': False, 'name': 'delPer', 'path': '/', 'secure': False, 'value': '0'}, {'domain': '.baidu.com', 'expiry': 3703655028.667185, 'httpOnly': False, 'name': 'BAIDUID', 'path': '/', 'secure': False, 'value': '7F845AF3F32D33D833EFABFC88D4D009:FG=1'}, {'domain': '.baidu.com', 'expiry': 3703655028.667236, 'httpOnly': False, 'name': 'BIDUPSID', 'path': '/', 'secure': False, 'value': '7F845AF3F32D33D833EFABFC88D4D009'}, {'domain': '.baidu.com', 'expiry': 3703655028.66726, 'httpOnly': False, 'name': 'PSTM', 'path': '/', 'secure': False, 'value': '1556171380'}, {'domain': 'www.baidu.com', 'httpOnly': False, 'name': 'BD_HOME', 'path': '/', 'secure': False, 'value': '0'}, {'domain': '.baidu.com', 'expiry': 1556257784.182555, 'httpOnly': False, 'name': 'BDORZ', 'path': '/', 'secure': False, 'value': 'B490B5EBF6F3CD402E515D22BCDA1598'}, {'domain': 'www.baidu.com', 'expiry': 1557035382, 'httpOnly': False, 'name': 'BD_UPN', 'path': '/', 'secure': False, 'value': '12314353'}, {'domain': 'www.baidu.com', 'httpOnly': False, 'name': 'BD_CK_SAM', 'path': '/', 'secure': False, 'value': '1'}, {'domain': '.baidu.com', 'httpOnly': False, 'name': 'PSINO', 'path': '/', 'secure': False, 'value': '1'}, {'domain': 'www.baidu.com', 'httpOnly': False, 'name': 'BDSVRTM', 'path': '/', 'secure': False, 'value': '731'}, {'domain': 'www.baidu.com', 'expiry': 1556173976, 'httpOnly': False, 'name': 'H_PS_645EC', 'path': '/', 'secure': False, 'value': '49c1oN5vV6lzznYyhwr%2F6WBMA1K2llgLh7Zv98vKt2Knh478J2E8jbbSaFs'}]	#cookies
<!DOCTYPE html><!--STATUS OK--><html xmlns="http://www.w3.org/1999/xhtml"><head><script charset="utf-8" async="" src="https://ss0.bdstatic.com/-0U0bnSm1A5BphGlnYG/tam-ogel/5d4e9b24-dcc5-483a-b6da-be1e9e621891.js"></script>
    
    <meta http-equiv="content-type" content="text/html;charset=utf-8" /><style data-for="result" id="css_result" type="text/css">body{color:#333;background:#fff;padding:6px 0 0;margin:0;position:relative;min-width:900px}body,th,td,.p1,.p2{font-family:arial}p,form,ol,ul,li,dl,dt,dd,h3{margin:0;padding:0;list-style:none}input{padding-top:0;padding-bottom:0;-moz-box-sizing:border-box;-webkit-box-sizing:border-box;box-sizing:border-box}table,img{border:0}td{font-size:9pt;line-height:18px}em{font-style:normal;color:#c00}a em{text-decoration:underline}cite{font-style:normal;color:green}.m,a.m{color:#666}a.m:visited{color:#606}.g,a.g{color:green}.c{color:#77c}.f14{font-size:14px}.f10{font-size:10.5pt}.f16{font-size:16px}.f13{font-size:13px}.bg{background-image:url(https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/static/protocol/https/global/img/icons_5859e57.png);_background-image:url(https://ss1.bdstatic.com/5eN		#源码
只复制了部分


声明浏览器对象
from selenium import webdriver

browser = webdriver.Chrome()		#各种浏览器
browser = webdriver.Firefox()
browser = webdriver.Edge()
browser = webdriver.PhantomJS()
browser = webdriver.Safari()

访问页面
from selenium import webdriver

browser = webdriver.Chrome()
url = 'http://www.taobao.com'
browser.get(url)
print(browser.page_source)
browser.close()
-->  部分结果
<!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml" lang="zh-CN" class="ks-webkit537 ks-webkit ks-chrome73 ks-chrome"><head><script charset="utf-8" src="https://tce.taobao.com/api/mget.htm?callback=jsonpXctrl107&amp;tce_sid=1947787&amp;tce_vid=0&amp;tid=&amp;tab=&amp;topic=&amp;count=&amp;env=online&amp;cna=undefined" async=""></script><script src="https://ald.taobao.com/recommend2.htm?appId=20140506002%2C20140506001%2C03014&amp;_ksTS=1556172174027_94&amp;callback=jsonp95" async=""></script><script src="https://textlink.simba.taobao.com/?name=tbhs&amp;cna&amp;nn=&amp;count=13&amp;pid=430266_1006&amp;_ksTS=1556172173993_74&amp;callback=jsonp75" async=""></script>
<meta charset="utf-8" />
<meta http-equiv="X-UA-Compatible" co  

查找元素
单个元素
from selenium import webdriver

url = 'https://www.taobao.com'
browser = webdriver.Chrome()
browser.get(url)
input_first = browser.find_element_by_id('q')					#find_elemnt_id 查找id为q的元素
input_second = browser.find_element_by_css_selector('#q')		#css_elector查找id=q的元素
input_third = browser.find_element_by_xpath('//*[@id="q"]')		#xpath查找属性为id=q的元素
print(input_first,input_second,input_third)
browser.close()
-->
<selenium.webdriver.remote.webelement.WebElement (session="f0cdb89a7f77fa887553eff4ecfbe60a", element="0.5708762229589062-1")> <selenium.webdriver.remote.webelement.WebElement (session="f0cdb89a7f77fa887553eff4ecfbe60a", element="0.5708762229589062-1")> <selenium.webdriver.remote.webelement.WebElement (session="f0cdb89a7f77fa887553eff4ecfbe60a", element="0.5708762229589062-1")>

单个元素的通用查找方式
from selenium import webdriver
from selenium.webdriver.common.by import By

url = 'http://www.taobao.com'
browser = webdriver.Chrome()
browser.get(url)
input_first = browser.find_element(By.ID,'q')
print(input_first)
browser.close()
-->
<selenium.webdriver.remote.webelement.WebElement (session="2cbc003859a694da29b57bc925f5c0a0", element="0.47061304187449315-1")>


查找多个元素
from selenium import webdriver
from selenium.webdriver.common.by import By

url = 'http://www.tabao.com'
browser = webdriver.Chrome()
browser.get(url)
li = browser.find_elements(By.CSS_SELECTOR,'.service-bd li')
print(li)
browser.close()
-->部分结果
[<selenium.webdriver.remote.webelement.WebElement (session="4c13a3e387cbaf384f1b903d3adedc23", element="0.38451802051432304-1")>, <selenium.webdriver.remote.webelement.WebElement (session="4c13a3e387cbaf384f1b903d3adedc23", element="0.38451802051432304-2")>, <selenium.webdriver.remote.webelement.WebElement (session="4c13a3e387cbaf384f1b903d3adedc23", element="0.38451802051432304-3")>, <selenium.webdriver.remote.webelement.WebElement (session="4c13a3e387cbaf384f1b903d3adedc23", element="0.38451802051432304-4")>, <selenium.webdriver.remote.webelement.WebElement (session="4c13a3e387cbaf384f1b903d3adedc23", element="0.3845


元素交互操作
from selenium import webdriver
import time

browser = webdriver.Chrome()
browser.get('https://www.taobao.com')
input = browser.find_element(By.ID,'q')		#搜索框id 为 q
input.send_keys('iphone')					
time.sleep(1)								#1s之后
input.clear()								#搜索框清空
input.send_keys('ipad')						
button = browser.find_element_by_class_name('btn-search')	#btn-search为搜索按钮坐标
button.click()

交互动作
from selenium import webdriver
from selenium.webdriver import ActionChains

browser = webdriver.Chrome()
browser.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
browser.switch_to.frame('iframeResult')
source = browser.find_element(By.ID,'draggable')
target = browser.find_element(By.ID,'droppable')
actions = ActionChains(browser)			#声明actions 动作链
actions.drag_and_drop(source,target)	
actions.perform()						#执行actions

执行js
实现进度条拖拽
from selenium import webdriver

browser = webdriver.Chrome()
browser.get('https://www.zhihu.com')
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
browser.execute_script('alert("To Bottom")')

获取元素信息

获取属性
from selenium import webdriver
from selenium.webdriver import ActionChains

browser = webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
logo = browser.find_element(By.XPATH,'//a[@class="zu-top-link-logo"]')
print(logo)
print(logo.get_attribute('id'))
-->
<selenium.webdriver.remote.webelement.WebElement (session="329421fa6d6f9596906140f0a924a4a4", element="0.5485149819316133-1")>
zh-top-link-logo

获取文本、ID、位置、标签名、大小
from selenium import webdriver
from selenium.webdriver.common.by import By

browser = webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
a = browser.find_elements(By.XPATH,'//li/a[@class="zu-top-nav-link"]')
for li in a:
    print(li.text)
    print(li.id)
    print(li.location)
    print(li.tag_name)
    print(li.size)
-->
首页
0.4477185237510821-1
{'x': 486, 'y': 0}
a
{'height': 45, 'width': 54}
话题
0.4477185237510821-2
{'x': 540, 'y': 0}
a
{'height': 45, 'width': 54}
发现
0.4477185237510821-3
{'x': 594, 'y': 0}
a
{'height': 45, 'width': 54}

元素等待

影式等待
'如果查找元素的时候,没有立即得到响应,隐式等待会等待一段时间再查找DOM,默认时间为0'
from selenium import webdriver
from selenium.webdriver.common.by import By

browser = webdriver.Chrome()
browser.implicitly_wait(10)
browser.get('https://www.baidu.com')
style = browser.find_element(By.XPATH,'//area[@style="outline:none;"]')
print(style)
-->
<selenium.webdriver.remote.webelement.WebElement (session="95a2fbc651f41f0604be1d5b385bc56a", element="0.5739750677363769-1")>

显式等待
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

browser = webdriver.Chrome()
browser.get('http://www.taobao.com')
wait = WebDriverWait(browser,10)
input = wait.until(EC.presence_of_element_located((By.ID,‘q’)))
button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'.btn-search')))
print(input,button)
-->

<selenium.webdriver.remote.webelement.WebElement (session="b2ff6e85088f6f38025aef18a6a79a17", element="0.7102273460896695-1")> <selenium.webdriver.remote.webelement.WebElement (session="b2ff6e85088f6f38025aef18a6a79a17", element="0.7102273460896695-2")>

前进、后退

time.sleep(1)				#等待1s
browser.forward()		#前进
browser.back()			#后退

cookies

browser.get_cookies()			#获取cookies
browser.add_cookies({'name':'name','XX':'XX'})			#add一些字典形式的cookies
browser.delete_all_cookies()			#删掉

通用的切换选项卡
使用js和window.open完成切换

import time
from selenium import webdriver
from selenium.webdriver.common.by import By

browser = webdriver.Chrome()
browser.get('https://www.baidu.com')				#第一个选项卡打开百度
browser.execute_script('window.open()')		#新建选项卡
browser.execute_script('window.open()')		#再来新建一个方便学习
print(browser.window_handles)
browser.switch_to_window(browser.window_handles[2])		#切换到第三个选项卡
browser.get('https:www.taobao.com')				#输入淘宝
sleep.time(1)													#等一下
browser.switch_to_window(browser.window_handles[0])		#切换到第一个选项卡
browser.get('https://python.org')						#输入python

猜你喜欢

转载自blog.csdn.net/dh0805dh/article/details/89491250
今日推荐