Infi-chu:
http://www.cnblogs.com/Infi-chu/
pyquery is specialized for CSS and jQuery manipulation processing
1. Initialize
string initialization
from pyquery import PyQuery as pq doc = pq(html) # pass in html text print (doc ('li'))
URL initialization
from pyquery import PyQuery as pq doc = pq(url='www.baidu.com') print(doc('title')) # another method from pyquery import PyQuery as pq import requests doc = pq(requests.get('http://www.baidu.com')) print(doc('title'))
file initialization
from pyquery import PyQuery as pq doc = pq(filename='text.html') print (doc ('li'))
2. Basic CSS selectors
from pyquery import PyQuery as pq doc = pq(url='http://www.baidu.com') print(doc(#head .head_wrapper a)) print(type(doc(#head .head_wrapper a)))
3. Find node
child nodes
from pyquery import PyQuery as pq doc = pq(url='http://www.baidu.com') items = doc('.head_wrapper') print(type(items)) print(items) lis = items.find('a') # find() is to find all descendant nodes that meet the conditions, and children() can be used to find only child nodes print(type(lis)) print(lis)
The parent node
uses the parent() method to get the parent node of the node
Use the parents() method to get the ancestor node of the node
Sibling nodes
use the siblings() method to get sibling nodes
4. Traverse
from pyquery import PyQuery as pq doc = pq(html) lis = doc('li').items() print(type(lis)) for li in lis: print(li,type(li))
5. Get information
Get attributes
Use the attr() method to get attributes (values)
from pyquery import PyQuery as pq doc = pq(url='http://www.baidu.com') items = doc('.head_wrapper') print(items.attr('href')) # can also be written as print(items.attr.href) # Get all properties of a from pyquery import PyQuery as pq doc = pq(url='http://www.baidu.com') a = doc('a') for i in a: print(i.attr.href)
Get text
Use the text() method to get plain text plain string content
from pyquery import PyQuery as pq doc = pq(url = 'http://www.baidu.com') a = doc('a') print(i.text()) # no need to traverse
Use html() method to keep things inside tags
from pyquery import PyQuery as pq doc = pq(url = 'http://www.baidu.com') a = doc('a') for i in a: print(i) print(i.html())
6. Node operations
addClass and removeClass
from pyquery import PyQuery as pq html = ''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class"bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0 active"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' doc = pq(html) li = doc('.item-0 active') print (li) li.removeClass('active') print (li) li.addClass('active') print (li)
attr, text and html
from pyquery import PyQuery as pq html = ''' <div class="div"> <p>ASD</p> <ul class="list"> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> </ul> </div> ''' doc = pq(html) li = doc('.item-0 active') print (li) li.attr('name','link') print (li) li.text('changed item') print (li) li.html('<span>changed item</span>') print (li)
remove()
from pyquery import PyQuery as pq doc = pq(html) res = doc('.div') print(res.find('ul').remove().text())
7. Pseudo-class selector to
be improved