Python3 crawler (7) pyquery for the use of parsing library

 Infi-chu:

http://www.cnblogs.com/Infi-chu/

pyquery is specialized for CSS and jQuery manipulation processing

1. Initialize
string initialization

from pyquery import PyQuery as pq
doc = pq(html) # pass in html text
print (doc ('li'))

URL initialization

from pyquery import PyQuery as pq
doc = pq(url='www.baidu.com')
print(doc('title'))
# another method
from pyquery import PyQuery as pq
import requests
doc = pq(requests.get('http://www.baidu.com'))
print(doc('title'))

file initialization

from pyquery import PyQuery as pq
doc = pq(filename='text.html')
print (doc ('li'))

2. Basic CSS selectors

from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')
print(doc(#head .head_wrapper a))
print(type(doc(#head .head_wrapper a)))

3. Find node
child nodes

from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')
items = doc('.head_wrapper')
print(type(items))
print(items)
lis = items.find('a') # find() is to find all descendant nodes that meet the conditions, and children() can be used to find only child nodes
print(type(lis))
print(lis)

The parent node
uses the parent() method to get the parent node of the node
Use the parents() method to get the ancestor node of the node

Sibling nodes
use the siblings() method to get sibling nodes

4. Traverse

from pyquery import PyQuery as pq
doc = pq(html)
lis = doc('li').items()
print(type(lis))
for li in lis:
    print(li,type(li))

5. Get information
Get attributes
Use the attr() method to get attributes (values)

from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')
items = doc('.head_wrapper')
print(items.attr('href'))
# can also be written as
print(items.attr.href)

# Get all properties of a
from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')
a = doc('a')
for i in a:
    print(i.attr.href)

Get text
Use the text() method to get plain text plain string content

from pyquery import PyQuery as pq
doc = pq(url = 'http://www.baidu.com')
a = doc('a')
print(i.text()) # no need to traverse

Use html() method to keep things inside tags

from pyquery import PyQuery as pq
doc = pq(url = 'http://www.baidu.com')
a = doc('a')
for i in a:
    print(i)
    print(i.html())

6. Node operations
addClass and removeClass

from pyquery import PyQuery as pq
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class"bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0 active"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
doc = pq(html)
li = doc('.item-0 active')
print (li)
li.removeClass('active')
print (li)
li.addClass('active')
print (li)

attr, text and html

from pyquery import PyQuery as pq
html = '''
<div class="div">
<p>ASD</p>
<ul class="list">
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
</ul>
</div>
'''
doc = pq(html)
li = doc('.item-0 active')
print (li)
li.attr('name','link')
print (li)
li.text('changed item')
print (li)
li.html('<span>changed item</span>')
print (li)

remove()

from pyquery import PyQuery as pq
doc = pq(html)
res = doc('.div')
print(res.find('ul').remove().text())

7. Pseudo-class selector to
be improved

 

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325326819&siteId=291194637