python reptile 8 - pyquery parsing library

1.pyquery Introduction

pyquery advantage, CSS selectors very powerful.

 

2. Initialize resolve

2.1 initialization string

html = '''
<div id="container">
<ul class="list>
<li class="li li-first" name="item"><a href="link1.html"><span>first item</span></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-1 active"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('li'))

2.2 URL initialization

from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com/')
print(doc('title'))

#用法同下:
import requests
res = requests.get('http://www.baidu.com/')
doc = pq(res.text)
print(doc('title'))

2.3 Local initialization file

from pyquery import PyQuery as pq
doc = pq(filename='./test.html')
print(doc('li'))

 

3. Basic use CSS selectors

from pyquery import PyQuery as pq
doc = pq(filename='./test.html')
print(doc('#container .list li'))

 

4. Find node

4.1 child node

DOC = items ( ' .list ' )
 Print (items.find ( ' Li ' ))    # Find () Find all descendant nodes 
Print (items.children ( ' Li ' )) # Children () to find the sub-node 
Print (items. Children ( ' .li-First ' ))

4.2 parent

DOC = items ( ' .list ' )
 Print (items.parent ())    # parent () to find the direct parent 
Print (items.parents ())    # Parents () Find all ancestor nodes 
Print (items.parents ( ' #container ' ))

4.3 sibling

DOC = items ( ' .list .Item-inactive ' )
 Print (items.siblings ())    # returns all siblings 
Print (items.siblings ( ' .Item-0 ' ))

 

5. traversal

DOC = LIS ( ' li ' ) .items ()   # call items () to get a generator, each li PyQuery new type, can continue to look for 
Print (type (LIS))    # <class' pyquery.pyquery.PyQuery '> 
for Li in LIS:
     Print (Li)
     Print (type (Li))   # <class' pyquery.pyquery.PyQuery'>

 

6. Obtain Information

6.1 acquire property

DOC = A ( ' A ' )
 Print (a.attr ( ' the href ' ))    # the call attr () method returns only the first eligible 
for Item in a.items ():
     Print (item.attr ( ' the href ' ))      # after traversing returns all

6.2 get the text

DOC = a ( ' a ' )
 Print (a.text ())    # obtain a plain text in all node information, separated by a space, returns a string format 
Print (type (a.text ()))    # <class 'STR'> 
Print (a.html ())    # returns only the first node in a HTML text 
for Item in a.items ():
     Print (item.html ())    # required to traverse all HTML text obtained

 

7. node operation

pyquery provides a series of methods for dynamic node, such as add remove class, adding nodes, etc.

7.1 removeClass () and addClass (), to add and remove the class attribute

DOC = Li ( ' .Item-1.active ' )    # plurality of attributes not found in the same node spacebar, or that look down 
Print (Li)                        # <Li class = "Item. 1-Active"> <A the href = "link3.html"> Item </a> THIRD </ Li> 
li.removeClass ( ' Active ' )
 Print (Li)                        # <Li class = "Item-. 1"> <a href="link3.html"> Item </a> THIRD </ Li> 
li.addClass ( ' Active ' )
 Print (Li)                        # <Li class = "Item. 1-Active"> <a href="link3.html">third item</a></li>

7.2 attr, text and html

DOC = Li ( ' .Item-1.active ' )
 Print (Li)                          # <Li class = "Item. 1-Active"> <a href="link3.html"> THIRD Item </a> </ Li> 
Li .attr ( ' name ' , ' Link ' )            # modify the attributes, the attributes names first parameter, the second property value, if not pass the second parameter was the attribute value acquired 
Print (Li)                          # < class = Li "Item-Active. 1" name = "Link"> <a href="link3.html"> THIRD Item </a> </ Li> 
li.text ( ' changed Item ')          # Modify the text, do not pass the reference was to acquire the text content 
Print (li)                           #<li class="item-1 active" name="link">changed item</li>
li.html('<span>changed item</span>')
print(li)                          #<li class="item-1 active" name="link"><span>changed item</span></li>

7.3 remove()

= HTML '' ' 
<div ID = "Container"> 
First 
<P> SECOND </ P> 
</ div> 
' '' 
from pyquery Import pyquery AS PQ 
DOC = PQ (HTML) 
div = DOC ( ' #container ' )
 Print (div.text ()) 
div.find ( ' the p- ' ) .remove ()
 Print (div.text ())       # get only the direct text content div

Other methods of operation as well as append (), empty (), prepend (), etc., and consistent usage of jQuery usage.

 

8. The pseudo class selector

Guess you like

Origin www.cnblogs.com/rong1111/p/12164678.html