Day03 --- BeautifulSoup related content

A, bs4 installation and use

'''
安装解析器
pip3 install lxml
安装解析库
pip3 install bs4
'''

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="sister"><b>$37</b></p>

<p class="story" id="p">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" >Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 importThe BeautifulSoup 

# Python comes parsing library 
# soup = the BeautifulSoup (html_doc, 'html.parser') 

# call objects BS4 give a soup 
soup = the BeautifulSoup (html_doc, ' lxml ' ) 

# BS4 objects 
Print (soup)
 # BS4 type 
Print ( of the type (Soup)) 

# beautification 
HTML = soup.prettify ()
 Print (HTML)

Second, traversing the document tree parsing library bs4

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="sister"><b>$37</b></p>

<p class="story" id="p">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" >Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup

soup = BeautifulSoup(html_doc,'lxml')

#Traversing the document tree 

# 1, directly 
Print (soup.html)
 Print (of the type (soup.html))
 Print (soup.a)
 Print (soup.p) 

# 2, to obtain the label name 
Print (soup.a.name) 

# 3, the properties of the label acquired 
Print (soup.a.attrs)   # acquires all the properties of a label 
Print (soup.a.attrs [ ' the href ' ]) 

# . 4, label text acquired 
Print (soup.p. text) 

# . 5, nested select 
Print (soup.html.body.p) 

# . 6, a child node, descendant node 
Print (soup.p.children)
 Print (List (soup.p.children)) 

#7, parent, ancestor node 
Print (soup.b.parent)
 Print (soup.b.parents)
 Print (List (soup.b.parents)) 

# 8, sibling 
Print (soup.a)
 # # Get the next sibling 
Print (soup.a.next_sibling)
 # # Get all sibling nodes of the next, returns a producer 
Print (soup.a.next_siblings)
 Print (List (soup.a.next_siblings))
 # # obtaining a sibling 
Print (soup.a.previous_sibling)
 # # Get all the sibling nodes on a return is a generator 
Print (List (soup.a.previous_siblings))

Third, the search bs4 parsing library's document tree

'' ' 
Find: find a first 
find_all: to find all the 

tag lookup attribute lookup: 
    Tags: 
        - global matching string filter string 
            matches the name attribute 
            attrs attribute to find matching 
            text matches the text 
        - the regular filter 
            module re match 
        - List Filter is 
            data matching the list 
        - BOOL filter 
            True match 
        - method filters 
            used in some of the properties to be unneeded attributes lookup. 
    Properties: 
        - the class_ 
        - ID 
'' ' 

html_doc = "" "
<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" >Elsie</a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>and they lived at the bottom of a well.</p><p class="story">...</p>
"""


look and find find_all#text matches the text
#attrs find matching attribute
#name attribute matches#)'lxml'= the BeautifulSoup (html_doc,
Soupthe BeautifulSoupImportBS4from






'''字符串过滤器'''
p = soup.find(name='p')
ps = soup.find_all(name='p')

print(p)
print(ps)

# name + attrs
p = soup.find(name='p',attrs={"id":"p"})
print(p)


# name + text
tag = soup.find(name='title',text="The Dormouse's story")
print(Tag) 


# name + text + attrs 
Tag = soup.find (name = ' A ' , attrs = { " class " : " SISTER " }, text = " Elsie " )
 Print (Tag) 


'' ' regular filter ' '' 
'' ' re matching module ' '' 
# name 
# matching node in accordance with a re module 
Import re 
a = soup.find (= the re.compile name ( ' a ' ))
 Print (a) 
A_S = Soup. find_all (name = re.compile('a'))
 Print (A_S) 

# attrs 
A = soup.find (attrs = { " ID " : the re.compile ( ' Link ' )})
 Print (A) 

'' ' List Filter ' '' 
'' ' within the list of data matches '' ' 
Import Re
 Print (soup.find (name = [ ' A ' , ' P ' , ' HTML ' , the re.compile ( ' A ' )]))
 Print (soup.find_all (name = [ ' A ','p' , ' HTML ' , the re.compile ( ' A ' )])) 


' '' BOOL filter '' ' 
' '' True match '' ' 
Print (soup.find (name = True, attrs = { " ID " : True})) 


'' ' method filter ' '' 
'' ' for some of the properties to be unnecessary and the lookup attributes. '' ' 
DEF has_class_no_id (Tag):
     Print (tag.name)
     IF tag.name == ' P '  and tag.has_attr ( " ID " class " ):
         return Tag 

Print (soup.find_all (name = has_class_no_id)) # name = function object 


# supplementary knowledge 
# ID 
A = soup.find (ID = ' link2 ' )
 Print (A) 

# class 
P = Soup. Find (= the class_ ' SISTER ' )
 Print (P)

 

Guess you like

Origin www.cnblogs.com/gumball/p/11128663.html