day03.2 bs4 installation and use, traversing the document tree bs4 of parsing library, bs4 the document tree search

bs4 Installation and Use

 

''''''
'''
安装解析器:
pip3 install lxml

安装解析库:
pip3 install bs4
'''
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="sister"><b>$37</b></p>
<p class="story" id="p">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" >Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
from bs4 Import the BeautifulSoup 

# Python comes parsing library 
# soup = the BeautifulSoup (html_doc, 'html.parser') 

# call objects BS4 give a soup 
soup = the BeautifulSoup (html_doc, ' lxml ' ) 

# BS4 target 
print (soup) 

# BS4 type 
print (of the type (Soup)) 

# beautification 
HTML = soup.prettify ()
 Print (HTML)

 

 bs4 parse tree traversal library of documents

 

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="sister"><b>$37</b></p>
<p class="story" id="p">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" >Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
from bs4 import BeautifulSoup

soup = BeautifulSoup(html_doc, 'lxml')
#Print (Soup) 
# Print (of the type (Soup)) 
# traversing the document tree 
# 1, directly ***** 
Print (soup.html)
 Print (of the type (soup.html))
 Print (soup.a)
 Print (Soup .p) 

# 2, the tag name acquired 
Print (soup.a.name) 

# . 3, acquires attribute tag ***** 
Print (soup.a.attrs)   # acquires all the properties of a label 
Print (Soup. a.attrs [ ' the href ' ]) 

# . 4, label text acquired ***** 
Print (soup.p.text)   # $ 37 [ 

# . 5, nested select 
Print (soup.html.body.p) 

#6, a child node, descendant node 
Print (soup.p.children)   # Returns iterator object 
Print (List (soup.p.children))   # [<B> 37 [$ </ B>] 

# . 7, a parent node, node ancestor 
Print (soup.b.parent)
 Print (soup.b.parents)
 Print (List (soup.b.parents)) 

# 8, siblings (sibling: siblings) 
Print (soup.a)
 # get the next sibling node 
Print (soup.a.next_sibling) 

# Get all next sibling node, and returns a generator 
Print (soup.a.next_siblings)
 Print (List (soup.a.next_siblings)) 

# obtaining a sibling 
Print ( soup.a.previous_sibling)
 #Get all the sibling nodes on a return is a generator 
Print (List (soup.a.previous_siblings))

 

bs4 the document tree search

 

'' '' '' 
'' ' 
The Find: first find a 
find_all: find all the 

labels and look for property to find: 
name attribute matches the 

    name tag name 
    attrs attribute to find matching 
    text matching text 
            
    label: 
        - String filter    
            string to match the global 
            
        - regular filter 
            module re match 
            
        - filter list 
            data matches the list 
            
        - BOOL filter 
            True match 
            
        - method filters 
            used in some of the properties to be unneeded attributes lookup. 
    Properties: 
        - the class_ 
        - ID 
'' ' 
html_doc = "" "
<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" >Elsie</a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>and they lived at the bottom of a well.</p><p class="story">...</p>
"""
the Find and search for documents find_all
#text text match
#attrs attribute to find matching
#name tag name#)'lxml'= BeautifulSoup (html_doc,
SoupBeautifulSoupImportBS4from




'''

字符串过滤器
'''
p = soup.find(name='p')
p_s = soup.find_all(name='p')

print(p)
print(p_s)

# name + attrs
p = soup.find(name='p', attrs={"id": "p"})
print(p)

# name + text
tag = soup.find(name='title', text="The Dormouse's story")
print(Tag) 

# name + text + attrs 
Tag = soup.find (name = ' A ' , attrs = { " class " : " SISTER " }, text = " Elsie " )
 Print (Tag) 

'' ' 
- regular filter 
re matching module 
'' ' 
Import re
 # name 
# the re module with a matching node 
a = soup.find (= the re.compile name ( ' a ' ))
 Print (a) 

A_S = soup.find_all (name = re .compile ( ' A '))
print(A_S) 


# attrs 
A = soup.find (attrs = { " ID " : the re.compile ( ' Link ' )})
 Print (A) 


# - List Filter 
# data within a list of matching 
Print (soup.find (name = [ ' A ' , ' P ' , ' HTML ' , the re.compile ( ' A ' )]))
 Print (soup.find_all (name = [ ' A ' , ' P ' , ' HTML ' , Re.compile(' A ' )])) 


# - BOOL filter 
# True Match 
Print (soup.find (name = True, attrs = { " ID " : True})) 

# - Filter Method 
# for some of the properties and is not to be You need to find properties. 

DEF have_id_not_class (Tag):
     # Print (tag.name) 
    IF tag.name == ' the p- '  and tag.has_attr ( " the above mentioned id " ) and  not tag.has_attr ( " class " ):
         return Tag 

# Print (soup.find_all (name = function object))
print(soup.find_all(name=have_id_not_class))


# 补充知识点:
# id
a = soup.find(id='link2')
print(a)

# class
p = soup.find(class_='sister')
print(p)

 

Guess you like

Origin www.cnblogs.com/jiangbaoyabo/p/11129776.html