A, bs4 installation and use
''' 安装解析器 pip3 install lxml 安装解析库 pip3 install bs4 ''' html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="sister"><b>$37</b></p> <p class="story" id="p">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" >Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ from bs4 importThe BeautifulSoup # Python comes parsing library # soup = the BeautifulSoup (html_doc, 'html.parser') # call objects BS4 give a soup soup = the BeautifulSoup (html_doc, ' lxml ' ) # BS4 objects Print (soup) # BS4 type Print ( of the type (Soup)) # beautification HTML = soup.prettify () Print (HTML)
Second, traversing the document tree parsing library bs4
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="sister"><b>$37</b></p> <p class="story" id="p">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" >Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') #Traversing the document tree # 1, directly Print (soup.html) Print (of the type (soup.html)) Print (soup.a) Print (soup.p) # 2, to obtain the label name Print (soup.a.name) # 3, the properties of the label acquired Print (soup.a.attrs) # acquires all the properties of a label Print (soup.a.attrs [ ' the href ' ]) # . 4, label text acquired Print (soup.p. text) # . 5, nested select Print (soup.html.body.p) # . 6, a child node, descendant node Print (soup.p.children) Print (List (soup.p.children)) #7, parent, ancestor node Print (soup.b.parent) Print (soup.b.parents) Print (List (soup.b.parents)) # 8, sibling Print (soup.a) # # Get the next sibling Print (soup.a.next_sibling) # # Get all sibling nodes of the next, returns a producer Print (soup.a.next_siblings) Print (List (soup.a.next_siblings)) # # obtaining a sibling Print (soup.a.previous_sibling) # # Get all the sibling nodes on a return is a generator Print (List (soup.a.previous_siblings))
Third, the search bs4 parsing library's document tree
'' ' Find: find a first find_all: to find all the tag lookup attribute lookup: Tags: - global matching string filter string matches the name attribute attrs attribute to find matching text matches the text - the regular filter module re match - List Filter is data matching the list - BOOL filter True match - method filters used in some of the properties to be unneeded attributes lookup. Properties: - the class_ - ID '' ' html_doc = "" " <html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" >Elsie</a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>and they lived at the bottom of a well.</p><p class="story">...</p> """ look and find find_all#text matches the text #attrs find matching attribute #name attribute matches#)'lxml'= the BeautifulSoup (html_doc, Soupthe BeautifulSoupImportBS4from '''字符串过滤器''' p = soup.find(name='p') ps = soup.find_all(name='p') print(p) print(ps) # name + attrs p = soup.find(name='p',attrs={"id":"p"}) print(p) # name + text tag = soup.find(name='title',text="The Dormouse's story") print(Tag) # name + text + attrs Tag = soup.find (name = ' A ' , attrs = { " class " : " SISTER " }, text = " Elsie " ) Print (Tag) '' ' regular filter ' '' '' ' re matching module ' '' # name # matching node in accordance with a re module Import re a = soup.find (= the re.compile name ( ' a ' )) Print (a) A_S = Soup. find_all (name = re.compile('a')) Print (A_S) # attrs A = soup.find (attrs = { " ID " : the re.compile ( ' Link ' )}) Print (A) '' ' List Filter ' '' '' ' within the list of data matches '' ' Import Re Print (soup.find (name = [ ' A ' , ' P ' , ' HTML ' , the re.compile ( ' A ' )])) Print (soup.find_all (name = [ ' A ','p' , ' HTML ' , the re.compile ( ' A ' )])) ' '' BOOL filter '' ' ' '' True match '' ' Print (soup.find (name = True, attrs = { " ID " : True})) '' ' method filter ' '' '' ' for some of the properties to be unnecessary and the lookup attributes. '' ' DEF has_class_no_id (Tag): Print (tag.name) IF tag.name == ' P ' and tag.has_attr ( " ID " class " ): return Tag Print (soup.find_all (name = has_class_no_id)) # name = function object # supplementary knowledge # ID A = soup.find (ID = ' link2 ' ) Print (A) # class P = Soup. Find (= the class_ ' SISTER ' ) Print (P)