bs4 Installation and Use
'''''' ''' 安装解析器: pip3 install lxml 安装解析库: pip3 install bs4 ''' html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="sister"><b>$37</b></p> <p class="story" id="p">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" >Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ from bs4 Import the BeautifulSoup # Python comes parsing library # soup = the BeautifulSoup (html_doc, 'html.parser') # call objects BS4 give a soup soup = the BeautifulSoup (html_doc, ' lxml ' ) # BS4 target print (soup) # BS4 type print (of the type (Soup)) # beautification HTML = soup.prettify () Print (HTML)
bs4 parse tree traversal library of documents
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="sister"><b>$37</b></p> <p class="story" id="p">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" >Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, 'lxml') #Print (Soup) # Print (of the type (Soup)) # traversing the document tree # 1, directly ***** Print (soup.html) Print (of the type (soup.html)) Print (soup.a) Print (Soup .p) # 2, the tag name acquired Print (soup.a.name) # . 3, acquires attribute tag ***** Print (soup.a.attrs) # acquires all the properties of a label Print (Soup. a.attrs [ ' the href ' ]) # . 4, label text acquired ***** Print (soup.p.text) # $ 37 [ # . 5, nested select Print (soup.html.body.p) #6, a child node, descendant node Print (soup.p.children) # Returns iterator object Print (List (soup.p.children)) # [<B> 37 [$ </ B>] # . 7, a parent node, node ancestor Print (soup.b.parent) Print (soup.b.parents) Print (List (soup.b.parents)) # 8, siblings (sibling: siblings) Print (soup.a) # get the next sibling node Print (soup.a.next_sibling) # Get all next sibling node, and returns a generator Print (soup.a.next_siblings) Print (List (soup.a.next_siblings)) # obtaining a sibling Print ( soup.a.previous_sibling) #Get all the sibling nodes on a return is a generator Print (List (soup.a.previous_siblings))
bs4 the document tree search
'' '' '' '' ' The Find: first find a find_all: find all the labels and look for property to find: name attribute matches the name tag name attrs attribute to find matching text matching text label: - String filter string to match the global - regular filter module re match - filter list data matches the list - BOOL filter True match - method filters used in some of the properties to be unneeded attributes lookup. Properties: - the class_ - ID '' ' html_doc = "" " <html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" >Elsie</a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>and they lived at the bottom of a well.</p><p class="story">...</p> """ the Find and search for documents find_all #text text match #attrs attribute to find matching #name tag name#)'lxml'= BeautifulSoup (html_doc, SoupBeautifulSoupImportBS4from ''' 字符串过滤器 ''' p = soup.find(name='p') p_s = soup.find_all(name='p') print(p) print(p_s) # name + attrs p = soup.find(name='p', attrs={"id": "p"}) print(p) # name + text tag = soup.find(name='title', text="The Dormouse's story") print(Tag) # name + text + attrs Tag = soup.find (name = ' A ' , attrs = { " class " : " SISTER " }, text = " Elsie " ) Print (Tag) '' ' - regular filter re matching module '' ' Import re # name # the re module with a matching node a = soup.find (= the re.compile name ( ' a ' )) Print (a) A_S = soup.find_all (name = re .compile ( ' A ')) print(A_S) # attrs A = soup.find (attrs = { " ID " : the re.compile ( ' Link ' )}) Print (A) # - List Filter # data within a list of matching Print (soup.find (name = [ ' A ' , ' P ' , ' HTML ' , the re.compile ( ' A ' )])) Print (soup.find_all (name = [ ' A ' , ' P ' , ' HTML ' , Re.compile(' A ' )])) # - BOOL filter # True Match Print (soup.find (name = True, attrs = { " ID " : True})) # - Filter Method # for some of the properties and is not to be You need to find properties. DEF have_id_not_class (Tag): # Print (tag.name) IF tag.name == ' the p- ' and tag.has_attr ( " the above mentioned id " ) and not tag.has_attr ( " class " ): return Tag # Print (soup.find_all (name = function object)) print(soup.find_all(name=have_id_not_class)) # 补充知识点: # id a = soup.find(id='link2') print(a) # class p = soup.find(class_='sister') print(p)