Code section:
1 from bs4 import BeautifulSoup 2 3 #下面代码示例都是用此文档测试 4 html_doc = """ 5 <html><head><title>The Dormouse's story</title></head> 6 <body> 7 <p class="title"><b>The Dormouse's story</b></p> 8 9 <p class="story">Once upon a time there were three little sisters; and their names were 10 <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, 11 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and 12 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; 13 is and The bottom of They lived AT A Well. </ P> 14 15 <P class = "Story"> ... </ P> 16 "" " . 17 Soup = the BeautifulSoup (html_doc, ' lxml ' ) 18 is Print ( " 1; Get head tags " ) . 19 Print (soup.head) 20 is Print ( " 2; b # Get p nodes under the node " ) 21 is Print (soup.pb) 22 is # name attribute acquisition node name: 23 is Print ( " 4; name property to get the name of the node ") 24 print(soup.body.name) 25 # attrs attribute acquisition node attributes, may be directly obtained in the form of a dictionary, the result may be returned list or string type, depending on the type of the node 26 is Print ( " . 5; p Node Get all the attributes " ) 27 Print (soup.p.attrs) 28 Print ( " . 6; p acquiring the class attribute nodes " ) 29 Print (soup.p.attrs [ ' class ' ]) 30 Print ( " . 7; p direct access node class attribute " ) 31 is Print (soup.p [ ' class ' ]) 32 # text string attribute acquisition node contains elements: 33 Print ( " . 8; acquired under a text label, only the first acquisition " ) 34 is Print (soup.p.string) 35 # direct child node of contents acquired property that returns the contents as a list of 36 Print ( " . 9 ; contents property gets direct child node, and returns a list of contents " ) 37 [ Print (soup.body.contents) 38 is # direct child node of the children property is acquired, only to return to the type of generator 39 Print ( " 10; direct child node of the children property is acquired, only to return to the type of generator " ) 40 Print (soup.body.children) 41 is # Calls the descendants attribute acquisition descendant node returns generator 42 is Print ( "11; descendants attribute acquisition descendant node returns generator " ) 43 is Print (soup.body.descendants) 44 is # parent node of the parent attribute acquisition, acquisition Parents ancestor node, return to the generator 45 Print ( " 12 is; parent attribute obtaining the parent node, Get parents ancestor node, return to the generator " ) 46 is Print (soup.b.parent) 47 Print (soup.b.parents) 48 # NEXT_SIBLING property returns next sibling node 49 Print ( " 13 is; NEXT_SIBLING property returns next sibling node " ) 50 Print (soup.a.next_sibling) 51 is # previous_sibling return to a sibling node, a node is noted newline 52 is Print( " 14; previous_sibling return to a sibling node, a node is noted newline " ) 53 is Print (soup.a.previous_sibling) 54 is # next_siblings property returns all sibling nodes 55 Print ( " 15; NEXT_SIBLING property returns next sibling node " ) 56 is Print (soup.a.next_siblings) 57 is # previous_siblings return all the sibling nodes is a node line breaks noted 58 Print ( " 16; previous_sibling return to a sibling node, a node is noted newline " ) 59 Print ( soup.a.previous_siblings) 60 # next obtain a parsed object next_element and previous_element properties, or a 61 Print ( " . 17; and the next_element previous_element attribute obtaining a parsed object or on a " ) 62 is Print (soup.a.next_element) 63 is Print (soup.a.previous_element) 64 # next_elements or forward iterator and previous_elements after visiting document parsing content 65 Print ( " 18; next_elements and previous_elements iterator forward or rear access the document parsing content " ) 66 Print (soup.a.next_elements) 67 Print (soup.a.previous_elements)
operation result:
/ home / aaron / Desktop / Python3-Test / venv / bin / python / home / aaron / Desktop / Python3-the Test / bs4- study.py . 1 ; obtaining head tag <head> <title> of The Dormouse ' S Story </ title> </ head> 2; # Get the node b of node p <b> of the Dormouse ' S Story </ b> . 4 ; name attribute acquisition node name body 5 ; p acquires all the properties node { ' Class ' : [ ' title ' ]} . 6 ; p obtaining class attribute nodes [ ' Title ' ] . 7 ; p direct access node class attributes [ ' Title ' ] . 8 ; acquired under a text label, only the first obtaining a Dormouse of The ' S Story . 9 ; direct child contents attribute acquisition node returns a list of content in the form ['\n', <p class="title"><b>The Dormouse's story</b></p>, '\n', <p class="story">Once upon a time there were three little sisters; and their names were <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id=" Link2 " > Lacie </a> and <a class = "SISTER" href= "http://example.com/tillie" id= "link3"> Tillie </a> ; and They lived The bottom of AT A . Well </ P>, ' \ n- ' , <P class = " Story " > ... </ P>, ' \ n- ' ] 10 ; direct child node of the children property is obtained, but to the generator Back type <list_iterator object at 0x7f0b1bd17750> . 11 ; Calls the descendants attribute acquisition descendant node returns generator <generator object Tag.descendants at 0x7f0b19e17d50> 12 is ; parent parent attribute acquisition, acquisition Parents ancestor node, returns generator <P class = " title " > <B> of The Dormouse ' S Story </ B> </ P> <generator object PageElement.parents at 0x7f0b19e17d50> 13 is ; NEXT_SIBLING property returns next sibling node , 14 ; previous_sibling return to the previous sibling node, a node also pay attention to line breaks Once upon a time there were three little sisters; and their names were 15 ; NEXT_SIBLING property returns next sibling node <generator object PageElement.next_siblings at 0x7f0b19e17d50> 16 ; previous_sibling return to a sibling node, a node is also noted that line breaks <generator object PageElement.previous_siblings at 0x7f0b19e17d50> 17 ; obtaining a next object to be parsed and previous_element next_element properties, or a Elsie Once upon a time there were three little sisters; and their names were 18 ; next_elements and previous_elements iterator forward or rear access the document parsing content <generator object PageElement.next_elements at 0x7f0b19e17d50> <generator object PageElement.previous_elements at 0x7f0b19e17d50> Process finished with exit code 0