Introduction:
Python recent study reptiles, here bs4 to be a learning notes on the data analysis module.
use:
One kind bs4 for parsing xml document, xml, html only
bs4 official documents Address:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
Study notes:
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class=... ... ... ... ... ... "sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
= BeautifulSoup Soup (html_doc, 'html.parser') # create a BeautifulSoup objects, add html file parser, may be different in different platforms, you do not need on Linux
Print (soup.prettify ()) # beautification output
print(soup.get_text()) # Save the entire contents of html_doc variable output (Linux system will \ n separated)
print('')
print(type(soup.title))
print(dir(soup.title))
Print (soup.title) # get html title
<title>The Dormouse's story</title>
Print (soup.title.text) # get html content title
"The Dormouse's story"
Print (soup.a) # obtaining a tag (first)
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Print (soup.a.attrs) # get all the properties of a first label to form a dictionary
{'href': 'http://example.com/elsie', 'class': ['sister'], 'id': 'link1'}
Print (soup.a.attrs [ 'href']) # href attribute acquiring a first tag
'http://example.com/elsie'
Print (soup.a.has_attr ( 'class')) # determines whether there is a class attribute
True
Print (soup.p) # get p tag (first)
<p class="title"><b>The Dormouse's story</b></p>
Print (soup.p.children) # Gets all child nodes of a first label p
<list_iterator object at 0x7fe8185261d0>
print(list(soup.p.children))
[<b>The Dormouse's story</b>]
print(list(soup.p.children)[0])
<b>The Dormouse's story</b>
print(list(soup.p.children)[0].text)
"The Dormouse's story"
Print (soup.find_all ( 'a')) # tags get all of a
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id=a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
in a soup.find_all for ( 'a'): # a tag through all
print(a.attrs['href'])
Print (soup.find (ID = 'link3')) # acquires label id = link3
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
print('#'*150)
#支持CSS选择器
#查找类名为story的节点
print(soup.select('.story'))
print('')
print(soup.select('.story a'))
print('')
#查找id=link1的节点
print(soup.select('#link1'))