The crawler Python parsing data base module bs4

Introduction:

Python recent study reptiles, here bs4 to be a learning notes on the data analysis module.


use:

One kind bs4 for parsing xml document, xml, html only


bs4 official documents Address:

https://www.crummy.com/software/BeautifulSoup/bs4/doc/


Study notes:


from bs4 import BeautifulSoup


html_doc = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title"><b>The Dormouse's story</b></p>


<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class=... ... ... ... ... ... "sister" id="link1">Elsie</a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>


<p class="story">...</p>

"""


= BeautifulSoup Soup (html_doc, 'html.parser')    # create a BeautifulSoup objects, add html file parser, may be different in different platforms, you do not need on Linux

Print (soup.prettify ())    # beautification output

print(soup.get_text())    # Save the entire contents of html_doc variable output (Linux system will \ n separated)

print('')


print(type(soup.title))

print(dir(soup.title))


Print (soup.title)    # get html title

    <title>The Dormouse's story</title>

Print (soup.title.text)    # get html content title

    "The Dormouse's story"


Print (soup.a)       # obtaining a tag (first)

    <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

Print (soup.a.attrs)    # get all the properties of a first label to form a dictionary

    {'href': 'http://example.com/elsie', 'class': ['sister'], 'id': 'link1'}

Print (soup.a.attrs [ 'href'])    # href attribute acquiring a first tag

    'http://example.com/elsie'

Print (soup.a.has_attr ( 'class'))      # determines whether there is a class attribute

    True


Print (soup.p)    # get p tag (first)

    <p class="title"><b>The Dormouse's story</b></p>

Print (soup.p.children)    # Gets all child nodes of a first label p

    <list_iterator object at 0x7fe8185261d0>

print(list(soup.p.children))

    [<b>The Dormouse's story</b>]

print(list(soup.p.children)[0])

    <b>The Dormouse's story</b>

print(list(soup.p.children)[0].text)

    "The Dormouse's story"


Print (soup.find_all ( 'a'))    # tags get all of a

    [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id=a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

in a soup.find_all for ( 'a'):    # a tag through all

    print(a.attrs['href'])


Print (soup.find (ID = 'link3'))    # acquires label id = link3

    <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

print('#'*150)


#支持CSS选择器

#查找类名为story的节点

print(soup.select('.story'))

print('')

print(soup.select('.story a'))

print('')

#查找id=link1的节点

print(soup.select('#link1'))

Guess you like

Origin blog.51cto.com/20131104/2435898
Recommended