初探python爬虫（五）——bs4

安装
cmd -》pip install BeautifulSoup
使用
import requests
from bs4 import BeautifulSoup

# 定义html文档内容
html_doc = """
<html><head><title abc="123">The Dormouse's story</title></head> <body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
#创建一个BeatifulSoup对象，
soup = BeautifulSoup(html_doc,'lxml')

# #1.通过tag标签对象获取文档数据
# r = soup .title
# print(r)    #<title abc="123">The Dormouse's story</title>
#
# r = soup.title['abc']
# print(r)    #123
#
# r = soup.p
# print(r)    #<p class="title"><b>The Dormouse's story</b></p>
#
# r = soup.p['class']
# print(r)    #['title']
#
# r = soup.title.text
# print(r)    #The Dormouse's story
#
# r = soup.p.parent.name
# print(r)    #body
#
#
# #2.通过搜索获取页面中的元素，find，find_all
# r= soup.find('a')
# print(r)    #<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
#
# r=soup.find_all('a')
# print(r)
# #<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
# # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
#
# r=soup.find('title')
# print(r)    #<title abc="123">The Dormouse's story</title>
# print(r,type(r))    #<title abc="123">The Dormouse's story</title> <class 'bs4.element.Tag'>
# print(r.text)   #The Dormouse's story
# print(r.get_text())  #The Dormouse's story

# 3.通过css选择器
# 通过标签 选择元素
r = soup.select('title')

# 4.通过class类名获取元素
r = soup.select('.title')
# print(r)    #[<p class="title"><b>The Dormouse's story</b></p>]

# 5.通过ID名获取元素
r = soup.select('#link2')
# print(r)    #[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]


# 6.通过空格 层级关系获取元素
r = soup.select('html body p')
# print(r)    #[<p class="title"><b>The Dormouse's story</b></p>, <p class="story">Once upon a time there were three little sisters; and their names were
            # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
            # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
            # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
            # and they lived at the bottom of a well.</p>, <p class="story">...</p>]

# 7。通过逗号，并列关系获取元素
r = soup.select('a,title')
print(r)
#[<title abc="123">The Dormouse's story</title>, <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
HadwinLing
发布了42 篇原创文章 · 获赞 12 · 访问量 6111
私信关注
初探python爬虫（五）——bs4

猜你喜欢