python - BeautifulSoup教程

BeautifulSoup

BeautifulSoup将网页元素的正则化查找简单化。

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
asdf
    <div class="title">
        <b>The Dormouse's story总共</b>
        <h1>f</h1>
    </div>
<div class="story">Once upon a time there were three little sisters; and their names were
    <a  class="sister0" id="link1">Els<span>f</span>ie</a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</div>
ad<br/>sf
<p class="story">...</p>
</body>
</html>
"""

soup = BeautifulSoup(html_doc, features="lxml")
# 找到第一个a标签, 返回对象
tag1 = soup.find(name='a')
# 找到所有的a标签,返回对象
tag2 = soup.find_all(name='a')
# 找到id=link2的标签,返回对象,css选择器语法
tag3 = soup.select('#link2')

这里写图片描述

使用示例

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
    ...
</body>
</html>
"""

soup = BeautifulSoup(html_doc, features="lxml")

1. 获取和设置标签名称

tag = soup.find('a')
name = tag.name # 获取
print(name)
tag.name = 'span' # 设置
print(soup)

2. 获取和设置标签属性

tag = soup.find('a')
attrs = tag.attrs    # 获取
print(attrs)
tag.attrs = {'ik':123} # 设置
tag.attrs['id'] = 'iiiii' # 设置
print(soup)

3. 按名称查找某标签

# soup查找tag名称
tag = soup.find('a')
print (tag)

4. 按类查找某标签

# soup查找类名称(属性)
# 方法一
tag = soup.find(class_= 'ConsTi')
print(tag)
# 方法二
tag = soup.find(attrs={'class': 'ConsTi'})
print(tag)

4. 按ID查找某标签

# soup查找id(属性)
tag = soup.find(id = 'banner')
print(tag)
tag = soup.find(attrs = {'id':'banner'})
print(tag)

5. 获取子节点和子孙节点

# 获取子节点,且剔除字符串对象,例如换行符
# 在获取子节点的时候,有Navigable和Tag两类对象,Navigable中经常存储换行符之类的对象。
from bs4.element import Tag
tags = tag.children
for i in tags:
    if type(i) ==  Tag:
        print(i)
    else:
        print('string type')

# 获取子孙节点,深度优先搜索
tags = tag.descendants
print([i for i in tags])

6. 清楚节点下内容

# 清空所有节点
# 1. 保留标签名
tag.clear()
# 2. 全部删除
tag.decompose()
# 3. 全部删除,并且将删除的内容返回
tag.extract()

7. 将tag转化为字符串

# 将tag对象转化为字符串
tag.decode() # 同上,获取字符串
str(tag) # 同上,获取字符串
tag.encode()#获取字节类型

8. find方法细说

tag = soup.find('a')
print(tag)
# recursive = True 深度优先搜索
# recursive = False 广度优先搜索,且单面
# text 文本匹配
tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
print(tag)

9. find_all方法细说

上述find()的方法同样适应于find_all()

# 列表内元素逻辑上为或关系
v = soup.find_all(name=['a','div'])
print(v)

v = soup.find_all(class_=['sister0', 'sister'])
print(v)

v = soup.find_all(text=['Tillie'])
print(v, type(v[0]))


v = soup.find_all(id=['link1','link2'])
print(v)

v = soup.find_all(href=['link1','link2'])
print(v)

10.正则查找

####### 正则 #######
import re
rep = re.compile('p')
rep = re.compile('^p')
v = soup.find_all(name=rep)
print(v)

rep = re.compile('sister.*')
v = soup.find_all(class_=rep)
print(v)

rep = re.compile('http://www.oldboy.com/static/.*')
v = soup.find_all(href=rep)
print(v)

11.其他补充

# 是否有特定属性
tag.has_attr('id')

# 获取内部文本内容
v = tag.get_text()

# 获取标签属性
tag = soup.find('a')
v = tag.get('id')
print(v)

12.查找索引

# print the index and the content
tag = soup.find('body')
for i,v in enumerate(tag):
    print(i,v)

13. 判断自闭合标签

‘br’ , ‘hr’, ‘input’, ‘img’, ‘meta’,’spacer’, ‘link’, ‘frame’, ‘base’

tag = soup.find('br')
v = tag.is_empty_element
print(v)

14. CSS选择器

soup.select("title")

soup.select("p nth-of-type(3)")

soup.select("body a")

soup.select("html head title")

tag = soup.select("span,a")

soup.select("head > title")

soup.select("p > a")

soup.select("p > a:nth-of-type(2)")

soup.select("p > #link1")

soup.select("body > a")

soup.select("#link1 ~ .sister")

soup.select("#link1 + .sister")

soup.select(".sister")

soup.select("[class~=sister]")

soup.select("#link1")

参考:https://www.cnblogs.com/wupeiqi/articles/6283017.html

猜你喜欢

转载自blog.csdn.net/ZenG_xiangt/article/details/81713035
今日推荐