简介

Beautiful Soup 是一个可以从 HTML 或 XML 文件中提取数据的 Python库.它能够通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式.Beautiful Soup 会帮你节省数小时甚至数天的工作时间.

安装

$ pip install beautifulsoup4

BeautifulSoup 不仅支持 HTML 解析器,还支持一些第三方的解析器，如，lxml，XML，html5lib 但是需要安装相应的库。

$ pip install lxml

$ pip install html5lib

使用

示例一

__author__ = 'MrChen'
 
from bs4 import BeautifulSoup
#这是示例
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
#初始化，实例化一个BeautifulSoup对象，参数可以是一个字符串，也可以是一个打开的文件比如open('mydoc.html')
soup = BeautifulSoup(html_doc)
 
print(soup.title)
#输出：<title>The Dormouse's story</title>
 
print(soup.title.parent)
#输出：<head><title>The Dormouse's story</title></head>
 
print(soup.title.parent.parent)
#输出：
#<html><head><title>The Dormouse's story</title></head>
#<body>
#<p class="title"><b>The Dormouse's story</b></p>
#<p class="story">Once upon a time there were three little sisters; and their names were
#<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
#<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
#and they lived at the bottom of a well.</p>
#<p class="story">...</p>
#</body></html>
 
print(soup.title.name)
#输出：title
 
print(soup.title.parent.name)
#输出：head
 
print(soup.title.parent.parent.name)
#输出：html
 
print(soup.p)
#输出：<p class="title"><b>The Dormouse's story</b></p>
 
print(soup.p['class'])
#输出：['title']
 
print(soup.a)
#输出：<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
 
print(soup.find_all('a'))
#输出：
#[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
 
print(soup.find(id = 'link3'))
#输出：<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
 
for link in soup.find_all('a'):
    print(link.get('href'))
#输出：
# http://example.com/elsie
# http://example.com/lacie
# http://example.com/tillie
 
print(soup.getText())
#输出：
# The Dormouse's story
#
# The Dormouse's story
# Once upon a time there were three little sisters; and their names were
# Elsie,
# Lacie and
# Tillie;
# and they lived at the bottom of a well.
# ...
 
print('all tags : <<<<<<')
for tag in soup.find_all(True):
    print(tag.name)
#输出：
#html
#head
#title
#body
#p
#b
#p
#a
#a
#a
#p

#print(soup.findAll('ul', class_="sub-menu"))
#print(soup.findAll('ul',{"class":"sub-menu"}))
#print(soup.find_all('ul', class_="sub-menu"))
#print(soup.find_all('ul',{"class":"sub-menu"}))

示例二

import sys  
reload(sys)  
sys.setdefaultencoding('utf-8') 
from bs4 import BeautifulSoup
import requests


html_doc = """
<head>
      <meta charset="utf-8">
      <meta http-equiv="X-UA-Compatible" content="IE=Edge">
    <title>首页 - 简书</title>
</head>

<body class="output fluid zh cn win reader-day-mode reader-font2 " data-js-module="recommendation" data-locale="zh-CN">

<ul class="article-list thumbnails">

  <li class=have-img>
      <a class="wrap-img" href="/p/49c4728c3ab2"><img src="http://upload-images.jianshu.io/upload_images/2442470-745c6471c6f8258c.jpg?imageMogr2/auto-orient/strip%7CimageView2/1/w/300/h/300" alt="300" /></a>
    <div>
      <p class="list-top">
        <a class="author-name blue-link" target="_blank" href="/users/0af6b163b687">阿随向前冲</a>
        <em>·</em>
        <span class="time" data-shared-at="2016-07-27T07:03:54+08:00"></span>
      </p>
      <h4 class="title"><a target="_blank" href="/p/49c4728c3ab2"> 只装了这六款软件，工作就高效到有时间逛某宝刷某圈</a></h4>
      <div class="list-footer">
        <a target="_blank" href="/p/49c4728c3ab2">
          阅读 1830
</a>        <a target="_blank" href="/p/49c4728c3ab2#comments">
           · 评论 35
</a>        <span> · 喜欢 95</span>
          <span> · 打赏 1</span>
        
      </div>
    </div>
  </li>
</ul>

</body>
"""

soup = BeautifulSoup(html_doc, 'html.parser', from_encoding='utf-8')

# 查找所有有关的节点
tags = soup.find_all('li', class_="have-img")

for tag in tags:
        image = tag.img['src']
        article_user = tag.p.a.get_text()
        article_user_url = tag.p.a['href']      
        created = tag.p.span['data-shared-at']        
        article_url = tag.h4.a['href']

        # 可以在查找的 tag 下继续使用 find_all()
        tag_span = tag.div.div.find_all('span')

        likes = tag_span[0].get_text(strip=True)

具体学习猛击官方文档
参考其他文章
https://www.jianshu.com/p/2b783f7914c6
https://blog.csdn.net/bruce_6/article/details/80764000

python：BeautifulSoup 模块使用指南

文章目录

简介

安装

使用

猜你喜欢