day065 BeautifulSoup

今天是另外一个html/xml解析器的使用–BeautifulSoup

  • 到这里我们已经接触过了三个可以解析网页文本的方式:re(正则), Ixml, 以及今天学的BeautifulSoup,它们三个的特点如下:

    • re:速度最快(C写的), 使用难度最大
    • Ixml:速度较快, 使用难度一般
    • BeautifulSoup:速度三者相比较慢,但不影响使用,使用最为方便容易

    基本使用

步骤:

  • 1.导入模块: from bs4 import BeautifulSoup
  • 2.创建soup对象,将字符串类型的网页文本传入:soup=BeautifulSoup(<网页数据>)
  • 3.使用soup.find()/soup.findall()/soup.select()对数据进行匹配返回
  • 4.对返回的数据进行展示/存储

四个对象

  • Tag:就是标签整体
  • NavigationSting:标签的文本内容
  • comment:标签去除注释符号的文本内容
  • BeautifulSoup:整个内容本身

    感觉有点抽象,标签就是标签呗,内容就是内容呗,为什么起这么几个不太好理解的名字?

# -*- coding:utf-8 -*-
import bs4


def tag():
    html = """
      <html><head><title>The Dormouse's story</title></head>
      <body>
      <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
      <p class="story">Once upon a time there were three little sisters; and their names were
      <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
      <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
      <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
      and they lived at the bottom of a well.</p>
      <p class="story">...</p>
      """

    # 创建一个soup对象
    soup = bs4.BeautifulSoup(html, 'lxml')

# 提取标签对象
    # <p ; <a; <body
    print soup.p # 返回第一个p标签
    print soup.body # 返回第一个body标签
    print soup.a # 返回的第一个a 标签

    print type(soup.p)  # <class 'bs4.element.Tag'>
    print type(soup.body)  # <class 'bs4.element.Tag'>
    print type(soup.a)  # <class 'bs4.element.Tag'>

    print soup.p.attrs
    # 第一个p标签的所有属性,一个字典,属性名位key,属性值为value,
    # 注意class属性的值为一个列表,因为class可能不只一个值

    print soup.a['href'] # 获取到第一个a标签的'href' 属性的值
    print soup.a.get('href') # 效果同上

    soup.a['href'] = 'http://www.baidu.com'
    print soup.a['href'] # 可以修改标签的属性值

    del soup.a['href'] # 可以删除某个属性的值
    print soup.a
    # <a class="sister" id="link1"><!-- Elsie --></a>

    print soup.p.string
    # The Dormouse's story
    print type(soup.p.string)
    # <class 'bs4.element.NavigableString'>

    # BeautifulSoup对象:标签本身
    print soup.name
    # [document]
    print type(soup.name)
    # <type 'unicode'>

     # Comment对象:一种特殊的NavigationString对象,忽略注释符号
    print soup.a.string
    #  Elsie
    print type(soup.a.string)
    # <class 'bs4.element.Comment'>

三个方法

find

  • 返回条件匹配的第一个标签/文本等内容
    用法如下:
from bs4 import BeautifulSoup
import re


# 有关find方法的使用
def find_method():
    html = """
              <html><head><title>The Dormouse's story</title></head>
              <body>
              <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
              <p class="story">Once upon a time there were three little sisters; and their names were
              <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
              <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
              <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
              and they lived at the bottom of a well.</p>
              <p class="story">...</p>
              """
    soup = BeautifulSoup(html, 'lxml')
    soup.prettify()

    # find(),参数可以传递 一.标签名tag 二.属性名attr 三.标签内容 text
    # 注意find只返回匹配出结果的第一个

    # 一:匹配标签可以匹配:1.单个标签名 2.标签名列表 3.正则表达式pattern对象
    # 1.1 单个标签名
    p = soup.find('p')
    print p
    # 返回第一个p标签


    print '-----' * 20
    # 1.3 正则表达式pattern对象
    # 匹配标签名以'h'开头的标签
    startwith_h = soup.find(re.compile(r'^h'))
    print startwith_h
    # 返回html标签

    print '-----'*20
    # 二. 属性名attr
    # 2.1 直接传递属性名
    # 匹配 class_="title" 的标签
    res = soup.find(class_="title")
    print res
    #
    print '-----'*20
    # 2.2 传递多个属性
    # 匹配属性有class="title"和 class="story"的标签
    res = soup.find(attrs={'class':"title", 'class':"story"})
    print res
    # 返回出第一个匹配到的class='story'的标签

    print '-----' * 20
    # 2.3 属性名正则表达
    # 匹配 属性名为t开头的标签
    res = soup.find(class_=re.compile(r'^t'))
    print res
    # [<p class="title" name="dromouse"><b>The Dormouse's story</b></p>]

    print '-----' * 20
    # 三.以标签里的test来匹配
    # 3.1 匹配一个内容
    # 匹配内容'The Dormouse's story'
    res = soup.find(text="The Dormouse's story")
    print res
    # The Dormouse's story


    # 3.2 匹配多个内容
    print '-----' * 20
    # 匹配人名:Elsie,Lacie,Tillie
    res = soup.find(text=['Elsie', 'Lacie', 'Tillie'])
    print res
    # Lacie


if __name__ == '__main__':
    find_method()

结果:

/Users/mengxing/.virtualenvs/py2_spider/bin/python2.7 /Users/mengxing/Desktop/code/spider/day03_beautiful_soup_4/05.find.py
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
----------------------------------------------------------------------------------------------------
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
              <a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>,
              <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
              <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
              and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>
----------------------------------------------------------------------------------------------------
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
----------------------------------------------------------------------------------------------------
<p class="story">Once upon a time there were three little sisters; and their names were
              <a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>,
              <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
              <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
              and they lived at the bottom of a well.</p>
----------------------------------------------------------------------------------------------------
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
----------------------------------------------------------------------------------------------------
The Dormouse's story
----------------------------------------------------------------------------------------------------
Lacie

findall()

  • 返回条件匹配的所有的标签/文本等内容,以列表的形式存储
    用法如下:

# findall(),参数可以传递 一.标签名tag 二.属性名attr 三.标签内容 text
    # 一:匹配标签可以匹配:1.单个标签名 2.标签名列表 3.正则表达式pattern对象
    # 1.1 单个标签名
    # 匹配出所有的p标签
    p = soup.find_all ('p')
    print p
    # 返回所有p标签的组成的列表

    print '-----'*20
    # 1.2 标签名列表
    # 匹配出所有的p标签和a标签
    P_a= soup.find_all(['p', 'a'])
    print P_a
    # 返回所有的p标签和a标签组成的列表

    print '-----' * 20
    # 1.3 正则表达式pattern对象
    # 匹配所有标签名以'h'开头的标签
    startwith_h = soup.find_all(re.compile(r'^h'))
    print startwith_h
    # 返回html标签(整个soup)和head标签组成的列表

    print '-----'*20
    # 二. 属性名attr
    # 2.1 直接传递属性名
    # 匹配 class_="title" 的标签 为什么加一个'_'才能获取到?
    res = soup.find_all(class_="title")
    print res

    print '-----'*20
    # 2.2 传递多个属性
    # 匹配属性有class="title"和 class="story"的标签
    res = soup.find_all(attrs={'class':"title", 'class':"story"})
    print res

    print '-----' * 20
    # 2.3 属性名正则表达
    # 匹配 属性名为t开头的标签
    res = soup.find_all(class_=re.compile(r'^t'))
    print res
    # [<p class="title" name="dromouse"><b>The Dormouse's story</b></p>]

    print '-----' * 20
    # 三.以标签里的test来匹配
    # 3.1 匹配一个内容
    # 匹配内容'The Dormouse's story'
    res = soup.find_all(text="The Dormouse's story")
    print res
    #[u"The Dormouse's story", u"The Dormouse's story"]


    # 3.2 匹配多个内容
    print '-----' * 20
    # 匹配人名:Elsie,Lacie,Tillie
    res = soup.find_all(text=['Elsie', 'Lacie', 'Tillie'])
    print res
    # [u'Lacie', u'Tillie'] ?

    print '-----' * 20
    # 3.3 正则匹配
    # 匹配内容中含有'e'的所有单词
    res = soup.find_all(text=re.compile(r'^ .*?e.*? $'))
    print res
    # [u' Elsie '] ?正则表达式有问题

结果如下

[<p class="title" name="dromouse"><b>The Dormouse's story</b></p>, <p class="story">Once upon a time there were three little sisters; and their names were\n              <a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>,\n              <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and\n              <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;\n              and they lived at the bottom of a well.</p>, <p class="story">...</p>]
----------------------------------------------------------------------------------------------------
[<p class="title" name="dromouse"><b>The Dormouse's story</b></p>, <p class="story">Once upon a time there were three little sisters; and their names were\n              <a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>,\n              <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and\n              <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;\n              and they lived at the bottom of a well.</p>, <a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, <p class="story">...</p>]
----------------------------------------------------------------------------------------------------
[<html><head><title>The Dormouse's story</title></head>\n<body>\n<p class="title" name="dromouse"><b>The Dormouse's story</b></p>\n<p class="story">Once upon a time there were three little sisters; and their names were\n              <a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>,\n              <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and\n              <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;\n              and they lived at the bottom of a well.</p>\n<p class="story">...</p>\n</body></html>, <head><title>The Dormouse's story</title></head>]
----------------------------------------------------------------------------------------------------
[<p class="title" name="dromouse"><b>The Dormouse's story</b></p>]
----------------------------------------------------------------------------------------------------
[<p class="story">Once upon a time there were three little sisters; and their names were\n              <a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>,\n              <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and\n              <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;\n              and they lived at the bottom of a well.</p>, <p class="story">...</p>]
----------------------------------------------------------------------------------------------------
[<p class="title" name="dromouse"><b>The Dormouse's story</b></p>]
----------------------------------------------------------------------------------------------------
[u"The Dormouse's story", u"The Dormouse's story"]
----------------------------------------------------------------------------------------------------
[u'Lacie', u'Tillie']
----------------------------------------------------------------------------------------------------
[u' Elsie ']

select()

  • 类选择器 方法
  • 返回条件匹配的所有标签/文本内容,以列表的形式存储,跟findall()比,好处是可以直接使用属性选择器,而不用写全属性内容。
    用法如下:
# css选择器的种类:类选择器,id选择器, 层级选择器, 属性选择器, 伪类选择器,标签选择器,组选择器

    # css选择器查找的方法为:.select(), 返回类型为一个list,跟findall()比较像,好处是,在写选择器时,
    # 和前端的CSS选择器语法一致,比较简单

    # 1.类选择器:
    # 选择类属性为'title'的标签
    res = soup.select('.title')
    print res
    # [<p class="title" name="dromouse"><b>The Dormouse's story</b></p>]

    # 2.id选择器:
    # 选择id属性为link1的标签
    res = soup.select('#link1')
    print res
    # [<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>]

    # 3.通过标签名查找:
    # 查找所有的a标签
    res = soup.select('a')
    print res
    #[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

    # 4.属性查找
    # 查找属性有 href="http://example.com/lacie 的a标签
    res = soup.select('a[href="http://example.com/lacie"]') # 注意要把a写在''里面
    print res
    # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

    # 5.组选择器
    # 选择 href="http://example.com/lacie" ,class="sister"的标签
    res = soup.select(".sister,href='http://example.com/lacie' ")
    print res

    # 6.层级选择器
    # 选择head下面的title标签
    res = soup.select('head title')
    print res

    # 获取内容,因为返回的是列表,所以可以直接使用下标获取列表内的部分标签,用get_text()
    # 方法获取标签的值
    # 获取class='title'下的b标签里的值
    text = soup.select('.title b')[0].get_text()
    print text
    # The Dormouse's story

结果如下:

[<p class="title" name="dromouse"><b>The Dormouse's story</b></p>]
[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>]
[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
[<title>The Dormouse's story</title>]
The Dormouse's story

实例

实例1:爬取腾讯招聘网站的所有招聘信息并处理存储

# 腾讯社招网.png

代码:

# -*- coding:utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

# 爬取 https://hr.tencent.com/ 上的所有招聘信息
import requests, re, json
from bs4 import BeautifulSoup
import logging


class SpiderTencentEmployPage(object):
    def __init__(self):
        self.base_url = 'https://hr.tencent.com/position.php?'
        self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
        self.list_data=[]# 每一行数据为一个单位,放在字典里,所有的数据单位放在一个列表里,这样后期可以转json


    # 请求数据
    def send_request(self, params):
        try:
            response = requests.get(url=self.base_url, params=params, headers=self.headers)
            data = response.content
            return data
        except Exception as err:
            logging.error(err)
            print err

    # 获取总页数
    def get_amount_pages(self):
        params = {
            'start': '0',
            'lid': '2175'
        }
        data = self.send_request(params)
        # get_amount_page_judge_num 传给filter_data,在函数内验证,如果为1,说明是为了获取最大页数
        # 为0 说明为正常筛选数据
        get_amount_page_judge_num = 1
        amount_page_num = self.filter_data(data, get_amount_page_judge_num)
        print type(amount_page_num)  # <type 'unicode'>

        return int(amount_page_num)

    # 分析过滤数据
    def filter_data(self, data, get_amount_page_judge_num):
        soup = BeautifulSoup(data, 'lxml')
        # 补全数据
        soup.prettify()
        # 获取最大页数
        if get_amount_page_judge_num == 1:
            # 筛选数据
            amount_page_num = soup.select('.pagenav a')[9].get_text()
            return amount_page_num # 类型为unicode,数字使用前需要int下
        else:
            # 1.一行元素的内容一个大标签,匹配出所有行元素的大标签,返回一个列表
            datas = soup.select('.even, .odd')
            # 2.每一行元素要以字典的形式存储,遍历所有大标签的列表
            for data in datas:
                # 3. 大标签,提取出每一个小标签的text值,存到局部字典里
                dict = {}
                dict['name']=data.select('td a')[0].get_text()
                dict['type']=data.select('td')[1].get_text()
                dict['count']=data.select('td')[2].get_text()
                dict['addr']=data.select('td')[3].get_text()
                dict['date']=data.select('td')[4].get_text()
                # 将完整的一行数据(字典)存放在总的数据列表里
                self.list_data.append(dict)
                # 不用返回,将list_data 直接保存数据即可

    # 将数据保存到本地
    def save_data(self):
        # 将list 数据转换为json数据后存储
        json_data = json.dumps(self.list_data) # dumps是转换为json格式数据(字符串)

        # 存储到本地
        with open('08.tencent_employ_data_02.json', 'w') as f:
            f.write(json_data)

    # 主逻辑
    def main(self):
        # 先爬取第一页,获取总页数
        amount_page_num = self.get_amount_pages()

        # 循环所有页数,爬取每一页的数据
        page_num = 1
        while page_num <= amount_page_num:
            params = {
                'start':(page_num-1)*10,# 注意不要使用占位符,直接填入数字!
                'lid':'2175'
            }
            # 请求数据
            data = self.send_request(params)

            # 解析处理数据
            get_amount_page_judge_num = 0
            self.filter_data(data, get_amount_page_judge_num)

            print '已经爬好第%s页'%page_num

            page_num += 1

        # 保存数据到本地
        self.save_data()

if __name__ == '__main__':
    spider_tencent = SpiderTencentEmployPage()

    spider_tencent.main()

结果

# tecent 社招.png

  • json数据需要在一些解析器中才能看清,工作中,为了给相关人员展示结果,应该转换成更直接的界面。此时,可是将json数据转换为csv格式的文件

  • csv 是一种文件格式,是能够在文本中展示表结构的文件类型。

使用步骤及上一个案例的转换代码

# -*- coding:utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

import csv, json

# 步骤:
# 1. 导入csv模块, json模块: import csv,json
# 2. 打开源文件(json文件),csv文件(没有就新建)
# 3. 创建csv读写器:csv.writer(<文件名>)
# 4. 将需要转换的数据先转换成list格式:json.load(<源文件对象>) # 打开的json文件这种状态,是一个对象,对象转换为list格式就是用json.load()
# 5. 使用csv读写器将转换格式后的list格式的文件写入csv文件里:.writerow(<单层列表>),表示写入单层列表的每一个值,在一行
#    .writerows(<两层列表>),表示写入两层列表中的每一个元素,一个元素(一个列表)占一行,自动换行
# 6. 关闭文件:<文件名>.close()
def json_to_csv():
    # 1.json文件 读取
    json_file = open('08.tencent_employ_data_02.json', 'r')
    print type(json_file) # <type 'file'>

    # 2.csv文件 写入
    csv_file= open('08.tencent_employ_data_02.csv', 'w')
    print type(csv_file) # <type 'file'>

    # 3.将json_file状态对象的jsons数据转换为原list类型
    data_list = json.load(json_file) # json.load()是讲对象转换为原数据类型的方法
    print type(data_list) # <type 'list'>

    # 4.取出表头
    sheets = data_list[0].keys()
    print sheets #  [u'count', u'date', u'type', u'name', u'addr']

    # 5. 取出内容
    content_list = []
    for data_dict in data_list:
        content_list.append(data_dict.values())
    print content_list # 大列表套每一条数据的小列表

    # 6. 创建csv读写器
    csv_writer = csv.writer(csv_file)
    print csv_writer # <_csv.writer object at 0x10cf2f1b0>
    print type(csv_writer) # <type '_csv.writer'>

    # 7.写入表头数据
    csv_writer.writerow(sheets)

    # 8.写入内容
    csv_writer.writerows(content_list)

    # 一个列表一行数据,自动换行,自动转换成表格

    # 9.关闭csv文件
    csv_file.close()

    # 10.关闭json文件
    json_file.close()


if __name__ == '__main__':
    json_to_csv()

转换后结果:

# json_to_csv.png

实例2:抓取糗事百科的部分需求信息

# 糗事百科.png

代码:

# -*- coding:utf-8 -*-
# 抓取糗事百科网页的每一个单元的:用户名,性别,年龄,正文,点赞数,抓取前10页
import requests, bs4, json, logging
import sys

reload(sys)
sys.setdefaultencoding("utf-8")


class SpiderQiushibaike(object):
    def __init__(self):
        self.base_url = 'https://www.qiushibaike.com/8hr/page/'
        self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
        self.data_list = []  # 用来存放获取的所有数据单元

    # 发送请求
    def send_request(self, url):
        try:
            response = requests.get(url=url, headers=self.headers)
        except Exception as err:
            logging.error(err)
            print err

        return response.content

    # 过滤数据
    def filter_data(self, data):
        # 创建soup对象
        soup = bs4.BeautifulSoup(data, 'lxml')
        # 完整化数据
        soup.prettify()
        # 取出所有的单元数据
        div_list = soup.select('#content-left .article')
        print '单元数据块数量:%d' % len(div_list)
        # 取出每一个单元的需求数据
        for div in div_list:
            temp_dict = {}  # 临时存储一个单元的所有所需数据
            temp_dict['user_name'] = div.select('.author img')[0]['alt']
            author_div = div.select('.author div')
            if author_div:
                temp_dict['age'] = author_div[0].string
                temp_dict['gender'] = author_div[0]['class'][1].replace('Icon', '')
            else:
                temp_dict['age'] = '无'
                temp_dict['gender'] = '无'
            temp_dict['artical'] = div.select('.content span')[0].get_text().strip()
            temp_dict['likenum'] = div.select('.stats-vote i')[0].get_text().strip()
            self.data_list.append(temp_dict)

    # 保存数据到本地
    def save_data(self):
        # 将列表转换成json数据
        data_json = json.dumps(self.data_list)
        with open('10_qiushibaike_data.json', 'w') as f:
            f.write(data_json)

    # 主逻辑
    def main(self):
        # 参数准备
        for page in range(1, 11):
            url = self.base_url + '%d/' % page

            # 发送请求
            data = self.send_request(url)

            # 过滤数据
            self.filter_data(data)

        # 保存数据到本地
        self.save_data()


if __name__ == '__main__':
    spider_qiushibaike = SpiderQiushibaike()
    spider_qiushibaike.main()

结果

# 糗事百科结果.png

猜你喜欢

转载自blog.csdn.net/michael_cool/article/details/79943961