python爬虫三：bs4库lxml

转：https://zhuanlan.zhihu.com/p/26701898

# -*- coding: utf-8 -*-
import bs4
#使用lxml方式
soup=bs4.BeautifulSoup(open("demo.html"),"lxml")


print soup.prettify()
'''
<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>

'''


#设置编码格式
'''
如何具体的使用？

bs4 库首先将传入的字符串或文件句柄转换为 Unicode的类型，这样，我们在抓取中文信息的时候，就不会有很麻烦的编码问题了。
当然，有一些生僻的编码 如：‘big5’，就需要我们手动设置编码：
soup = BeautifulSoup(markup, from_encoding="编码方式")
'''


#内容解析
'''
对象的种类：

bs4 库将复杂的html文档转化为一个复杂的树形结构，每个节点都是Python对象 ，所有对象可以分为以下四个类型：
Tag , NavigableString , BeautifulSoup , Comment
我们来逐一解释：

Tag： 和html中的Tag基本没有区别，可以简单上手使用
NavigableString： 被包裹在tag内的字符串
BeautifulSoup： 表示一个文档的全部内容，大部分的时候可以吧他看做一个tag对象，支持遍历文档树和搜索文档树方法。
Comment：这是一个特殊的NavigableSting对象，在出现在html文档中时，会以特殊的格式输出，比如注释类型。
'''


#方法
#获取头信息
print soup.head
'''
<head>
<meta charset="utf-8"/>
</head>
'''

#获取title信息
print soup.title

#获取b标签
print soup.body.b#<b>The Dormouse's story</b>

#获取所有a标签
print soup.findAll('a')

tag=soup.findAll('a')

print tag[1]#<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>


#获取节点方式
head_tag=soup.head
print head_tag
print head_tag.contents#[u'\n', <meta charset="unicode-escape"/>, u'\n']
print head_tag.contents[0]#<meta charset="utf-8"/>

content=head_tag.contents[0]
print content.contents#[u"The Dormouse's story"]


#另外通过tag的 .children生成器，可以对tag的子节点进行循环：
for child in content.children:
    print child#The Dormouse's story


#这种方式只能遍历出子节点。如何遍历出子孙节点呢？
for child in content.descendants:
    print child #The Dormouse's story



#获取页面的所有的文本内容
'''
如果该tag只有一个子节点（NavigableString类型）：直接使用tag.string就能找到。
如果tag有很多个子、孙节点，并且每个节点里都string：
'''

for s in soup.strings:
    print repr(s)


'''
u"The Dormouse's story"
u"The Dormouse's story"
u' Lacie '
u' Lacie '
u' Lacie '
'''

#官方文档https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/#id12

python爬虫三：bs4库lxml

猜你喜欢