网络爬虫 lxml库--解析和提取 HTML/XML 数据

一、lxml库的使用

我们利用它来解析 HTML 代码

from lxml import etree

text = '''
<div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a> # 注意,此处缺少一个 </li> 闭合标签
     </ul>
 </div>
'''

html = etree.HTML(text)     #将字符串解析为html文档 HTML  可以自动补全 li标签  body和html标签
print(html)  #<Element html at 0x25be0d7adc8>
print(type(html))  #<class 'lxml.etree._Element'>

html = etree.tostring(html)  #将html文档序列化为字节流字符串  bytes 类型的
print(html.decode())  #将字节流字符串解码为字符串
 
 

还可以文件读取

除了直接读取字符串,lxml还支持从文件里读取内容
from lxmlimport etree
# 读取外部文件 hello.html
html = etree.parse('./hello.html')
#转换成string类型的html
result =etree.tostring(html)
#转换成html文档
html = etree.HTML(result)
#最后转换成字节流html类型
result =etree.tostring(html)
#解码成utf-8编码
print(result.decode("utf-8"))

二、使用xpath和lxml进行数据筛选

hello.html
<div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">haha<span class="bold">third item</span></a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
xpath得到的都为列表,都需遍历
 
 
from lxml import etree

html_obj = etree.parse('hello.html')

result_list = html_obj.xpath('//li') #获取所有的li标签文档
for result in result_list:
    print(etree.tostring(result).decode()) #序列化li标签文档并解码为html字符串

result_list = html_obj.xpath('//li/@class')  #获取所有li标签的class属性值
print(result_list)
for result in result_list:
    print(result)

result_list = html_obj.xpath('//li/a[@href="link1.html"]/text()')  #获取li标签下href属性为link1.htmla标签的文本内容
# result_list = html_obj.xpath('//li/a[@href="link1.html"]') #获取li标签下href属性为link1.htmla标签
print(result_list)
for result in result_list:
    print(result)
    print(etree.tostring(result).decode())

# result_list = html_obj.xpath('//li//span/text()')  获取li后代span标签的文本内容
result_list = html_obj.xpath('//li/a/span/text()')  #获取lia下的span标签的文本内容
# result_list = html_obj.xpath('//li/a/text()') 获取lia标签的文本内容,不包a标签后代标签的文本内容
for result in result_list:
    print(result)

result_list = html_obj.xpath('//li[last()]/a/@href') #获取最后一个lia标签的href属性值
print(result_list)

result_list = html_obj.xpath('//li[last()-1]/a')  #获取最后一个的前一个lia标签
print(result_list[0].text)

result_list = html_obj.xpath('//li[last()-2]//text()') #获取最后一个的前一个li的文本内容以及他下面所有标签的文本内容,包括
print('dddd',result_list)


result_list = html_obj.xpath('//*/@class')  #获取所有标签的class属性值
for result in result_list:
    print(result)

result_list = html_obj.xpath('//*[@class="bold"]/text()') #获取class属性值为bold的标签的文本内容

print(result_list)
result_list = html_obj.xpath('//*[contains(@href,"html")]')  #获取所有href属性值包含html的标签
for result in result_list:
    print(etree.tostring(result).decode())

三、爬取网页数据

爬取百度贴吧图片

import requests
from lxml import etree
import os

def save_image(result):
    response = requests.get(result)
    if not os.path.exists('images'):
        os.makedirs('images')
    image_file_name = result[len(result)-10:]
    with open('images/'+image_file_name,'wb') as f:
        f.write(response.content)
    print('保存成功', image_file_name)

def get_image_url(tieba_detail_url):
    response = requests.get(tieba_detail_url)
    html_obj = etree.HTML(response.content.decode())  response.text   response.content  都行
    result_list = html_obj.xpath('//div[@class="d_post_content j_d_post_content "]/img/@src')
    for result in result_list:
        save_image(result)


def tieba_spider(kw, url, start_page, end_page):
    for page in range(start_page, end_page + 1):
        page = (page - 1) * 50
        params = {
            'pn': str(page),
            'kw': kw
        }
        # headers = {
        #     "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
        # }
        response = requests.get(url, params=params)
        html = response.content.decode()
        print(response.url)
        html_obj = etree.HTML(html)
        result_list = html_obj.xpath('//div[@class="threadlist_title pull_left j_th_tit "]/a/@href')
        for result in result_list:
            tieba_detail_url = "https://tieba.baidu.com" + result
            get_image_url(tieba_detail_url)

def main():
    kw = input("请输入您要爬取贴吧名称:")
    start_page = int(input("请输入您要爬取起始页面:"))
    end_page = int(input("请输入您要爬取结束页面:"))
    url = "https://tieba.baidu.com/f?" + "&ie=utf-8"
    tieba_spider(kw, url, start_page, end_page)


if __name__ == '__main__':
    main()

猜你喜欢

转载自blog.csdn.net/qq_41654985/article/details/81015520
今日推荐