python reptile learning (X) bs4 analytical data

lxml installation is a pit

# -*- coding: utf-8 -*-
import lxml
import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
    # # UA伪装:将对应的User-Agent封装到字典中
    # headers = {
    #     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'
    # }
    # url = "https://www.2717.com/tag/434.html"
    # # 对指定url发起请求,对应的url是携带参数的,并且请求过程中处理了参数
    # response = requests.get(url=url, headers=headers)
    # # 接受返回数据
    # page_text = response.text
    # #从网页提取对象
    # #soup = BeautifulSoup(page_text, 'lxml')
    # # 存储
    # with open('./text.html', 'w', encoding='utf-8') as fp:
    #     fp.write(page_text)
    # print("html保存成功")

    #将本地html文档中的数据加载到该对象中
    fp = open('./text.html','r',encoding='utf-8')
    soup = BeautifulSoup(fp,'lxml')
    #print(soup)
    # soup.tagName返回的文档中第一次出现的tagName的标签
    #print(soup.a)
    #find('tagName')等同于soud.a
    #print(soup.find('li'))

    # 第二种用法,属性定位
    #属性名称加下划线 class_
    #print(soup.find('div',class_="warp"))
    #find_all 找到符合要求的所有标签 (列表) 也可以进行属性定位
    #print(soup.find_all('li'))
    #select('某种选择器 (id/class/标签/。。。)'),返回的是一个列表
    #print(soup.select('.warp'))

    #多层选择
    # >表示一个层级 空格表示多个层级
    #print(soup.select('.w1200 ul'))
    print(soup.select('.w1200 > ul img')[0]) #第【0】个标签

    #获取标签中的文本数据
    # soup.a.text/string/get_text()
    #text和get_text可以获取该标签下所有内容
    #string只能获取该标签下直系的文本内容
    #print(soup.select('.w1200 > ul a')[0].get_text())

    #获取标签中的属性值 soup.a['属性名称']
    print(soup.select('.w1200 > ul img')[0]['src'])
Published 23 original articles · won praise 0 · Views 671

Guess you like

Origin blog.csdn.net/haimian_baba/article/details/103754542