爬虫——选择器

BeautifulSoup

  • 导入类库
from bs4 import BeautifulSoup  # 适用于简单页面 
  • 创建soup对象
soup = BeautifulSoup(test_data,'lxml')
  • 标签特性
1 # print(soup.a)                          #获取指定标签
2 # print(soup.a['href'])                    #获取标签指定属性
3 # print(soup.a.contents,type(soup.a.contents))      #获取标签的文本,结果是列表
4 # print(soup.a.text,type(soup.a.text))                #获取标签的文本,结果是字符串
  • 查找标签
    • 查找所有a标签
    • 查找特定id的a标签及其href
    • 查找所有特定id的标签
 1 # 查找所有a标签
 2 # a_list = soup.find_all('a')
 3 # for a in a_list:
 4 #     print(a['href'])
 5 
 6 # 查找id为places_neighbours__row的a标签及其href
 7 # a_place = soup.find_all('a',id='places_neighbours__row')
 8 # for a in a_place:
 9 #     print(a['href'])
10 # 作用比上面更广,不指定标签,把具有id为places_neighbours__row的标签都找出来
11 # attrs_place = soup.find_all(attrs={'id':'places_neighbours__row'})
12 # for attrs in attrs_place:
13 #     print(attrs.name)           #name为标签名字

lxml

/ 从根标签开始 // 从当前标签开始 * 通配符,选择所有 //div/book[2]/title[@lang="zh"] 选择当前div标签路径下第二个book(下标从1开始)的含有lang=zh属性的title元素

  • 导入类库
import lxml.html
  • 创建lxml对象
html = lxml.html.fromstring(test_data) 
  • 相对路径//
# 以下三种具有相同结果,使用相对路径最终都指向title,写得越简单需要搜索越久
# html_data = html.xpath('//div/book/title') # html_data = html.xpath('//book/title') # html_data = html.xpath('//title') 
  • *作用
# 把含有属性的title都选择出来
# html_data = html.xpath('//book/title[@*]')  # 将title所有的属性值选择出来,选择出来的是属性值没有text # html_data = html.xpath('//book/title/@*') 
  • 内置text()函数

取出title的内容,是一个列表,不用text

# html_data = html.xpath('//book/title/text()')
# html_data = html.xpath('//div/ul/li[1]/a/text()') 
  • 逻辑关系and
# html_data = html.xpath('//a[@href="link1.html" and @id="places_neighbours__row"]/text()') 
  • 逻辑关系or
# html_data = html.xpath('//li[@class="item-1" or @class="item-0"]/a/text()') 
  • 不等于(!=)
# html_data = html.xpath('//li[@class!="item-1" and @class!="item-0"]/a/text()') 
  • last()

取指定标签的最后一个

# html_data = html.xpath('//div/book[last()-1]/title/text()')
  • 比较关系
# html_data = html.xpath('//div/book[price > 39]/title/text()')
# html_data = html.xpath('//div/book[price >= 39.95]/title/text()') 
  • starts-with
# html_data = html.xpath('//li[starts-with(@class,"item")]/a/text()')
# html_data = html.xpath('//li[starts-with(@class,"g")]/a/text()')
  • contains
# html_data = html.xpath('//li[contains(@class,"te")]/a/text()')
# html_data = html.xpath('//title[contains(@lang,"n")]/text()') 
  • 父子节点
# html_data = html.xpath('//book/descendant::*/text()')
# html_data = html.xpath('//book/ancestor::*') # 选出祖先节点


爬取百科文章

爬取前对网页源码进行分析,对要爬取的文章定位

1 # url_base = 'https://www.qiushibaike.com/8hr/page/2/'
2 # result = requests.get(url_base,headers=headers)
3 # html = lxml.html.fromstring(result.text)
4 # html_data = html.xpath('//div[@class="content"]/span[1]/text()')
5 # # print(html_data)
6 # for i in html_data:
7 #     with open('./qiushi.txt','ab') as f:
8 #         f.write(i.encode('utf-8'))
9 # print(result.text)

爬取百科图片

爬取前对网页源码进行分析,对要爬取的图片定位

 
 1 # url_base = 'https://www.qiushibaike.com/imgrank/page/2/'
 2 # result = requests.get(url_base,headers=headers)
 3 # html = lxml.html.fromstring(result.text)
 4 # html_data = html.xpath('//div[@class="thumb"]/a/img/@src')     #提取图片地址
 5 # # print(html_data[0][2:],type(str(html_data[0][2:])))
 6 # for i in html_data:
 7 #     photo_url = 'https:'+str(i)
 8 #     photo = requests.get(photo_url)
 9 #     p_num = html_data.index(i) + 1
10 #     with open('./picture/'+str(p_num)+'.jpg','wb') as f:
11 #         f.write(photo.content)






请使用手机"扫一扫"x

猜你喜欢

转载自www.cnblogs.com/siplips/p/9689473.html
今日推荐