基本使用
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="sister"><b>$37</b></p> <p class="story" id="p">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" >Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """
find: a first look
find_ all: to find all
Find and Find tag attributes:
The name attribute matching
name tag name
attrs attribute to find a match
name tag name
attrs attribute to find a match
Contextual text
labels:
- global matching filter string String
labels:
- global matching filter string String
- Regular filter
re matching module
re matching module
- List Filter
Data matching in the list
-bool filter
True Match
True Match
- method filter
for some of the attributes you want and do not need to look for the property.
for some of the attributes you want and do not need to look for the property.
Properties:
- class_
- class_
- id
html_doc = """ <html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" >Elsie</a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>and they lived at the bottom of a well.</p><p class="story">...</p> """ P'= soup.find (name = P' '' String filter'' ') 'lxml'= the BeautifulSoup (html_doc, Soupthe BeautifulSoupImportBS4from ') p_s=soup.find_all(name='p') print(p) print(p_s) p=soup.find(name='p',attrs={"id":"p"}) print(p) tag=soup.find(name='a',attrs={"class":"sister"},text="Elsie") print(tag) ''' re matching module Regular filter ''' import re a=soup.find(name=re.compile('a')) print(a) a_s=soup.find_all(name=re.compile('a')) print(a_s) a=soup.find(attrs={"id":re.compile('link')}) print(a) print(soup.find(name=['a','p''html',re.compile('a')])) print(soup.find_all(name=['a','p','html',re.compile('a')])) print(soup.find(name=True,attrs={"id":True})) def have_id_not_class(tag): if tag.name=='p' and tag.has_attr("id") and not tag.has_attr("class"): return tag print(soup.find_all(name=have_id_not_class)) a=soup.find(id='link2') print(a) p=soup.find(class_='sister') print(p)
html_doc = """
<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" >Elsie</a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>and they lived at the bottom of a well.</p><p class="story">...</p>
"""
P_s soup.find_all = ()'P'=nameP = soup.find (' ''String filter'' ')'lxml'Soup = the BeautifulSoup (html_doc,the BeautifulSoupImportBS4from
name = 'P' )
Print (P)
Print (P_s)
P = soup.find ( name = 'P' , attrs = { "ID" : "P" })
Print (P)
Tag = soup.find ( name = 'A' , attrs = { "class" : "SISTER" }, text = "Elsie" )
Print (Tag)
'' '
regular filter
re matching module
' ''
Import re
A = soup.find ( name = re.compile('a'))
print(a)
a_s=soup.find_all(name=re.compile('a'))
print(a_s)
a=soup.find(attrs={"id":re.compile('link')})
print(a)
print(soup.find(name=['a','p''html',re.compile('a')]))
print(soup.find_all(name=['a','p','html',re.compile('a')]))
print(soup.find(name=True,attrs={"id":True}))
def have_id_not_class(tag):
if tag.name=='p' and tag.has_attr("id") and not tag.has_attr("class"):
return tag
print(soup.find_all(name=have_id_not_class))
a=soup.find(id='link2')
print(a)
p=soup.find(class_='sister')
print(p)