xpath和css及BeautifulSoup选择器的使用

'''
/从根标签开始
//从当前标签  后续节点含有即可选出
*通配符，选择所有
//div/book[1]/title 选择div下第一个book标签的title元素
//div/book/title[@lang='zh'] 选择title属性含有lang且内容是zh的title元素
//div/book/title //book/title //title 具有相同的结果，因为使用相对路径最终都指向title
//book/title/@* 将title所有属性值选择出来
//book/title/text() 将title的内容选择出来，使用内置text()函数
//a[@href="link1.html" and @id="places_neighbours_row"]
//a[@href="link1.html" or @id="places_neighbours_row"]
//div/book[last()]/title/text() 将最后一个book元素选出
//div/book[price > 39]/title 将book子标签price数值大于39的选择出来
//li[starts-with(@class,'item')] 将class属性前缀是item的li选择出来
//title[contains(@lang,'eng')] 将title属性lang含有eng关键字的标签选出来
'''

import lxml.html
import requests

网页内容：
'<tr id="places_population__row"><td class="w2p_fl"><label class="readonly" for="places_population" id="places_population__label">Population: </label></td><td class="w2p_fw">84,000</td><td class="w2p_fc"></td></tr>'

result = requests.get('http://example.webscraping.com/places/default/view/Andorra-6')
html = lxml.html.fromstring(result.content.decode('utf-8'))
html_data = html.xpath('//tr[@id="places_population__row"]/td[@class="w2p_fw"]')
print(html_data)
for i in html_data:
    print(i.text)

css选择器（可以在浏览器检查中直接复制css选择器）

def parse_lxml(html_str):
    '''
    使用lxml解析，lxml常用方法
    选择所有标签：*
    选择class = home 的<a>标签：a.home
    选择id = game的<a>标签：a#game
    选择<a>标签的所有子标签span:a > span
    选择<a>标签的所有后代标签：a span
    选择title属性为home的所有<a>标签：a[title=home]
    :param html_str:
    :return:
    '''
    tree = lxml.html.fromstring(html_str)
    td = tree.cssselect('#places_population__row > td:nth-child(2)')[0]
    area = td.text_content()
    return area

result = requests.get('http://example.webscraping.com/places/default/view/China-47')
result = parse_lxml(result.text)
print(result)

BeautifulSoup

# from bs4 import BeautifulSoup
def parse_bs4(html_str):
    '''
    使用bs4解析，bs4常用方法
    contents,find,find_all,text
    :param html_str:
    :return:
    '''
    soup = BeautifulSoup(html_str,'lxml')
    tr = soup.find(attrs={'id':'places_area__row'})
    area = tr.find(attrs={'class':'w2p_fw'})
    return area.text

result = requests.get('http://example.webscraping.com/places/default/view/China-47')
result = parse_bs4(result.text)
print(result)

xpath和css及BeautifulSoup选择器的使用

猜你喜欢