python接口自动化--lxml解析

 1 from lxml import etree
 2 import urllib3
 3 import requests
 4 urllib3.disable_warnings()
 5 url="https://www.cnblogs.com/mvc/blog/news.aspx?blogApp=xiaoyujuan"
 6 
 7 r = requests.get(url,verify=False)
 8 # print(r.text)
 9 
10 dom = etree.HTML(r.content.decode("utf-8"))
11 block = dom.xpath("//*[@id='profile_block']")
12 t = etree.tostring(block[0],encoding='utf-8',pretty_print=True)
13 print(t.decode("utf-8"))
14 
15 t1 = block[0].xpath("text()")#获取当前节点文本元素
16 print(t1)
17 t2 = block[0].xpath('a')#定位a标签
18 for i,j in zip(t1,t2):
19     print("%s%s" %(i,j.text))
 1 from lxml import etree
 2 htmldemo = ''' 
 3 <meta charset="UTF-8"> <!-- for HTML5 -->
 4 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
 5 <html><head><title>yoyo ketang</title></head><body><b><!--Hey, this in comment!--></b>
 6 <p class="title"><b>yoyoketang</b></p><p class="yoyo">这里是我的微信公众号:yoyoketang <br>
 7 <a href="http://www.cnblogs.com/yoyoketang/tag/fiddler/" class="sister" id="link1">fiddler教程</a><br>
 8 <a href="http://www.cnblogs.com/yoyoketang/tag/python/" class="sister" id="link2">python笔记</a><br>
 9 <a href="http://www.cnblogs.com/yoyoketang/tag/selenium/" class="sister" id="link3">selenium文档</a><br>
10 快来关注吧!</p>
11 <p class="story">...</p>
12 '''
13 #etree.HTMLz解析html内容
14 demo = etree.HTML(htmldemo)
15 #打印解析之后的html内容,可用etree.tosting方法
16 #encoding="utf-8"参数可以正常输出html里面的中文内容
17 #pretty_print=True是以标准格式输出
18 t = etree.tostring(demo,encoding='utf-8',pretty_print=True)
19 print(t.decode('utf-8'))

猜你喜欢

转载自www.cnblogs.com/xiaoyujuan/p/11304355.html