bs4库学习

 1 # -*- coding:utf-8 -*-
 2 import bs4
 3 import requests
 4 
 5 def tags_val(tag, key='', index=0):
 6     '''
 7     tag指HTML元素，如：<a href="http://meilizhichengwk027.fang.com/chengjiao/-p11-t12/" class="" id="rent">出租</a>，
 8     通过bs4的select获取元素，tag = bs4.BeautifulSoup(response.text,'html.parser')；
 9     key指元素的属性，如：key='id'；
10     index指元素序号，如果tag有多个相同的，通过序号获取精确的其中一个；
11     最终返回元素属性指，如id的值为rent。或key不传入参，返回tag文本，如'出租'
12     '''
13     if len(tag) == 0 or len(tag) <= index:
14         return ''
15     elif key:
16         txt = tag[index].get(key)
17         return txt.strip(' \t\r\n') if txt else ''
18     else:
19         txt = tag[index].text
20         return txt.strip(' \t\r\n') if txt else ''
21 
22 def tag_val(tag, key=''):
23     '''
24     与tags_val用法类似，但是不传index，也就是要能通过selector获取到唯一的元素，然后通过key返回元素属性值或文本
25     '''
26     if tag is None:
27         return ''
28     elif key:
29         txt = tag.get(key)
30         return txt.strip(' \t\r\n') if txt else ''
31     else:
32         txt = tag.text
33         return txt.strip(' \t\r\n') if txt else ''
34 
35 #使用requests.Session(),能够保存每次返回的cookie,让下次发送请求时带上保存的cookie
36 session = requests.Session()
37 response = session.get('http://meilizhichengwk027.fang.com/chengjiao/')
38 html = bs4.BeautifulSoup(response.text,'html.parser')
39 # 当时用 html.select('.dealSent.sentwrap tr:nth-child(2) p:nth-child(1)')时，会报错，不支持此用法，但是在chrome的console是可以获取到元素的
40 tags = html.select('.dealSent.sentwrap tr:nth-of-type(2) p:nth-of-type(1)')
41 print tags
42 print tags_val(tags)
结果：
[<p><b><a href="http://esf.wuhan.fang.com/chengjiao/433513_1.htm" target="_blank">1\u5ba41\u5385</a></b></p>]
1室1厅
猜你喜欢