import requests
import re
def parse_page(page_url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36',
'Cookie': 'ganji_uuid=3984569194922329389162; _gl_tracker=%7B%22ca_source%22%3A%22www.baidu.com%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22seo_baidu%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A35065526370%7D; ganji_xuuid=a9e45a92-73d5-4e3f-d7bf-278ee97c1527.1600652665525; GANJISESSID=p0u4fb9s622s632ur98hrcaqfp; citydomain=tj; ganji_login_act=1600652969366'
}
resp = requests.get(page_url, headers=headers)
# print(resp.text)
text = resp.text
houses = re.findall(r"""
<div.+?ershoufang-list"
.+? #匹配任意字符 .任意字符
<a.+?js-title.+?>
(.+?) #分组形式获取标题信息
</a> #结束标志
.+?<dd.+?dd-item.+?<span>(.+?)</span> #获取房型
.+?<span.+?<span>(.+?)</span> #获取面积
.+?<div.+?price.+?<span.+?>(.+?)</span> #租房价格
""", text, re.VERBOSE|re.DOTALL) #|或运算
for house in houses:
print(house)
def main():
base_url = 'http://tj.ganji.com/zufang/pn{}/'
for i in range(1, 10):
page_url = base_url.format(i)
parse_page(page_url)
break
if __name__ == '__main__':
main()
'''
1. 如果让.代表所有字符,需要在函数后面加上re.DOTALL来标识,否则不会代表\n
2. 获取数据非贪婪模式,要用?
3. 正则不对,没有输出结果,出现假死
4. 正则不对,不钻牛角尖,更换思路
'''