[Python web crawler] 150 lectures to easily get the Python web crawler paid course notes chapter 12-regular combat: Ganji.com rental information

import requests
import re

def parse_page(page_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36',
        'Cookie': 'ganji_uuid=3984569194922329389162; _gl_tracker=%7B%22ca_source%22%3A%22www.baidu.com%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22seo_baidu%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A35065526370%7D; ganji_xuuid=a9e45a92-73d5-4e3f-d7bf-278ee97c1527.1600652665525; GANJISESSID=p0u4fb9s622s632ur98hrcaqfp; citydomain=tj; ganji_login_act=1600652969366'
    }

    resp = requests.get(page_url, headers=headers)
    # print(resp.text)

    text = resp.text
    houses = re.findall(r"""
        <div.+?ershoufang-list"
        .+? #匹配任意字符 .任意字符
        <a.+?js-title.+?>
        (.+?) #分组形式获取标题信息
        </a>   #结束标志
        .+?<dd.+?dd-item.+?<span>(.+?)</span>   #获取房型
        .+?<span.+?<span>(.+?)</span>       #获取面积
        .+?<div.+?price.+?<span.+?>(.+?)</span> #租房价格
    """, text, re.VERBOSE|re.DOTALL) #|或运算
    for house in houses:
        print(house)

def main():
    base_url = 'http://tj.ganji.com/zufang/pn{}/'
    for i in range(1, 10):
        page_url = base_url.format(i)
        parse_page(page_url)
        break

if __name__ == '__main__':
    main()


'''
1. 如果让.代表所有字符,需要在函数后面加上re.DOTALL来标识,否则不会代表\n
2. 获取数据非贪婪模式,要用?
3. 正则不对,没有输出结果,出现假死
4. 正则不对,不钻牛角尖,更换思路

'''

 

Guess you like

Origin blog.csdn.net/weixin_44566432/article/details/108707568