爬虫08-链家

import requests
import re

start = int(input('起始页码:'))
end = int(input('终止页码:'))
for page in range(start, end+1):
    url = 'https://sh.lianjia.com/ershoufang/pg'+str(page)
    html = requests.get(url).content.decode('utf-8', 'ignore')
    p = re.compile('(?<=<div class="info clear">).*?(?=</div>)', re.S | re.M)
    div = p.findall(html)
    div.pop()
    for d in div:
        pa = re.compile(r'(?<=data-sl=\"\">).*?(?=</a>)')
        title = re.findall(pa, d)
        # title
        title = title[0]
        pat = re.compile(r'(?<=href=\").*?(?=\")')
        href = re.findall(pat, d)
        # url
        href = href[0]
        # 第二个页面的数据
        second = requests.get(href).content.decode('utf-8', 'ignore')
        second_list = re.compile('<span class="label">(.*?)</span>(.*?)</li>')
        second_con = re.findall(second_list, second)
        print(second_con)
    # 总价
    total = re.compile(r'<div class="totalPrice"><span>(.*?)</span>', re.S | re.M)
    totalPrice = total.findall(html)
    for t in totalPrice:
        total_price = t + '万'
    # 单价
    price = re.compile(r'<div class="unitPrice" .*?<span>(.*?)</span>')
    unitPrice = re.findall(price, html)
    for unit in unitPrice:
        print(unit)
    # 小区名称
    name = re.compile(r'<div class="houseInfo">.*?region\">(.*?)</a>', re.S | re.M)
    houseInfo = re.findall(name, html)
    for house in houseInfo:
        print(house)
    # 房屋户型 面积 朝向 装修情况 有无电梯
    some = re.compile(r'<div class=\"houseInfo\">.*?</a>(.*?)</div>', re.S | re.M)
    some = re.findall(some, html)
    for so in some:
        print(so)

猜你喜欢

转载自blog.csdn.net/qwerLoL123456/article/details/83515041