import requests
import re
start = int(input('起始页码:'))
end = int(input('终止页码:'))
for page in range(start, end+1):
url = 'https://sh.lianjia.com/ershoufang/pg'+str(page)
html = requests.get(url).content.decode('utf-8', 'ignore')
p = re.compile('(?<=<div class="info clear">).*?(?=</div>)', re.S | re.M)
div = p.findall(html)
div.pop()
for d in div:
pa = re.compile(r'(?<=data-sl=\"\">).*?(?=</a>)')
title = re.findall(pa, d)
# title
title = title[0]
pat = re.compile(r'(?<=href=\").*?(?=\")')
href = re.findall(pat, d)
# url
href = href[0]
# 第二个页面的数据
second = requests.get(href).content.decode('utf-8', 'ignore')
second_list = re.compile('<span class="label">(.*?)</span>(.*?)</li>')
second_con = re.findall(second_list, second)
print(second_con)
# 总价
total = re.compile(r'<div class="totalPrice"><span>(.*?)</span>', re.S | re.M)
totalPrice = total.findall(html)
for t in totalPrice:
total_price = t + '万'
# 单价
price = re.compile(r'<div class="unitPrice" .*?<span>(.*?)</span>')
unitPrice = re.findall(price, html)
for unit in unitPrice:
print(unit)
# 小区名称
name = re.compile(r'<div class="houseInfo">.*?region\">(.*?)</a>', re.S | re.M)
houseInfo = re.findall(name, html)
for house in houseInfo:
print(house)
# 房屋户型 面积 朝向 装修情况 有无电梯
some = re.compile(r'<div class=\"houseInfo\">.*?</a>(.*?)</div>', re.S | re.M)
some = re.findall(some, html)
for so in some:
print(so)
爬虫08-链家
猜你喜欢
转载自blog.csdn.net/qwerLoL123456/article/details/83515041
今日推荐
周排行