简单做了个脚本,将房源写入json文件
代码注释很全,不做解释了。
import json
from urllib import request,parse
import re
from lxml import etree
from bs4 import BeautifulSoup,element
def get_url():
“”"
处理url地址
:param url: 初始url模板
:param headers: 请求头
:return: None
“”"
page = int(input(‘请输入页数:’))
for i in range(1,page+1):
url = ‘https://bj.fang.anjuke.com/loupan/all/p{}/’.format(i)
# 执行解析
parse(url,headers,i)
def parse(url,headers,i):
“”"
数据采集
:param url: 请求地址
:param headers: 请求头
:return: None
“”"
# 构造请求
req = request.Request(url=url,headers=headers)
# 发起请求
response = request.urlopen(req)
# 获得响应
res = response.read().decode(“utf-8”)
# bs4 解析
soup = BeautifulSoup(res,‘lxml’)
content = soup.select(’.item-mod’)
lis = []
for c in content:
# 将内容格式化,方便写入json文件
# 此处的判断是由于筛选条件的标签属性和我们数据相似,意外取了出来
# 我们做了简单处理
dic = {}
# 名字
title = c.select('.infos a h3 .items-name')
if len(title)==0:
pass
else:
title = title[0].get_text()
# 地址
address = c.select('.infos .address span')
if len(title)==0:
pass
else:
address = address[0].get_text()
# 户型
huxing = c.select('.infos .huxing span')
if len(huxing)==0:
pass
else:
huxing = huxing[0].get_text()
# 均价
average_price = c.select('.favor-pos span')
if len(average_price)==0:
average_price = '售价待定'
else:
average_price = average_price[0].get_text()
dic['title'] = title
dic['address'] = address
dic['huxing'] = huxing
dic['average_price'] = average_price
lis.append(dic)
# 写入json 文件
json.dump(lis,open('anjuke'+str(i)+'.json','a',encoding="utf-8"),ensure_ascii=False,indent=4)
if name == “main”:
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.16 Safari/537.36’,
}
get_url()