爬取安居客新房(urllib+bs4)

简单做了个脚本,将房源写入json文件
代码注释很全,不做解释了。
import json
from urllib import request,parse
import re
from lxml import etree
from bs4 import BeautifulSoup,element

def get_url():
“”"
处理url地址
:param url: 初始url模板
:param headers: 请求头
:return: None
“”"
page = int(input(‘请输入页数:’))
for i in range(1,page+1):
url = ‘https://bj.fang.anjuke.com/loupan/all/p{}/’.format(i)
# 执行解析
parse(url,headers,i)

def parse(url,headers,i):
“”"
数据采集
:param url: 请求地址
:param headers: 请求头
:return: None
“”"
# 构造请求
req = request.Request(url=url,headers=headers)
# 发起请求
response = request.urlopen(req)
# 获得响应
res = response.read().decode(“utf-8”)
# bs4 解析
soup = BeautifulSoup(res,‘lxml’)
content = soup.select(’.item-mod’)

lis = []
for c in content:
    # 将内容格式化,方便写入json文件
    # 此处的判断是由于筛选条件的标签属性和我们数据相似,意外取了出来
    # 我们做了简单处理
    dic = {}
    # 名字
    title = c.select('.infos a h3 .items-name')
    if len(title)==0:
        pass
    else:
        title = title[0].get_text()
    # 地址
    address = c.select('.infos .address span')
    if len(title)==0:
        pass
    else:
        address = address[0].get_text()
    # 户型
    huxing = c.select('.infos .huxing span')
    if len(huxing)==0:
        pass
    else:
        huxing = huxing[0].get_text()
    # 均价
    average_price = c.select('.favor-pos span')
    if len(average_price)==0:
        average_price = '售价待定'
    else:
        average_price = average_price[0].get_text()
    dic['title'] = title
    dic['address'] = address
    dic['huxing'] = huxing
    dic['average_price'] = average_price

    lis.append(dic)
# 写入json 文件
json.dump(lis,open('anjuke'+str(i)+'.json','a',encoding="utf-8"),ensure_ascii=False,indent=4)

if name == “main”:
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.16 Safari/537.36’,
}
get_url()

猜你喜欢

转载自blog.csdn.net/weixin_44220464/article/details/94470215