python爬取安居客储存到csv或者mongo数据库
目的
爬取安居客房产进行信息储存
准备
安居客地址:https://shuyang.anjuke.com/map/sale/
pycharm
寻找安居客的返回数据的url
1.经过分析了原网页和json和js发现,真正返回数据的url在json中:https://shuyang.anjuke.com/v3/ajax/map/sale/1158/prop_list/?room_num=-1&price_id=-1&area_id=-1&floor=-1&orientation=-1&is_two_years=0&is_school=0&is_metro=0&order_id=0&p=1&zoom=12&lat=34.052874_34.181985&lng=118.368882_119.251952&kw=&maxp=99&et=f7908e&ib=1&bst=pem632
对url进行浏览器访问 ,查看数据,发现它是十六进制输出的数据,明显的反爬手段
由于要获取房产信息,必须获取里面的数据信息,然后进行分析筛选信息,得到自己想要的数据
获取请求头
获取r返回的json数据
import requests
url = "https://shuyang.anjuke.com/v3/ajax/map/sale/1158/prop_list/?room_num=-1&price_id=-1&area_id=-1&floor=-1&orientation=-1&is_two_years=0&is_school=0&is_metro=0&order_id=0&p=1&zoom=12&lat=34.052874_34.181985&lng=118.368882_119.251952&kw=&maxp=99&et=f7908e&ib=1&bst=pem632"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"
}
# 获取url返回数据
r = requests.get(url=url, headers=headers)
jsn = r.json()
# 打印json数据
print(jsn)
获得了json数据,我们就可以对数据进行分析,查看里面的数据,发现props下面的每个字典就对应着每个房产的信息
要想取到里面的数据,就必须用jsonpath获取,获取以下九个数据,以列表格式输出 list=[ ]
所以要定义一个列表来储存数据,用jsonpath来获取数据
import jsonpath
import requests
url = "https://shuyang.anjuke.com/v3/ajax/map/sale/1158/prop_list/?room_num=-1&price_id=-1&area_id=-1&floor=-1&orientation=-1&is_two_years=0&is_school=0&is_metro=0&order_id=0&p=1&zoom=12&lat=34.052874_34.181985&lng=118.368882_119.251952&kw=&maxp=99&et=f7908e&ib=1&bst=pem632"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"
}
# 获取url返回数据
r = requests.get(url=url, headers=headers)
jsn = r.json()
# 打印json数据
# print(jsn)
list = []
#获取所有region_name的数据
region = jsonpath.jsonpath(jsn, "$..region_name")
#获取所有block_name的数据
pblock = jsonpath.jsonpath(jsn, "$..block_name")
#获取所有rhval的数据
rhval1 = jsonpath.jsonpath(jsn, "$..rhval")
#获取所有area的数据
area1 = jsonpath.jsonpath(jsn, "$..area")
#获取所有price的数据
price1 = jsonpath.jsonpath(jsn, "$..price")
#获取所有long_title的数据
title = jsonpath.jsonpath(jsn, "$..long_title")
#获取所有comm_name的数据
comm = jsonpath.jsonpath(jsn, "$..comm_name")
#获取所有house_orient_nam的数据
house = jsonpath.jsonpath(jsn, "$..house_orient_name")
#获取所有floor_tag的数据
floor = jsonpath.jsonpath(jsn, "$..floor_tag")
for i in range(len(region)):
list.append([region[i], pblock[i], rhval1[i], area1[i], price1[i], title[i], comm[i], house[i], floor[i]])
print(list)
打印了当前页面的房产数据(第一页)
好不容易爬一次,不能只爬一页啊!所以我们分析每一页的url的变化
所以要定义一个页面的参数p,p=1 ,进行循环遍历每一页
import requests
import jsonpath
import json
from pymongo import MongoClient
client = MongoClient('127.0.0.1', 27017)
collection = client['stu']['real estate']
url = "https://shuyang.anjuke.com/v3/ajax/map/sale/1075/prop_list/?room_num=-1&price_id=-1&area_id=-1&floor=-1&orientation=-1&is_two_years=0&is_school=0&is_metro=0&order_id=0&p=1&zoom=12&lat=34.060052_34.174817&lng=118.368882_119.251952&kw=&maxp=99&et=b4e7df&ib=1&bst=pem576"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"
}
cLass= ['region', 'pblock', 'rhval1', 'area1', 'price1','title','comm','house','floor']
p=1
for q in range(1,6):
p=q
data={
"p":"%s" % p
}
print(data)
r = requests.get(url=url, headers=headers,params=data)
jsn = r.json()
list = []
region = jsonpath.jsonpath(jsn, "$..region_name")
pblock = jsonpath.jsonpath(jsn, "$..block_name")
rhval1 = jsonpath.jsonpath(jsn, "$..rhval")
area1 = jsonpath.jsonpath(jsn, "$..area")
price1 = jsonpath.jsonpath(jsn, "$..price")
title = jsonpath.jsonpath(jsn, "$..long_title")
comm = jsonpath.jsonpath(jsn, "$..comm_name")
house = jsonpath.jsonpath(jsn, "$..house_orient_name")
floor = jsonpath.jsonpath(jsn, "$..floor_tag")
for i in range(len(region)):
list.append([region[i], pblock[i], rhval1[i], area1[i], price1[i], title[i], comm[i], house[i], floor[i]])
collection.insert_one({
'region':region[i],'pblock':pblock[i],'rhval1':rhval1[i],'area1':area1[i], 'price1':price1[i],'title':title[i],'comm':comm[i],'house':house[i],'floor':floor[i]})
print(list)
第一页的房产信息
第二页的房产信息
接下来还有第三页第四页第五页…(循环五次)
数据出来了,接下来该储存数据了,方法有两种
1.csv储存
2.mongo数据库储存
csv储存,完整代码
定义一个class头为他们的类
import requests
import jsonpath
import json
import csv
from pymongo import MongoClient
url = "https://shuyang.anjuke.com/v3/ajax/map/sale/1075/prop_list/?room_num=-1&price_id=-1&area_id=-1&floor=-1&orientation=-1&is_two_years=0&is_school=0&is_metro=0&order_id=0&p=1&zoom=12&lat=34.060052_34.174817&lng=118.368882_119.251952&kw=&maxp=99&et=b4e7df&ib=1&bst=pem576"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"
}
#定义一个csv储存的类
cLass= ['region', 'pblock', 'rhval1', 'area1', 'price1','title','comm','house','floor']
p=1
for q in range(1,6):
p=q
data={
"p":"%s" % p
}
print(data)
r = requests.get(url=url, headers=headers,params=data)
jsn = r.json()
list = []
region = jsonpath.jsonpath(jsn, "$..region_name")
pblock = jsonpath.jsonpath(jsn, "$..block_name")
rhval1 = jsonpath.jsonpath(jsn, "$..rhval")
area1 = jsonpath.jsonpath(jsn, "$..area")
price1 = jsonpath.jsonpath(jsn, "$..price")
title = jsonpath.jsonpath(jsn, "$..long_title")
comm = jsonpath.jsonpath(jsn, "$..comm_name")
house = jsonpath.jsonpath(jsn, "$..house_orient_name")
floor = jsonpath.jsonpath(jsn, "$..floor_tag")
for i in range(len(region)):
list.append([region[i], pblock[i], rhval1[i], area1[i], price1[i], title[i], comm[i], house[i], floor[i]])
print(list)
#csv储存数据 用wps打开或者excel打开都行,只要不乱码就算成功
with open("anjuke%s.csv" % p, 'w', encoding="utf-8") as f:
f_csv = csv.writer(f)
f_csv.writerow(cLass)
f_csv.writerows(list)
mongo数据库储存,完整代码
import requests
import jsonpath
import json
import csv
from pymongo import MongoClient
#monggo数据库
client = MongoClient('127.0.0.1', 27017)
collection = client['stu']['real estate']
url = "https://shuyang.anjuke.com/v3/ajax/map/sale/1075/prop_list/?room_num=-1&price_id=-1&area_id=-1&floor=-1&orientation=-1&is_two_years=0&is_school=0&is_metro=0&order_id=0&p=1&zoom=12&lat=34.060052_34.174817&lng=118.368882_119.251952&kw=&maxp=99&et=b4e7df&ib=1&bst=pem576"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"
}
cLass= ['region', 'pblock', 'rhval1', 'area1', 'price1','title','comm','house','floor']
p=1
for q in range(1,6):
p=q
data={
"p":"%s" % p
}
print(data)
r = requests.get(url=url, headers=headers,params=data)
jsn = r.json()
list = []
region = jsonpath.jsonpath(jsn, "$..region_name")
pblock = jsonpath.jsonpath(jsn, "$..block_name")
rhval1 = jsonpath.jsonpath(jsn, "$..rhval")
area1 = jsonpath.jsonpath(jsn, "$..area")
price1 = jsonpath.jsonpath(jsn, "$..price")
title = jsonpath.jsonpath(jsn, "$..long_title")
comm = jsonpath.jsonpath(jsn, "$..comm_name")
house = jsonpath.jsonpath(jsn, "$..house_orient_name")
floor = jsonpath.jsonpath(jsn, "$..floor_tag")
for i in range(len(region)):
list.append([region[i], pblock[i], rhval1[i], area1[i], price1[i], title[i], comm[i], house[i], floor[i]])
collection.insert_one({
'region':region[i],'pblock':pblock[i],'rhval1':rhval1[i],'area1':area1[i], 'price1':price1[i],'title':title[i],'comm':comm[i],'house':house[i],'floor':floor[i]})