爬取https://bj.lianjia.com/ershoufang/这一页的房屋信息、图片数据,插入到mongodb
简单示例脚本:
from pymongo import MongoClient
from gridfs import *
import requests
from lxml import etree
import lxml,time
head={
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36",
"Host":"bj.lianjia.com",
"Referer":"https://bj.lianjia.com/ershoufang/"
}
head_img = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Host':'image1.ljcdn.com',
'If-Modified-Since':'Fri, 08 Jun 2018 08:31:03 GMT',
'If-None-Match':"5c8509de4fd873870de25600bf9ebb52",
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36'
}
fangyuanxinxi_list = []
title = ''
address =''
flood = ''
followInfo = ''
img_url = ''
image = None
mongo_dict= {}
res = requests.request(method='get',url='https://bj.lianjia.com/ershoufang',headers=head)
res.encoding='utf-8'
root = etree.HTML(res.content)
fangyuanxinxi_list = root.xpath('//li[@class="clear"]')
mongo_client = MongoClient('127.0.0.1',27017)
mongo_db = mongo_client.test7
mongo_col = mongo_db.test
mongo_fs = GridFS(mongo_db,collection='coll_image')
for fangyuanxinxi in fangyuanxinxi_list:
time.sleep(3)
title = fangyuanxinxi.xpath('div/div[@class="title"]/.')[0][0].text
mongo_dict['title'] = title
address = fangyuanxinxi.xpath('div/div[@class="address"]/div/.')[0][0].text
mongo_dict['address'] = address
flood = fangyuanxinxi.xpath('div/div[@class="flood"]/div/.')[0].text
mongo_dict['flood'] =flood
followInfo = str(fangyuanxinxi.xpath('div/div[@class="followInfo"]/.') [0].text) +','+ str(fangyuanxinxi.xpath('div/div[@class="followInfo"]/.') [0][0].tail)
mongo_dict['followInfo']=followInfo
img_url = fangyuanxinxi.xpath('a/img/@data-original')[0]
mongo_dict['img_url'] = img_url
if img_url:
image = requests.request(method='get',url=img_url,headers=head_img,allow_redirects=True).content
img_id = mongo_fs.put(image,filename = str.replace(title,' ','') + str.split(str(time.time()),'.')[0] + '.' + str.split(img_url,'.')[-1],)
mongo_dict['img_id'] = img_id
mongo_col.insert(mongo_dict)
mongo_dict={}
房屋信息:
图片: