python 爬取中介房屋信息、图片,插入到mongodb中

爬取https://bj.lianjia.com/ershoufang/这一页的房屋信息、图片数据,插入到mongodb

简单示例脚本:

from pymongo import MongoClient
from gridfs import *
import requests
from lxml import etree
import lxml,time

head={
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36",
    "Host":"bj.lianjia.com",
    "Referer":"https://bj.lianjia.com/ershoufang/"
}

head_img = {
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate, br',
    'Accept-Language':'zh-CN,zh;q=0.9',
    'Cache-Control':'max-age=0',
    'Connection':'keep-alive',
    'Host':'image1.ljcdn.com',
    'If-Modified-Since':'Fri, 08 Jun 2018 08:31:03 GMT',
    'If-None-Match':"5c8509de4fd873870de25600bf9ebb52",
    'Upgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36'
}

fangyuanxinxi_list = []
title = ''
address =''
flood = ''
followInfo = ''
img_url = ''
image = None
mongo_dict= {}

res = requests.request(method='get',url='https://bj.lianjia.com/ershoufang',headers=head)
res.encoding='utf-8'
root = etree.HTML(res.content)
fangyuanxinxi_list = root.xpath('//li[@class="clear"]')

mongo_client = MongoClient('127.0.0.1',27017)
mongo_db = mongo_client.test7
mongo_col = mongo_db.test
mongo_fs = GridFS(mongo_db,collection='coll_image')

for fangyuanxinxi in fangyuanxinxi_list:
    time.sleep(3)
    title   = fangyuanxinxi.xpath('div/div[@class="title"]/.')[0][0].text
    mongo_dict['title'] = title

    address = fangyuanxinxi.xpath('div/div[@class="address"]/div/.')[0][0].text
    mongo_dict['address'] = address

    flood   = fangyuanxinxi.xpath('div/div[@class="flood"]/div/.')[0].text
    mongo_dict['flood'] =flood

    followInfo = str(fangyuanxinxi.xpath('div/div[@class="followInfo"]/.') [0].text) +','+ str(fangyuanxinxi.xpath('div/div[@class="followInfo"]/.') [0][0].tail)
    mongo_dict['followInfo']=followInfo

    img_url = fangyuanxinxi.xpath('a/img/@data-original')[0]
    mongo_dict['img_url'] = img_url
    if  img_url:
        image = requests.request(method='get',url=img_url,headers=head_img,allow_redirects=True).content
        img_id = mongo_fs.put(image,filename = str.replace(title,' ','') + str.split(str(time.time()),'.')[0] + '.' + str.split(img_url,'.')[-1],)
        mongo_dict['img_id'] = img_id
    mongo_col.insert(mongo_dict)
    mongo_dict={}

房屋信息:


图片:



猜你喜欢

转载自blog.csdn.net/qq284489030/article/details/80840994
今日推荐