mongodb操作

有需要Python学习资料的小伙伴吗?小编整理【一套Python资料、源码和PDF】，感兴趣者可以加学习群：548377875，反正闲着也是闲着呢，不如学点东西啦~~

import pymongo

#连接数据库实例(连接数据库)---》获取相应数据库---》获取相应collection集合(表)
client = pymongo.MongoClient(host='localhost',port=27017)

db = client.test        #也可用字典形式操作，如下
# db = client["test"]

collection  = db.students  #也可用字典形式操作，如下
# collection = db["students"]

student1 = {
    'id':'001',
    'name':'haha',
    'age':20,
    'gender':'male'
}
student2 = {
    'id': '002',
    'name': 'Mike',
    'age': 41,
    'gender': 'male'
}
#--------------------------------------------------------------------------
        #插入 insert into students(...) values('002',...)
        #若不指定 _id 字段，系统默认会生成一个ObjectId
        #可插入一条或多条数据(列表形式)，python3不推荐使用insert
# collection.insert([student1,student2])
# collection.insert(student1)

        #官方推荐，分开使用，返回值不是ObjectId，而是InsertOneResult对象，我们可以调用其inserted_id属性获取_id。
# result = collection.insert_one(student2)
# print(result)
# print(result.inserted_id)

# result = collection.insert_many([student1,student2])
# print(result)
# print(result.inserted_ids)

#------------------------------------------------------------------
        #查询 select * from students where id=002
        #查询条件使用字典，可使用多字段,find是多条查询
# result_find = collection.find({"name":"lijingbo","age":20})
# print(result_find.next())   #返回一个游标，游标相当于迭代器，可使用next()获取一条结果，或者使用循环遍历等，遍历结果是字典
        #find_one:单个查询，返回字典类型
# result = collection.find_one({'age':20})
# print(result,type(result))
        #结合关系符进行查询:$gt,$lt,$gte,$lte,$ne,$in,$nin
# result = collection.find({'age':{'$gt':18}})
# result = collection.find({'age':{'$in':[18,41]}})
        #结合特殊符号查询：$regex
# result = collection.find({'name':{'$regex':'^M.*'}})  #正则
# result = collection.find({'name':{'$exists':True}})     #查询含有name属性的
# result = collection.find({'age':{'$mod':[5,0]}})        #求模，对5取余=0
# result = collection.find({'$where':'obj.age==20'})       #查询age为20的，obj是自身
# result = collection.find({'age':20}).count()                #统计
# result = collection.find().sort('age',pymongo.ASCENDING)      #按照指定字段升序排列
# result = collection.find().sort('age',pymongo.DESCENDING)     #按照指定字段升序排列
# result = collection.find().sort('age',pymongo.DESCENDING).skip(2)     #按照指定字段升序排列，偏移2个(就是把最前面两个跳过去了)
# result = collection.find().sort('age',pymongo.DESCENDING).skip(2).limit(5)    #限制得到5
# print(result)
# for r in result:
#     print(r['name'],r['age'])

#----------------------------------------------------------
        #更新 update students set name=haha where id=001
        #参数1：查询条件(字典)；参数2：更新值(字典，键：'$set'，值：字典【也可直接使用外部字典】)
        #其他：upsert默认为False，为True时——若更新的原数据不存在，则插入数据
                #multi——默认为False只更新查询到的第一条数据，为True时：更新全部查询到的数据
        # $set：是mongodb内置函数，覆盖原始数据
# collection.update({"id":"001"},{'$set':{'age':34}},upsert=True,multi=True)
# print(collection.find().next())
        #上面的官方也不推荐，可以使用下面的
# result = collection.update_one({'name':'haha'},{'$set':{'age':18}})
# result = collection.update_many({'name':'haha'},{'$set':{'age':18}})
# print(result)   #只修改一条数据，若该数据不修改就和修改条件一样了，那有可能修改数为0
# print(result.matched_count,result.modified_count)


#-----------------------------------------------------
        #删除,remove方法官方不推荐
# collection.remove({"id":"001"},justOne=1)
# result = collection.delete_one({'name':'Mike'})
# result = collection.delete_many({'name':'Mike'})
# print(result)
# print(result.deleted_count)

#---------------------------------------------------
        #组合方法
# result = collection.find_one_and_delete({'name':'haha'})
# result = collection.find_one_and_update({'name':'haha'},{'$set':{'age':45}})
# result = collection.find_one_and_replace({'name':'haha'})
# print(result)

MongoCache

将数据以字典的特性存储缓存到mongodb数据库

导入类库

import pickle,zlib  #对象序列化    压缩数据
from datetime import datetime,timedelta     #设置缓存超时间间隔
from pymongo import MongoClient
from bson.binary import Binary      #MongoDB存储二进制的类型

创建MongoCache类

初始化init
- 连接mongodb数据库
- 连接数据库cache实例(没有则创建)
- 连接集合webpage(没有则创建)
- 创建timestamp索引，设置超时时间为30天

重写__setitem__
- 数据经过pickle序列化
- zlib压缩
- 经Binary转化为mongodb需要的格式
- 添加格林威治时间
- 网址为键_id，结果为值，存入mongodb

使用下载的url(路由)作为key，存入系统默认的_id字段，更新数据库，若存在则更新，不存在则插入，_id唯一就可实现爬取的数据去重

用字典的形式向数据库添加一条缓存(数据)

重写__getitem__
- 将缓存数据按照item作为key取出(key仍然是下载的url)
- 根据_id(url)查找(find_one)结果
- 解压缩，反序列化
重写__contains__
- 当调用in，not in ，会自动调用该方法判断链接对应网址是否在数据库中
- 可通过字典的查找方式__getitem__直接查找(self[item])
- 该函数返回布尔值
方法clear
- 清空该集合中的数据

import pickle,zlib  #对象序列化    压缩数据
from datetime import datetime,timedelta     #设置缓存超时间间隔
from pymongo import MongoClient
from bson.binary import Binary      #MongoDB存储二进制的类型
from http_ljb.tiebaspider import TiebaSpider
from http_ljb.qiushispider import QiushiSpider

class MongoCache:
    def __init__(self,client=None,expires=timedelta(days=30)):
        '''
        初始化函数
        :param client: 数据库连接(数据库实例)
        :param expires: 超时时间
        '''
        self.client = MongoClient('localhost',27017)
        self.db = self.client.cache     #创建名为cache的数据库
        web_page = self.db.webpage      #创建集合webpage并赋值给变量
        #创建timestamp索引，设置超时时间为30天，total_seconds会将days转为秒
        self.db.webpage.create_index('timestamp',expireAfterSeconds=expires.total_seconds())

    def __setitem__(self, key, value):
        '''
        用字典的形式向数据库添加一条缓存(数据)
        :param key: 缓存的键
        :param value: 缓存的值
        :return:
        '''
        #数据---》pickle序列化---》zlib压缩---》Binary转化为mondodb需要的格式，使用格林威治时间
        record = {'result':Binary(zlib.compress(pickle.dumps(value))),'timestamp':datetime.utcnow()}
        #使用下载的url(路由)作为key，存入系统默认的_id字段，更新数据库，若存在则更新，不存在则插入，_id唯一就可实现爬取的数据去重
        self.db.webpage.update({'_id':key},{'$set':record},upsert=True)

    def __getitem__(self, item):
        '''
        将缓存数据按照item作为key取出(key仍然是下载的url)
        :param item:键
        :return:
        '''
        record = self.db.webpage.find_one({'_id':item}) #查找出来就不是Binary了，不用进行转化
        if record:
            return pickle.loads(zlib.decompress(record['result'])) #解压缩，反序列化
        else:
            raise KeyError(item + 'does not exist')     #查询不到就抛出键错误异常

    def __contains__(self, item):
        '''
        当调用in，not in ，会自动调用该方法判断链接对应网址是否在数据库中
        :param item: 下载的url链接(路由)
        :return:
        '''
        try:
            self[item]      #这一步会调用__getitem__,找不到__getitem__会抛出异常，在这里进行捕获异常只返回False，否则返回True
        except KeyError:
            return False
        else:
            return True

    def clear(self):
        '''
        清空该集合中的数据
        :return:
        '''
        self.db.webpage.drop()

爬取实例

调用贴吧爬取代码和百科爬取代码，使用mongodb存储爬取数据

导入爬取类
创建新类并继承自爬取类
重写保存方法
- 创建MongoCache对象
- 网址为键，数据为值，以字典形式存入mongodb
重写run方法
- 在保存时，需多传一个网址参数(为了在保存方法中对应保存)

import pickle,zlib  #对象序列化    压缩数据
from datetime import datetime,timedelta     #设置缓存超时间间隔
from pymongo import MongoClient
from bson.binary import Binary      #MongoDB存储二进制的类型
from http_ljb.tiebaspider import TiebaSpider
from http_ljb.qiushispider import QiushiSpider

class MongoCache:
    def __init__(self,client=None,expires=timedelta(days=30)):
        '''
        初始化函数
        :param client: 数据库连接(数据库实例)
        :param expires: 超时时间
        '''
        self.client = MongoClient('localhost',27017)
        self.db = self.client.cache     #创建名为cache的数据库
        web_page = self.db.webpage      #创建集合webpage并赋值给变量
        #创建timestamp索引，设置超时时间为30天，total_seconds会将days转为秒
        self.db.webpage.create_index('timestamp',expireAfterSeconds=expires.total_seconds())

    def __setitem__(self, key, value):
        '''
        用字典的形式向数据库添加一条缓存(数据)
        :param key: 缓存的键
        :param value: 缓存的值
        :return:
        '''
        #数据---》pickle序列化---》zlib压缩---》Binary转化为mondodb需要的格式，使用格林威治时间
        record = {'result':Binary(zlib.compress(pickle.dumps(value))),'timestamp':datetime.utcnow()}
        #使用下载的url(路由)作为key，存入系统默认的_id字段，更新数据库，若存在则更新，不存在则插入，_id唯一就可实现爬取的数据去重
        self.db.webpage.update({'_id':key},{'$set':record},upsert=True)

    def __getitem__(self, item):
        '''
        将缓存数据按照item作为key取出(key仍然是下载的url)
        :param item:键
        :return:
        '''
        record = self.db.webpage.find_one({'_id':item}) #查找出来就不是Binary了，不用进行转化
        if record:
            return pickle.loads(zlib.decompress(record['result'])) #解压缩，反序列化
        else:
            raise KeyError(item + 'does not exist')     #查询不到就抛出键错误异常

    def __contains__(self, item):
        '''
        当调用in，not in ，会自动调用该方法判断链接对应网址是否在数据库中
        :param item: 下载的url链接(路由)
        :return:
        '''
        try:
            self[item]      #这一步会调用__getitem__,找不到__getitem__会抛出异常，在这里进行捕获异常只返回False，否则返回True
        except KeyError:
            return False
        else:
            return True

    def clear(self):
        '''
        清空该集合中的数据
        :return:
        '''
        self.db.webpage.drop()

class TiebaMongo(TiebaSpider):
    def save_result(self, result,url_str):
        """
        重写父类的该方法，将数据保存到数据库
        :param result:
        :param url_str:
        :return:
        """
        mc = MongoCache()
        mc[url_str] = result

    def run(self):
        url_lists = self.make_url()
        for url_str in url_lists:
            result_str = self.download_url(url_str)
            self.save_result(result=result_str,url_str=url_str)

# class QiushiMongo(QiushiSpider):
#     def save_result(self, result,url_str):
#         mc = MongoCache()
#         mc[url_str] = result
#
#     def run(self):
#         url_lists = self.make_url()
#         for url_str in url_lists:
#             result_str = self.download_url(url_str)
#             self.save_result(result=result_str,url_str=url_str)

# if __name__ == '__main__':
        #爬取贴吧并存到MongoDB
    # test = TiebaMongo('lol')
    # test.run()
        #爬取糗事并存到MongoDB
    # qiushi = QiushiMongo()
    # qiushi.run()
        #查询MongoDB
    # mc = MongoCache()
    # print(mc['https://tieba.baidu.com/f?kw=lol&ie=utf-8&pn=2'])
    # print('https://tieba.baidu.com/f?kw=lol&ie=utf-8&pn=3' in mc)
    # cha = MongoCache()
    # print(cha[test.url_base])
    # print(mc["https://www.qiushibaike.com/8hr/page/2/"])

有需要Python学习资料的小伙伴吗?小编整理【一套Python资料、源码和PDF】，感兴趣者可以加学习群：548377875，反正闲着也是闲着呢，不如学点东西啦~~

Python爬虫实战完整版

mongodb操作

MongoCache

导入类库

创建MongoCache类

爬取实例

猜你喜欢