Python MongoDB co-table

An original table structure

1, imsi table

MongoDB Enterprise > db.trs_action_dzwl_zm.findOne()
{
        "_id" : {
                "imsi" : "460029380018855",
                "start_time" : "2019-03-13 15:37:07"
        },
        "site_address" : "织里-大港路与G318交叉口",
        "xnetbar_wacode" : "EG-MIX-WL-4C-006",
        "imei" : "000000052052052",
        "device_longitude" : "120.275424",
        "device_latitude" : "30.838656",
        "tmsi" : "1552462627",
        "rssi" : "140",
        "band" : "40",
        "plmn" :"46000",         "VENDOR_NAME": "Nanjing Schengen"        "device_name": "weaving - Dagang Road and the intersection G318-4G",
        "tel_number": "1,595,028",


        "province": "Jiangsu Province",
        "City": "Yancheng"
}

2, car table

MongoDB Enterprise > db.trs_action_car_info.findOne()
{
        "_id" : {
                "license_number" : "苏A39NX7",
                "start_time" : "2019-05-16 23:03:13"
        },
        "site_address" : "湖织大道-香圩桥东侧",
        "site_location_id" : "",
        "unlawful_act" : "",
        "driving_direct" : "其它",
        "lane_id" : "001",
        "netbar_wacode" : "904",
        "license_color" : "002",
        "photo_cnt" : "",
        "monitor_type" : "卡口式监控",
        "photo_path" : "/pic?did=12ffaa00-78a3-1037-921c-54c4150760be&bid=486472&pid=4294966623&ptime=1558018994",
        "speed" : "0",
        "stat" : "0",
        "vehicle_brand1" : "0",
        "vehicle_brand2" : "0",
        "car_length" : "",
        "car_color" : "其它颜色",
        "shade" : "000",
        "car_type" : "轿车",
        "license_type" : "92式民用车",
        "vehicle_feature_path" : "",
        "device_name" : "湖织大道-香圩桥东侧",
        "monitor_direct" : "未知",
        "lane" : "001",
        "device_longitude" : "120.308512",
        "device_latitude" : "30.881026",
        "site_name" :"Weaving Lake Avenue - Hong Wei east side of the bridge", }        "site_latitude": "30.881026"        "site_longitude": "120.308512",
        "road_segment_direct": "unknown",


3, face table

MongoDB Enterprise > db.trs_action_face_info.findOne()
{
        "_id" : {
                "pid" : "0120_1561570383884_d61beb5b9e644ed081f4ffc5e362ece7",
                "start_time" : "2019-06-13 12:32:59"
        },
        "site_address" : "融泰宾馆",
        "img_mode" : "",
        "obj_img_url" : "/pic?=d4=i778z096as091-706105m6ep=t1i5i*d1=*ipd7=*9s8=42b8i2d05*717540c14-a563e27-1579*d-d0i806d8e42",
        "quality_score" : "0.883593",
        "netbar_wacode" : "33052802001310942740",
        "device_name" : "融泰宾馆",
        "device_longitude" : "120.262211",
        "device_latitude" : "30.841749",
        "age" : "",
        "gender" : "1",
        "race" : "",
        "beard" : "",
        "eye_open" : "",
        "eye_glass" : "",
        "sun_glass" : "1",
        "mask" : "",
        "mouth_open" : "",
        "smile" : "1",
        "similarity" : "0.97059",
        "image_id" : "0120_1561570383884_d61beb5b9e644ed081f4ffc5e362ece7",
        "bkg_url" : "/pic?=d4=i778z096as091-706105m6ep=t1i5i*d1=*ipd7=*9s8=42b8i2d05*717540c14-a563e27-1579*d-d0i806d8e42"
}

4, MAC table

Second, the co-table structure collectionsitetime

Requirement: imsi, car, face, MAC (MAC sub temporarily) four tables, will extract the key fields of the table

1) to the site

2) in two minute intervals

3) a document, a maximum of only 200 critical data stored within two minutes

MongoDB Enterprise > db.collecsites.findOne()
{
        "_id" : ObjectId("5e159ef831d840f9482b2adc"),
        "timeline" : "2019-03-13 15:34:00",
        "site" : "织里-大港路与G318交叉口",
        "face" : [ ],
        "lpn" : [ ],
        "mac" : [ ],
        "nsamples" : 200,
        "imsi" : [
                {
                        "start_time" : "2019-03-13 15:35:56",
                        "imsi" : "460078995442766"
                },
                {
                        "start_time" : "2019-03-13 15:35:56",
                        "imsi" : "460006254007976"
                }
        ]
}

Third, script development

1, using the python module

from multiprocessing import Pool (process pool)

from pymongo import MongoClient (python mongodb drive connection)

import pandas as pd (the period of time is divided into a plurality of time periods, the present example a period of 2 min)

2, script

1) the script connected mongodb

[root@mongodb07 python3]# cat mongodbclient.py
#coding=utf-8
from multiprocessing import Pool
import os, time, random
import json
from datetime import datetime
from pymongo import MongoClient
import sys
import datetime
class Database(object):
    def __init__(self, address, port, database):
        self.conn = MongoClient(host=address, port=port)
        self.db = self.conn[database]
    def get_state(self):
        return self.conn is not None and self.db is not None
    def insert_one(self, collection, data):
        if self.get_state():
            ret = self.db[collection].insert_one(data)
            return ret.inserted_id
        else:
            return ""
    def insert_many(self, collection, data):
        if self.get_state():
            ret = self.db[collection].insert_many(data)
            return ret.inserted_id
        else:
            return ""
    def update(self, collection, data):
        # data format:
        # {key:[old_data,new_data]}
        data_filter = {}
        data_revised = {}
        for key in data.keys():
            data_filter[key] = data[key][0]
            data_revised[key] = data[key][1]
        if self.get_state():
            return self.db[collection].update_many(data_filter, {"$set": data_revised}).modified_count
        return 0
    def updateOne(self, collection, data_filter,data_revised):
        if self.get_state():
            return self.db[collection].update(data_filter,data_revised,True)
        return 0

    def find(self, col, condition, column=None):
        if self.get_state():
            if column is None:
                return self.db[col].find(condition)
            else:
                return self.db[col].find(condition, column)
        else:
            return None
    def aggregate(self, col, condition):
        if self.get_state():
            options = {'allowDiskUse':True}
            result=self.db[col].aggregate(condition,**options)
            return result
        else:
            return None

    def delete(self, col, condition):
        if self.get_state():
            return self.db[col].delete_many(filter=condition).deleted_count
        return 0
    close_connect DEF (Self):
        self.conn.close ()
        #return 'Mongo connection was closed'
2) the collection of mongodb script to do the actual operation
[root@mongodb07 python3]# cat collection_curd.py
#coding:utf-8
from multiprocessing import Pool
import os, time, random
import json
from datetime import datetime
from pymongo import MongoClient
import sys
import datetime
import mongodbclient
import pandas as pd
def max_number (num1, num2, num3 ): ## acquires maximum value
    MAX_NUM = max (num1, num2, num3)
    return MAX_NUM
def site_cursor_to_list (myresult, colum): ## mongodb the cursor is converted to output the python List
    sitelist = []
    for I in myResult:
        sitelist.append (I [Colum])
    return sitelist
def list_Duplicate_removal (inlist): ## removing duplicates
    OUTLIST List = (SET (InList))
    return OUTLIST
def get_time_interval (str_start_time, str_end_time): ## 2 minutes, time slicing input
    TIME_INTERVAL = pd.date_range (str_start_time, str_end_time, FREQ = 'Min 2')
    return TIME_INTERVAL
def get_site(collection_name,str_start_time,str_end_time):  ##获取2分钟内imsi/face/lpn/mac的站点名称
    db = mongodbclient.Database("172.16.102.15", 27017, "idpad_zl")
    myresult=db.find(collection_name, {"_id.start_time":{ "$gte":str_start_time,"$lt":str_end_time}})
    db.close_connect()
    return site_cursor_to_list(myresult,"site_address")
def get_site_data (collection_name, str_start_time, str_end_time , site, colums): ## under the conditions: 2 minutes of beginning and ending time, station name, a set of name, field name, obtain the required data
    db = mongodbclient.Database ( "172.16.102.15", 27017, "idpad_zl")
    myResult = db.find (COLLECTION_NAME, { "_id.start_time": { "$ GTE": str_start_time, "$ lt": str_end_time}, "site_address": Site}, colums)
    db.close_connect ( )
    return myResult

def sitetime_insert(collection_name,site,str_start_time,imsi_sitetime,face_sitetime,car_sitetime,mac_sitetime):  ##将数据插入集合
    db = mongodbclient.Database("172.16.102.15", 27017, "idpad_zl")
    db.insert_one(collection_name,{"site":site,"timeline":str_start_time,"nsamples":200,"imsi":imsi_sitetime,"face":face_sitetime,"lpn":car_sitetime,"mac":mac_sitetime})
    db.close_connect()
def sitetime_updateOne(collection_name,site,str_start_time,key,value):   ##将数据更新到集合中
    db = mongodbclient.Database("172.16.102.15", 27017, "idpad_zl")
    db.updateOne(collection_name,{"site":site,"timeline":str_start_time,"nsamples":200,key:[]},{"$set":{key:value}})
    db.close_connect()
#def sit_colse():
#    db.close_connect()
 
3) Action Script
[root@mongodb07 python3]# cat collection_insert.py
#coding:utf-8
from multiprocessing import Pool
import os, time, random
import json
from datetime import datetime
from pymongo import MongoClient
import sys
import datetime
import mongodbclient
import pandas as pd
import collection_curd as curd
from multiprocessing import Pool
#update_exec(imsi_outlen_flo,"collecsites",'imsi',imsidata,imsi_outlen_int,imsi_max_len)
def update_exec(type_outlen_flo,collectionname,site,str_start_time,typelist,datalist,type_outlen_int,type_max_len):
    if type_outlen_flo <=1.0:
        curd.sitetime_updateOne(collectionname,site,str_start_time,typelist,datalist)
    else:
        for x in range(type_outlen_int+1):
            if x==type_outlen_int:
                curd.sitetime_updateOne(collectionname,site,str_start_time,typelist,datalist[x*200:type_max_len])
                #print(typelist)
            else:
                curd.sitetime_updateOne(collectionname,site,str_start_time,typelist,datalist[x*200:(x+1)*200])
                #print(typelist)
data_exec DEF (the nums, TIME_INTERVAL):
    #start the time.time = ()
    #Print ( "START_TIME:", Start)
    # = TIME_INTERVAL curd.get_time_interval ( '20.19031 million', '20.19123 million')
    #for I in Range (len (TIME_INTERVAL ) -1): from ## time slices, each slice select period
    #Print ( "Start:", the nums)
    str_start_time = datetime.datetime.strftime (TIME_INTERVAL [the nums], '% Y-M-% D% % H:% M:% S ') ## time slices, each slice start time
    str_end_time = datetime.datetime.strftime (time_interval [nums + 1],'% Y-% m-% d% H:% M :% S ') ## time slices, the end of each time slice
    #Print (str_start_time,' ', str_end_time)
    #Print ( "#################### #### ")
    # the time.sleep (. 5)
    #exit ()
    # sitelist = []
    myresult_imsi_sit = curd.get_site ( "trs_action_dzwl_zm", str_start_time, str_end_time) ## 2 minutes imsi acquired domain name, and the domain name into the following loop
    myresult_car_sit = curd.get_site ( "trs_action_car_info", str_start_time, str_end_time) ## Get 2 minutes car domain name, and the domain name into the following loop
    myresult_face_sit = curd.get_site ( "trs_action_face_info", str_start_time, str_end_time) ## acquires the domain name of the face within 2 minutes, and the domain name into the following the cycle
    myResult = myresult_imsi_sit + myresult_car_sit + myresult_face_sit
    #Print (myResult)
    myResult = curd.list_Duplicate_removal (myResult) get all ## sites of de-duplicated
    #Print (myResult)
    #exit ()
    IF not myResult:
        Pass
    the else:
        for i in range (len (myresult)):
            site=myresult[i]
            #Print (Site)
            my_imsi_site_data = curd.get_site_data ( "trs_action_dzwl_zm", str_start_time, str_end_time, Site, { "_ ID"}) Get the site ##, the data in this time IMSI
            my_car_site_data = curd.get_site_data ( "trs_action_car_info", str_start_time, str_end_time, site, { " _ id"}) ## acquires the site, the data within this time car
            my_face_site_data = curd.get_site_data ( "trs_action_face_info", str_start_time, str_end_time, site, { "_ id"}) ## acquires this site, this time in the face of data
            imsidata=curd.site_cursor_to_list(my_imsi_site_data,"_id")
            cardata=curd.site_cursor_to_list(my_car_site_data,"_id")
            facedata=curd.site_cursor_to_list(my_face_site_data,"_id")
            #print(imsidata)
            imsi_outlen_int = len (imsidata) / 200
            imsi_outlen_flo = len (imsidata) /200.0
            car_outlen_int = len (cardata) / 200
            face_outlen_int = len (data face) / 200
            car_outlen_flo = len (cardata) /200.0
            face_outlen_flo = len (data face) /200.0
            car_max_len=len(cardata)
            face_max_len=len(facedata)
            imsi_max_len=len(imsidata)
            #print("car_max_len:",car_outlen_int," ","face_max_len:",face_outlen_int," ","imsi_max_len:",imsi_outlen_int)
            max_mod_200=max(imsi_outlen_int,car_outlen_int,face_outlen_int)+1
            #print(max_mod_200)
            if imsi_outlen_flo>imsi_outlen_int or car_outlen_flo>car_outlen_int or face_outlen_flo>face_outlen_int:
                for i in range(max_mod_200):
                    curd.sitetime_insert("collecsites",site,str_start_time,[],[],[],[])
            else:
                for i in range(max_mod_200-1):
                    curd.sitetime_insert("collecsites",site,str_start_time,[],[],[],[])
            update_exec(imsi_outlen_flo,"collecsites",site,str_start_time,'imsi',imsidata,imsi_outlen_int,imsi_max_len)
            update_exec(car_outlen_flo,"collecsites",site,str_start_time,'lpn',cardata,car_outlen_int,car_max_len)
            update_exec(face_outlen_flo,"collecsites",site,str_start_time,'face',facedata,face_outlen_int,face_max_len)
            #print(site)
            #exit()
            #curd.sit_colse
    #def update_exec(type_outlen_flo,collectionname,site,str_start_time,typelist,datalist,type_outlen_int,type_max_len):
    #end = time.time()
    #print("end_time : ",end)
    #print('ALL Insert Task runs %s(ms).' % ((end - start)*1000))

if __name__ == '__main__':
    start = time.time()
    p=Pool(30)
    #print("start_time : ",start)
    time_interval=curd.get_time_interval('20190310','20191230')
    for i in range(len(time_interval)-1):  ##从时间切片中,选取每一个切片时间段
        #print(i)
        #res=p.apply_async(data_exec,args=(i,))
        result=p.apply_async(data_exec, args=(i,time_interval))
    p.close()
    p.join()
    end = time.time()
    print("end_time : ",end)
    print('ALL Insert Task runs %s(ms).' % ((end - start)*1000))
 

Fourth, the problems encountered in the development

1. How will some time be divided according to two minutes
2, instantiated mongodb connection, the script is running, how close the connection
3, using a process thread pool

Guess you like

Origin www.cnblogs.com/xibuhaohao/p/12167940.html