import pandas as pd import numpy as np from pandas import DataFrame import datetime import sys import pymysql import csv from sqlalchemy import create_engine from sqlalchemy.orm import scoped_session, sessionmaker # db = pymysql.connect('localhost', 'root', '123456', 'languid') engine = create_engine('mysql+pymysql://root:123456@localhost/languid?charset=utf8' ) DB = scoped_session (sessionmaker (the bind = Engine)) col_list = [ ' User ' , ' tm_type ' , ' Serv ' , ' App ' , ' RECORD_TIME ' , ' up_flux ' , ' down_flux ' ] # Internet Account # terminal type App service # # # # recording time of upstream traffic downstream traffic # filepath = ' C: // Baidu network disk //20181007_flux_40.csv ' # DEF data_deal (filepath): IF the __name__ == '__main__': df_flux = pd.read_csv(filepath, sep=',', error_bad_lines=False, usecols=[3, 10, 11, 12, 15, 16, 17], names=col_list,engine='python',encoding = "utf-8",nrows=22222) df_flux.dropna(how='all',inplace=True) df_flux.dropna(subset=['user'],inplace=True,axis=0) df_flux['record_time']='2019-5-28' df_flux2 = df_flux.groupby(by=['user','tm_type','serv','app','record_time'])['up_flux','down_flux'].sum() df_flux3 = df_flux.groupby(by=['user', 'tm_type', 'serv', 'app', 'record_time']).count() df_flux4 = df_flux3.drop(['down_flux'], axis=1) df_flux5 = df_flux4.rename(columns={'up_flux': 'counts'}, inplace=False) df_flux2=DataFrame(df_flux2) df_flux2 = df_flux2.rename(columns={'up_flux': 'up_flux_sum','down_flux':'down_flux_sum'}) result = pd.concat([df_flux5, df_flux2], axis=1) print(result)
1. Clean the completely empty row in the data user 2. Clean column null rows and columns 3. The total amount of upstream traffic statistics per day per terminal services app downlink traffic column. Number 4. Statistical per person per day in the Terminal Services app.
import pandas as pd import numpy as np from pandas import DataFrame import datetime import sys import pymysql import csv from sqlalchemy import create_engine from sqlalchemy.orm import scoped_session, sessionmaker # db = pymysql.connect('localhost', 'root', '123456', 'languid') engine = create_engine('mysql+pymysql://root:123456@localhost/languid?charset=utf8' ) DB = scoped_session (sessionmaker (the bind = Engine)) col_list = [ ' User ' , ' tm_type ' , ' Serv ' , ' App ' , ' RECORD_TIME ' , ' up_flux ' , ' down_flux ' ] # Internet Account # terminal type App service # # # # recording time of upstream traffic # downlink traffic student_list = [ ' User ' , ' Age ' , ' Low ' ,'High ' , ' Time ' ] filepath = ' C: // Baidu network disk //20181007_flux_40.csv ' FILEPATH2 = ' C: // Baidu network disk //v_student_net.csv ' # DEF data_deal (filepath): IF the __name__ == ' __main__ ' : df_flux = pd.read_csv (filepath, On Sep = ' , ' , error_bad_lines = False, usecols = [ . 3 , 10 , . 11 , 12 is , 15 , 16 , . 17 ], = col_list names, Engine ='python',encoding = "utf-8") df_flux.dropna(how='all',inplace=True) df_flux.dropna(subset=['user'],inplace=True,axis=0) df_flux['record_time']='2019-5-28' df_flux2 = df_flux.groupby(['user', 'tm_type', 'serv', 'app', 'record_time'], as_index=False)['up_flux', 'down_flux'].sum() df_flux3 = df_flux.groupby(by=['user', 'tm_type', 'serv', 'app', 'record_time'],as_index=False).count() df_flux4 = df_flux3.drop(['down_flux'], axis=1) df_flux5 = df_flux4.rename(columns={'up_flux': 'counts'}, inplace=False) df_flux2=DataFrame(df_flux2) df_flux2 = df_flux2.rename(columns={'up_flux': 'up_flux_sum','down_flux':'down_flux_sum'}) result = pd.concat([df_flux2, df_flux5['counts']], axis=1) result_1 = df_flux2[~df_flux2['user' ] .Str.contains ( ' 10 \. ' )] Result_1 [ ' down_flux_sum ' ] = result_1 [ ' down_flux_sum ' ] .astype ( a float ) # result_1 [ ' User ' ] = result_1 [ ' User ' ] .astype ( a float ) # QQQ = result_1 [result_1 [ ' User ' ]] result_1 [ ' tm_type ' ] .replace ( ' \ / mobile terminal \ / \ w * system of the mobile terminal ' , 'mobile', REGEX = True, InPlace = True) result_1.loc [result_1 [ ' tm_type ' ] .str.contains ( ' multi-terminal ' ), ' tm_type ' ] = ' multi-terminal ' result_1.loc [result_1 [ ' tm_type ' ]. str.contains ( ' unknown type ' ), ' tm_type ' ] = ' unknown ' result_1 [ ' tm_type ' ] .replace ( ' \ / the PC \ / the MAC the PC ' , ' the PC ',regex=True,inplace=True) v_student = pd.read_csv(filepath2,sep=',',error_bad_lines=False,engine='python',encoding='utf-8',header=0,index_col=[0]) unique_value = v_student['username'].nunique() v_student = v_student.rename(columns={'username': 'user'}, inplace=False) student_merge=pd.merge(v_student,result_1,how='inner') student_group = student_merge.groupby(['class_code'],as_index=False)['down_flux_sum'] student_group_2 =student_merge.groupby(['class_code'],as_index=False)['up_flux_sum'].count() student_group_3 = student_group_2.rename(columns={'up_flux_sum': 'counts'}, inplace=False)
1. As a regular expression, and loc cleaning data tm_type column, make the following changes
System of the mobile terminal = mobile ()
pc=pc()
= Multi-terminal multiple terminals ()
Unknown = unknown ()
2.ip filtered data () of the user data row ip filtration column
3. Type = conversion transformed into other types of uplink traffic ()