pandas data cleaning (two)

import pandas as pd
import numpy as np
from pandas import DataFrame
import datetime
import sys
import pymysql
import csv
from sqlalchemy import create_engine
from sqlalchemy.orm import scoped_session, sessionmaker


# db = pymysql.connect('localhost', 'root', '123456', 'languid')
engine = create_engine('mysql+pymysql://root:123456@localhost/languid?charset=utf8' ) 
DB = scoped_session (sessionmaker (the bind = Engine)) 


col_list = [ ' User ' , ' tm_type ' , ' Serv ' , ' App ' , ' RECORD_TIME ' , ' up_flux ' , ' down_flux ' ] # Internet Account # terminal type App service # # # # recording time of upstream traffic downstream traffic # 

filepath = ' C: // Baidu network disk //20181007_flux_40.csv ' 
# DEF data_deal (filepath): 
IF the __name__ == '__main__':
    df_flux = pd.read_csv(filepath, sep=',', error_bad_lines=False, usecols=[3, 10, 11, 12, 15, 16, 17], names=col_list,engine='python',encoding = "utf-8",nrows=22222)
    df_flux.dropna(how='all',inplace=True)
    df_flux.dropna(subset=['user'],inplace=True,axis=0)
    df_flux['record_time']='2019-5-28'
    df_flux2 = df_flux.groupby(by=['user','tm_type','serv','app','record_time'])['up_flux','down_flux'].sum()
    df_flux3 = df_flux.groupby(by=['user', 'tm_type', 'serv', 'app', 'record_time']).count()
    df_flux4 = df_flux3.drop(['down_flux'], axis=1)
    df_flux5 = df_flux4.rename(columns={'up_flux': 'counts'}, inplace=False)
  
    df_flux2=DataFrame(df_flux2)
    df_flux2 = df_flux2.rename(columns={'up_flux': 'up_flux_sum','down_flux':'down_flux_sum'})
   
    result = pd.concat([df_flux5, df_flux2], axis=1)
    
    print(result)

1. Clean the completely empty row in the data user 2. Clean column null rows and columns 3. The total amount of upstream traffic statistics per day per terminal services app downlink traffic column. Number 4. Statistical per person per day in the Terminal Services app.

 

import pandas as pd
import numpy as np
from pandas import DataFrame
import datetime
import sys
import pymysql
import csv
from sqlalchemy import create_engine
from sqlalchemy.orm import scoped_session, sessionmaker


# db = pymysql.connect('localhost', 'root', '123456', 'languid')
engine = create_engine('mysql+pymysql://root:123456@localhost/languid?charset=utf8' ) 
DB = scoped_session (sessionmaker (the bind = Engine)) 


col_list = [ ' User ' , ' tm_type ' , ' Serv ' , ' App ' , ' RECORD_TIME ' , ' up_flux ' , ' down_flux ' ] # Internet Account # terminal type App service # # # # recording time of upstream traffic # downlink traffic 
student_list = [ ' User ' , ' Age ' , ' Low ' ,'High ' , ' Time ' ] 

filepath = ' C: // Baidu network disk //20181007_flux_40.csv ' 
FILEPATH2 = ' C: // Baidu network disk //v_student_net.csv ' 
# DEF data_deal (filepath): 
IF the __name__ == ' __main__ ' : 
    df_flux = pd.read_csv (filepath, On Sep = ' , ' , error_bad_lines = False, usecols = [ . 3 , 10 , . 11 , 12 is , 15 , 16 , . 17 ], = col_list names, Engine ='python',encoding = "utf-8")
    df_flux.dropna(how='all',inplace=True)
    df_flux.dropna(subset=['user'],inplace=True,axis=0)
    df_flux['record_time']='2019-5-28'
    df_flux2 = df_flux.groupby(['user', 'tm_type', 'serv', 'app', 'record_time'], as_index=False)['up_flux', 'down_flux'].sum()
    df_flux3 = df_flux.groupby(by=['user', 'tm_type', 'serv', 'app', 'record_time'],as_index=False).count()
    df_flux4 = df_flux3.drop(['down_flux'], axis=1)
    df_flux5 = df_flux4.rename(columns={'up_flux': 'counts'}, inplace=False)
    df_flux2=DataFrame(df_flux2)
    df_flux2 = df_flux2.rename(columns={'up_flux': 'up_flux_sum','down_flux':'down_flux_sum'})
    result = pd.concat([df_flux2, df_flux5['counts']], axis=1)
    result_1 = df_flux2[~df_flux2['user' ] .Str.contains ( ' 10 \. ' )] 
    Result_1 [ ' down_flux_sum ' ] = result_1 [ ' down_flux_sum ' ] .astype ( a float ) 
    # result_1 [ ' User ' ] = result_1 [ ' User ' ] .astype ( a float ) 
    # QQQ = result_1 [result_1 [ ' User ' ]] 
    result_1 [ ' tm_type ' ] .replace ( ' \ / mobile terminal \ / \ w * system of the mobile terminal ' , 'mobile', REGEX = True, InPlace = True) 
    result_1.loc [result_1 [ ' tm_type ' ] .str.contains ( ' multi-terminal ' ), ' tm_type ' ] = ' multi-terminal ' 
    result_1.loc [result_1 [ ' tm_type ' ]. str.contains ( ' unknown type ' ), ' tm_type ' ] = ' unknown ' 
    result_1 [ ' tm_type ' ] .replace ( ' \ / the PC \ / the MAC the PC ' , ' the PC ',regex=True,inplace=True)




    v_student = pd.read_csv(filepath2,sep=',',error_bad_lines=False,engine='python',encoding='utf-8',header=0,index_col=[0])
    unique_value = v_student['username'].nunique()
    v_student = v_student.rename(columns={'username': 'user'}, inplace=False)
    student_merge=pd.merge(v_student,result_1,how='inner')
    student_group = student_merge.groupby(['class_code'],as_index=False)['down_flux_sum']
    student_group_2 =student_merge.groupby(['class_code'],as_index=False)['up_flux_sum'].count()
    student_group_3 = student_group_2.rename(columns={'up_flux_sum': 'counts'}, inplace=False)

1. As a regular expression, and loc cleaning data tm_type column, make the following changes

System of the mobile terminal = mobile ()

pc=pc()

= Multi-terminal multiple terminals ()

Unknown = unknown ()

2.ip filtered data () of the user data row ip filtration column

3. Type = conversion transformed into other types of uplink traffic ()

Guess you like

Origin www.cnblogs.com/languid/p/10960559.html