pandas数据清洗(二)

import pandas as pd
import numpy as np
from pandas import DataFrame
import datetime
import sys
import pymysql
import csv
from sqlalchemy import create_engine
from sqlalchemy.orm import scoped_session, sessionmaker


# db = pymysql.connect('localhost', 'root', '123456', 'languid')
engine = create_engine('mysql+pymysql://root:123456@localhost/languid?charset=utf8')
db = scoped_session(sessionmaker(bind=engine))


col_list = ['user', 'tm_type', 'serv', 'app', 'record_time', 'up_flux', 'down_flux']#上网账号#终端类型#服务#app#记录时间#上行流量#下行流量

filepath='C://百度网盘//20181007_flux_40.csv'
# def data_deal(filepath):
if __name__ == '__main__':
    df_flux = pd.read_csv(filepath, sep=',', error_bad_lines=False, usecols=[3, 10, 11, 12, 15, 16, 17], names=col_list,engine='python',encoding = "utf-8",nrows=22222)
    df_flux.dropna(how='all',inplace=True)
    df_flux.dropna(subset=['user'],inplace=True,axis=0)
    df_flux['record_time']='2019-5-28'
    df_flux2 = df_flux.groupby(by=['user','tm_type','serv','app','record_time'])['up_flux','down_flux'].sum()
    df_flux3 = df_flux.groupby(by=['user', 'tm_type', 'serv', 'app', 'record_time']).count()
    df_flux4 = df_flux3.drop(['down_flux'], axis=1)
    df_flux5 = df_flux4.rename(columns={'up_flux': 'counts'}, inplace=False)
  
    df_flux2=DataFrame(df_flux2)
    df_flux2 = df_flux2.rename(columns={'up_flux': 'up_flux_sum','down_flux':'down_flux_sum'})
   
    result = pd.concat([df_flux5, df_flux2], axis=1)
    
    print(result)

1.清洗数据中的全空行 2.清洗user列中的空值的行 3.统计上行流量列以及下行流量列的当天每人每终端服务app的总量。 4.统计每人每天终端服务app的次数。

import pandas as pd
import numpy as np
from pandas import DataFrame
import datetime
import sys
import pymysql
import csv
from sqlalchemy import create_engine
from sqlalchemy.orm import scoped_session, sessionmaker


# db = pymysql.connect('localhost', 'root', '123456', 'languid')
engine = create_engine('mysql+pymysql://root:123456@localhost/languid?charset=utf8')
db = scoped_session(sessionmaker(bind=engine))


col_list = ['user', 'tm_type', 'serv', 'app', 'record_time', 'up_flux', 'down_flux']#上网账号#终端类型#服务#app#记录时间#上行流量#下行流量
student_list=['user','age','low','high','time']

filepath='C://百度网盘//20181007_flux_40.csv'
filepath2='C://百度网盘//v_student_net.csv'
# def data_deal(filepath):
if __name__ == '__main__':
    df_flux = pd.read_csv(filepath, sep=',', error_bad_lines=False, usecols=[3, 10, 11, 12, 15, 16, 17], names=col_list,engine='python',encoding = "utf-8")
    df_flux.dropna(how='all',inplace=True)
    df_flux.dropna(subset=['user'],inplace=True,axis=0)
    df_flux['record_time']='2019-5-28'
    df_flux2 = df_flux.groupby(['user', 'tm_type', 'serv', 'app', 'record_time'], as_index=False)['up_flux', 'down_flux'].sum()
    df_flux3 = df_flux.groupby(by=['user', 'tm_type', 'serv', 'app', 'record_time'],as_index=False).count()
    df_flux4 = df_flux3.drop(['down_flux'], axis=1)
    df_flux5 = df_flux4.rename(columns={'up_flux': 'counts'}, inplace=False)
    df_flux2=DataFrame(df_flux2)
    df_flux2 = df_flux2.rename(columns={'up_flux': 'up_flux_sum','down_flux':'down_flux_sum'})
    result = pd.concat([df_flux2, df_flux5['counts']], axis=1)
    result_1 = df_flux2[~df_flux2['user'].str.contains('10\.')]
    result_1['down_flux_sum'] = result_1['down_flux_sum'].astype(float)
    # result_1['user'] = result_1['user'].astype(float)
    # qqq = result_1[result_1['user']]
    result_1['tm_type'].replace('\/移动终端\/\w*系统移动终端','mobile',regex=True,inplace=True)
    result_1.loc[result_1['tm_type'].str.contains('多终端'),'tm_type']='多终端'
    result_1.loc[result_1['tm_type'].str.contains('未知类型'), 'tm_type'] = 'Unknown'
    result_1['tm_type'].replace('\/PC\/MAC PC','PC',regex=True,inplace=True)




    v_student = pd.read_csv(filepath2,sep=',',error_bad_lines=False,engine='python',encoding='utf-8',header=0,index_col=[0])
    unique_value = v_student['username'].nunique()
    v_student = v_student.rename(columns={'username': 'user'}, inplace=False)
    student_merge=pd.merge(v_student,result_1,how='inner')
    student_group = student_merge.groupby(['class_code'],as_index=False)['down_flux_sum']
    student_group_2 =student_merge.groupby(['class_code'],as_index=False)['up_flux_sum'].count()
    student_group_3 = student_group_2.rename(columns={'up_flux_sum': 'counts'}, inplace=False)

1.用正则表达以及loc清洗tm_type列的数据,做以下更改

系统移动终端=mobile()

pc=pc()

多终端=多终端()

未知=unknown()

2.ip数据过滤() 将user列中的为ip的数据行过滤

3.类型转换=上行流量转化成其他类型()

猜你喜欢

转载自www.cnblogs.com/languid/p/10960559.html