常用库导入
import pandas as pd
import numpy as np
import matplotlib. pyplot as plt
import seaborn as sns
import warnings
warnings. filterwarnings( "ignore" )
pd. options. display. max_columns = None
pd. set_option( 'display.float_format' , lambda x: '%.2f' % x)
1、数据整合(concat)
train_data = pd. read_csv( 'training30.csv' )
test_data = pd. read_csv( 'test30.csv' )
total_data = pd. concat( [ train_data, test_data] , axis= 0 )
total_data. info( )
2、数据筛选
cdma = pd. read_csv( 'cdma.xls' , encoding= 'gbk' , sep= '\t' )
print ( cdma. shape)
cdma = cdma[ ( cdma[ '销售区局' ] == '浦东电信局' ) & ( cdma[ '渠道管理细分' ] . isin( [ '专营渠道' , '中小渠道' , '开放渠道' ] ) ) ]
print ( cdma. shape)
3、数据匹配(merge)
match_table = pd. read_excel( '数据说明与匹配公式.xlsx' , sheet_name= '部门匹配表' )
new_cdma = cdma. merge( match_table, how= 'left' , on= [ '发展部门名称' , '渠道管理细分' ] )
new_cdma = new_cdma[ new_cdma[ '渠道管理细分' ] == '专营渠道' ]
new_cdma[ [ '统计日期' , '订单号' , '所属部门' , '所属代理商' , '所属分局' , '渠道经理' ] ] . head( )
4、数据透视表(pivot_table)
cdma_pivot = new_cdma. pivot_table( index= '所属代理商' , values= '订单号' , columns= '所属分局' , aggfunc= 'count' , fill_value= 0 , margins= True , margins_name= '合计' )
cdma_pivot
5、数据排序(sort_values)
cdma_pivot. sort_values( by= '合计' , inplace= True , ascending= False )
cdma_pivot
6、数据替换(replace)
train_data = train_data. replace( '?' , np. nan)
train_data. head( 10 )
train_data2 = train_data. replace( 'Tai' , 'Cy' , regex= True )
train_data2. head( 10 )
7、数据删除(dropna)
print ( train_data. shape)
train_data3 = train_data. dropna( subset= [ 'gender' , 'age' ] )
print ( train_data3. shape)
8、降采样
def lower_sample_data ( df, labelname, percent= 1 ) :
'''
percent:多数类别下采样的数量相对于少数类别样本数量的比例
'''
data1 = df[ df[ labelname] == 1 ]
data0 = df[ df[ labelname] == 0 ]
index = np. random. randint(
len ( data0) , size= percent * ( len ( data1) ) )
lower_data0 = data0. iloc[ list ( index) ]
return ( pd. concat( [ lower_data0, data1] ) )
print ( train_data[ "'Purchase or not'" ] . value_counts( ) )
train_data4 = lower_sample_data( train_data, "'Purchase or not'" , percent= 1 )
print ( train_data4[ "'Purchase or not'" ] . value_counts( ) )
9、缺失值处理(fillna)
train_data5 = pd. read_csv( 'cs-training.csv' )
per_columns = set ( train_data5. columns) - set ( [ 'CustomerID' , 'SeriousDlqin2yrs' ] )
for column in per_columns:
temp_mean = train_data5[ column] . mean( )
train_data5[ column] = train_data5[ column] . fillna( temp_mean)
train_data5. describe( )
10、噪声处理
方法一:四分位法
def cap ( x, quantile= [ 0.05 , 0.95 ] ) :
"""盖帽法处理异常值
Args:
x:pd.Series列,连续变量
quantile:指定盖帽法的上下分位数范围
"""
Q05, Q95= x. quantile( quantile) . values. tolist( )
if Q05 > x. min ( ) :
x = x. copy( )
x. loc[ x< Q05] = Q05
if Q95 < x. max ( ) :
x = x. copy( )
x. loc[ x> Q95] = Q95
return ( x)
train_data6 = train_data5[ per_columns]
train_data6 = train_data6. apply ( cap)
train_data7 = pd. concat( [ train_data5[ [ 'CustomerID' , 'SeriousDlqin2yrs' ] ] , train_data6] , axis= 1 )
train_data7 = train_data7[ train_data5. columns]
train_data7. describe( )
方法二:平均值法
def cap_mean ( x) :
"""盖帽法处理异常值
Args:
x:pd.Series列,连续变量
"""
x_up = x. mean( ) + 3 * x. std( )
x_down = x. mean( ) - 3 * x. std( )
if x_down > x. min ( ) :
x = x. copy( )
x. loc[ x< x_down] = x_down
if x_up < x. max ( ) :
x = x. copy( )
x. loc[ x> x_up] = x_up
return ( x)
train_data8 = train_data5[ per_columns]
train_data8 = train_data8. apply ( cap_mean)
train_data9 = pd. concat( [ train_data5[ [ 'CustomerID' , 'SeriousDlqin2yrs' ] ] , train_data8] , axis= 1 )
train_data9 = train_data9[ train_data5. columns]
train_data9. describe( )
11、数据正规化/标准化
from sklearn. preprocessing import MinMaxScaler
from sklearn. preprocessing import StandardScaler
mm_scaler = MinMaxScaler( )
ss_scaler = StandardScaler( )
print ( train_data9[ 'age' ] . head( ) )
train_data9[ 'age' ] = mm_scaler. fit_transform( train_data9[ [ 'age' ] ] )
print ( train_data9[ 'age' ] . head( ) )
print ( '-------------------------------------------------' )
print ( train_data9[ 'MonthlyIncome' ] . head( ) )
train_data9[ 'MonthlyIncome' ] = ss_scaler. fit_transform( train_data9[ [ 'MonthlyIncome' ] ] )
print ( train_data9[ 'MonthlyIncome' ] . head( ) )
12、数据泛化(map)
print ( cdma[ '发展渠道小类' ] . value_counts( ) )
qd_map = { '自营营业厅' : '自营渠道' , '专营店' : '专营渠道' , '合作营业厅' : '专营渠道' , '核心渠道专区专柜' : '专营渠道' , '天翼小店' : '中小渠道' ,
'外包营业厅' : '专营渠道' , '全国连锁卖场' : '开放渠道' , '全网通(专营)' : '专营渠道' , '商圈店' : '专营渠道' , '天翼合作店' : '中小渠道' , '终端零售店(开放)' : '中小渠道' }
cdma_2 = cdma. copy( )
cdma_2[ '渠道统计归类' ] = cdma_2[ '发展渠道小类' ] . map ( qd_map)
print ( cdma_2[ '渠道统计归类' ] . value_counts( ) )
13、连续性指派(LabelEncoder)
from sklearn. preprocessing import LabelEncoder
le = LabelEncoder( )
cdma_2[ '渠道统计归类' ] = le. fit_transform( cdma_2[ [ '渠道统计归类' ] ] )
cdma_2[ '渠道统计归类' ] . value_counts( )
14、数据离散化(cut/qcut)
方法一:人工分离法
age_range = list ( range ( 0 , 111 , 10 ) )
train_data5[ 'age_cut1' ] = pd. cut( train_data5[ 'age' ] , age_range, include_lowest= True , right= False )
train_data5[ 'age_cut1' ] . value_counts( ) . sort_index( )
方法二:等宽装箱法
train_data5[ 'age_cut2' ] = pd. cut( train_data5[ 'age' ] , bins= 10 , include_lowest= True , right= False , precision= 0 )
train_data5[ 'age_cut2' ] . value_counts( ) . sort_index( )
方法三:等深装箱法
train_data5[ 'age_cut3' ] = pd. qcut( train_data5[ 'age' ] , 10 , precision= 1 )
train_data5[ 'age_cut3' ] . value_counts( ) . sort_index( )