Missing Values(缺失值)

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u013317445/article/details/84963069

缺失值之心里有数

import pandas as pd
data= pd.read_csv(r'G:\kaggle\melb_data.csv')
#统计缺失值的数量
missing_val_count_by_column= data.isnull().sum()
missing_val_count_by_column  #pandas Series类型
Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64
type(missing_val_count_by_column)
pandas.core.series.Series
#只输出有缺失值的
print(missing_val_count_by_column[missing_val_count_by_column > 0])
Car               62
BuildingArea    6450
YearBuilt       5375
CouncilArea     1369
dtype: int64
#判断data的'Car'列有缺失值吗?
data['Car'].isnull().any()  #该列有缺失值,则out:True
True
#看该列有多少个缺失值
data['Car'].isnull().sum()
62
#看Car列缺失值的具体情况
data['Car'].isnull()
0        False
1        False
2        False
3        False
4        False
5        False
6        False
7        False
8        False
9        False
10       False
11       False
12       False
13       False
14       False
15       False
16       False
17       False
18       False
19       False
20       False
21       False
22       False
23       False
24       False
25       False
26       False
27       False
28       False
29       False
         ...  
13550     True
13551    False
13552    False
13553    False
13554    False
13555    False
13556    False
13557    False
13558    False
13559    False
13560    False
13561    False
13562    False
13563    False
13564    False
13565    False
13566    False
13567    False
13568    False
13569    False
13570    False
13571    False
13572    False
13573    False
13574    False
13575    False
13576    False
13577    False
13578    False
13579    False
Name: Car, Length: 13580, dtype: bool

解决方案

1.简单的:删除有缺失值的列

当列中的值大部分都缺失时,它是有用的。

data_without_missing_values = original_data.dropna(axis=1)

有training dataset和 test dataset, 假如想删除两个DataFrame中相同的列:

col_with_missing= [col for col in original_data.columns 
                   if original_data[col].isnull().any()] #ifTrue的时候执行  什么时候True: 该列有缺失值的时候
reduced_original_data= original_data.drop(col_with_missing, axis=1) #drop啊 not dropna
reduced_test_data= test_data.drop(col_with_missing, axis=1)

2. 较好的:缺失值插补(估算值)

以下默认填补了mean value

from sklearn.impute import SimpleImputer

my_imputer= SimpleImputer() #default: missing_values=np.nan, strategy='mean'   处理稀疏矩阵:missing_values=-1  其他策略:strategy="most_frequent"
data_with_imputed_values= my_imputer.fit_transform(original_data)

3.插补法拓展 ,没懂?

# make copy to avoid changing original data (when Imputing)
new_data = original_data.copy()

# make new columns indicating what will be imputed
cols_with_missing = (col for col in new_data.columns 
                                 if new_data[col].isnull().any())
for col in cols_with_missing:
    new_data[col + '_was_missing'] = new_data[col].isnull()

# Imputation
my_imputer = SimpleImputer()
new_data = pd.DataFrame(my_imputer.fit_transform(new_data))
new_data.columns = original_data.columns

例子

import pandas as pd

melb_data= pd.read_csv(r'G:\kaggle\melb_data.csv')

#target
y= melb_data.Price

#剔除Price列
melb_predictors= melb_data.drop(['Price'], axis=1) 
#剔除非数值特征
melb_numeric_predictors= melb_predictors.select_dtypes(exclude=['object'])

测试不同缺失值处理方式下的model score

#Approach1——删除有缺失值的列

#找到有缺失值的列
col_with_missing= [col for col in melb_numeric_predictors 
                       if melb_numeric_predictors[col].isnull().any()]
#删掉有缺失值的列
reduced_melb_numeric_predictors= melb_numeric_predictors.drop(col_with_missing, axis=1)

#Approach2——缺失值插补(估算值) strategy='mean'
from sklearn.impute import SimpleImputer

my_imputer= SimpleImputer()
melb_numeric_predictors_with_imputed_values = my_imputer.fit_transform(melb_numeric_predictors)
#不同处理方式下,得到的训练样本不一样。
#就定义一个Function队不同的训练样本得到的 model score进行测试

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

def score(X, y):
    #split
    X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=0)
    #model
    melb_model= RandomForestRegressor()
    #fit
    clf= melb_model.fit(X_train, y_train)
    #score
    score= clf.score(X_test, y_test)
    
    return score
#test
score_drop_approach= score(reduced_melb_numeric_predictors, y)
score_impute_values= score(melb_numeric_predictors_with_imputed_values, y)
print("drop approach:",score_drop_approach)
print('impute approach:',score_impute_values)
('drop approach:', 0.7251907026905651)
('impute approach:', 0.74245443764218)

通过实验可以看到:这里,使用插值法的效果比直接删除法效果好。

猜你喜欢

转载自blog.csdn.net/u013317445/article/details/84963069