数据清洗之 缺失值处理

缺失值处理

  • 缺失值首先需要根据实际情况定义
  • 可以采取直接删除法
  • 有时候需要使用替换法或者插值法
  • 常用的替换法有均值替换、前向、后向替换和常数替换
import pandas as pd
import numpy as np
import os
os.getcwd()
'D:\\Jupyter\\notebook\\Python数据清洗实战\\数据清洗之数据预处理'
os.chdir('D:\\Jupyter\\notebook\\Python数据清洗实战\\数据')
df = pd.read_csv('MotorcycleData.csv', encoding='gbk', na_values='Na')
def f(x):
    if '$' in str(x):
        x = str(x).strip('$')
        x = str(x).replace(',', '')
    else:
        x = str(x).replace(',', '')
    return float(x)
df['Price'] = df['Price'].apply(f)
df['Mileage'] = df['Mileage'].apply(f)
# 计算缺失比例
df.apply(lambda x: sum(x.isnull())/len(x), axis=0)
Condition         0.000000
Condition_Desc    0.778994
Price             0.000000
Location          0.000267
Model_Year        0.000534
Mileage           0.003470
Exterior_Color    0.095422
Make              0.000534
Warranty          0.318297
Model             0.016415
Sub_Model         0.676231
Type              0.197785
Vehicle_Title     0.964233
OBO               0.008808
Feedback_Perc     0.117710
Watch_Count       0.530629
N_Reviews         0.000801
Seller_Status     0.083411
Vehicle_Tile      0.007207
Auction           0.002269
Buy_Now           0.031630
Bid_Count         0.707727
dtype: float64
df.head(3)
Condition Condition_Desc Price Location Model_Year Mileage Exterior_Color Make Warranty Model ... Vehicle_Title OBO Feedback_Perc Watch_Count N_Reviews Seller_Status Vehicle_Tile Auction Buy_Now Bid_Count
0 Used mint!!! very low miles 11412.0 McHenry, Illinois, United States 2013.0 16000.0 Black Harley-Davidson Unspecified Touring ... NaN FALSE 8.1 NaN 2427 Private Seller Clear True FALSE 28.0
1 Used Perfect condition 17200.0 Fort Recovery, Ohio, United States 2016.0 60.0 Black Harley-Davidson Vehicle has an existing warranty Touring ... NaN FALSE 100 17 657 Private Seller Clear True TRUE 0.0
2 Used NaN 3872.0 Chicago, Illinois, United States 1970.0 25763.0 Silver/Blue BMW Vehicle does NOT have an existing warranty R-Series ... NaN FALSE 100 NaN 136 NaN Clear True FALSE 26.0

3 rows × 22 columns

# how = 'all', 只有当前行都是缺失值才删除
# how = 'any', 只要当前行有一个缺失值就删除
df.dropna(how = 'any', axis=0)
Condition Condition_Desc Price Location Model_Year Mileage Exterior_Color Make Warranty Model ... Vehicle_Title OBO Feedback_Perc Watch_Count N_Reviews Seller_Status Vehicle_Tile Auction Buy_Now Bid_Count

0 rows × 22 columns

# subset 根据指定字段判断
# df.dropna(how='any', subset=['Condition', 'Price', 'Mileage'])
# 缺失值使用0填补
df.fillna(0).head(5)
Condition Condition_Desc Price Location Model_Year Mileage Exterior_Color Make Warranty Model ... Vehicle_Title OBO Feedback_Perc Watch_Count N_Reviews Seller_Status Vehicle_Tile Auction Buy_Now Bid_Count
0 Used mint!!! very low miles 11412.0 McHenry, Illinois, United States 2013.0 16000.0 Black Harley-Davidson Unspecified Touring ... 0 FALSE 8.1 0 2427 Private Seller Clear True FALSE 28.0
1 Used Perfect condition 17200.0 Fort Recovery, Ohio, United States 2016.0 60.0 Black Harley-Davidson Vehicle has an existing warranty Touring ... 0 FALSE 100 17 657 Private Seller Clear True TRUE 0.0
2 Used 0 3872.0 Chicago, Illinois, United States 1970.0 25763.0 Silver/Blue BMW Vehicle does NOT have an existing warranty R-Series ... 0 FALSE 100 0 136 0 Clear True FALSE 26.0
3 Used CLEAN TITLE READY TO RIDE HOME 6575.0 Green Bay, Wisconsin, United States 2009.0 33142.0 Red Harley-Davidson 0 Touring ... 0 FALSE 100 0 2920 Dealer Clear True FALSE 11.0
4 Used 0 10000.0 West Bend, Wisconsin, United States 2012.0 17800.0 Blue Harley-Davidson NO WARRANTY Touring ... 0 FALSE 100 13 271 OWNER Clear True TRUE 0.0

5 rows × 22 columns

# 针对一个变量进行缺失值判断,使用其均值进行填补
df.Mileage.fillna(df.Mileage.mean()).head(5)
0    16000.0
1       60.0
2    25763.0
3    33142.0
4    17800.0
Name: Mileage, dtype: float64
df.columns
Index(['Condition', 'Condition_Desc', 'Price', 'Location', 'Model_Year',
       'Mileage', 'Exterior_Color', 'Make', 'Warranty', 'Model', 'Sub_Model',
       'Type', 'Vehicle_Title', 'OBO', 'Feedback_Perc', 'Watch_Count',
       'N_Reviews', 'Seller_Status', 'Vehicle_Tile', 'Auction', 'Buy_Now',
       'Bid_Count'],
      dtype='object')
df[df['Exterior_Color'].isnull()].head(5)
Condition Condition_Desc Price Location Model_Year Mileage Exterior_Color Make Warranty Model ... Vehicle_Title OBO Feedback_Perc Watch_Count N_Reviews Seller_Status Vehicle_Tile Auction Buy_Now Bid_Count
14 Used NaN 5500.0 Davenport, Iowa, United States 2008.0 22102.0 NaN Harley-Davidson Vehicle does NOT have an existing warranty Touring ... NaN FALSE 9.3 NaN 244 Private Seller Clear True FALSE 16.0
35 Used NaN 7700.0 Roselle, Illinois, United States 2007.0 10893.0 NaN Harley-Davidson NaN Other ... NaN FALSE 100 NaN 236 NaN Clear False TRUE NaN
41 Used NaN 6800.0 Hampshire, Illinois, United States 2003.0 55782.0 NaN Harley-Davidson Vehicle does NOT have an existing warranty Softail ... NaN TRUE 100 2< 298 Private Seller Clear False TRUE NaN
55 Used NaN 29500.0 Parma, Michigan, United States 1950.0 8471.0 NaN Harley-Davidson NaN Other ... NaN TRUE 100 24 216 NaN Clear False TRUE NaN
72 Used NaN 6500.0 Bourbonnais, Illinois, United States 1986.0 55300.0 NaN Harley-Davidson NaN Touring ... NaN TRUE 100 2< 1 Private Seller Clear False TRUE NaN

5 rows × 22 columns

# 求众数
df['Exterior_Color'].mode()[0]
'Black'
# 缺失用众数填补
df['Exterior_Color'].fillna(df['Exterior_Color'].mode()[0]).head(5)
0          Black
1          Black
2    Silver/Blue
3            Red
4           Blue
Name: Exterior_Color, dtype: object
df['Mileage'].median()
7083.0
# 对不同变量使用不同数据填补
# 不加inplace=True,不会对原数据生效
df.fillna(value={'Exterior_Color': df['Exterior_Color'].mode()[0], 
                'Mileage': df['Mileage'].median(),}).head(5)
Condition Condition_Desc Price Location Model_Year Mileage Exterior_Color Make Warranty Model ... Vehicle_Title OBO Feedback_Perc Watch_Count N_Reviews Seller_Status Vehicle_Tile Auction Buy_Now Bid_Count
0 Used mint!!! very low miles 11412.0 McHenry, Illinois, United States 2013.0 16000.0 Black Harley-Davidson Unspecified Touring ... NaN FALSE 8.1 NaN 2427 Private Seller Clear True FALSE 28.0
1 Used Perfect condition 17200.0 Fort Recovery, Ohio, United States 2016.0 60.0 Black Harley-Davidson Vehicle has an existing warranty Touring ... NaN FALSE 100 17 657 Private Seller Clear True TRUE 0.0
2 Used NaN 3872.0 Chicago, Illinois, United States 1970.0 25763.0 Silver/Blue BMW Vehicle does NOT have an existing warranty R-Series ... NaN FALSE 100 NaN 136 NaN Clear True FALSE 26.0
3 Used CLEAN TITLE READY TO RIDE HOME 6575.0 Green Bay, Wisconsin, United States 2009.0 33142.0 Red Harley-Davidson NaN Touring ... NaN FALSE 100 NaN 2920 Dealer Clear True FALSE 11.0
4 Used NaN 10000.0 West Bend, Wisconsin, United States 2012.0 17800.0 Blue Harley-Davidson NO WARRANTY Touring ... NaN FALSE 100 13 271 OWNER Clear True TRUE 0.0

5 rows × 22 columns

# 前向填补
df['Exterior_Color'].fillna(method='ffill').tail(10)
7483      Purple
7484      Purple
7485       Black
7486       Black
7487        Gray
7488       Black
7489       Black
7490         Red
7491    TWO TONE
7492        Gray
Name: Exterior_Color, dtype: object
# 后向填补
df['Exterior_Color'].fillna(method='bfill').tail(10)
7483      Purple
7484       Black
7485       Black
7486       Black
7487        Gray
7488       Black
7489       Black
7490         Red
7491    TWO TONE
7492        Gray
Name: Exterior_Color, dtype: object
发布了266 篇原创文章 · 获赞 335 · 访问量 4万+

猜你喜欢

转载自blog.csdn.net/qq_29339467/article/details/105611038