pandas缺失值填充时遇到的问题

打比赛时,遇到了一个问题。填充空白值的时候,如果使用 固定值,均值啥的都没问题。
但是我想用

.fillna(method='pad',axis=0,inplace=True)

但是每次都是报错

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-191-1252788aaf79> in <module>
----> 1 sh_car1.fillna(method='pad',axis=0,inplace=True)

C:\Anaconda3\lib\site-packages\pandas\core\frame.py in fillna(self, value, method, axis, inplace, limit, downcast, **kwargs)
   4242             limit=limit,
   4243             downcast=downcast,
-> 4244             **kwargs
   4245         )
   4246 

C:\Anaconda3\lib\site-packages\pandas\core\generic.py in fillna(self, value, method, axis, inplace, limit, downcast)
   6235                 inplace=inplace,
   6236                 coerce=True,
-> 6237                 downcast=downcast,
   6238             )
   6239         else:

C:\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in interpolate(self, **kwargs)
    567 
    568     def interpolate(self, **kwargs):
--> 569         return self.apply("interpolate", **kwargs)
    570 
    571     def shift(self, **kwargs):

C:\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
    436                     kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy)
    437 
--> 438             applied = getattr(b, f)(**kwargs)
    439             result_blocks = _extend_blocks(applied, result_blocks)
    440 

C:\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in interpolate(self, method, axis, index, values, inplace, limit, limit_direction, limit_area, fill_value, coerce, downcast, **kwargs)
   1172                 fill_value=fill_value,
   1173                 coerce=coerce,
-> 1174                 downcast=downcast,
   1175             )
   1176         # validate the interp method

C:\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in _interpolate_with_fill(self, method, axis, inplace, limit, fill_value, coerce, downcast)
   1226             limit=limit,
   1227             fill_value=fill_value,
-> 1228             dtype=self.dtype,
   1229         )
   1230         values = self._try_coerce_result(values)

C:\Anaconda3\lib\site-packages\pandas\core\missing.py in interpolate_2d(values, method, axis, limit, fill_value, dtype)
    481     method = clean_fill_method(method)
    482     if method == "pad":
--> 483         values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype))
    484     else:
    485         values = transf(

C:\Anaconda3\lib\site-packages\pandas\core\missing.py in pad_2d(values, limit, mask, dtype)
    546 
    547     if np.all(values.shape):
--> 548         algos.pad_2d_inplace(values, mask, limit=limit)
    549     else:
    550         # for test coverage

pandas\_libs\algos.pyx in pandas._libs.algos.__pyx_fused_cpdef()

TypeError: No matching signature found

经过千辛万苦终于找到了问题的根源。

原来,我在加载数据的时候使用了一个 压缩内存的函数

# 减少内存使用
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                print('column name :',col)
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

这里面产生了一种新的数据类型 np.float16
而这种类型,在pandas里是没有的。
pandas里面只有的float类型。

np.float32类型 都没有问题。

所以在填充的时候就会报错。

猜你喜欢

转载自www.cnblogs.com/duoba/p/12521274.html