数据清洗之 数据增加和删除

数据增加和删除

  • 在数据中,直接添加列
  • 使用df.insert方法在数据中添加一列
  • drop(labels, axis, inplace=True)方法(删除)
    • labels表示删除的数据,axis表示作用轴,inplace=True表示是否对原数据生效
    • axis=0按行操作,axis=1按列操作
  • 使用del函数直接删除其中一列
import pandas as pd
import os
import numpy as np
os.getcwd()
'D:\\Jupyter\\notebook\\Python数据清洗实战\\数据'
os.chdir('D:\\Jupyter\\notebook\\Python数据清洗实战\\数据')
df = pd.read_csv('baby_trade_history.csv', encoding='utf-8', dtype={'user_id':str})
df['购买量'] = np.where(df['buy_mount']>3, '高', '低')
df.head(5)
user_id auction_id cat_id cat1 property buy_mount day 购买量
0 786295544 41098319944 50014866 50022520 21458:86755362;13023209:3593274;10984217:21985... 2 20140919
1 532110457 17916191097 50011993 28 21458:11399317;1628862:3251296;21475:137325;16... 1 20131011
2 249013725 21896936223 50012461 50014815 21458:30992;1628665:92012;1628665:3233938;1628... 1 20131011
3 917056007 12515996043 50018831 50014815 21458:15841995;21956:3494076;27000458:59723383... 2 20141023
4 444069173 20487688075 50013636 50008168 21458:30992;13658074:3323064;1628665:3233941;1... 1 20141103
# 将第二列放在第一列
auction_id = df['auction_id']
del df['auction_id']
df.head(5)
user_id cat_id cat1 property buy_mount day 购买量
0 786295544 50014866 50022520 21458:86755362;13023209:3593274;10984217:21985... 2 20140919
1 532110457 50011993 28 21458:11399317;1628862:3251296;21475:137325;16... 1 20131011
2 249013725 50012461 50014815 21458:30992;1628665:92012;1628665:3233938;1628... 1 20131011
3 917056007 50018831 50014815 21458:15841995;21956:3494076;27000458:59723383... 2 20141023
4 444069173 50013636 50008168 21458:30992;13658074:3323064;1628665:3233941;1... 1 20141103
# 第一个参数:插入位置
# 第二个参数:标签名称
# 第三个参数:数据
df.insert(0, 'auction_id_new', auction_id)
df.head(5)
auction_id_new user_id cat_id cat1 property buy_mount day 购买量
0 41098319944 786295544 50014866 50022520 21458:86755362;13023209:3593274;10984217:21985... 2 20140919
1 17916191097 532110457 50011993 28 21458:11399317;1628862:3251296;21475:137325;16... 1 20131011
2 21896936223 249013725 50012461 50014815 21458:30992;1628665:92012;1628665:3233938;1628... 1 20131011
3 12515996043 917056007 50018831 50014815 21458:15841995;21956:3494076;27000458:59723383... 2 20141023
4 20487688075 444069173 50013636 50008168 21458:30992;13658074:3323064;1628665:3233941;1... 1 20141103
# 删除两列数据
df.drop(labels=['auction_id_new', '购买量'], axis=1).head(5)
user_id cat_id cat1 property buy_mount day
0 786295544 50014866 50022520 21458:86755362;13023209:3593274;10984217:21985... 2 20140919
1 532110457 50011993 28 21458:11399317;1628862:3251296;21475:137325;16... 1 20131011
2 249013725 50012461 50014815 21458:30992;1628665:92012;1628665:3233938;1628... 1 20131011
3 917056007 50018831 50014815 21458:15841995;21956:3494076;27000458:59723383... 2 20141023
4 444069173 50013636 50008168 21458:30992;13658074:3323064;1628665:3233941;1... 1 20141103
# 再次查看df,发现刚才删除数据仍然存在
# 因为没有对原数据生效 
df.head(5)
auction_id_new user_id cat_id cat1 property buy_mount day 购买量
0 41098319944 786295544 50014866 50022520 21458:86755362;13023209:3593274;10984217:21985... 2 20140919
1 17916191097 532110457 50011993 28 21458:11399317;1628862:3251296;21475:137325;16... 1 20131011
2 21896936223 249013725 50012461 50014815 21458:30992;1628665:92012;1628665:3233938;1628... 1 20131011
3 12515996043 917056007 50018831 50014815 21458:15841995;21956:3494076;27000458:59723383... 2 20141023
4 20487688075 444069173 50013636 50008168 21458:30992;13658074:3323064;1628665:3233941;1... 1 20141103
df.drop(labels=['auction_id_new', '购买量'], axis=1, inplace=True)
---------------------------------------------------------------------------

KeyError                                  Traceback (most recent call last)

<ipython-input-30-acf2a75acaf3> in <module>
----> 1 df.drop(labels=['auction_id_new', '购买量'], axis=1, inplace=True)


D:\Anaconda3\lib\site-packages\pandas\core\frame.py in drop(self, labels, axis, index, columns, level, inplace, errors)
   3938                                            index=index, columns=columns,
   3939                                            level=level, inplace=inplace,
-> 3940                                            errors=errors)
   3941 
   3942     @rewrite_axis_style_signature('mapper', [('copy', True),


D:\Anaconda3\lib\site-packages\pandas\core\generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
   3778         for axis, labels in axes.items():
   3779             if labels is not None:
-> 3780                 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
   3781 
   3782         if inplace:


D:\Anaconda3\lib\site-packages\pandas\core\generic.py in _drop_axis(self, labels, axis, level, errors)
   3810                 new_axis = axis.drop(labels, level=level, errors=errors)
   3811             else:
-> 3812                 new_axis = axis.drop(labels, errors=errors)
   3813             result = self.reindex(**{axis_name: new_axis})
   3814 


D:\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in drop(self, labels, errors)
   4963             if errors != 'ignore':
   4964                 raise KeyError(
-> 4965                     '{} not found in axis'.format(labels[mask]))
   4966             indexer = indexer[~mask]
   4967         return self.delete(indexer)


KeyError: "['auction_id_new' '购买量'] not found in axis"
df.head(5)
user_id cat_id cat1 property buy_mount day
0 786295544 50014866 50022520 21458:86755362;13023209:3593274;10984217:21985... 2 20140919
1 532110457 50011993 28 21458:11399317;1628862:3251296;21475:137325;16... 1 20131011
2 249013725 50012461 50014815 21458:30992;1628665:92012;1628665:3233938;1628... 1 20131011
3 917056007 50018831 50014815 21458:15841995;21956:3494076;27000458:59723383... 2 20141023
4 444069173 50013636 50008168 21458:30992;13658074:3323064;1628665:3233941;1... 1 20141103
# 删除标签为3,4数据
df.drop(labels=[3,4], axis=0, inplace=True)
df.head(5)
user_id cat_id cat1 property buy_mount day
0 786295544 50014866 50022520 21458:86755362;13023209:3593274;10984217:21985... 2 20140919
1 532110457 50011993 28 21458:11399317;1628862:3251296;21475:137325;16... 1 20131011
2 249013725 50012461 50014815 21458:30992;1628665:92012;1628665:3233938;1628... 1 20131011
5 152298847 121394024 50008168 21458:3408353;13023209:727117752;22009:2741771... 1 20141103
6 513441334 50010557 50008168 25935:21991;1628665:29784;22019:34731;22019:20... 1 20121212
df.drop(labels=range(0,3), axis=0, inplace=True)
df.head(5)
user_id cat_id cat1 property buy_mount day
5 152298847 121394024 50008168 21458:3408353;13023209:727117752;22009:2741771... 1 20141103
6 513441334 50010557 50008168 25935:21991;1628665:29784;22019:34731;22019:20... 1 20121212
7 297411659 50010542 50008168 21458:60020529;25935:31381;1633959:27247291;16... 1 20121212
8 82830661 50013874 28 21458:11580;21475:137325 1 20121101
9 475046636 203527 28 22724:40168;22729:40278;21458:21817;2770200:24... 1 20121101
发布了248 篇原创文章 · 获赞 293 · 访问量 4万+

猜你喜欢

转载自blog.csdn.net/qq_29339467/article/details/105491640
今日推荐