数据修改和查找
- 在数据中,可以使用rename修改列名称或者行索引名称
- 使用loc方法修改数据
- 使用loc方法查找符合条件的数据
- 条件于条件之间用&或者|连接,分别代表 ‘且’ 和 ‘或’
- 使用between和isin选择满足条件的行
import pandas as pd
import numpy as np
import os
os.getcwd()
'D:\\Jupyter\\notebook\\Python数据清洗实战\\数据清洗之数据表处理'
os.chdir('D:\\Jupyter\\notebook\\Python数据清洗实战\\数据')
df1 = pd.read_csv('sam_tianchi_mum_baby.csv', encoding='utf-8', dtype=str)
df1.head(5)
|
user_id |
birthday |
gender |
0 |
2757 |
20130311 |
1 |
1 |
415971 |
20121111 |
0 |
2 |
1372572 |
20120130 |
1 |
3 |
10339332 |
20110910 |
0 |
4 |
10642245 |
20130213 |
0 |
df1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 3 columns):
user_id 953 non-null object
birthday 953 non-null object
gender 953 non-null object
dtypes: object(3)
memory usage: 22.4+ KB
df1.loc[df1['gender']=='0', 'gender'] = '女性'
df1.loc[df1['gender']=='1', 'gender'] = '男性'
df1.loc[df1['gender']=='2', 'gender'] = '未知'
df1.head(5)
|
user_id |
birthday |
gender |
0 |
2757 |
20130311 |
男性 |
1 |
415971 |
20121111 |
女性 |
2 |
1372572 |
20120130 |
男性 |
3 |
10339332 |
20110910 |
女性 |
4 |
10642245 |
20130213 |
女性 |
df1.rename(columns={'user_id': '用户ID', 'birthday': '出生日期', 'gender': '性别'}, inplace=True)
df1.head(5)
|
用户ID |
出生日期 |
性别 |
0 |
2757 |
20130311 |
男性 |
1 |
415971 |
20121111 |
女性 |
2 |
1372572 |
20120130 |
男性 |
3 |
10339332 |
20110910 |
女性 |
4 |
10642245 |
20130213 |
女性 |
df1.rename(index={3: 333, 4:444}, inplace=True)
df1.head(5)
|
用户ID |
出生日期 |
性别 |
0 |
2757 |
20130311 |
男性 |
1 |
415971 |
20121111 |
女性 |
2 |
1372572 |
20120130 |
男性 |
333 |
10339332 |
20110910 |
女性 |
444 |
10642245 |
20130213 |
女性 |
df1.iloc[:5]
|
用户ID |
出生日期 |
性别 |
0 |
2757 |
20130311 |
男性 |
1 |
415971 |
20121111 |
女性 |
2 |
1372572 |
20120130 |
男性 |
333 |
10339332 |
20110910 |
女性 |
444 |
10642245 |
20130213 |
女性 |
df1.reset_index(drop=True, inplace=True)
df1.head(5)
|
用户ID |
出生日期 |
性别 |
0 |
2757 |
20130311 |
男性 |
1 |
415971 |
20121111 |
女性 |
2 |
1372572 |
20120130 |
男性 |
3 |
10339332 |
20110910 |
女性 |
4 |
10642245 |
20130213 |
女性 |
df = pd.read_csv('baby_trade_history.csv', encoding='utf-8', dtype={'user_id':str})
df.head(2)
|
user_id |
auction_id |
cat_id |
cat1 |
property |
buy_mount |
day |
0 |
786295544 |
41098319944 |
50014866 |
50022520 |
21458:86755362;13023209:3593274;10984217:21985... |
2 |
20140919 |
1 |
532110457 |
17916191097 |
50011993 |
28 |
21458:11399317;1628862:3251296;21475:137325;16... |
1 |
20131011 |
df[df['buy_mount'] > 10][:5]
|
user_id |
auction_id |
cat_id |
cat1 |
property |
buy_mount |
day |
49 |
103125167 |
18426669796 |
50018438 |
50014815 |
21458:46896;1628665:3233941;1628665:3233942;21... |
16 |
20140220 |
65 |
605724983 |
19747694834 |
50006520 |
50014815 |
21458:30992 |
12 |
20141017 |
89 |
277279277 |
18024521052 |
211122 |
38 |
21458:33516;33480:3238774;2653417:7353464;3359... |
12 |
20130513 |
247 |
392530596 |
17001611735 |
50011993 |
28 |
122218042:50276;21475:135183931;1628861:45151;... |
101 |
20140301 |
409 |
1968453717 |
12567034563 |
122616024 |
28 |
135925585:42825;138052423:142000990;18822961:2... |
14 |
20150127 |
df[~(df['buy_mount'] > 10)][:5]
|
user_id |
auction_id |
cat_id |
cat1 |
property |
buy_mount |
day |
0 |
786295544 |
41098319944 |
50014866 |
50022520 |
21458:86755362;13023209:3593274;10984217:21985... |
2 |
20140919 |
1 |
532110457 |
17916191097 |
50011993 |
28 |
21458:11399317;1628862:3251296;21475:137325;16... |
1 |
20131011 |
2 |
249013725 |
21896936223 |
50012461 |
50014815 |
21458:30992;1628665:92012;1628665:3233938;1628... |
1 |
20131011 |
3 |
917056007 |
12515996043 |
50018831 |
50014815 |
21458:15841995;21956:3494076;27000458:59723383... |
2 |
20141023 |
4 |
444069173 |
20487688075 |
50013636 |
50008168 |
21458:30992;13658074:3323064;1628665:3233941;1... |
1 |
20141103 |
df[(df['buy_mount'] > 10) & (df['day'] > 20140101)][:5]
|
user_id |
auction_id |
cat_id |
cat1 |
property |
buy_mount |
day |
49 |
103125167 |
18426669796 |
50018438 |
50014815 |
21458:46896;1628665:3233941;1628665:3233942;21... |
16 |
20140220 |
65 |
605724983 |
19747694834 |
50006520 |
50014815 |
21458:30992 |
12 |
20141017 |
247 |
392530596 |
17001611735 |
50011993 |
28 |
122218042:50276;21475:135183931;1628861:45151;... |
101 |
20140301 |
409 |
1968453717 |
12567034563 |
122616024 |
28 |
135925585:42825;138052423:142000990;18822961:2... |
14 |
20150127 |
462 |
1802549062 |
17383345857 |
50006520 |
50014815 |
22277:6262384;21458:30992;1628665:61550;162866... |
11 |
20141120 |
df[df['buy_mount'].between(4, 10, inclusive=True)][:5]
|
user_id |
auction_id |
cat_id |
cat1 |
property |
buy_mount |
day |
22 |
469517728 |
8232924597 |
211122 |
38 |
21458:21782;36786:42781029;13023102:6999219;22... |
6 |
20140502 |
52 |
55544814 |
4917672059 |
50015727 |
50014815 |
21458:4540492;1633959:58840623;7107736:3227806... |
4 |
20131106 |
117 |
296448405 |
18524578446 |
50016030 |
50008168 |
21458:247918101;1628665:29782;1628665:29784;16... |
7 |
20131202 |
134 |
97481514 |
41161316434 |
211122 |
38 |
6940834:29865;21458:4331527;1804977:606613769;... |
6 |
20141126 |
148 |
662134541 |
7594318922 |
211122 |
38 |
21458:21776;36780:15333590;2675455:19653564;69... |
6 |
20131211 |
df.dtypes
user_id object
auction_id int64
cat_id int64
cat1 int64
property object
buy_mount int64
day int64
dtype: object
df[df['cat1'].isin([38,28])][:5]
|
user_id |
auction_id |
cat_id |
cat1 |
property |
buy_mount |
day |
1 |
532110457 |
17916191097 |
50011993 |
28 |
21458:11399317;1628862:3251296;21475:137325;16... |
1 |
20131011 |
8 |
82830661 |
19948600790 |
50013874 |
28 |
21458:11580;21475:137325 |
1 |
20121101 |
9 |
475046636 |
10368360710 |
203527 |
28 |
22724:40168;22729:40278;21458:21817;2770200:24... |
1 |
20121101 |
10 |
734147966 |
15307958346 |
50018202 |
38 |
21458:3270827;7361532:28710594;7397093:7536994... |
2 |
20121101 |
13 |
377550424 |
15771663914 |
50015841 |
28 |
1628665:3233941;1628665:3233942;3914866:11580;... |
1 |
20121123 |