Python pandas 处理数据
flyfish
import pandas as pd
import time
test0=pd.read_csv("D:\\1.csv",parse_dates = [3])
test1=pd.read_csv("D:\\1.csv", chunksize=5,parse_dates = ['o_date'])
print(test0.groupby('user_id').count())
test0['year'] = test0['o_date'].apply(lambda x: x.year)#增加列
test0['month'] = test0['o_date'].apply(lambda x: x.month)
test0['timetoint'] = test0['o_date'].apply(lambda x: str(x.year)+str(x.month).zfill(2)+str(x.day).zfill(2))
func = lambda x: x.month if x.year==2016 else x.month+12
test0['total_month'] = test0['o_date'].apply(func)
test0.to_csv("D:\\2.csv")
'''
i=10
for piece in test1:
outname = str(i) + '.csv'
piece.to_csv(outname)
i += 1
'''
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
print("left:\n",left);
print("left[2:4]\n",left[2:4])
print("left key1=='K0'\n",left.query("key1=='K0'"))
print("left left[(left.key1=='K0')\n",left[(left.key1=='K0')])
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print("right:\n",right);
#保留两个表中公共部分的信息
result0 = pd.merge(left, right, on=['key1', 'key2'])
print("result0:\n",result0);
#只保留左表的所有数据
result1 = pd.merge(left, right, how='left', on=['key1', 'key2'])
print("result1:\n",result1);
#只保留右表的所有数据
result2 = pd.merge(left, right, how='right', on=['key1', 'key2'])
print("result2:\n",result2);
#保留两个表的所有信息
result3 = pd.merge(left, right, how='outer', on=['key1', 'key2'])
print("result3:\n",result3);
left:
A B key1 key2
0 A0 B0 K0 K0
1 A1 B1 K0 K1
2 A2 B2 K1 K0
3 A3 B3 K2 K1
left[2:4]
A B key1 key2
2 A2 B2 K1 K0
3 A3 B3 K2 K1
left key1=='K0'
A B key1 key2
0 A0 B0 K0 K0
1 A1 B1 K0 K1
left left[(left.key1=='K0')
A B key1 key2
0 A0 B0 K0 K0
1 A1 B1 K0 K1
right:
C D key1 key2
0 C0 D0 K0 K0
1 C1 D1 K1 K0
2 C2 D2 K1 K0
3 C3 D3 K2 K0
result0:
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
result1:
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
result2:
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
3 NaN NaN K2 K0 C3 D3
result3:
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
5 NaN NaN K2 K0 C3 D3
关于时间的处理
import pandas as pd
#读取行为表找出3月份的数据
action = pd.read_csv('./data/user_action.csv', parse_dates=['a_date'])
#获取2017年4月数据
loaddata = action.set_index('a_date')
AprilData=loaddata['2017-4']
#获取2017年3月和4月数据
AprilData.to_csv('./data/S_Action_Data_April.csv', index=True)
MarchAndAprilData=loaddata['2017-3':'2017-4']
MarchAndAprilData.to_csv('./data/S_Action_Data_March_And_April.csv', index=True)
#获取2017年3月数据 另一种方法
#新增列年和月以便容易计算 去除原来的时间列
action['a_month'] = action['a_date'].apply(lambda x: x.month)
action['a_year'] = action['a_date'].apply(lambda x: x.year)
#drop函数默认删除行,列需要加axis = 1
action.drop('a_date', axis=1, inplace=True)
result=action.query('a_month == 3 and a_year==2017')
result.to_csv('./data/S_Action_Data_March.csv', index=False)