Python pandas 处理数据

Python pandas 处理数据

flyfish

import pandas as pd
import time
test0=pd.read_csv("D:\\1.csv",parse_dates = [3])
test1=pd.read_csv("D:\\1.csv", chunksize=5,parse_dates = ['o_date'])

print(test0.groupby('user_id').count())

test0['year'] = test0['o_date'].apply(lambda x: x.year)#增加列
test0['month'] = test0['o_date'].apply(lambda x: x.month)
test0['timetoint'] = test0['o_date'].apply(lambda x:  str(x.year)+str(x.month).zfill(2)+str(x.day).zfill(2))


func = lambda x: x.month if  x.year==2016 else x.month+12
test0['total_month'] = test0['o_date'].apply(func)

test0.to_csv("D:\\2.csv")
'''
i=10
for piece in test1:
 outname =  str(i) + '.csv'
 piece.to_csv(outname)
 i += 1
'''


left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
 'key2': ['K0', 'K1', 'K0', 'K1'],
 'A': ['A0', 'A1', 'A2', 'A3'],
 'B': ['B0', 'B1', 'B2', 'B3']})
print("left:\n",left);

print("left[2:4]\n",left[2:4])
print("left key1=='K0'\n",left.query("key1=='K0'"))
print("left left[(left.key1=='K0')\n",left[(left.key1=='K0')])

right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
 'key2': ['K0', 'K0', 'K0', 'K0'],
  'C': ['C0', 'C1', 'C2', 'C3'],
  'D': ['D0', 'D1', 'D2', 'D3']})
print("right:\n",right);


#保留两个表中公共部分的信息
result0 = pd.merge(left, right, on=['key1', 'key2'])

print("result0:\n",result0);



#只保留左表的所有数据
result1 = pd.merge(left, right, how='left', on=['key1', 'key2'])
print("result1:\n",result1);

#只保留右表的所有数据
result2 = pd.merge(left, right, how='right', on=['key1', 'key2'])
print("result2:\n",result2);

#保留两个表的所有信息
result3 = pd.merge(left, right, how='outer', on=['key1', 'key2'])
print("result3:\n",result3);
left:
     A   B key1 key2
0  A0  B0   K0   K0
1  A1  B1   K0   K1
2  A2  B2   K1   K0
3  A3  B3   K2   K1
left[2:4]
     A   B key1 key2
2  A2  B2   K1   K0
3  A3  B3   K2   K1
left key1=='K0'
     A   B key1 key2
0  A0  B0   K0   K0
1  A1  B1   K0   K1
left left[(left.key1=='K0')
     A   B key1 key2
0  A0  B0   K0   K0
1  A1  B1   K0   K1
right:
     C   D key1 key2
0  C0  D0   K0   K0
1  C1  D1   K1   K0
2  C2  D2   K1   K0
3  C3  D3   K2   K0
result0:
     A   B key1 key2   C   D
0  A0  B0   K0   K0  C0  D0
1  A2  B2   K1   K0  C1  D1
2  A2  B2   K1   K0  C2  D2
result1:
     A   B key1 key2    C    D
0  A0  B0   K0   K0   C0   D0
1  A1  B1   K0   K1  NaN  NaN
2  A2  B2   K1   K0   C1   D1
3  A2  B2   K1   K0   C2   D2
4  A3  B3   K2   K1  NaN  NaN
result2:
      A    B key1 key2   C   D
0   A0   B0   K0   K0  C0  D0
1   A2   B2   K1   K0  C1  D1
2   A2   B2   K1   K0  C2  D2
3  NaN  NaN   K2   K0  C3  D3
result3:
      A    B key1 key2    C    D
0   A0   B0   K0   K0   C0   D0
1   A1   B1   K0   K1  NaN  NaN
2   A2   B2   K1   K0   C1   D1
3   A2   B2   K1   K0   C2   D2
4   A3   B3   K2   K1  NaN  NaN
5  NaN  NaN   K2   K0   C3   D3

关于时间的处理

import pandas as pd

#读取行为表找出3月份的数据
action = pd.read_csv('./data/user_action.csv', parse_dates=['a_date'])

#获取2017年4月数据
loaddata = action.set_index('a_date')
AprilData=loaddata['2017-4']

#获取2017年3月和4月数据
AprilData.to_csv('./data/S_Action_Data_April.csv', index=True)
MarchAndAprilData=loaddata['2017-3':'2017-4']
MarchAndAprilData.to_csv('./data/S_Action_Data_March_And_April.csv', index=True)
#获取2017年3月数据 另一种方法
#新增列年和月以便容易计算 去除原来的时间列
action['a_month'] = action['a_date'].apply(lambda x: x.month)
action['a_year'] = action['a_date'].apply(lambda x: x.year)

#drop函数默认删除行,列需要加axis = 1
action.drop('a_date', axis=1, inplace=True)
result=action.query('a_month == 3 and a_year==2017')

result.to_csv('./data/S_Action_Data_March.csv', index=False)

猜你喜欢

转载自blog.csdn.net/flyfish1986/article/details/80283161