1、读取数据,有时候是txt有没有命名,转化dataframe中的时间为时间类型
dir = '../data/'
app_launch = pd.read_table(dir + 'app_launch_log.txt',engine='python', names=['user_id','day'])
comment_score = pd.read_csv('../data/jdata_user_comment_score.csv', parse_dates=['comment_create_tm'])
2、查看dataframe信息
info() describe() unique()
3、去除重复元素,比如user_id
### data1 train 3->23 test 24->30
### 构造线下测试集通过提取这段时间的拍摄、行为用户去重作为test
test_vedcre = video_create[video_create.create_day>23][['user_id']]
test_action = user_activity[user_activity.action_day>23][['user_id']]
off_test = pd.concat([test_vedcre,test_action],axis=0,ignore_index = True)
off_test = off_test.drop_duplicates(['user_id']).reset_index(drop = True)
off_test
4、评价函数,用户集合、商品集合需要F1时候(要去重和预测结果转成dataframe才行)
### 线下评测函数
def evaluate(prediction,result):
print ('Prediction set size: %d' % len(prediction))
print ('Result set size: %d' % len(result))
intersection = pd.merge(prediction,test)
precision = len(intersection)/len(prediction)
recall = len(intersection)/len(result)
F1 = 2 * precision * recall / (precision + recall)
print ('P : %2f' % precision)
print ('R : %2f' % recall)
print ('F1: %2f' % F1)
return precision, recall, F1
5、对dataframe单独一列的值操作
def day_to_weekday(day):
weekday = day%7
return weekday
df['weekday'] = day_to_weekday(df.day)
df['weekday'] = df.weekday.apply(lambda x: x+7 if x==0 else x)
df
6、对dataframe某一列进行排序
sort_values
7、对每个user求对应的day的最大
d = {'user_id': [1,1,1,2,1,1,1,2,2,2,5,6,7,8,8,9,9,9,10,10,10,1,1,1], 'launch_day': [3,3,4,5,13,21,10,12,12,13,13,7,8,8,4,5,6,7,5,4,9,6,7,8]}
df = pd.DataFrame(data=d)
df = df.drop_duplicates(['user_id','launch_day']).sort_values('user_id').reset_index(drop = True)
df
t = pd.DataFrame(df.groupby('user_id')['launch_day'].apply(lambda x:max(list(x)))).reset_index()
t
8、重命名列名
df = df.rename(columns={'old_col_1': 'new_col_1'})
9、距离上次操作天数、连续第几天操作
d = {'user_id': [2,2,2,1,1,1,1,1,1,1], 'launch_day': [5,12,13,3,3,4,5,5,6,9]}
df = pd.DataFrame(data=d)
df = df.sort_values(['user_id','launch_day']).reset_index(drop = True)
df['launch_day_gap'] = df.groupby('user_id')['launch_day'].diff().fillna(0)
df['continues_launch_day'] = df['launch_day_gap']
def continues_launch_day(day_gap):
j = 0
for i in day_gap.index:
if day_gap[i] == 0:
day_gap[i] = j
elif day_gap[i] == 1:
j+=1
day_gap[i] = j
else :
j = 0
day_gap[i] = j
return day_gap
df['continues_launch_day'] = df.groupby('user_id')['launch_day_gap'].apply(continues_launch_day)
df