原始field特征embedding
我的知乎地址:https://zhuanlan.zhihu.com/p/42089584 求关注
#encoding=utf-8
import numpy as np
import pandas as pd
import time
import gc
import sys
index=int(sys.argv[1])
#index=8
sparse_col_name=['appIdAction','appIdInstall','interest1','interest2', 'interest3', 'interest4', 'interest5', 'kw1', 'kw2', 'kw3',
'topic1', 'topic2', 'topic3']
#sparse_col_max_length=[537,920,47,32,10,10,86,5,5,5,5,5,5]
sparse_col_true_max_length=[33,33,33,-1,10,10,-1,-1,-1,-1,-1,-1,-1]
max_length_limit=33
t1=time.time()
def tran(x,_max_length):
x=x.split(" ")
for i in range(len(x)):
x[i]=int(x[i])
if(len(x)<_max_length):
for i in range(_max_length-len(x)):
x.append(0)
if(len(x)>_max_length):
for i in range(len(x)-_max_length):
x.pop()
return np.array(x)
#--------------------------------------------------------------------------------
aa=pd.read_csv('userFeature.csv')#mix userfeature
print('aa shape',aa.shape)
print('index:',index,"col name:",sparse_col_name[index-1],"time",time.time()-t1)
user_feature=pd.concat([aa["uid"],aa[sparse_col_name[index-1]]],axis=1)#
del aa
gc.collect()
#--------------------------------------------------------------------------------
#--------------------------------------------------------------------------------
train=pd.read_csv("train.csv")
#--------------------------------------------------------------------------------
predict1=pd.read_csv('test1.csv')
predict2=pd.read_csv('test2.csv')
predict=pd.concat([predict1,predict2])
del predict1,predict2;gc.collect()
#--------------------------------------------------------------------------------
print("read prepared!")
train.loc[train['label']==-1,'label']=0
predict['label']=-1
data=pd.concat([train,predict])
#data=pd.merge(data,ad_feature,on='aid',how='left')
data=pd.merge(data,user_feature,on='uid',how='left')
print("merge over!")
data[sparse_col_name[index-1]]=data[sparse_col_name[index-1]].fillna('0 0')
#--------------------------------------------------------------------------------
#_max_length=sparse_col_max_length[index-1]
if(sparse_col_true_max_length[index-1]>0):
_max_length=sparse_col_true_max_length[index-1]
elif(sparse_col_true_max_length[index-1]<0):
print('begin to compute the max length',time.time()-t1)
_temp_col=data[sparse_col_name[index-1]].apply(lambda x:len(x.split(" ")))
_max_length=_temp_col.max()
del _temp_col
gc.collect()
#--------------------------------------------------------------------------------
print('maxlength:',_max_length,"time",time.time()-t1)
if(_max_length>max_length_limit):
_max_length=max_length_limit
print('the maxlength exceeds the maximum limit',_max_length,"time",time.time()-t1)
data[sparse_col_name[index-1]]=data[sparse_col_name[index-1]].apply(lambda x:tran(x,_max_length))
_temp_col=data[sparse_col_name[index-1]].apply(lambda x:x.max())
_max_value=_temp_col.max()
print("max_value:",_max_value)
del _temp_col
gc.collect()
#--------------------------------------------------------------------------------
print('ori data shape:',data.shape,"time",time.time()-t1)
#print(data.head())
for i in range(_max_length):
print('i',i,time.time()-t1)
data["col"+str(i+1)]=data[sparse_col_name[index-1]].apply(lambda x:x[i])
print('new data shape1:',data.shape,"time",time.time()-t1)
#print(data.head())
data.drop(sparse_col_name[index-1],axis=1,inplace=True)
print('new data shape2:',data.shape,"time",time.time()-t1)
data.to_csv("./SDD_data/final_sdd_embedding_feature_mix_chusai_%s.csv"%sparse_col_name[index-1],header=True,index=False)