Oppo 持续更新中

数据集来自天池竞赛
以下部分为暂时的数据处理,每天都会更新完善~~

import numpy as np
import pandas as pd
#读取训练/测试/验证集数据
train_file = 'H:/TianChiOppoRound1/oppo_round1_train_20180929.txt'
train_df = pd.read_csv(train_file,sep='\t',header=None,names=['prefix','query_prediction','title','tag','label'],low_memory=False)
test_file = 'H:/TianChiOppoRound1/oppo_round1_test_A_20180929.txt'
test_df = pd.read_csv(test_file,sep='\t',header=None,names=['prefix','query_prediction','title','tag'],low_memory=False)
vali_file = 'H:/TianChiOppoRound1/oppo_round1_vali_20180929.txt'
vali_df = pd.read_csv(vali_file,sep='\t',header=None,names=['prefix','query_prediction','title','tag','label'],low_memory=False)

#定义函数将query_prediction数据展开为列表
def split_query_prediction(text):
    if pd.isna(text):
        return []
    return [s.strip() for s in text.replace('{','').replace('}','').split(',')]

train_df['pred_list'] = train_df['query_prediction'].apply(split_query_prediction)
train_df['pred_len'] = train_df['pred_list'].apply(len)
train_query_prediction = train_df.pop('query_prediction')
#train_df.to_csv('H:/TianChiOppoRound1/train.csv')
# print(np.shape(train_df))
# print(train_df.columns.values)

持续更新中。。。

猜你喜欢

转载自blog.csdn.net/weixin_40924580/article/details/83021110