My baseline

import pandas as pd
def cut_down(filepath):
    train_file = filepath
    train_data = pd.read_csv(train_file)
    train_data.info()
    train_data.fillna(0)
    print("=" * 10)
    gl_float = train_data.select_dtypes(include=['float64'])
    converted_float = gl_float.apply(pd.to_numeric,downcast='float')
    gl_int = train_data.select_dtypes(include=['int64'])
    converted_int = gl_int.apply(pd.to_numeric,downcast='integer')
    temp=pd.DataFrame()
    temp[converted_int.columns]=converted_int
    temp[converted_float.columns]=converted_float
    print(temp.info())
    print("="*10)
    return temp
import pandas as pd
import numpy as np
import downcast_demo
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")


train_file='tap_fun_train.csv'
print("changing...")
train_data=downcast_demo.cut_down(train_file)

test_file='tap_fun_test.csv'
print("changing...")
test_data=downcast_demo.cut_down(test_file)

def feature(data,feature_columns,label=None):
    #提取指定特征 和 标签
    X = data[feature_columns].as_matrix()
    if label != None:
        y=data[label].as_matrix()
        y=np.array(y).reshape(len(y))
        return X,y
    else:
        return X

def trainandTest(X_train_data, y_train_data,test_x):
    # XGBoost训练过程
    # 线下测试model
    X_train, X_test, y_train, y_test = train_test_split(X_train_data, y_train_data,
                                                        test_size=0.2,
                                                        random_state=21)
    # from sklearn import neighbors
    # model = neighbors.KNeighborsRegressor()

    import lightgbm as lgb
    model = lgb.LGBMRegressor(
        boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
        max_depth=-1, n_estimators=2000, objective='binary',
        subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
        learning_rate=0.05, min_child_weight=50, random_state=2018, n_jobs=-1
    )
    model.fit(train_x, train_y, eval_set=[(train_x, train_y)], eval_metric='rmse', early_stopping_rounds=100)


    print("model fiting...")
    pre=model.predict(X_test)
    print("pre:")
    print(pre)
    print(y_test)

    test_pre=model.predict(test_x)
    return test_pre
    # 显示重要特征
    # plot_importance(model)
    # plt.show()

if __name__=="__main__":

    colums_list=['pvp_battle_count',
                 'pvp_lanch_count',
                 'pvp_win_count',
                 'pve_battle_count',
                 'pve_lanch_count',
                 'pve_win_count',
                 'pay_count',
                 'pay_price',
                 'avg_online_minutes']
    train_x,train_y=feature(train_data,colums_list,'prediction_pay_price')
    train_x=np.array(train_x)
    print('train_x, train_y')
    print(train_x.shape,'\n',train_y.shape)

    test_x= feature(test_data, colums_list)
    test_x = np.array(test_x)
    print('test_x')
    print(test_x.shape, '\n')
    print("traing")
    pre=trainandTest(train_x, train_y, test_x)

    print("wring..")
    sub=pd.DataFrame()
    sub['user_id']=test_data['user_id']
    sub['prediction_pay_price']=pre
    sub.to_csv("sublgb.csv", index=False)


猜你喜欢

转载自blog.csdn.net/qq_39622065/article/details/81413417
my