CatBoost code templates (bad memory can collect)

From the code
data from
import pandas as pd
import numpy as np
from itertools import combinations
from catboost import CatBoostClassifier
train_df = pd.read_csv('../input/train.csv')
test_df  = pd.read_csv('../input/test.csv')
#
labels = train_df.target
test_id = test_df.ID
#
train_df.drop(['ID', 'target'], axis=1, inplace=True)
test_df.drop(['ID'], axis=1, inplace=True)
#
train_df.fillna(-9999, inplace=True)
test_df.fillna(-9999, inplace=True)
# Keep list of all categorical features in dataset to specify this for CatBoost
cat_features_ids = np.where(train_df.apply(pd.Series.nunique) < 30000)[0].tolist()
# Train the model:
clf = CatBoostClassifier(learning_rate=0.1, iterations=1000, random_seed=0, logging_level='Silent')
clf.fit(train_df, labels, cat_features=cat_features_ids)
# Make submission:
prediction = clf.predict_proba(test_df)[:,1]
pd.DataFrame(
    {'ID':test_id, 'PredictedProb':prediction}
).to_csv(
    'submission_base.csv', index=False
)
# Improved approach
# Now by adding few more steps of data manipulation and feature engineering we have acheived 11th place on the leaderboard
selected_features = [
    'v10', 'v12', 'v14', 'v21', 'v22', 'v24', 'v30', 'v31', 'v34', 'v38', 'v40', 'v47', 'v50',
    'v52', 'v56', 'v62', 'v66', 'v72', 'v75', 'v79', 'v91', 'v112', 'v113', 'v114', 'v129'
]
# drop some of the features that were not selected
train_df = train_df[selected_features]
test_df = test_df[selected_features]
# update the list of categorical features
cat_features_ids = np.where(train_df.apply(pd.Series.nunique) < 30000)[0].tolist()
char_features = list(train_df.columns[train_df.dtypes == np.object])
char_features_without_v22 = list(train_df.columns[(train_df.dtypes == np.object) & (train_df.columns != 'v22')])
cmbs = list(combinations(char_features, 2)) + map(lambda x: ("v22",) + x, combinations(char_features_without_v22, 2))
def concat_columns(df, columns):
    value = df[columns[0]].astype(str) + ' '
    for col in columns[1:]:
        value += df[col].astype(str) + ' '
    return value
# add new features based on combinations/interactions
for cols in cmbs:
    train_df["".join(cols)] = concat_columns(train_df, cols)
    test_df["".join(cols)] = concat_columns(test_df, cols)
# add new engineered features to the list of categorical features in dataframe
cat_features_ids += range(len(selected_features), train_df.shape[1])
# Train the model:
clf = CatBoostClassifier(learning_rate=0.1, iterations=1000, random_seed=0, logging_level='Silent')
clf.fit(train_df, labels, cat_features=cat_features_ids)
# Make submission:
prediction = clf.predict_proba(test_df)[:,1]
pd.DataFrame(
    {'ID':test_id, 'PredictedProb':prediction}
).to_csv(
    'submission_improved.csv', index=False
)
# Bagging
# Finally by averaging predictions from several models trained with different seed we reduce the variance and are able to achieve 9th place on the leaderboard
predictions = []
for i in range(10):
    clf = CatBoostClassifier(learning_rate=0.1, iterations=1000, random_seed=i, logging_level='Silent')
    clf.fit(train_df, labels, cat_features=cat_features_ids)
    predictions.append(clf.predict_proba(test_df)[:,1])
# Make submission:
prediction = np.mean(predictions, axis=0)
pd.DataFrame(
    {'ID':test_id, 'PredictedProb':prediction}
).to_csv(
    'submission_improved_bagged.csv', index=False
)
Suddenly every Taolin
Published 47 original articles · won praise 4 · Views 2259
Private letter concerns
CatBoost code templates (bad memory can collect)

Guess you like