import numpy as np
import matplotlib.pyplot as plt
#生成用于分类的数据集
from sklearn.datasets.samples_generator import make_classification
X,labels=make_classification(n_samples=2000,n_features=20,n_redundant=0,n_informative=4,n_classes= 3,
random_state=1,n_clusters_per_class=2)
rng=np.random.RandomState(2)
X+=2*rng.uniform(size=X.shape)
# unique_lables=set(labels)
# colors=plt.cm.Spectral(np.linspace(0,1,len(unique_lables)))
# for k,col in zip(unique_lables,colors):
# x_k=X[labels==k]
# plt.plot(x_k[:,0],x_k[:,1],'o',markerfacecolor=col,markeredgecolor="k",
# markersize=14)
# plt.title('data by make_classification()')
# plt.show()
print(X.shape)
###############
# lightgbm
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
print('Load data...')
labels = np.reshape(labels, (len(labels),-1))
print(labels.shape)
X_y = np.concatenate([X,labels], axis = 1)
train_num = int(len(X)*0.8)
df_train = pd.DataFrame(data = X_y[:train_num,:])
df_test = pd.DataFrame(X_y[train_num:, :])
print(df_test.head(10))
y_train = df_train[20] # training label
y_test = df_test[20] # testing label
X_train = df_train.loc[:,:19] # training dataset
X_test = df_test.loc[:,:19] # testing dataset
# print('y_tst',y_test)
# print('X_train', X_train)
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': {'binary_logloss'},
'num_leaves': 64,
'num_trees': 100,
'learning_rate': 0.01,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 0
}
# number of leaves,will be used in feature transformation
num_leaf = 64
print('Start training...')
# train
gbm = lgb.train(params,
lgb_train,
num_boost_round=100,
valid_sets=lgb_train)
print('Save model...')
# save model to file
gbm.save_model('model.txt')
print('Start predicting...')
# predict and get data on leaves, training data
y_pred = gbm.predict(X_train, pred_leaf=True)
print('Writing transformed training data')
transformed_training_matrix = np.zeros([len(y_pred), len(y_pred[0]) * num_leaf],
dtype=np.int64) # N * num_tress * num_leafs
for i in range(0, len(y_pred)):
temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
transformed_training_matrix[i][temp] += 1
print('X_train leaf', transformed_training_matrix.shape)
print(transformed_training_matrix[:10])
y_pred = gbm.predict(X_test, pred_leaf=True)
print('Writing transformed testing data')
transformed_testing_matrix = np.zeros([len(y_pred), len(y_pred[0]) * num_leaf], dtype=np.int64)
for i in range(0, len(y_pred)):
temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
print('temp', temp)
print('y_pred', np.array(y_pred[i]))
transformed_testing_matrix[i][temp] += 1
if i == 0:
break
print('testing leaf shape', transformed_testing_matrix.shape)
lm = LogisticRegression(penalty='l2',C=0.05) # logestic model construction
lm.fit(transformed_training_matrix,y_train) # fitting the data
y_pred_test = lm.predict_proba(transformed_testing_matrix) # Give the probabilty on each label
print('y_pred_test by LR', y_pred_test)
NE = (-1) / len(y_pred_test) * sum(((1+y_test)/2 * np.log(y_pred_test[:,1]) + (1-y_test)/2 * np.log(1 - y_pred_test[:,1])))
print("Normalized Cross Entropy " + str(NE))
Lightgbm + LR代码实现
猜你喜欢
转载自blog.csdn.net/rosefun96/article/details/104053012
今日推荐
周排行