Lightgbm + LR代码实现

import numpy as np 
import matplotlib.pyplot as plt
#生成用于分类的数据集
from sklearn.datasets.samples_generator import make_classification
X,labels=make_classification(n_samples=2000,n_features=20,n_redundant=0,n_informative=4,n_classes= 3, 
							 random_state=1,n_clusters_per_class=2)
rng=np.random.RandomState(2)
X+=2*rng.uniform(size=X.shape)

# unique_lables=set(labels)
# colors=plt.cm.Spectral(np.linspace(0,1,len(unique_lables)))
# for k,col in zip(unique_lables,colors):
	# x_k=X[labels==k]
	# plt.plot(x_k[:,0],x_k[:,1],'o',markerfacecolor=col,markeredgecolor="k",
			 # markersize=14)
# plt.title('data by make_classification()')
# plt.show()

print(X.shape)

###############
# lightgbm 

import lightgbm as lgb

import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression

print('Load data...')
labels = np.reshape(labels, (len(labels),-1))
print(labels.shape)
X_y = np.concatenate([X,labels], axis = 1)

train_num = int(len(X)*0.8)
df_train = pd.DataFrame(data = X_y[:train_num,:])
df_test = pd.DataFrame(X_y[train_num:, :])

print(df_test.head(10))

y_train = df_train[20]  # training label
y_test = df_test[20]  # testing label

X_train = df_train.loc[:,:19]  # training dataset
X_test = df_test.loc[:,:19]  # testing dataset

# print('y_tst',y_test)
# print('X_train', X_train)

# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

params = {
	'task': 'train',
	'boosting_type': 'gbdt',
	'objective': 'binary',
	'metric': {'binary_logloss'},
	'num_leaves': 64,
	'num_trees': 100,
	'learning_rate': 0.01,
	'feature_fraction': 0.9,
	'bagging_fraction': 0.8,
	'bagging_freq': 5,
	'verbose': 0
}

# number of leaves,will be used in feature transformation
num_leaf = 64

print('Start training...')
# train
gbm = lgb.train(params,
				lgb_train,
				num_boost_round=100,
				valid_sets=lgb_train)

print('Save model...')
# save model to file
gbm.save_model('model.txt')

print('Start predicting...')
# predict and get data on leaves, training data
y_pred = gbm.predict(X_train, pred_leaf=True)

print('Writing transformed training data')
transformed_training_matrix = np.zeros([len(y_pred), len(y_pred[0]) * num_leaf],
									   dtype=np.int64)  # N * num_tress * num_leafs
for i in range(0, len(y_pred)):
	temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
	transformed_training_matrix[i][temp] += 1

print('X_train leaf', transformed_training_matrix.shape)
print(transformed_training_matrix[:10])

y_pred = gbm.predict(X_test, pred_leaf=True)
print('Writing transformed testing data')
transformed_testing_matrix = np.zeros([len(y_pred), len(y_pred[0]) * num_leaf], dtype=np.int64)
for i in range(0, len(y_pred)):
	temp = np.arange(len(y_pred[0])) * num_leaf + np.array(y_pred[i])
	print('temp', temp)
	print('y_pred', np.array(y_pred[i]))
	transformed_testing_matrix[i][temp] += 1
	if i == 0:
		break

print('testing leaf shape', transformed_testing_matrix.shape)

lm = LogisticRegression(penalty='l2',C=0.05) # logestic model construction
lm.fit(transformed_training_matrix,y_train)  # fitting the data
y_pred_test = lm.predict_proba(transformed_testing_matrix)   # Give the probabilty on each label

print('y_pred_test by LR', y_pred_test)

NE = (-1) / len(y_pred_test) * sum(((1+y_test)/2 * np.log(y_pred_test[:,1]) +  (1-y_test)/2 * np.log(1 - y_pred_test[:,1])))
print("Normalized Cross Entropy " + str(NE))
发布了510 篇原创文章 · 获赞 152 · 访问量 77万+

猜你喜欢

转载自blog.csdn.net/rosefun96/article/details/104053012