Python Sklearn库常用函数(1)

版权声明:该文章系个人原创,转载请注明出处,谢谢! https://blog.csdn.net/Chaos_Happy/article/details/87973270

目录

  • 数据预处理
  • 线性回归
  • 模型验证
  • 分类
  • SVM
  • 聚类
  • 神经网络

数据预处理

from sklearn import preprocessing

#均值移除(Mean removal)

preprocessing.scale(data)

#范围缩放(Scaling)
#设置MinMaxScaler处理器

data_sclar = preprocessing.MinMaxScaler(feature_range=(0,1))

#转换数据

data_sclar.fit_transform(data)

#归一化(Normalization)

data_normalized = preprocessing.normalize(data, norm='l1')

#二值化(Binarization), 二值化用于将数值特征向量转换为布尔型向量

data_binarization = preprocessing.Binarizer(threshold=1.4).transform(data)

#独热编码

encoder=preprocessing.OneHotEncoder()
encoder.fit([[1,2,3],[2,3,4],[5,6,4]])
encoder_vector = encoder.transform([[3,4,5],[6,4,7],[2,1,4]])

#数据集分割成训练集和测试集
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=5)

线性回归器

import numpy as np
from sklearn import linear_model

#创建线性回归对象

linear_regressor = linear_model.LinearRegression()
linear_regressor.fit(X_train, y_train)
y_pre = linear_regressor.predict(x)

matplotlib线性回归图

#jupyter notebook中,需要加 %matplotlib inline

import matplotlib.pyplot as plt
plt.figure()
plt.scatter(x_train,y_train,color-'green')
plt.plot(x_train,y_pre,color='black', linewidth=4)
plt.title('Train data')
plt.show()

模型的保存及加载

#保存模型

import cPickle as pickle
output_model_file='saved_model.pkl'
with open(output_model_file,'w') as f:
		pickle.dump(linear_regressor, f)
		
#加载模型

with open(output_model_file, 'r') as f:
		model_linregr=pickle.load(f)

创建岭回归

#岭回归:线性回归引入正则化项作为阈值来消除异常值的影响。

ridge_regressor=linear_model.Ridge(alpha=0.01, fit_intercept=True,max_iter=1000)

#alpha 控制回归器的复杂程度,趋于0时,就用普通最小二乘法,如果需要对异常值不那么敏感,就需要设置一个较大的alpha值

ridge_regressor.fit(x_train,y_train)

创建多项式回归器

from sklearn.preprocessing import PolynomialFeatures
polynomial=PolynamialFeatures(degree=3)
x_train_transformed=polynomial.fit_transform(x_train)

使用AdaBoost回归预测

from sklearn.tree import DecissionTreeRegressor
from sklearn.ensemble import AdaBoostRegreesor
from sklearn.utils import shuffle

#shuffle函数的作用是把数据的顺序打乱

x,y=shuffle(data,target,random_state=7)

#拟合一个决策树回归模型

dt_regressor=DecissionTreeRegressor(max_depth=4)
dt_regressor.fit(x_train,y_train)

#用带AdaBoost算法的决策树回归模型:

ab_regressor=AdaBoostRegressor(DecissionTreeRegressor(max_depth=4),n_estimators=400,random_state=7)
ab_regressor.fit(x_train,y_train)

随机森林回归器(random forest regressor)

from sklearn.ensemble import RandomForestRegressor
rf_regressor=RandomForestRegressor(n_estimators=1000,max_depth=10,min_samples_split=1)

模型的验证

回归模型验证参数

import sklearn.metrics as sm

#MAE平均绝对误差

sm.mean_absolute_error(y_test, y_test_pred)

#MSE均方误差

sm.mean_squared_error(y_test, y_test_pred)

#MAE中位数均方误差

sm.median_absolute_error(y_test, y_test_pred)

#解释方差分

sm.explained_variance_score(y_test,y_test_pred)

#R2

sm.r2_score(y_test, y_test_pred)

#通常尽量保证均方误差最低,解释方差分最高

分类器用交叉验证检验模型准确性

#num_validations 分类的个数
from sklearn  import model_selection

#精度

accuracy=model_selection.cross_val_score(classifier_gaussiannb,x,y,scoring='accuracy',cv=num_validations)

#F1

F1=model_selection.cross_val_score(classifier_gaussiannb,x,y,scoring='f1_weight',cv=num_validations)

混淆矩阵(confusion matrix)

Confusion matrix Predicted class 0 Predicted class 1 Predicted class 2
True class 0 45 4 3
True class 1 11 56 2
True class 2 5 6 49
from sklearn.metrics import confusion_matrix

提取性能报告

from sklearn.metrics import classification_report

分类器

逻辑回归分类器

from sklearn import linear_model
classifier=linear_model.LogisticRegression(solver='liblinear',c=100)
classifier.fit(x,y)

朴素贝叶斯分类器

from sklearn.naive_bayes import GaussianNB
classifier_gaussiannb=GaussianNB()
classifier_gaussiannb.fit(x,y)

SVM

from sklearn.svm import SVC

线性分类器

#用线性核函数初始化SVM对象
params={'kernel':'linear'}
classifier=SVC(**params)
classifier.fit(x,y)

非线性分类器

#用非线性核函数初始化SVM对象
params={'kernel':'poly','degree':3} #多项式
params={'kernel':'rbf'} # 径向基
classifier=SVC(**params)
classifier.fit(x,y)

聚类

K-Means

#num_clusters 聚的类个数
from sklearn.cluster import KMeans
Kmeans=KMeans(init='k-means++', n_clusters=num_clusters,n_init=10)
Kmeans.fit(data)

猜你喜欢

转载自blog.csdn.net/Chaos_Happy/article/details/87973270