版权声明:该文章系个人原创,转载请注明出处,谢谢! https://blog.csdn.net/Chaos_Happy/article/details/87973270
目录
- 数据预处理
- 线性回归
- 模型验证
- 分类
- SVM
- 聚类
- 神经网络
数据预处理
from sklearn import preprocessing
#均值移除(Mean removal)
preprocessing.scale(data)
#范围缩放(Scaling)
#设置MinMaxScaler处理器
data_sclar = preprocessing.MinMaxScaler(feature_range=(0,1))
#转换数据
data_sclar.fit_transform(data)
#归一化(Normalization)
data_normalized = preprocessing.normalize(data, norm='l1')
#二值化(Binarization), 二值化用于将数值特征向量转换为布尔型向量
data_binarization = preprocessing.Binarizer(threshold=1.4).transform(data)
#独热编码
encoder=preprocessing.OneHotEncoder()
encoder.fit([[1,2,3],[2,3,4],[5,6,4]])
encoder_vector = encoder.transform([[3,4,5],[6,4,7],[2,1,4]])
#数据集分割成训练集和测试集
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=5)
线性回归器
import numpy as np
from sklearn import linear_model
#创建线性回归对象
linear_regressor = linear_model.LinearRegression()
linear_regressor.fit(X_train, y_train)
y_pre = linear_regressor.predict(x)
matplotlib线性回归图
#jupyter notebook中,需要加 %matplotlib inline
import matplotlib.pyplot as plt
plt.figure()
plt.scatter(x_train,y_train,color-'green')
plt.plot(x_train,y_pre,color='black', linewidth=4)
plt.title('Train data')
plt.show()
模型的保存及加载
#保存模型
import cPickle as pickle
output_model_file='saved_model.pkl'
with open(output_model_file,'w') as f:
pickle.dump(linear_regressor, f)
#加载模型
with open(output_model_file, 'r') as f:
model_linregr=pickle.load(f)
创建岭回归
#岭回归:线性回归引入正则化项作为阈值来消除异常值的影响。
ridge_regressor=linear_model.Ridge(alpha=0.01, fit_intercept=True,max_iter=1000)
#alpha 控制回归器的复杂程度,趋于0时,就用普通最小二乘法,如果需要对异常值不那么敏感,就需要设置一个较大的alpha值
ridge_regressor.fit(x_train,y_train)
创建多项式回归器
from sklearn.preprocessing import PolynomialFeatures
polynomial=PolynamialFeatures(degree=3)
x_train_transformed=polynomial.fit_transform(x_train)
使用AdaBoost回归预测
from sklearn.tree import DecissionTreeRegressor
from sklearn.ensemble import AdaBoostRegreesor
from sklearn.utils import shuffle
#shuffle函数的作用是把数据的顺序打乱
x,y=shuffle(data,target,random_state=7)
#拟合一个决策树回归模型
dt_regressor=DecissionTreeRegressor(max_depth=4)
dt_regressor.fit(x_train,y_train)
#用带AdaBoost算法的决策树回归模型:
ab_regressor=AdaBoostRegressor(DecissionTreeRegressor(max_depth=4),n_estimators=400,random_state=7)
ab_regressor.fit(x_train,y_train)
随机森林回归器(random forest regressor)
from sklearn.ensemble import RandomForestRegressor
rf_regressor=RandomForestRegressor(n_estimators=1000,max_depth=10,min_samples_split=1)
模型的验证
回归模型验证参数
import sklearn.metrics as sm
#MAE平均绝对误差
sm.mean_absolute_error(y_test, y_test_pred)
#MSE均方误差
sm.mean_squared_error(y_test, y_test_pred)
#MAE中位数均方误差
sm.median_absolute_error(y_test, y_test_pred)
#解释方差分
sm.explained_variance_score(y_test,y_test_pred)
#R2
sm.r2_score(y_test, y_test_pred)
#通常尽量保证均方误差最低,解释方差分最高
分类器用交叉验证检验模型准确性
#num_validations 分类的个数
from sklearn import model_selection
#精度
accuracy=model_selection.cross_val_score(classifier_gaussiannb,x,y,scoring='accuracy',cv=num_validations)
#F1
F1=model_selection.cross_val_score(classifier_gaussiannb,x,y,scoring='f1_weight',cv=num_validations)
混淆矩阵(confusion matrix)
Confusion matrix | Predicted class 0 | Predicted class 1 | Predicted class 2 |
---|---|---|---|
True class 0 | 45 | 4 | 3 |
True class 1 | 11 | 56 | 2 |
True class 2 | 5 | 6 | 49 |
from sklearn.metrics import confusion_matrix
提取性能报告
from sklearn.metrics import classification_report
分类器
逻辑回归分类器
from sklearn import linear_model
classifier=linear_model.LogisticRegression(solver='liblinear',c=100)
classifier.fit(x,y)
朴素贝叶斯分类器
from sklearn.naive_bayes import GaussianNB
classifier_gaussiannb=GaussianNB()
classifier_gaussiannb.fit(x,y)
SVM
from sklearn.svm import SVC
线性分类器
#用线性核函数初始化SVM对象
params={'kernel':'linear'}
classifier=SVC(**params)
classifier.fit(x,y)
非线性分类器
#用非线性核函数初始化SVM对象
params={'kernel':'poly','degree':3} #多项式
params={'kernel':'rbf'} # 径向基
classifier=SVC(**params)
classifier.fit(x,y)
聚类
K-Means
#num_clusters 聚的类个数
from sklearn.cluster import KMeans
Kmeans=KMeans(init='k-means++', n_clusters=num_clusters,n_init=10)
Kmeans.fit(data)