knn的详细链接:https://blog.csdn.net/fanzonghao/article/details/86411102
决策树的详细链接:https://blog.csdn.net/fanzonghao/article/details/85246720
SVM:寻找最优的间隔
等式约束的最优解
不等式约束的最优解:利用kkT条件
SVM案例,用于水果数据集分类:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import matplotlib.patches as mpatches
from matplotlib.colors import ListedColormap
def plot_class_regions_for_classifier(clf, X, y, X_test=None, y_test=None, title=None,
target_names=None, plot_decision_regions=True):
"""
根据分类器可视化数据分类的结果
只能用于二维特征的数据
"""
num_classes = np.amax(y) + 1
color_list_light = ['#FFFFAA', '#EFEFEF', '#AAFFAA', '#AAAAFF']
color_list_bold = ['#EEEE00', '#000000', '#00CC00', '#0000CC']
cmap_light = ListedColormap(color_list_light[0:num_classes])
cmap_bold = ListedColormap(color_list_bold[0:num_classes])
h = 0.03
k = 0.5
x_plot_adjust = 0.1
y_plot_adjust = 0.1
plot_symbol_size = 50
x_min = X[:, 0].min()
x_max = X[:, 0].max()
y_min = X[:, 1].min()
y_max = X[:, 1].max()
x2, y2 = np.meshgrid(np.arange(x_min-k, x_max+k, h), np.arange(y_min-k, y_max+k, h))
P = clf.predict(np.c_[x2.ravel(), y2.ravel()])
P = P.reshape(x2.shape)
plt.figure()
if plot_decision_regions:
plt.contourf(x2, y2, P, cmap=cmap_light, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, s=plot_symbol_size, edgecolor='black')
plt.xlim(x_min - x_plot_adjust, x_max + x_plot_adjust)
plt.ylim(y_min - y_plot_adjust, y_max + y_plot_adjust)
if X_test is not None:
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cmap_bold, s=plot_symbol_size,
marker='^', edgecolor='black')
train_score = clf.score(X, y)
test_score = clf.score(X_test, y_test)
title = title + "\nTrain score = {:.2f}, Test score = {:.2f}".format(train_score, test_score)
if target_names is not None:
legend_handles = []
for i in range(0, len(target_names)):
patch = mpatches.Patch(color=color_list_bold[i], label=target_names[i])
legend_handles.append(patch)
plt.legend(loc=0, handles=legend_handles)
if title is not None:
plt.title(title)
plt.show()
# 加载数据集
fruits_df = pd.read_table('fruit_data_with_colors.txt')
X = fruits_df[['width', 'height']]
y = fruits_df['fruit_label'].copy()
# 将不是apple的标签设为0
y[y != 1] = 0
# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4, random_state=0)
print(y_test.shape)
# 不同的C值
c_values = [0.0001, 1, 100]
for c_value in c_values:
# 建立模型
svm_model = SVC(C=c_value, kernel='rbf')
# 训练模型
svm_model.fit(X_train, y_train)
# 验证模型
y_pred = svm_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('C={},准确率:{:.3f}'.format(c_value, acc))
# 可视化
plot_class_regions_for_classifier(svm_model, X_test.values, y_test.values, title='C={}'.format(c_value))
二维高斯分布
将kernel替换成‘linear’
import warnings
import matplotlib.pyplot as plt
from sklearn.datasets import make_circles
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier,RandomForestClassifier,ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
warnings.filterwarnings('ignore')
X,y=make_circles(n_samples=300,noise=0.15,factor=0.5,random_state=233)
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y== 1, 0], X[y== 1, 1])
# plt.show()
X_train,X_test,y_train,y_test=train_test_split(X,y)
print('X_train.shape=',X_train.shape)
print('X_test.shape=',X_test.shape)
print(y_test)
print('===========knn==============')
knn_clf=KNeighborsClassifier()
knn_clf.fit(X_train,y_train)
print('knn accuracy={}'.format(knn_clf.score(X_test,y_test)))
print('\n')
print('===========logistic regression==============')
log_clf = LogisticRegression()
log_clf.fit(X_train, y_train)
print('logistic regression accuracy={}'.format(log_clf.score(X_test, y_test)))
print('\n')
print('===========SVM==============')
svm_clf = SVC()
svm_clf.fit(X_train, y_train)
print('SVM accuracy={}'.format(svm_clf.score(X_test, y_test)))
print('\n')
print('===========Decison tree==============')
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
print('Decison tree accuracy={}'.format(dt_clf.score(X_test, y_test)))
print('\n')
print('===========ensemble classfier==============')
voting_clf=VotingClassifier(estimators=[('knn',KNeighborsClassifier()),
('logistic', LogisticRegression()),
('SVM',SVC()),
('decision tree',DecisionTreeClassifier())],
voting='hard')#严格遵守少数服从多数
voting_clf.fit(X_train,y_train)
print('voting classfier accuracy={}'.format(voting_clf.score(X_test, y_test)))
print('\n')
print('===========random forest==============')
rf_clf=RandomForestClassifier(n_estimators=500,#500棵树
max_depth=6,#每颗树的深度
bootstrap=True,# 放回抽样
oob_score=True,#使用没有被抽到的数据做验证
)
rf_clf.fit(X,y)#由于oob_score为true 故直接fit整个训练集
print('rf accuracy={}'.format(rf_clf.oob_score_))
print('\n')
print('===========extreme random tree==============')
ex_clf=ExtraTreesClassifier(n_estimators=500,
max_depth=6,
bootstrap=True,
oob_score=True)
ex_clf.fit(X,y)
print('extreme random treeaccuracy={}'.format(ex_clf.oob_score_))
print('\n')
print('===========Adaboost classifier==============')
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(),
n_estimators=500,
learning_rate=0.3)
ada_clf.fit(X_train, y_train)
print('Adaboost accuracy={}'.format(ada_clf.score(X_test,y_test)))
print('\n')
随机森林算法的高明之处之一就是利用随机性,使得模型更鲁棒。假如森林中有 N 棵树,那么就随机取出 N 个训练数据集,对 N 棵树分别进行训练,通过统计每棵树的预测结果来得出随机森林的预测结果。
因为随机森林的主要构件是决策树,所以随机森林的超参数很多与决策树相同。除此之外,有2个比较重要的超参数值得注意,一个是 bootstrap,取 true 和 false,表示在划分训练数据集时是否采用放回取样;另一个是 oob_score,因为采用放回取样时,构建完整的随机森林之后会有大约 33% 的数据没有被取到过,所以当 oob_score 取 True 时,就不必再将数据集划分为训练集和测试集了,直接取未使用过的数据来验证模型的准确率。
由上述可以看出Extremely Randomized Trees 算法精度最高,它不仅在构建数据子集时对样本的选择进行随机抽取,而且还会对样本的特征进行随机抽取(即在建树模型时,采用部分特征而不是全部特征进行训练)。换句话说,就是对于特征集 X,随机森林只是在行上随机,Extremely Randomized Trees是在行和列上都随机。
而Boosting 是一族将弱学习器提升为强学习器的一种算法。这族算法的工作机制类似:首先是根据初始训练集训练出一个基学习器,然后根据基学习器的表现调整样本分布,使得让基学习器犯错的样本再对下一个学习器训练时得到更大的权重,使得下一个学习器提高其在使上一个分类器犯错的样本集中的表现;然而该学习器仍会犯错,我们就将该步骤反复进行,直到达到某个指标。