机器学习基线模型更新中...

机器学习

K-近邻(鸢尾花种类预测)

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

# 1.获取数据集
iris = load_iris()

# 2.数据基本处理
# x_train,x_test,y_train,y_test为训练集特征值、测试集特征值、训练集目标值、测试集目标值
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=22)

# 3、特征工程：标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

# 4、机器学习(模型训练)
# estimator = KNeighborsClassifier(n_neighbors=9,algorithm='auto')
'''
n_neighbors：
int,可选（默认= 5），k_neighbors查询默认使用的邻居数
algorithm：{‘auto’，‘ball_tree’，‘kd_tree’，‘brute’}
快速k近邻搜索算法，默认参数为auto，可以理解为算法自己决定合适的搜索算法。除此之外，用户也可以自己指定搜索算法ball_tree、kd_tree、brute方法进行搜索，
brute是蛮力搜索，也就是线性扫描，当训练集很大时，计算非常耗时。
kd_tree，构造kd树存储数据以便对其进行快速检索的树形数据结构，kd树也就是数据结构中的二叉树。以中值切分构造的树，每个结点是一个超矩形，在维数小于20时效率高。
ball tree是为了克服kd树高维失效而发明的，其构造过程是以质心C和半径r分割样本空间，每个节点是一个超球体。
'''
estimator = KNeighborsClassifier()
param_dict = {
    
    "n_neighbors": [1, 3, 5]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=3)
estimator.fit(x_train, y_train)

# 5、模型评估
# 方法1：比对真实值和预测值
y_predict = estimator.predict(x_test)
print("预测结果为:\n", y_predict)
print("比对真实值和预测值：\n", y_predict == y_test)
# 方法2：直接计算准确率
score = estimator.score(x_test, y_test)
print("准确率为：\n", score)

线性回归(波士顿房价预测)

正规方程

def linear_model1():
    """
    线性回归:正规方程
    :return:None
    """
    # 1.获取数据
    data = load_boston()

    # 2.数据集划分
    x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, random_state=22)

    # 3.特征工程-标准化
    transfer = StandardScaler()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.fit_transform(x_test)

    # 4.机器学习-线性回归(正规方程)
    estimator = LinearRegression()
    estimator.fit(x_train, y_train)

    # 5.模型评估
    # 5.1 获取系数等值
    y_predict = estimator.predict(x_test)
    print("预测值为:\n", y_predict)
    print("模型中的系数为:\n", estimator.coef_)
    print("模型中的偏置为:\n", estimator.intercept_)

    # 5.2 评价
    # 均方误差
    error = mean_squared_error(y_test, y_predict)
    print("误差为:\n", error)

    return None

梯度下降

def linear_model2():
    """
    线性回归:梯度下降法
    :return:None
    """
    # 1.获取数据
    data = load_boston()

    # 2.数据集划分
    x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, random_state=22)

    # 3.特征工程-标准化
    transfer = StandardScaler()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.fit_transform(x_test)

    # 4.机器学习-线性回归(特征方程)
    estimator = SGDRegressor(max_iter=1000)
    estimator.fit(x_train, y_train)

    # 5.模型评估
    # 5.1 获取系数等值
    y_predict = estimator.predict(x_test)
    print("预测值为:\n", y_predict)
    print("模型中的系数为:\n", estimator.coef_)
    print("模型中的偏置为:\n", estimator.intercept_)

    # 5.2 评价
    # 均方误差
    error = mean_squared_error(y_test, y_predict)
    print("误差为:\n", error)

    return None

逻辑回归(癌症分类预测)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# 1.获取数据
names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                   'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                   'Normal Nucleoli', 'Mitoses', 'Class']

data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",
                  names=names)
data.head()

# 2.基本数据处理
# 2.1 缺失值处理
data = data.replace(to_replace="?", value=np.NaN)
data = data.dropna()
# 2.2 确定特征值,目标值
x = data.iloc[:, 1:10]
x.head()
y = data["Class"]
y.head()
# 2.3 分割数据
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22)
# 3.特征工程(标准化)
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 4.机器学习(逻辑回归)
estimator = LogisticRegression()
estimator.fit(x_train, y_train)
# 5.模型评估
y_predict = estimator.predict(x_test)
y_predict
estimator.score(x_test, y_test)

决策树(泰坦尼克号乘客生存预测)

import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz

if __name__ == '__main__':

    # 加载数据集
    titan = pd.read_csv("./data/train.csv")
    # titan = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")
    print(titan.describe())

    # 确定特征值和目标值
    x = titan[["Pclass", "Age", "Sex"]].copy()
    y = titan["Survived"]

    # sklearn 的 决策树不支持缺失值处理，需要自行处理
    x['Age'].fillna(value=titan["Age"].mean(), inplace=True)

    # 数据集划分
    x_train, x_test, y_train, y_test = \
        train_test_split(x, y, random_state=22, test_size=0.2)

    # sklearn 的决策树要求对数据集中的类别特征进行 one hot 编码
    x_train = x_train.to_dict(orient="records")
    x_test = x_test.to_dict(orient="records")

    transfer = DictVectorizer()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.fit_transform(x_test)

    # 模型训练
    estimator = DecisionTreeClassifier(max_depth=15)
    estimator.fit(x_train, y_train)

    # 模型评估
    y_pre = estimator.predict(x_test)
    accuracy = estimator.score(x_test, y_test)
    print("accuracy:", accuracy)

集成学习(随机森林)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

"""
Survival   是否生存	0 = No, 1 = Yes
pclass	   船票登记	1 = 1st, 2 = 2nd, 3 = 3rd
Sex        性别
Age	       年龄
sibsp	   乘客在船上兄弟姐妹的数量
parch	   乘客在船上父母和孩子的数量
ticket	   船票编号
fare	   船票价格
cabin	   客舱号码
embarked   登船港口
"""


if __name__ == '__main__':

    # 1. 获取数据集
    titan = pd.read_csv("./data/train.csv")

    # 2. 确定特征值和目标值
    x = titan[["Pclass", "Age", "Sex"]].copy()
    y = titan["Survived"]

    # 3. 处理缺失值
    x['Age'].fillna(value=titan["Age"].mean(), inplace=True)
    x.head()

    # 4. 数据集划分
    x_train, x_test, y_train, y_test = \
        train_test_split(x, y, random_state=22, test_size=0.2)

    # 5. 类别特征 one hot 编码
    x_train = x_train.to_dict(orient="records")
    x_test = x_test.to_dict(orient="records")

    transfer = DictVectorizer()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.fit_transform(x_test)

    # 6. 随机森林 + 网格搜索
    estimator = RandomForestClassifier()
    param = {
    
    "n_estimators": [100, 120, 300], "max_depth": [3, 7, 11]}
    grid_search = GridSearchCV(estimator, param_grid=param, cv=3)
    grid_search.fit(x_train, y_train)

    # 7. 模型评估
    accuracy = grid_search.score(x_test, y_test)
    print("accuracy:", accuracy)

聚类算法

sklearn.cluster.KMeans(n_clusters=8)

参数:
- n_clusters:开始的聚类中心数量
  - 整型，缺省值=8，生成的聚类数，即产生的质心（centroids）数。
方法:
- estimator.fit(x)
- estimator.predict(x)
- estimator.fit_predict(x)
  - 计算聚类中心并预测每个样本属于哪个类别,相当于先调用fit(x),然后再调用predict(x)

XGboost

import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
# 1、获取数据
titan = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")

# 2.数据基本处理
# 2.1 确定特征值,目标值
x = titan[["pclass", "age", "sex"]]
y = titan["survived"]
# 2.2 缺失值处理
# 缺失值需要处理，将特征当中有类别的这些特征进行字典特征抽取
x['age'].fillna(x['age'].mean(), inplace=True)
# 2.3 数据集划分
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22)

# 3.特征工程(字典特征抽取)
# 特征中出现类别符号，需要进行one-hot编码处理(DictVectorizer)
x.to_dict(orient="records") # 需要将数组特征转换成字典数据

# 对于x转换成字典数据x.to_dict(orient="records")
# [{"pclass": "1st", "age": 29.00, "sex": "female"}, {}]
transfer = DictVectorizer(sparse=False)
x_train = transfer.fit_transform(x_train.to_dict(orient="records"))
x_test = transfer.fit_transform(x_test.to_dict(orient="records"))

# 4.xgboost模型训练和模型评估
# 模型初步训练
from xgboost import XGBClassifier
xg = XGBClassifier()
xg.fit(x_train, y_train)
xg.score(x_test, y_test)
# 针对max_depth进行模型调优
depth_range = range(10)
score = []
for i in depth_range:
    xg = XGBClassifier(eta=1, gamma=0, max_depth=i)
    xg.fit(x_train, y_train)
    s = xg.score(x_test, y_test)
    print(s)
    score.append(s)
# 结果可视化
import matplotlib.pyplot as plt

plt.plot(depth_range, score)

plt.show()

机器学习基线模型更新中...

机器学习

K-近邻(鸢尾花种类预测)

线性回归(波士顿房价预测)

正规方程

梯度下降

逻辑回归(癌症分类预测)

决策树(泰坦尼克号乘客生存预测)

集成学习(随机森林)

聚类算法

XGboost

猜你喜欢