Machine learning baseline model is being updated...

machine learning

K-nearest neighbor (iris flower species prediction)

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

# 1.获取数据集
iris = load_iris()

# 2.数据基本处理
# x_train,x_test,y_train,y_test为训练集特征值、测试集特征值、训练集目标值、测试集目标值
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=22)

# 3、特征工程：标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

# 4、机器学习(模型训练)
# estimator = KNeighborsClassifier(n_neighbors=9,algorithm='auto')
'''
n_neighbors：
int,可选（默认= 5），k_neighbors查询默认使用的邻居数
algorithm：{‘auto’，‘ball_tree’，‘kd_tree’，‘brute’}
快速k近邻搜索算法，默认参数为auto，可以理解为算法自己决定合适的搜索算法。除此之外，用户也可以自己指定搜索算法ball_tree、kd_tree、brute方法进行搜索，
brute是蛮力搜索，也就是线性扫描，当训练集很大时，计算非常耗时。
kd_tree，构造kd树存储数据以便对其进行快速检索的树形数据结构，kd树也就是数据结构中的二叉树。以中值切分构造的树，每个结点是一个超矩形，在维数小于20时效率高。
ball tree是为了克服kd树高维失效而发明的，其构造过程是以质心C和半径r分割样本空间，每个节点是一个超球体。
'''
estimator = KNeighborsClassifier()
param_dict = {
    
    "n_neighbors": [1, 3, 5]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=3)
estimator.fit(x_train, y_train)

# 5、模型评估
# 方法1：比对真实值和预测值
y_predict = estimator.predict(x_test)
print("预测结果为:\n", y_predict)
print("比对真实值和预测值：\n", y_predict == y_test)
# 方法2：直接计算准确率
score = estimator.score(x_test, y_test)
print("准确率为：\n", score)

Linear Regression (Boston House Price Forecast)

normal equation

def linear_model1():
    """
    线性回归:正规方程
    :return:None
    """
    # 1.获取数据
    data = load_boston()

    # 2.数据集划分
    x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, random_state=22)

    # 3.特征工程-标准化
    transfer = StandardScaler()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.fit_transform(x_test)

    # 4.机器学习-线性回归(正规方程)
    estimator = LinearRegression()
    estimator.fit(x_train, y_train)

    # 5.模型评估
    # 5.1 获取系数等值
    y_predict = estimator.predict(x_test)
    print("预测值为:\n", y_predict)
    print("模型中的系数为:\n", estimator.coef_)
    print("模型中的偏置为:\n", estimator.intercept_)

    # 5.2 评价
    # 均方误差
    error = mean_squared_error(y_test, y_predict)
    print("误差为:\n", error)

    return None

gradient descent

def linear_model2():
    """
    线性回归:梯度下降法
    :return:None
    """
    # 1.获取数据
    data = load_boston()

    # 2.数据集划分
    x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, random_state=22)

    # 3.特征工程-标准化
    transfer = StandardScaler()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.fit_transform(x_test)

    # 4.机器学习-线性回归(特征方程)
    estimator = SGDRegressor(max_iter=1000)
    estimator.fit(x_train, y_train)

    # 5.模型评估
    # 5.1 获取系数等值
    y_predict = estimator.predict(x_test)
    print("预测值为:\n", y_predict)
    print("模型中的系数为:\n", estimator.coef_)
    print("模型中的偏置为:\n", estimator.intercept_)

    # 5.2 评价
    # 均方误差
    error = mean_squared_error(y_test, y_predict)
    print("误差为:\n", error)

    return None

Logistic regression (cancer classification prediction)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# 1.获取数据
names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                   'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                   'Normal Nucleoli', 'Mitoses', 'Class']

data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",
                  names=names)
data.head()

# 2.基本数据处理
# 2.1 缺失值处理
data = data.replace(to_replace="?", value=np.NaN)
data = data.dropna()
# 2.2 确定特征值,目标值
x = data.iloc[:, 1:10]
x.head()
y = data["Class"]
y.head()
# 2.3 分割数据
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22)
# 3.特征工程(标准化)
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 4.机器学习(逻辑回归)
estimator = LogisticRegression()
estimator.fit(x_train, y_train)
# 5.模型评估
y_predict = estimator.predict(x_test)
y_predict
estimator.score(x_test, y_test)

Decision tree (Titanic passenger survival prediction)

import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz

if __name__ == '__main__':

    # 加载数据集
    titan = pd.read_csv("./data/train.csv")
    # titan = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")
    print(titan.describe())

    # 确定特征值和目标值
    x = titan[["Pclass", "Age", "Sex"]].copy()
    y = titan["Survived"]

    # sklearn 的 决策树不支持缺失值处理，需要自行处理
    x['Age'].fillna(value=titan["Age"].mean(), inplace=True)

    # 数据集划分
    x_train, x_test, y_train, y_test = \
        train_test_split(x, y, random_state=22, test_size=0.2)

    # sklearn 的决策树要求对数据集中的类别特征进行 one hot 编码
    x_train = x_train.to_dict(orient="records")
    x_test = x_test.to_dict(orient="records")

    transfer = DictVectorizer()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.fit_transform(x_test)

    # 模型训练
    estimator = DecisionTreeClassifier(max_depth=15)
    estimator.fit(x_train, y_train)

    # 模型评估
    y_pre = estimator.predict(x_test)
    accuracy = estimator.score(x_test, y_test)
    print("accuracy:", accuracy)

Ensemble learning (random forest)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

"""
Survival   是否生存	0 = No, 1 = Yes
pclass	   船票登记	1 = 1st, 2 = 2nd, 3 = 3rd
Sex        性别
Age	       年龄
sibsp	   乘客在船上兄弟姐妹的数量
parch	   乘客在船上父母和孩子的数量
ticket	   船票编号
fare	   船票价格
cabin	   客舱号码
embarked   登船港口
"""


if __name__ == '__main__':

    # 1. 获取数据集
    titan = pd.read_csv("./data/train.csv")

    # 2. 确定特征值和目标值
    x = titan[["Pclass", "Age", "Sex"]].copy()
    y = titan["Survived"]

    # 3. 处理缺失值
    x['Age'].fillna(value=titan["Age"].mean(), inplace=True)
    x.head()

    # 4. 数据集划分
    x_train, x_test, y_train, y_test = \
        train_test_split(x, y, random_state=22, test_size=0.2)

    # 5. 类别特征 one hot 编码
    x_train = x_train.to_dict(orient="records")
    x_test = x_test.to_dict(orient="records")

    transfer = DictVectorizer()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.fit_transform(x_test)

    # 6. 随机森林 + 网格搜索
    estimator = RandomForestClassifier()
    param = {
    
    "n_estimators": [100, 120, 300], "max_depth": [3, 7, 11]}
    grid_search = GridSearchCV(estimator, param_grid=param, cv=3)
    grid_search.fit(x_train, y_train)

    # 7. 模型评估
    accuracy = grid_search.score(x_test, y_test)
    print("accuracy:", accuracy)

Clustering Algorithm

sklearn.cluster.KMeans(n_clusters=8)

parameter:
- n_clusters: the number of starting cluster centers
  - Integer, default value = 8, the number of clusters generated, that is, the number of centroids generated.
method:
- estimator.fit(x)
- estimator.predict(x)
- estimator.fit_predict(x)
  - Calculating the cluster center and predicting which category each sample belongs to is equivalent to calling fit(x) first and then predict(x)

XGboost

import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
# 1、获取数据
titan = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")

# 2.数据基本处理
# 2.1 确定特征值,目标值
x = titan[["pclass", "age", "sex"]]
y = titan["survived"]
# 2.2 缺失值处理
# 缺失值需要处理，将特征当中有类别的这些特征进行字典特征抽取
x['age'].fillna(x['age'].mean(), inplace=True)
# 2.3 数据集划分
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22)

# 3.特征工程(字典特征抽取)
# 特征中出现类别符号，需要进行one-hot编码处理(DictVectorizer)
x.to_dict(orient="records") # 需要将数组特征转换成字典数据

# 对于x转换成字典数据x.to_dict(orient="records")
# [{"pclass": "1st", "age": 29.00, "sex": "female"}, {}]
transfer = DictVectorizer(sparse=False)
x_train = transfer.fit_transform(x_train.to_dict(orient="records"))
x_test = transfer.fit_transform(x_test.to_dict(orient="records"))

# 4.xgboost模型训练和模型评估
# 模型初步训练
from xgboost import XGBClassifier
xg = XGBClassifier()
xg.fit(x_train, y_train)
xg.score(x_test, y_test)
# 针对max_depth进行模型调优
depth_range = range(10)
score = []
for i in depth_range:
    xg = XGBClassifier(eta=1, gamma=0, max_depth=i)
    xg.fit(x_train, y_train)
    s = xg.score(x_test, y_test)
    print(s)
    score.append(s)
# 结果可视化
import matplotlib.pyplot as plt

plt.plot(depth_range, score)

plt.show()

Machine learning baseline model is being updated...

machine learning

K-nearest neighbor (iris flower species prediction)

Linear Regression (Boston House Price Forecast)

normal equation

gradient descent

Logistic regression (cancer classification prediction)

Decision tree (Titanic passenger survival prediction)

Ensemble learning (random forest)

Clustering Algorithm

XGboost

Guess you like