机器学习第二天

0. 机器学习简单流程

在这里插入图片描述

文字版机器学习流程


  1. !!!明确原始数据做什么

  2. 数据的基本处理:pd去处理数据(缺失值,合并表)

  3. 特征工程:特征进行处理

  4. 找到合适的算法去预测
    模型:算法+数据

  5. 模型的评估,判定效果
    -如果不成功
    1—>换算法,调参数
    2—>特征工程

  6. 上线使用


目录

1.KNN算法

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

def knn_iris():
    """
    用KNN算法对鸢尾花分类
    """
    # 1.获取数据集
    iris = load_iris()
    # 2.划分数据集   0.3---->是训练是7,预测是3
    x_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.5,random_state=2)
    print(x_train.shape)
    print(x_test.shape)
    # 3.特征工程,标准化
    transfer = StandardScaler()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.transform(x_test)
    # 4.KNN算法预估积
    estimator = KNeighborsClassifier(n_neighbors=1)
    estimator.fit(x_train,y_train)
    # 5.模型评估
    # 方法一,直接对比真实值和预测值
    y_predict = estimator.predict(x_test)
    print("y_predict:\n",y_predict)
    print("直接对比真实值和预测值:\n",y_test == y_predict)
    # 方法二,计算准确度
    # 输入-----> x和y
    score = estimator.score(x_test,y_test)
    print("准确度为:\n",score)
    return None

if __name__ == '__main__':
    knn_iris()
(75, 4)
(75, 4)
y_predict:
 [0 0 2 0 0 2 0 2 2 0 0 0 0 0 1 1 0 1 2 1 1 1 2 1 1 0 0 2 0 2 2 0 1 2 1 0 2
 1 1 2 1 1 2 1 0 1 0 1 0 0 0 1 2 2 0 2 2 2 1 0 0 2 1 1 2 2 1 0 1 0 2 1 1 0
 1]
直接对比真实值和预测值:
 [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True False  True  True
  True  True  True False  True  True  True  True  True  True False  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True]
准确度为:
 0.96

2.网格搜索和交叉验证

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV        # 交叉验证和网格搜索

def knn_iriscv():
    """
    用KNN算法对鸢尾花分类,添加网格搜索和交叉验证
    """
    # 1.获取数据集
    iris = load_iris()
    # 2.划分数据集  
    x_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.5,random_state=2)
    # 3.特征工程,标准化
    transfer = StandardScaler()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.transform(x_test)

    # 4.KNN算法预估积
    estimator = KNeighborsClassifier()
    # 加入网格搜索和交叉验证
    # 参数准备
    param_dict = {"n_neighbors":[1,2,3,4,5,6,7]}

    estimator = GridSearchCV(estimator,param_grid=param_dict,cv=10) # cv=10,是10折交叉验证
    estimator.fit(x_train,y_train)

    # 5.模型评估 计算准确度
    score = estimator.score(x_test,y_test)
    # 并自动切换到最好的参数进行预测

    # ------------------------------------------------
    print("准确度为:\n",score)
    # 最佳参数:best_params_
    print("最佳参数:\n",estimator.best_params_)
    # 最佳结果:best_score_       !!!这个是验证集中最好的结果的
    print("最佳结果:\n",estimator.best_score_)
    # 最佳估计器:best_estimator_
    print("最佳估计器:\n",estimator.best_estimator_)
    # 交叉验证结果:estimator.cv_results_
    # print("交叉验证结果:\n",estimator.cv_results_)
    # ------------------------------------------------
    return None

if __name__ == '__main__':
    knn_iriscv()
准确度为:
 0.96
最佳参数:
 {'n_neighbors': 1}
最佳结果:
 0.9466666666666667
最佳估计器:
 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

3. KNN 实战

face签到位置实战

4.朴素贝叶斯

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import datasets
import numpy as np
def nb_news():
    """
    用朴素贝叶斯对新闻进行分类
    """
    # 1) 获取数据
    news = fetch_20newsgroups(subset="train")
    """
    print(len(news.data))       # 一共10000万多条数据
    print(news.data[0])         # 查看第一条数据
    """
    x_train,x_test,y_train,y_test = train_test_split(news.data, news.target)
    # 3) 特征工程:文本特征抽取-tfidf
    transfer = TfidfVectorizer()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.transform(x_test)
    # 4) 朴素贝叶斯算法预估器流程
    estimator = MultinomialNB()
    estimator.fit(x_train,y_train)
    # 5) 模型评估
    # 计算准确率
    score = estimator.score(x_test,y_test)
    print("准确率为",score)
    return None

nb_news()
准确率为 0.8416401555319901

5. 决策树

from sklearn.tree import DecisionTreeClassifier,export_graphviz # 可视化决策树
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

def decision_iris():
    """
    用决策树对鸢尾花进行分类
    """
    # 1) 获取数据集
    iris = load_iris()

    # 2) 划分数据集
    x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target,random_state=22)

    # !! 这时候不同计算距离,所以数据标准化可有可无
    # 3) 决策数预估器
    estimator = DecisionTreeClassifier() #(criterion="entropy") # 使用信息增益
    estimator.fit(x_train,y_train)

    # 4) 模型评估 计算准确率
    score = estimator.score(x_test,y_test)
    print("决策树的准确率为:",score)
	# 可视化决策树
    export_graphviz(estimator,out_file="./tree.doc",feature_names=iris.feature_names)
    return None

decision_iris()
决策树的准确率为: 0.9210526315789473
发布了31 篇原创文章 · 获赞 13 · 访问量 9893

猜你喜欢

转载自blog.csdn.net/qq_43497702/article/details/100069523