The next day Machine Learning

0. simple machine learning process

Here Insert Picture Description

Text version of the machine learning process


  1. !!! raw data clearly what to do

  2. The basic processing data: pd to process data (missing values, combined table)

  3. Project features: Features processed

  4. Finding the right algorithm to predict
    model: Algorithms + Data

  5. Assessment model, the effect is determined
    - if unsuccessful
    1-> conversion method, the parameter adjustment
    feature works 2->

  6. On-line use


table of Contents

1.KNN algorithm

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

def knn_iris():
    """
    用KNN算法对鸢尾花分类
    """
    # 1.获取数据集
    iris = load_iris()
    # 2.划分数据集   0.3---->是训练是7,预测是3
    x_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.5,random_state=2)
    print(x_train.shape)
    print(x_test.shape)
    # 3.特征工程,标准化
    transfer = StandardScaler()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.transform(x_test)
    # 4.KNN算法预估积
    estimator = KNeighborsClassifier(n_neighbors=1)
    estimator.fit(x_train,y_train)
    # 5.模型评估
    # 方法一,直接对比真实值和预测值
    y_predict = estimator.predict(x_test)
    print("y_predict:\n",y_predict)
    print("直接对比真实值和预测值:\n",y_test == y_predict)
    # 方法二,计算准确度
    # 输入-----> x和y
    score = estimator.score(x_test,y_test)
    print("准确度为:\n",score)
    return None

if __name__ == '__main__':
    knn_iris()
(75, 4)
(75, 4)
y_predict:
 [0 0 2 0 0 2 0 2 2 0 0 0 0 0 1 1 0 1 2 1 1 1 2 1 1 0 0 2 0 2 2 0 1 2 1 0 2
 1 1 2 1 1 2 1 0 1 0 1 0 0 0 1 2 2 0 2 2 2 1 0 0 2 1 1 2 2 1 0 1 0 2 1 1 0
 1]
直接对比真实值和预测值:
 [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True False  True  True
  True  True  True False  True  True  True  True  True  True False  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True]
准确度为:
 0.96

2. The grid search and cross-validation

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV        # 交叉验证和网格搜索

def knn_iriscv():
    """
    用KNN算法对鸢尾花分类,添加网格搜索和交叉验证
    """
    # 1.获取数据集
    iris = load_iris()
    # 2.划分数据集  
    x_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.5,random_state=2)
    # 3.特征工程,标准化
    transfer = StandardScaler()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.transform(x_test)

    # 4.KNN算法预估积
    estimator = KNeighborsClassifier()
    # 加入网格搜索和交叉验证
    # 参数准备
    param_dict = {"n_neighbors":[1,2,3,4,5,6,7]}

    estimator = GridSearchCV(estimator,param_grid=param_dict,cv=10) # cv=10,是10折交叉验证
    estimator.fit(x_train,y_train)

    # 5.模型评估 计算准确度
    score = estimator.score(x_test,y_test)
    # 并自动切换到最好的参数进行预测

    # ------------------------------------------------
    print("准确度为:\n",score)
    # 最佳参数:best_params_
    print("最佳参数:\n",estimator.best_params_)
    # 最佳结果:best_score_       !!!这个是验证集中最好的结果的
    print("最佳结果:\n",estimator.best_score_)
    # 最佳估计器:best_estimator_
    print("最佳估计器:\n",estimator.best_estimator_)
    # 交叉验证结果:estimator.cv_results_
    # print("交叉验证结果:\n",estimator.cv_results_)
    # ------------------------------------------------
    return None

if __name__ == '__main__':
    knn_iriscv()
准确度为:
 0.96
最佳参数:
 {'n_neighbors': 1}
最佳结果:
 0.9466666666666667
最佳估计器:
 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

3. KNN combat

face check-in combat

4. Naive Bayes

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import datasets
import numpy as np
def nb_news():
    """
    用朴素贝叶斯对新闻进行分类
    """
    # 1) 获取数据
    news = fetch_20newsgroups(subset="train")
    """
    print(len(news.data))       # 一共10000万多条数据
    print(news.data[0])         # 查看第一条数据
    """
    x_train,x_test,y_train,y_test = train_test_split(news.data, news.target)
    # 3) 特征工程:文本特征抽取-tfidf
    transfer = TfidfVectorizer()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.transform(x_test)
    # 4) 朴素贝叶斯算法预估器流程
    estimator = MultinomialNB()
    estimator.fit(x_train,y_train)
    # 5) 模型评估
    # 计算准确率
    score = estimator.score(x_test,y_test)
    print("准确率为",score)
    return None

nb_news()
准确率为 0.8416401555319901

5. Decision Tree

from sklearn.tree import DecisionTreeClassifier,export_graphviz # 可视化决策树
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

def decision_iris():
    """
    用决策树对鸢尾花进行分类
    """
    # 1) 获取数据集
    iris = load_iris()

    # 2) 划分数据集
    x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target,random_state=22)

    # !! 这时候不同计算距离,所以数据标准化可有可无
    # 3) 决策数预估器
    estimator = DecisionTreeClassifier() #(criterion="entropy") # 使用信息增益
    estimator.fit(x_train,y_train)

    # 4) 模型评估 计算准确率
    score = estimator.score(x_test,y_test)
    print("决策树的准确率为:",score)
	# 可视化决策树
    export_graphviz(estimator,out_file="./tree.doc",feature_names=iris.feature_names)
    return None

decision_iris()
决策树的准确率为: 0.9210526315789473
Published 31 original articles · won praise 13 · views 9893

Guess you like

Origin blog.csdn.net/qq_43497702/article/details/100069523