Port wine assess

数据集下载
项目说明:本项目用到了机器学习中PCA降维、Kmeans聚类、多元线性回归、随机森林、Adaboost等几种方法,理解概念是基础,数学推导很重要,这里我用到了sklearn机器学习包,对经过PCA降维处理和特征工程后的数据做了训练和预测。
任务1:根据酿酒葡萄的理化指标和葡萄酒的质量对这些酿酒葡萄进行分级?
分析:方案:分别对附件二中的红葡萄和白葡萄,根据属性进行聚类,即通过我们刚才任务二中读取出来的数据。 用K-means , DBSCAN。 (因为我们这个数据的属性比较多,所以在做聚类之前先进行一次降维,采用pca。降维之后的信息损失小于10%即可,即所选取的主特征值之和比上总的特征值之和大于0.9即可。)

任务2:分析酿酒葡萄与葡萄酒的理化指标之间的联系?
分析:通过多元回归的方法,建立酿酒葡萄的各个属性(经过PCA之后的降维后的属性)与对应葡萄酒的各个理化指标之间的模型,然后用回归的残差来作为酿酒葡萄与葡萄酒的理化指标之间的联系。

任务3:通过酿酒葡萄的属性和葡萄酒的评分,建立多元回归的预测模型?
分析:通过多元线性回归,随机森林,adaboost分别对pca之后的酿酒葡萄属性与酿酒葡萄对应的葡萄酒的评分之间建立预测模型。(即仅通过酿酒葡萄的属性来预测葡萄酒的评分。)

代码:

import pandas as pd
import numpy as np
import re
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn import preprocessing
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, median_absolute_error

def prepare_data():
    yb = pd.read_excel(r'F:\B 项目总结\葡萄酒_项目\样品属性.xls')
    score_r = pd.read_excel(r'F:\B 项目总结\葡萄酒_项目\样品评分.xls')
    score_w = pd.read_excel(r'F:\B 项目总结\葡萄酒_项目\样品评分.xls', sheetname=1)
    yb_wine = pd.read_excel(r'F:\B 项目总结\葡萄酒_项目\样品属性.xls', sheetname=1)
    score_total = pd.concat([score_r, score_w], ignore_index=True)

    yb.index = yb['样品编号'].tolist()# 样品编号变成index
    del yb['样品编号']
    score_total.index = score_total['样品编号'].tolist()
    del score_total['样品编号']
    return yb, yb_wine, score_r, score_w, score_total


def task_one(yb, score_total):
    # task one:根据酿酒葡萄的理化指标和葡萄酒的质量对这些酿酒葡萄进行分级
    # normalization data
    x = pd.concat([score_total, yb], axis=1, join_axes=[yb.index], join='inner')
    train_x = preprocessing.minmax_scale(x)

    # reduce dimension through pca
    pca = PCA(n_components=0.9)
    chengfen = pca.fit_transform(train_x)
    # print(pca.explained_variance_ratio_) #查看方差(等同于特征值),越大越好 >90%

    # kmeans clustering
    km = KMeans(n_clusters=4, max_iter=2000)
    km.fit(chengfen)
    # fenlei1 = km.labels_
    fenlei1 = km.predict(chengfen)
    print('KMeans聚类:', fenlei1)
    print(km.score(chengfen))

    # dbscan clustering
    dbscan = DBSCAN(eps=1, min_samples=2)
    dbscan.fit(chengfen)
    fenlei2 = dbscan.labels_
    print('DBSCAN聚类:', fenlei2)


def task_two(yb, yb_wine, score_total):
    # two:分析酿酒葡萄与葡萄酒的理化指标之间的联系
    x = pd.concat([score_total, yb], axis=1, join_axes=[yb.index], join='inner')

    del x['评分']
    train_grape = preprocessing.minmax_scale(x)
    yb_wine.index = yb_wine['样品编号'].tolist()
    del yb_wine['样品编号']

    yb_wine = yb_wine.fillna(0)
    train_wine = preprocessing.minmax_scale(yb_wine)

    pca = PCA(n_components=13)
    train_g = pca.fit_transform(train_grape)
    # print(train_wine[:,0])

    # show the correlation between features and wine scores
    for i in range(train_wine.shape[1]):
        lr = LinearRegression()
        lr.fit(train_g, train_wine[:, i])
        print('The correlation of %d-th feature:', lr.score(train_g, train_wine[:, i]))
    return train_g


def task_three(train_g, score_total):
    # three:通过酿酒葡萄的属性和葡萄酒的评分,简历多元回归的预测模型

    train_score = preprocessing.minmax_scale(score_total)
    lr = LinearRegression()
    lr.fit(train_g, train_score)
    print('多元线性回归:', lr.score(train_g, train_score))

    rfr=RandomForestRegressor()
    # rfr.fit(train_g,train_score)
    # print('随机森林回归:',rfr.score(train_g,train_score))
    rfr.fit(train_g, train_score.reshape((55,)))
    print('随机森林回归reshape_y:', rfr.score(train_g, train_score.reshape((55,))))

    abr=AdaBoostRegressor()
    # abr.fit(train_g,train_score)
    # print('Adaboost回归:',abr.score(train_g,train_score))
    abr.fit(train_g, train_score.reshape((55,)))
    print('Adaboost回归reshape_y:', abr.score(train_g, train_score.reshape((55,))))

if __name__ == '__main__':
    yb, yb_wine, score_r, score_w, score_total = prepare_data()
    task_one(yb, score_total)
    train_g = task_two(yb, yb_wine, score_total)
    task_three(train_g, score_total)

猜你喜欢

转载自blog.csdn.net/zztingfeng/article/details/80709017
今日推荐