There are a total of 5 people's exercise data A, B, C, D, E. Each person's data has 41 columns and several rows, that is, 41 characteristic values. Each characteristic value represents a person's body temperature and acceleration at this time. , Data of the magnetic field of a person's environment, etc. Based on these characteristic values, the person's motion state (running, cycling, or squatting) at this time can be predicted.
The data of these five people are stored in featurePaths, featurePaths= ['A/A.feature','B/B.feature','C/C.feature','D/D.feature','E/E. feature']
The movement status of these five people is stored in labelPaths, labelPaths = ['A/A.label','B/B.label','C/C.label','D/D.label',' E/E.label']
#监督学习--分类算法---KNN——决策树——朴素贝叶斯
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer #导入预处理模块Imputer
from sklearn.model_selection import train_test_split #导入自动生成训练集和测试集的模块train_teast_split
from sklearn.metrics import classification_report #导入预测结果评估模块classification——report
#导入三个分类器模块
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
def load_datasets(feature_paths, label_paths):
#读取特征文件列表和标签文件列表中的内容,归并到一个集合后返回
feature = np.ndarray(shape=(0,41))#创建一个0行41列的空数组
label = np.ndarray(shape=(0,1)) #创建一个0行1列的空数组
for file in feature_paths:
#调用pandas库中的read_table函数,读取一个特征文件的内容;
#使用逗号分隔符读取特征数据,将问号替换标记为缺失值,文件不包含表头
df = pd.read_table(file, delimiter=',', na_values='?', header=None)
#Imputer函数,通过设定strategy参数为mean,使用平均值补全缺失数据,axis=0为第一列
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(df)
df = imp.transform(df)
#将预处理后的数据加入feature
feature = np.concatenate((feature, df))
for file in label_paths:
#读取标签数据,文件中不包含表头
df = pd.read_table(file, header=None)
#将新读入的数据合并到标签集合中
label = np.concatenate((label, df))
#将标签归整为一维向量(数组)
label = np.ravel(label)
#返回更新后的值
return feature, label
if __name__ == '__main__':
featurePaths =['A/A.feature','B/B.feature','C/C.feature','D/D.feature','E/E.feature']
labelPaths = ['A/A.label','B/B.label','C/C.label','D/D.label','E/E.label']
''' 调用函数,读入数据 '''
#将前4个数据集作为训练集读入
x_train,y_train = load_datasets(featurePaths[:4],labelPaths[:4])
#将最后1个数据作为测试集读入
x_test,y_test = load_datasets(featurePaths[4:],labelPaths[4:])
#使用全量数据作为训练集,借助train_test_split函数将训练数据打乱
#test_size=0.0只会对数据集中的子集进行随机排列,size=0不会改变数组的原有size
x_train, x_, y_train, y_ = train_test_split(x_train, y_train, test_size = 0.0)
#创建近邻分类器并在测试集上预测
print('Start training knn')
knn = KNeighborsClassifier().fit(x_train, y_train)
print('Training done')
answer_knn = knn.predict(x_test)
print('Prediction done')
#创建决策树分类
print('Start training DT')
dt = DecisionTreeClassifier().fit(x_train, y_train)
print('Training done')
answer_dt = dt.predict(x_test)
print('Prediction done')
#创建朴素贝叶斯分类
print('Start training Bayes')
gnb = GaussianNB().fit(x_train, y_train)
print('Training done')
answer_gnb = gnb.predict(x_test)
print('Prediction done')
#分类结果分析
#使用classification_report函数计算准确率、召回率、f1值和支持度
print('\n\nThe classification report for knn:')
print(classification_report(y_test, answer_knn))
print('\n\nThe classification report for DT:')
print(classification_report(y_test, answer_dt))
print('\n\nThe classification report for Bayes:')
print(classification_report(y_test, answer_gnb))