import numpy as np import pandas as pda from sklearn.datasets import load_iris import matplotlib.pyplot as plt #加载数据 iris=load_iris() # print(iris) data=iris["data"] labels=iris["target"] # print(data,labels) df=pda.DataFrame(data,columns=["sepal_len","sepal_width","petal_len","petal_width"]) df["Irsa-setosa"]=labels #处理数据 df.ix[df["Irsa-setosa"]==0,"Irsa-setosa"]="Iris-Setosa" df.ix[df["Irsa-setosa"]==1,"Irsa-setosa"]="Iris-Versicolor" df.ix[df["Irsa-setosa"]==2,"Irsa-setosa"]="Iris-Virgnica" # print(df) # print(df.shape) X=data # print(X) y=df["Irsa-setosa"].values label_dict={1:"Iris-Setosa", 2: "Iris-Versicolor", 3: "Iris-Virgnica"} feature_dict={0:"sepal length(cm)", 1:"sepal width(cm)", 2:"petal length(cm)", 3:"petal width(cm)"} plt.figure(figsize=(14,8)) for i in range(4): plt.subplot(2,2,i+1) for lab in ("Iris-Setosa","Iris-Versicolor","Iris-Virgnica"): plt.hist(X[y==lab,i],bins=20,alpha=0.8,label=lab) plt.xlabel(feature_dict[i]) plt.legend(loc="upper right",fancybox=True,fontsize=8) plt.show() #标准化,消除量纲 from sklearn.preprocessing import StandardScaler X_std=StandardScaler().fit_transform(X) # print(X_std) mean_vec=np.mean(X_std,axis=0) cov_mat=(X_std-mean_vec).T.dot((X_std-mean_vec))/(X_std.shape[0]-1) # print("Covarinance matrix :\n",cov_mat) eig_vals,eig_vecs=np.linalg.eig(cov_mat) # print("Eigenvectors \n %s" %eig_vecs) #特征向量 # print("\nEigenvalues \n %s" %eig_vals) #特征值 eig_pairs=[(np.abs(eig_vals[i]),eig_vecs[:,i]) for i in range(len(eig_vals))] print() print(eig_pairs) eig_pairs.sort(key= lambda x:x[0],reverse=True) #按照键值进行排序 print("Eigenvalues in descending order:") for i in eig_pairs: print(i[0]) print("==========================================") tot=sum(eig_vals) vars_exp=[(i/tot)*100 for i in sorted(eig_vals,reverse=True)] print(vars_exp) # [72.77045209380134, 23.030523267680636, 3.683831957627396, 0.5151926808906299] #cumsum n等于(0,n-1)的和 cum_var_exp=np.cumsum(vars_exp) print(cum_var_exp) # [ 72.77045209 95.80097536 99.48480732 100. ] plt.figure(figsize=(10,8)) plt.bar(range(4),vars_exp,alpha=0.5,align="center",label="individual explained variance ") plt.step(range(4),cum_var_exp,where="mid",label="cumulative explained variance ") plt.xlabel("Principal components") plt.ylabel("Explained variance ratio") plt.legend(loc="best") plt.tight_layout() plt.show() matrix_w=np.hstack((eig_pairs[0][1].reshape(4,1),eig_pairs[1][1].reshape(4,1))) print("matrix_w",matrix_w) Y=X_std.dot(matrix_w) print("Y",Y) plt.figure(figsize=(6, 4)) for lab, col in zip(('Iris-Setosa', 'Iris-Versicolor', 'Iris-Virgnica'),('blue', 'red', 'green')): plt.scatter(X[y==lab, 0],X[y==lab, 1],label=lab,c=col) plt.xlabel('sepal_len') plt.ylabel('sepal_wid') plt.legend(loc='best') plt.tight_layout() plt.show() plt.figure(figsize=(6, 4)) for lab, col in zip(('Iris-Setosa', 'Iris-Versicolor', 'Iris-Virgnica'),('blue', 'red', 'green')): plt.scatter(Y[y==lab, 0], Y[y==lab, 1],label=lab,c=col) plt.xlabel('Principal Component 1') plt.ylabel('Principal Component 2') plt.legend(loc='lower center') plt.tight_layout() plt.show()