Table of contents
1.1 Exploration of basic data information
2.2 PCA to extract eigenvalues
3. Data clustering to identify types of medicinal materials
3.1 Elbow diagram to determine K value
3.2 Determine the K value from the contour coefficient map
3.4 Visualization of clustering results
4. Study the characteristics and differences of different kinds of medicinal materials
4.1 Mean curves of spectral data of different kinds of medicinal materials
4.2 Standard deviation curves of spectral data of different kinds of medicinal materials
1. Data preprocessing
1.1 Exploration of basic data information
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = 'SimHei' # 设置中文显示
plt.rcParams['axes.unicode_minus'] = False
data = pd.read_excel('附件1.xlsx',index_col = 0) # index_col指定索引
data.head()
data.shape # 数据维度
# (425, 3348)
data.info() # 数据基本信息
data.isnull().any().any() # 空值判断
# False
Explore and find that there are no missing values in the data
1.2 Data Visualization
# 数据可视化
def func_1(x):
plt.plot(#####)
def func_2(data):
fontsize = 5
plt.figure(figsize=(8, 6), dpi = 300)
########
plt.yticks(fontsize = fontsize)
plt.xlabel('波数(cm^-1)')
plt.ylabel('吸光度(AU)')
plt.grid(True) # 网格线设置
data.agg(lambda x: func_1(x), axis = 1)
plt.show()
func_2(data)
The visualization results show that there are three spectral data that are obviously outliers, which may be outliers or separate classes
1.3 Outlier processing
Introduce the triple sigma rule to check whether there are outliers in the data, and output the outlier index, and delete the outlier value to prepare for the next identification of the type of medicinal materials
#异常值检验3σ
def func_3(x):
lower = x.mean()-3*x.std()
toplimit = x.mean()+3*x.std()
return (x<lower)|(x>toplimit)
ycz = data.agg(lambda x:func_3(x))
ycz_index = data[(*******)].index
ycz_index
# Int64Index([64, 136, 201], dtype='int64', name='No')
data.drop(****,axis=0,inplace = True)
func_2(data)
data.to_excel('data_ycl.xlsx')
2. Data eigenvalue extraction
The feature value of the data extraction is prepared for the next identification of the type of medicinal materials
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
plt.rcParams['font.sans-serif'] = 'SimHei' # 设置中文显示
plt.rcParams['axes.unicode_minus'] = False
data = pd.read_excel('data_ycl.xlsx', index_col = 0)
2.1 Data Standardization
# 0-1标准化
arr_max = np.max(data)
arr_min = np.min(data)
data_bzh = (data-arr_min)/(arr_max-arr_min)
data_bzh
2.2 PCA to extract eigenvalues
Extraction of eigenvalues by principal component analysis
pca = PCA()
pca.fit(data_bzh)
pca.explained_####### # 贡献率
lg = np.cumsum(#####) #累计贡献率
a = [0.59843097, 0.88309499, 0.93970633, 0.97403493, 0.9853352 ,0.98891337, 0.99174341]
plt.figure(figsize=(8, 6), dpi = 300)
plt.plot(a)
plt.title('前七个主成分累计贡献率')
plt.xlabel('主成分')
plt.ylabel('累计贡献率')
plt.grid(True)
plt.savefig('前七个主成分累计贡献率.png')
plt.show()
Determining Principal Components from Gravel Plots
pca = PCA(3) # 选取累计贡献率大于90%的主成分(3个主成分)
pca.fit(data_bzh)
data_jw = pca.transform(data_bzh)
data_jw
3. Data clustering to identify types of medicinal materials
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from collections import Counter
from sklearn import metrics
from mpl_toolkits.mplot3d import Axes3D
plt.rcParams['font.sans-serif'] = 'SimHei' # 设置中文显示
plt.rcParams['axes.unicode_minus'] = False
data = pd.read_excel('data_jw.xlsx',index_col = 0)
3.1 Elbow diagram to determine K value
SSE = []
for k in ####:
############
km.fit(####)
SSE.append(#####)
X = range(1, 11)
plt.figure(figsize=(8, 6), dpi = 300)
plt.xlabel('k')
plt.ylabel('SSE')
plt.title('肘部图')
plt.plot(X, SSE, 'o-')
plt.grid(True)
plt.savefig('肘部图.png')
plt.show()
Determining k values using elbow plots, silhouette coefficient plots
3.2 Determine the K value from the contour coefficient map
scores = []
for k in #####:
######
score = metrics.######
scores.#######
X = range(3, 11)
plt.figure(figsize=(8, 6), dpi = 300)
plt.xlabel('k')
plt.ylabel('轮廓系数')
plt.title('轮廓系数图')
plt.plot(X, scores, 'o-')
plt.grid(True)
plt.savefig('轮廓系数图.png')
plt.show()
According to the elbow map, the silhouette coefficient map finally determines that the k value is 3
3.3 Data clustering
km = KMeans(n_clusters = 3)
km.fit(data)
print(Counter(km.labels_)) # 打印每个类多少个
print(km.cluster_centers_) # 中心点
data_1 = data.reset_index() # 把索引转为列
r = pd.concat([data_1['NO'], pd.Series(km.labels_)], axis = 1)
r.columns = ['NO', '聚类类别']
print(r)
3.4 Visualization of clustering results
data_lei0 = data[data['类别']==0]
data_lei1 = data[data['类别']==1]
data_lei2 = data[data['类别']==2]
x0 = data_lei0[0]
y0 = data_lei0[1]
z0 = data_lei0[2]
x1 = data_lei1[0]
y1 = data_lei1[1]
z1 = data_lei1[2]
x2 = data_lei2[0]
y2 = data_lei2[1]
z2 = data_lei2[2]
x3 = [-8.68761271, 10.22622717, -7.66566209]
y3 = [-6.26880974, -0.22269714, 7.70126935]
z3 = [0.04950984, 0.21554457, -0.43296869]
plt.figure(figsize=(8, 6), dpi = 300)
colors=['k', 'b', 'y', 'r']
ax = plt.subplot(111, projection='3d')
ax.plot(x0, y0, z0, 'o', color=colors[0], label='第一类')
ax.plot(x1, y1, z1, 'o', color=colors[1], label='第二类')
ax.plot(x2, y2, z2, 'o', color=colors[2], label='第三类')
ax.plot(x3, y3, z3, '*', color=colors[3], label='中心点')
plt.legend(loc='upper left', numpoints=1, ncol=3, fontsize=8, bbox_to_anchor=(0, 0))
plt.title('聚类效果图')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('z')
plt.savefig('聚类效果图.png')
plt.show()
This concludes the identification of the types of medicinal materials
4. Study the characteristics and differences of different kinds of medicinal materials
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = 'SimHei' # 设置中文显示
plt.rcParams['axes.unicode_minus'] = False
data = pd.read_excel('聚类.xlsx',index_col = 1)
data_1 = data[data['类别']=='第一类']
data_2 = data[data['类别']=='第二类']
data_3 = data[data['类别']=='第三类']
According to a preliminary analysis of the mean curves and standard deviation curves of spectral data of different kinds of medicinal materials, it can be seen that there are some differences in characteristics such as kurtosis and peak number.
4.1 Mean curves of spectral data of different kinds of medicinal materials
plt.figure(figsize=(8, 6), dpi = 300)
plt.xticks(range(652, 4000, 500))
plt.plot(np.mean(data_1), c = 'r', label = '第一类')
plt.plot(np.mean(data_2), c = 'b', label = '第二类')
plt.plot(np.mean(data_3), c = 'k', label = '第三类')
plt.grid(True)
plt.legend()
plt.xlabel('波数(cm^-1)')
plt.ylabel('吸光度(AU)')
plt.title('不同种类药材光谱数据均值曲线')
plt.savefig('不同种类药材光谱数据均值曲线.png')
plt.show()
4.2 Standard deviation curves of spectral data of different kinds of medicinal materials
plt.figure(figsize=(8, 6), dpi = 300)
plt.xticks(range(652, 4000, 500))
plt.plot(np.std(data_1), c = 'r', label = '第一类')
plt.plot(np.std(data_2), c = 'b', label = '第二类')
plt.plot(np.std(data_3), c = 'k', label = '第三类')
plt.grid(True)
plt.legend()
plt.xlabel('波数(cm^-1)')
plt.ylabel('吸光度(AU)')
plt.title('不同种类药材光谱数据标准差曲线')
plt.savefig('不同种类药材光谱数据标准差曲线.png')
plt.show()
4.3 Calculate the spectral information divergence SID of each type of Chinese herbal medicine spectrum
Introduce an index 'spectral information divergence SID' to further explore the characteristics and differences of different types of medicinal materials
Spectral information divergence (SID) is used to measure the similarity between two different pixels in a hyperspectral image. The Euclidean distance takes into account the variability of the spectrum itself, which can make a better evaluation of the spectral data.
df_1 = pd.DataFrame(np.mean(data_1))
df_2 = pd.DataFrame(np.mean(data_2))
df_3 = pd.DataFrame(np.mean(data_3))
index_0 = range(652, 4000)
def SID(x, y):
p = np.zeros_like(x, dtype=np.float)
q = np.zeros_like(y, dtype=np.float)
Sid = 0
for i in range(len(x)):
p[i] = x[i]/np.sum(x)
##############
for j in range(len(x)):
#############
return Sid
# 第一类和第二类光谱信息散度(SID)
SID((pd.DataFrame(df_1.values.T, columns = index_0)).values, (pd.DataFrame(df_2.values.T, columns = index_0)).values)
# 0.024393900155562476
# 第一类和第三类光谱信息散度(SID)
SID((pd.DataFrame(df_1.values.T, columns = index_0)).values, (pd.DataFrame(df_3.values.T, columns = index_0)).values)
# 0.06295196780155943
# 第二类和第三类光谱信息散度(SID)
SID((pd.DataFrame(df_2.values.T, columns = index_0)).values, (pd.DataFrame(df_3.values.T, columns = index_0)).values)
# 0.1474926576547535