import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.DataFrame({'Condition 1': np.random.rand(20),
'Condition 2': np.random.rand(20) * 0.9,
'Condition 3': np.random.rand(20) * 1.1
})
df
|
Condition 1 |
Condition 2 |
Condition 3 |
0 |
0.150388 |
0.319698 |
0.307660 |
1 |
0.969830 |
0.613011 |
0.695216 |
2 |
0.660890 |
0.552131 |
0.229432 |
3 |
0.574232 |
0.679883 |
0.738781 |
4 |
0.527174 |
0.578460 |
0.981132 |
5 |
0.952754 |
0.388025 |
0.935823 |
6 |
0.077330 |
0.331501 |
0.663525 |
7 |
0.288425 |
0.755113 |
0.829731 |
8 |
0.398153 |
0.668251 |
0.674626 |
9 |
0.687752 |
0.540433 |
0.971847 |
10 |
0.470583 |
0.352360 |
0.249517 |
11 |
0.643588 |
0.240827 |
0.640346 |
12 |
0.278763 |
0.012188 |
0.506313 |
13 |
0.486791 |
0.538330 |
0.005713 |
14 |
0.661333 |
0.101712 |
0.868087 |
15 |
0.420160 |
0.640365 |
0.388247 |
16 |
0.932169 |
0.580433 |
0.594378 |
17 |
0.956558 |
0.878580 |
0.458417 |
18 |
0.637018 |
0.058973 |
0.338527 |
19 |
0.950942 |
0.647577 |
0.687604 |
fig,ax = plt.subplots(figsize=(10,8))
#stacked 是否堆叠
df.plot.bar(ax=ax,stacked=False)
<matplotlib.axes._subplots.AxesSubplot at 0xa263898>
from matplotlib.ticker import FuncFormatter
df_ratio = df.div(df.sum(axis=1),axis=0)
fig,ax = plt.subplots()
df_ratio.plot.bar(ax=ax,stacked=True)
ax.yaxis.set_major_formatter(FuncFormatter(lambda y,_:'{:.0%}'.format(y)))
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00383/risk_factors_cervical_cancer.csv'
df = pd.read_csv(url, na_values="?")
df.head()
|
Age |
Number of sexual partners |
First sexual intercourse |
Num of pregnancies |
Smokes |
Smokes (years) |
Smokes (packs/year) |
Hormonal Contraceptives |
Hormonal Contraceptives (years) |
IUD |
... |
STDs: Time since first diagnosis |
STDs: Time since last diagnosis |
Dx:Cancer |
Dx:CIN |
Dx:HPV |
Dx |
Hinselmann |
Schiller |
Citology |
Biopsy |
0 |
18 |
4.0 |
15.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
... |
NaN |
NaN |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
15 |
1.0 |
14.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
... |
NaN |
NaN |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
2 |
34 |
1.0 |
NaN |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
... |
NaN |
NaN |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
3 |
52 |
5.0 |
16.0 |
4.0 |
1.0 |
37.0 |
37.0 |
1.0 |
3.0 |
0.0 |
... |
NaN |
NaN |
1 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
4 |
46 |
3.0 |
21.0 |
4.0 |
0.0 |
0.0 |
0.0 |
1.0 |
15.0 |
0.0 |
... |
NaN |
NaN |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
5 rows × 36 columns
from sklearn.preprocessing import Imputer
#缺失值填充处理
impute = pd.DataFrame(Imputer().fit_transform(df))
impute.columns = df.columns
impute.index = df.index
impute.head()
|
Age |
Number of sexual partners |
First sexual intercourse |
Num of pregnancies |
Smokes |
Smokes (years) |
Smokes (packs/year) |
Hormonal Contraceptives |
Hormonal Contraceptives (years) |
IUD |
... |
STDs: Time since first diagnosis |
STDs: Time since last diagnosis |
Dx:Cancer |
Dx:CIN |
Dx:HPV |
Dx |
Hinselmann |
Schiller |
Citology |
Biopsy |
0 |
18.0 |
4.0 |
15.0000 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
... |
6.140845 |
5.816901 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1 |
15.0 |
1.0 |
14.0000 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
... |
6.140845 |
5.816901 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
2 |
34.0 |
1.0 |
16.9953 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
... |
6.140845 |
5.816901 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
3 |
52.0 |
5.0 |
16.0000 |
4.0 |
1.0 |
37.0 |
37.0 |
1.0 |
3.0 |
0.0 |
... |
6.140845 |
5.816901 |
1.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
4 |
46.0 |
3.0 |
21.0000 |
4.0 |
0.0 |
0.0 |
0.0 |
1.0 |
15.0 |
0.0 |
... |
6.140845 |
5.816901 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
5 rows × 36 columns
%matplotlib notebook
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
features = impute.drop('Dx:Cancer', axis=1)
y = impute["Dx:Cancer"]
pca = PCA(n_components=3)
X_r = pca.fit_transform(features)
print("Explained variance:\nPC1 {:.2%}\nPC2 {:.2%}\nPC3 {:.2%}"
.format(pca.explained_variance_ratio_[0],
pca.explained_variance_ratio_[1],
pca.explained_variance_ratio_[2]))
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(X_r[:, 0], X_r[:, 1], X_r[:, 2], c=y, cmap=plt.cm.coolwarm)
# Label the axes
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')