knn_2
start
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
df = pd.read_csv('./adults.txt')
df.head()
@id | age | workclass | final_weight | education | education_num | marital_status | occupation | relationship | race | sex | capital_gain | capital_loss | hours_per_week | native_country | salary |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
根据各项条件(X)判断薪资(y)
X = df.iloc[:,:-1]
y = df['salary']
清洗-去除一些属性
# final_weight 最终重量?
# education 与 education_num 意义相近 保留量化的
# capital_gain 资本收益 capital_loss 资本损失 忽略
X2 = X.drop(['final_weight','education','capital_gain','capital_loss'],axis=1)
X2.info()
- <class ‘pandas.core.frame.DataFrame’>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 10 columns):
age 32561 non-null int64 ########## age : 32561项 不存在空项 int64类型
workclass 32561 non-null object
education_num 32561 non-null int64
marital_status 32561 non-null object
occupation 32561 non-null object
relationship 32561 non-null object
race 32561 non-null object
sex 32561 non-null object
hours_per_week 32561 non-null int64
native_country 32561 non-null object
dtypes: int64(3), object(7)
memory usage: 2.5+ MB
量化属性值
cols = ['marital_status', 'occupation',
'relationship', 'race', 'sex', 'native_country','workclass']
for col in cols:
uq = X2[col].unique() # 将对应属性的属性值(去重)构成一个列表
# 返回属性值在列表中的位置
def convert_1(x):
return np.argwhere(uq == x)[0,0]
X2[col] = X2[col].map(convert_1)
X2.info()
- <class ‘pandas.core.frame.DataFrame’>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 10 columns):
age 32561 non-null int64
workclass 32561 non-null int64
education_num 32561 non-null int64
marital_status 32561 non-null int64
occupation 32561 non-null int64
relationship 32561 non-null int64
race 32561 non-null int64
sex 32561 non-null int64
hours_per_week 32561 non-null int64
native_country 32561 non-null int64
dtypes: int64(10)
memory usage: 2.5 MB
%%time
from sklearn.model_selection import train_test_split
s_ = 0
knn = KNeighborsClassifier()
for i in range(20):
x_train,x_test,y_train,y_test = train_test_split(X2,y,test_size = 0.2)
knn.fit(x_train,y_train)
s_ += knn.score(x_test,y_test)/20
# s_ # 0.7958007062797482 # Wall time: 9.58 s
独热编码量化属性值
- pandas中的独热量化方法
X3 = X.drop(['final_weight','education','capital_gain','capital_loss'],axis=1)
X4 = pd.get_dummies(X3) # 只对非number型数据进行独热转换
X4.shape # (32561, 89)
%%time
s_ = 0
knn = KNeighborsClassifier(5)
for i in range(5):
x_train,x_test,y_train,y_test = train_test_split(X4,y,test_size = 0.2)
knn.fit(x_train,y_train)
s_ += knn.score(x_test,y_test)/5
# s_ # 0.8109933978197452
- sklearn预处理preprocessing中也提供了相关方法
from sklearn.preprocessing import OneHotEncoder # sklearn 也提供了独热编码方法
one_hot_encode = OneHotEncoder()
X_o_h = one_hot_encode.fit_transform(X3) # 对所有数据进行了独热转换
X_o_h是稀松矩阵,大大降低了内存的消耗
%%time
s_ = 0
knn = KNeighborsClassifier(5)
for i in range(5):
x_train,x_test,y_train,y_test = train_test_split(X_o_h,y,test_size = 0.2)
knn.fit(x_train,y_train)
s_ += knn.score(x_test,y_test)/5
# s_ # 0.8117918010133578 # Wall time: 35.1 s 耗时长