python_数据_knn_分类_2

knn_2

start

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
df = pd.read_csv('./adults.txt')
df.head()
@id age workclass final_weight education education_num marital_status occupation relationship race sex capital_gain capital_loss hours_per_week native_country salary
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K

根据各项条件(X)判断薪资(y)

X = df.iloc[:,:-1]
y = df['salary']

清洗-去除一些属性

# final_weight 最终重量?
# education 与 education_num 意义相近 保留量化的
# capital_gain 资本收益   capital_loss 资本损失  忽略
X2 = X.drop(['final_weight','education','capital_gain','capital_loss'],axis=1)
X2.info()
  • <class ‘pandas.core.frame.DataFrame’>
    RangeIndex: 32561 entries, 0 to 32560
    Data columns (total 10 columns):
    age 32561 non-null int64 ########## age : 32561项 不存在空项 int64类型
    workclass 32561 non-null object
    education_num 32561 non-null int64
    marital_status 32561 non-null object
    occupation 32561 non-null object
    relationship 32561 non-null object
    race 32561 non-null object
    sex 32561 non-null object
    hours_per_week 32561 non-null int64
    native_country 32561 non-null object
    dtypes: int64(3), object(7)
    memory usage: 2.5+ MB

量化属性值

cols = ['marital_status', 'occupation',
       'relationship', 'race', 'sex', 'native_country','workclass']
for col in cols:
    uq = X2[col].unique()  # 将对应属性的属性值(去重)构成一个列表
    # 返回属性值在列表中的位置
    def convert_1(x):
        return np.argwhere(uq == x)[0,0]
    X2[col] = X2[col].map(convert_1)
X2.info()
  • <class ‘pandas.core.frame.DataFrame’>
    RangeIndex: 32561 entries, 0 to 32560
    Data columns (total 10 columns):
    age 32561 non-null int64
    workclass 32561 non-null int64
    education_num 32561 non-null int64
    marital_status 32561 non-null int64
    occupation 32561 non-null int64
    relationship 32561 non-null int64
    race 32561 non-null int64
    sex 32561 non-null int64
    hours_per_week 32561 non-null int64
    native_country 32561 non-null int64
    dtypes: int64(10)
    memory usage: 2.5 MB
%%time
from sklearn.model_selection import train_test_split

s_ = 0
knn = KNeighborsClassifier()
for i in range(20):
    x_train,x_test,y_train,y_test = train_test_split(X2,y,test_size = 0.2)
    knn.fit(x_train,y_train)
    s_ += knn.score(x_test,y_test)/20
# s_    # 0.7958007062797482       # Wall time: 9.58 s

独热编码量化属性值

  • pandas中的独热量化方法
X3 = X.drop(['final_weight','education','capital_gain','capital_loss'],axis=1)
X4 = pd.get_dummies(X3)    # 只对非number型数据进行独热转换
X4.shape   # (32561, 89)
%%time   
s_ = 0
knn = KNeighborsClassifier(5)
for i in range(5):
    x_train,x_test,y_train,y_test = train_test_split(X4,y,test_size = 0.2)
    knn.fit(x_train,y_train)
    s_ += knn.score(x_test,y_test)/5
# s_   # 0.8109933978197452
  • sklearn预处理preprocessing中也提供了相关方法
from sklearn.preprocessing import OneHotEncoder    # sklearn 也提供了独热编码方法

one_hot_encode = OneHotEncoder()
X_o_h = one_hot_encode.fit_transform(X3)    # 对所有数据进行了独热转换

X_o_h是稀松矩阵,大大降低了内存的消耗

%%time
s_ = 0
knn = KNeighborsClassifier(5)
for i in range(5):
    x_train,x_test,y_train,y_test = train_test_split(X_o_h,y,test_size = 0.2)
    knn.fit(x_train,y_train)
    s_ += knn.score(x_test,y_test)/5
# s_    # 0.8117918010133578       # Wall time: 35.1 s    耗时长

猜你喜欢

转载自blog.csdn.net/sinat_39045958/article/details/86584813