Disclaimer: The content is not original, the code comes from Baosir
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
% matplotlib inline
seeds = pd. read_csv( 'data/seeds.csv' , sep = '\t' , header = None )
seeds. head( )
0
1
2
3
4
5
6
7
0
15.26
14.84
0.8710
5.763
3.312
2.221
5.220
Kama
1
14.88
14.57
0.8811
5.554
3.333
1.018
4.956
Kama
2
14.29
14.09
0.9050
5.291
3.337
2.699
4.825
Kama
3
13.84
13.94
0.8955
5.324
3.379
2.259
4.805
Kama
4
16.14
14.99
0.9034
5.658
3.562
1.355
5.175
Kama
seeds[ 7 ] . value_counts( )
Kama 70
Rosa 70
Canadian 70
Name: 7, dtype: int64
seeds[ 7 ] . value_counts( ) . plot( kind = 'bar' )
<AxesSubplot:>
import seaborn as sns
sns. set ( )
from sklearn. model_selection import train_test_split
from sklearn. linear_model import Lasso, RidgeClassifier
from sklearn. neighbors import KNeighborsClassifier
from sklearn. tree import DecisionTreeClassifier
from sklearn. preprocessing import MinMaxScaler, StandardScaler
X = seeds. iloc[ : , : 7 ] . copy( )
X. shape
(210, 7)
X
0
1
2
3
4
5
6
0
15.26
14.84
0.8710
5.763
3.312
2.221
5.220
1
14.88
14.57
0.8811
5.554
3.333
1.018
4.956
2
14.29
14.09
0.9050
5.291
3.337
2.699
4.825
3
13.84
13.94
0.8955
5.324
3.379
2.259
4.805
4
16.14
14.99
0.9034
5.658
3.562
1.355
5.175
...
...
...
...
...
...
...
...
205
12.19
13.20
0.8783
5.137
2.981
3.631
4.870
206
11.23
12.88
0.8511
5.140
2.795
4.325
5.003
207
13.20
13.66
0.8883
5.236
3.232
8.315
5.056
208
11.84
13.21
0.8521
5.175
2.836
3.598
5.044
209
12.30
13.34
0.8684
5.243
2.974
5.637
5.063
210 rows × 7 columns
y = seeds. iloc[ : , - 1 ] . copy( )
y. shape
(210,)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size= 0.2 , random_state= 1 )
def knn_score ( k, X, y) :
knn = KNeighborsClassifier( n_neighbors = k)
scores = [ ]
train_scores = [ ]
for i in range ( 100 ) :
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size= 0.2 , random_state= 1 )
knn. fit( X_train, y_train)
scores. append( knn. score( X_test, y_test) )
train_scores. append( knn. score( X_train, y_train) )
return np. array( scores) . mean( ) , np. array( train_scores) . mean( )
result_dict = {
}
k_list = [ 1 , 3 , 5 , 7 , 9 , 11 ]
for k in k_list:
score, train_score = knn_score( k, X, y)
result_dict[ k] = [ score, train_score]
result_dict
{1: [0.9047619047619047, 1.0],
3: [0.9047619047619047, 0.9642857142857139],
5: [0.8571428571428572, 0.9285714285714287],
7: [0.8571428571428572, 0.9345238095238096],
9: [0.8809523809523812, 0.9226190476190478],
11: [0.8809523809523812, 0.9226190476190478]}
pd. DataFrame( result_dict) . T
0
1
1
0.904762
1.000000
3
0.904762
0.964286
5
0.857143
0.928571
7
0.857143
0.934524
9
0.880952
0.922619
11
0.880952
0.922619
result = pd. DataFrame( result_dict) . T. copy( )
result. columns = [ 'Test' , 'Train' ]
result
Test
Train
1
0.904762
1.000000
3
0.904762
0.964286
5
0.857143
0.928571
7
0.857143
0.934524
9
0.880952
0.922619
11
0.880952
0.922619
result. plot( )
plt. xticks( k_list)
plt. show( )
Advanced Edition
X. shape
(210, 7)
X. describe( ) . T
count
mean
std
min
25%
50%
75%
max
0
210.0
14.847524
2.909699
10.5900
12.27000
14.35500
17.305000
21.1800
1
210.0
14.559286
1.305959
12.4100
13.45000
14.32000
15.715000
17.2500
2
210.0
0.870999
0.023629
0.8081
0.85690
0.87345
0.887775
0.9183
3
210.0
5.628533
0.443063
4.8990
5.26225
5.52350
5.979750
6.6750
4
210.0
3.258605
0.377714
2.6300
2.94400
3.23700
3.561750
4.0330
5
210.0
3.700201
1.503557
0.7651
2.56150
3.59900
4.768750
8.4560
6
210.0
5.408071
0.491480
4.5190
5.04500
5.22300
5.877000
6.5500
def standard_X ( X) :
X_copy = X. copy( )
for col_name in X_copy. columns:
col_data = X_copy[ [ col_name] ]
stand_data = StandardScaler( ) . fit_transform( col_data. values)
X_copy[ col_name] = stand_data
return X_copy
standard_X( X) . describe( [ 0.01 , 0.25 , 0.5 , 0.75 , 0.99 ] ) . T
count
mean
std
min
1%
25%
50%
75%
99%
max
0
210.0
-5.392512e-17
1.002389
-1.466714
-1.397504
-0.887955
-0.169674
0.846599
2.072913
2.181534
1
210.0
9.146123e-17
1.002389
-1.649686
-1.474607
-0.851433
-0.183664
0.887069
2.023505
2.065260
2
210.0
1.322091e-15
1.002389
-2.668236
-2.588824
-0.598079
0.103993
0.711677
1.678118
2.006586
3
210.0
-2.182910e-15
1.002389
-1.650501
-1.464372
-0.828682
-0.237628
0.794595
2.154459
2.367533
4
210.0
-2.030122e-16
1.002389
-1.668209
-1.634930
-0.834907
-0.057335
0.804496
1.936725
2.055112
5
210.0
-3.679596e-16
1.002389
-1.956769
-1.857934
-0.759148
-0.067469
0.712379
2.519905
3.170590
6
210.0
-1.337554e-16
1.002389
-1.813288
-1.633810
-0.740495
-0.377459
0.956394
2.130797
2.328998
View data distribution
After describing the standardized data and viewing the 99th quantile, it is found that there is a large gap between the two columns labeled 2 and 5
stand_X = standard_X( X)
for col_name in stand_X. columns:
sns. distplot( stand_X[ col_name] )
plt. title( col_name)
plt. show( )
binning operation
10 3000 5000 10000000
Use 5000 as the split point to split high-income and low-income for mapping (reduce the difference between data)
X[ 0 ] = pd. cut( X[ 0 ] , bins = 5 , labels = [ 0 , 1 , 2 , 3 , 4 ] )
X[ 0 ]
0 2
1 2
2 1
3 1
4 2
..
205 0
206 0
207 1
208 0
209 0
Name: 0, Length: 210, dtype: category
Categories (5, int64): [0 < 1 < 2 < 3 < 4]
sns. countplot( X[ 0 ] )
C:\Anaconda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
<AxesSubplot:xlabel='0', ylabel='count'>
for col_name in X. columns:
X[ col_name] = pd. cut( X[ col_name] , bins = 5 , labels = [ 0 , 1 , 2 , 3 , 4 ] )
X
0
1
2
3
4
5
6
0
2
2
2
2
2
0
1
1
2
2
3
1
2
0
1
2
1
1
4
1
2
1
0
3
1
1
3
1
2
0
0
4
2
2
4
2
3
0
1
...
...
...
...
...
...
...
...
205
0
0
3
0
1
1
0
206
0
0
1
0
0
2
1
207
1
1
3
0
2
4
1
208
0
0
1
0
0
1
1
209
0
0
2
0
1
3
1
210 rows × 7 columns
knn = KNeighborsClassifier( )
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.2 , random_state = 1 )
knn. fit( X_train, y_train)
KNeighborsClassifier()
knn. score( X_train, y_train)
0.9166666666666666
knn. score( X_test, y_test)
0.9523809523809523