Machine Learning in Practice: Abalone Regression Analysis (Summary of Regression Methods)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_csv('./data/abalone.txt',
                  header=None,
                  sep='\t'
                  )
data.head()
0 1 2 3 4 5 6 7 8
0 1 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15
1 1 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7
2 -1 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9
3 1 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10
4 0 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       4177 non-null   int64  
 1   1       4177 non-null   float64
 2   2       4177 non-null   float64
 3   3       4177 non-null   float64
 4   4       4177 non-null   float64
 5   5       4177 non-null   float64
 6   6       4177 non-null   float64
 7   7       4177 non-null   float64
 8   8       4177 non-null   int64  
dtypes: float64(7), int64(2)
memory usage: 293.8 KB
data.describe([0.01,0.99]).T
count mean std min 1% 50% 99% max
0 4177.0 0.052909 0.822240 -1.0000 -1.00000 0.0000 1.00000 1.0000
1 4177.0 0.523992 0.120093 0.0750 0.19500 0.5450 0.73500 0.8150
2 4177.0 0.407881 0.099240 0.0550 0.14000 0.4250 0.58000 0.6500
3 4177.0 0.139516 0.041827 0.0000 0.04500 0.1400 0.22000 1.1300
4 4177.0 0.828742 0.490389 0.0020 0.03576 0.7995 2.14442 2.8255
5 4177.0 0.359367 0.221963 0.0010 0.01350 0.3360 0.99778 1.4880
6 4177.0 0.180594 0.109614 0.0005 0.00788 0.1710 0.47610 0.7600
7 4177.0 0.238831 0.139203 0.0015 0.01038 0.2340 0.62000 1.0050
8 4177.0 9.933684 3.224169 1.0000 4.00000 9.0000 20.00000 29.0000
data.iloc[:,-1].value_counts()
9     689
10    634
8     568
11    487
7     391
12    267
6     259
13    203
14    126
5     115
15    103
16     67
17     58
4      57
18     42
19     32
20     26
3      15
21     14
23      9
22      6
27      2
24      2
1       1
26      1
29      1
2       1
25      1
Name: 8, dtype: int64
X = data.iloc[:,:-1].copy()
y = data.iloc[:,-1].copy()
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor
C:\Anaconda\lib\site-packages\xgboost\compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  from pandas import MultiIndex, Int64Index
from sklearn.preprocessing import StandardScaler
s_X = StandardScaler().fit_transform(X)
s_X = pd.DataFrame(data = s_X, columns=X.columns)
s_X
0 1 2 3 4 5 6 7
0 1.151980 -0.574558 -0.432149 -1.064424 -0.641898 -0.607685 -0.726212 -0.638217
1 1.151980 -1.448986 -1.439929 -1.183978 -1.230277 -1.170910 -1.205221 -1.212987
2 -1.280690 0.050033 0.122130 -0.107991 -0.309469 -0.463500 -0.356690 -0.207139
3 1.151980 -0.699476 -0.432149 -0.347099 -0.637819 -0.648238 -0.607600 -0.602294
4 -0.064355 -1.615544 -1.540707 -1.423087 -1.272086 -1.215968 -1.287337 -1.320757
... ... ... ... ... ... ... ... ...
4172 -1.280690 0.341509 0.424464 0.609334 0.118813 0.047908 0.532900 0.073062
4173 1.151980 0.549706 0.323686 -0.107991 0.279929 0.358808 0.309362 0.155685
4174 1.151980 0.632985 0.676409 1.565767 0.708212 0.748559 0.975413 0.496955
4175 -1.280690 0.841182 0.777187 0.250672 0.541998 0.773341 0.733627 0.410739
4176 1.151980 1.549052 1.482634 1.326659 2.283681 2.640993 1.787449 1.840481

4177 rows × 8 columns

X_train,X_test,y_train,y_test = train_test_split(s_X,y,test_size = 0.2,random_state=19)
RF = RandomForestRegressor(n_estimators = 500)
RF.fit(X_train,y_train)
RF.score(X_test,y_test)
0.5040997932157716
lr = LinearRegression()
lr.fit(X_train,y_train)
lr.score(X_test,y_test)
0.46853356891462006
knn = KNeighborsRegressor()
knn.fit(X_train,y_train)
knn.score(X_test,y_test)
0.4416450520111813
dt = DecisionTreeRegressor()
dt.fit(X_train,y_train)
dt.score(X_test,y_test)
0.054795460519916794
ada = AdaBoostRegressor(
    base_estimator = DecisionTreeRegressor(max_depth=3),
    n_estimators=50,
    learning_rate=1.0
)
ada.fit(X_train,y_train)
ada.score(X_test,y_test)
0.2854128443586734
gbdt = GradientBoostingRegressor()
gbdt.fit(X_train,y_train)
gbdt.score(X_test,y_test)
0.49482544308232923
xgb = XGBRegressor().fit(X_train,y_train)
xgb.score(X_test,y_test)
C:\Anaconda\lib\site-packages\xgboost\data.py:208: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  from pandas import MultiIndex, Int64Index





0.44712420076610004

Guess you like

Origin blog.csdn.net/qq_33489955/article/details/124313112