逻辑回归
import pandas as pd
import numpy as np
data = pd. read_excel( 'fruit_data.xlsx' , index_col= "ID" )
data. head( )
mass
width
height
color_score
fruit_name
ID
1
192
8.4
7.3
0.55
apple
2
180
8.0
6.8
0.59
apple
3
176
7.4
7.2
0.60
apple
4
178
7.1
7.8
0.92
apple
5
172
7.4
7.0
0.89
apple
train_data = data. dropna( )
train_data[ 'category' ] = train_data[ 'fruit_name' ] . apply ( lambda x: 1 if x== 'apple' else 0 )
train_data. head( )
R:\Anaconda\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
mass
width
height
color_score
fruit_name
category
ID
1
192
8.4
7.3
0.55
apple
1
2
180
8.0
6.8
0.59
apple
1
3
176
7.4
7.2
0.60
apple
1
4
178
7.1
7.8
0.92
apple
1
5
172
7.4
7.0
0.89
apple
1
test_data = data. loc[ data[ 'fruit_name' ] . isnull( ) == True ]
test_data
mass
width
height
color_score
fruit_name
ID
39
158
7.1
7.6
0.72
NaN
40
190
7.5
7.9
0.77
NaN
41
189
7.6
7.7
0.77
NaN
42
160
7.9
6.9
0.65
NaN
方法一:sklearn.linear_model.LogisticRegression
from sklearn. linear_model import LogisticRegression
X = train_data. iloc[ : , : - 2 ]
y = train_data[ 'category' ]
LR = LogisticRegression( )
LR. fit( X, y)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='auto', n_jobs=None, penalty='l2',
random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
warm_start=False)
print ( LR. intercept_)
[4.54213181]
print ( LR. coef_)
[[-0.01125145 0.97166531 -1.314372 0.20036824]]
test = test_data. iloc[ : , : - 1 ]
print ( LR. predict( test) )
print ( LR. predict_proba( test) )
[0 0 0 1]
[[0.54530945 0.45469055]
[0.63120971 0.36879029]
[0.54143416 0.45856584]
[0.18555923 0.81444077]]
LR. score( X, y)
0.7105263157894737
方法二:statsmodels(结果与SPSS一致)
import statsmodels. api as sm
X1 = sm. add_constant( X)
lr = sm. Logit( y, X1)
result = lr. fit( )
result. summary( )
Optimization terminated successfully.
Current function value: 0.449106
Iterations 7
Logit Regression Results
Dep. Variable:
category
No. Observations:
38
Model:
Logit
Df Residuals:
33
Method:
MLE
Df Model:
4
Date:
Tue, 12 May 2020
Pseudo R-squ.:
0.3521
Time:
12:30:14
Log-Likelihood:
-17.066
converged:
True
LL-Null:
-26.340
Covariance Type:
nonrobust
LLR p-value:
0.0009644
coef
std err
z
P>|z|
[0.025
0.975]
const
-7.2016
14.503
-0.497
0.620
-35.627
21.224
mass
-0.0238
0.024
-0.982
0.326
-0.071
0.024
width
4.3068
1.844
2.335
0.020
0.692
7.922
height
-3.7497
1.641
-2.286
0.022
-6.965
-0.534
color_score
9.8913
5.746
1.722
0.085
-1.370
21.152
result. predict( sm. add_constant( test) )
ID
39 0.147665
40 0.194533
41 0.446099
42 0.972809
dtype: float64
线性判别分析
from sklearn. discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis( )
X_r = lda. fit( X, y)
X_r. coef_
array([[-0.03206332, 4.57480239, -2.87678633, 10.50469726]])
X_r. score( X, y)
0.7631578947368421
X_r. predict( test)
array([0, 0, 0, 1], dtype=int64)
X_r. predict( X)
array([1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1], dtype=int64)
多分类
data2 = pd. read_excel( 'mul_fruit.xlsx' )
data2. head( )
ID
mass
width
height
color_score
fruit_name
kind
0
1
192
8.4
7.3
0.55
apple
1.0
1
2
180
8.0
6.8
0.59
apple
1.0
2
3
176
7.4
7.2
0.60
apple
1.0
3
4
178
7.1
7.8
0.92
apple
1.0
4
5
172
7.4
7.0
0.89
apple
1.0
train_data2 = data2. dropna( )
test2 = data2. loc[ data2[ 'fruit_name' ] . isnull( ) == True ] . iloc[ : , 1 : 5 ]
target_names = train_data2[ 'fruit_name' ] . unique( )
X = train_data2. iloc[ : , [ 1 , 2 , 3 , 4 ] ]
y = train_data2[ 'kind' ]
lda2 = LinearDiscriminantAnalysis( n_components= 2 )
X_r2 = lda. fit( X, y)
X_r2. score( X, y)
0.8305084745762712
X_r2. predict( test2)
array([3., 3., 3., 1., 2., 4., 1., 3.])
import matplotlib. pyplot as plt
X_rr = X_r2 = lda. fit( X, y) . transform( X)
plt. figure( )
colors = [ 'navy' , 'turquoise' , 'darkorange' , 'blue' ]
lw = 2
for color, i, target_name in zip ( colors, [ 1 , 2 , 3 , 4 ] , target_names) :
plt. scatter( X_rr[ y == i, 0 ] , X_rr[ y == i, 1 ] , color= color, alpha= .8 , lw= lw,
label= target_name)
plt. legend( loc= 'best' , shadow= False , scatterpoints= 1 )
plt. title( 'LDA of FRUITS dataset' )
Text(0.5, 1.0, 'LDA of FRUITS dataset')
np. set_printoptions( suppress= True )