from sklearn. model_selection import train_test_split
from sklearn. linear_model import LinearRegression
from sklearn. preprocessing import StandardScaler
import numpy as np
import matplotlib as mpl
import matplotlib. pyplot as plt
import pandas as pd
from pandas import DataFrame
import time
mpl. rcParams[ 'font.sans-serif' ] = [ u'simHei' ]
mpl. rcParams[ 'axes.unicode_minus' ] = False
path1= 'datas/household_power_consumption_1000.txt'
df = pd. read_csv( path1, sep= ';' , low_memory= False )
df. head( )
Date
Time
Global_active_power
Global_reactive_power
Voltage
Global_intensity
Sub_metering_1
Sub_metering_2
Sub_metering_3
0
16/12/2006
17:24:00
4.216
0.418
234.84
18.4
0.0
1.0
17.0
1
16/12/2006
17:25:00
5.360
0.436
233.63
23.0
0.0
1.0
16.0
2
16/12/2006
17:26:00
5.374
0.498
233.29
23.0
0.0
2.0
17.0
3
16/12/2006
17:27:00
5.388
0.502
233.74
23.0
0.0
1.0
17.0
4
16/12/2006
17:28:00
3.666
0.528
235.68
15.8
0.0
1.0
17.0
df. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
Date 1000 non-null object
Time 1000 non-null object
Global_active_power 1000 non-null float64
Global_reactive_power 1000 non-null float64
Voltage 1000 non-null float64
Global_intensity 1000 non-null float64
Sub_metering_1 1000 non-null float64
Sub_metering_2 1000 non-null float64
Sub_metering_3 1000 non-null float64
dtypes: float64(7), object(2)
memory usage: 70.4+ KB
new_df = df. replace( '?' , np. nan)
datas = new_df. dropna( axis= 0 , how = 'any' )
datas. describe( ) . T
count
mean
std
min
25%
50%
75%
max
Global_active_power
1000.0
2.418772
1.239979
0.206
1.806
2.414
3.308
7.706
Global_reactive_power
1000.0
0.089232
0.088088
0.000
0.000
0.072
0.126
0.528
Voltage
1000.0
240.035790
4.084420
230.980
236.940
240.650
243.295
249.370
Global_intensity
1000.0
10.351000
5.122214
0.800
8.400
10.000
14.000
33.200
Sub_metering_1
1000.0
0.000000
0.000000
0.000
0.000
0.000
0.000
0.000
Sub_metering_2
1000.0
2.749000
8.104053
0.000
0.000
0.000
1.000
38.000
Sub_metering_3
1000.0
5.756000
8.066941
0.000
0.000
0.000
17.000
19.000
df. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
Date 1000 non-null object
Time 1000 non-null object
Global_active_power 1000 non-null float64
Global_reactive_power 1000 non-null float64
Voltage 1000 non-null float64
Global_intensity 1000 non-null float64
Sub_metering_1 1000 non-null float64
Sub_metering_2 1000 non-null float64
Sub_metering_3 1000 non-null float64
dtypes: float64(7), object(2)
memory usage: 70.4+ KB
def date_format ( dt) :
import time
t = time. strptime( ' ' . join( dt) , '%d/%m/%Y %H:%M:%S' )
return ( t. tm_year, t. tm_mon, t. tm_mday, t. tm_hour, t. tm_min, t. tm_sec)
X = datas. iloc[ : , 0 : 2 ]
X = X. apply ( lambda x: pd. Series( date_format( x) ) , axis= 1 )
Y = datas[ 'Global_active_power' ]
X. head( 2 )
0
1
2
3
4
5
0
2006
12
16
17
24
0
1
2006
12
16
17
25
0
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size= 0.2 , random_state= 0 )
print ( X_train. shape)
print ( X_test. shape)
print ( Y_train. shape)
(800, 6)
(200, 6)
(800,)
X_train. describe( ) . T
count
mean
std
min
25%
50%
75%
max
0
800.0
2006.00000
0.000000
2006.0
2006.0
2006.0
2006.0
2006.0
1
800.0
12.00000
0.000000
12.0
12.0
12.0
12.0
12.0
2
800.0
16.59875
0.490458
16.0
16.0
17.0
17.0
17.0
3
800.0
10.75500
8.068386
0.0
4.0
8.0
19.0
23.0
4
800.0
29.72375
17.266517
0.0
15.0
30.0
45.0
59.0
5
800.0
0.00000
0.000000
0.0
0.0
0.0
0.0
0.0
ss = StandardScaler( )
X_train = ss. fit_transform( X_train)
X_test = ss. transform( X_test)
pd. DataFrame( X_train) . describe( ) . T
count
mean
std
min
25%
50%
75%
max
0
800.0
0.000000e+00
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
1
800.0
0.000000e+00
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
2
800.0
2.196299e-15
1.000626
-1.221561
-1.221561
0.818625
0.818625
0.818625
3
800.0
-8.604228e-17
1.000626
-1.333814
-0.837742
-0.341670
1.022529
1.518601
4
800.0
3.691492e-17
1.000626
-1.722545
-0.853268
0.016009
0.885286
1.696611
5
800.0
0.000000e+00
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
lr = LinearRegression( )
lr. fit( X_train, Y_train)
y_predict = lr. predict( X_test)
print ( "训练R2:" , lr. score( X_train, Y_train) )
print ( "测试R2:" , lr. score( X_test, Y_test) )
mse = np. average( ( y_predict- Y_test) ** 2 )
rmse = np. sqrt( mse)
print ( "rmse:" , rmse)
训练R2: 0.24409311805909026
测试R2: 0.12551628513735846
rmse: 1.164092345973625
from sklearn. externals import joblib
joblib. dump( ss, "data_ss.model" )
joblib. dump( lr, "data_lr.model" )
ss = joblib. load( "data_ss.model" )
lr = joblib. load( "data_lr.model" )
data1 = [ [ 2006 , 12 , 17 , 12 , 25 , 0 ] ]
data1 = ss. transform( data1)
print ( data1)
lr. predict( data1)
[[ 0. 0. 0.81862454 0.15440249 -0.27374978 0. ]]
array([1.16996393])
t= np. arange( len ( X_test) )
plt. figure( facecolor= 'w' )
plt. plot( t, Y_test, 'r-' , linewidth= 2 , label= '真实值' )
plt. plot( t, y_predict, 'g-' , linewidth= 2 , label= '预测值' )
plt. legend( loc = 'upper left' )
plt. title( "线性回归预测时间和功率之间的关系" , fontsize= 20 )
plt. grid( b= True )
plt. show( )
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-StMpDQDu-1581861627931)(output_18_0.png)]
X = datas. iloc[ : , 2 : 4 ]
Y2 = datas. iloc[ : , 5 ]
X2_train, X2_test, Y2_train, Y2_test = train_test_split( X, Y2, test_size= 0.2 , random_state= 0 )
scaler2 = StandardScaler( )
X2_train = scaler2. fit_transform( X2_train)
X2_test = scaler2. transform( X2_test)
lr2 = LinearRegression( )
lr2. fit( X2_train, Y2_train)
Y2_predict = lr2. predict( X2_test)
print ( "电流预测准确率: " , lr2. score( X2_test, Y2_test) )
print ( "电流参数:" , lr2. coef_)
t= np. arange( len ( X2_test) )
plt. figure( facecolor= 'w' )
plt. plot( t, Y2_test, 'r-' , linewidth= 2 , label= u'真实值' )
plt. plot( t, Y2_predict, 'g-' , linewidth= 2 , label= u'预测值' )
plt. legend( loc = 'lower right' )
plt. title( u"线性回归预测功率与电流之间的关系" , fontsize= 20 )
plt. grid( b= True )
plt. show( )
电流预测准确率: 0.9920420609708968
电流参数: [5.07744316 0.07191391]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-9AuelMOP-1581861627933)(output_19_1.png)]