Due to the fact that some of the codes in "Python Data Analysis and Mining Practice" were found in the study, some of them could not be implemented, and I took a lot of detours when I was learning. I hereby share the code that can be directly implemented, hoping to make friends in need less detours.
Section 10-1
import pandas as pd inputfile='../10.2/water_heater.xls' outfile='../10.2/dividsequence.xls' data=pd.read_excel(inputfile) data[ u'occurrence time' ]=pd.to_datetime(data[ u'occurrence time' ], format = '%Y%m%d%H%M%S' ) #conversion time format T=pd.Timedelta( minutes = 4 ) #threshold data=data[data[ u'water flow' ]> 0 ] #retain records whose water flow is not 0 d=data[ u'occurrence time' ].diff()>T #data type is boolean data[ u'time number' ]=d.cumsum()+ 1 #Use the cumulative sum number data.to_excel(outfile)
Section 10-2
#Threshold optimization import pandas as pd import numpy as np inputfile='../10.2/water_heater.xls' data=pd.read_excel(inputfile) data[ u'occurrence time' ]=pd.to_datetime(data[ u'occurrence time' ], format = '%Y%m%d%H%M%S' ) #conversion time format T=pd.Timedelta( minutes = 5 ) #Expert threshold data=data[data[ u'water flow' ]> 0 ] #Keep records whose water flow is not 0 def event_num(ts): d=data[u'发生时间'].diff()>ts return d.sum()+1 dt=[pd.Timedelta(minutes=i) for i in np.arange(1,9,0.25)] h=pd.DataFrame(dt,columns=[u'阈值']) h[ u 'number of events' ]=h[ u 'threshold' ].apply(event_num) h[ u'slope ' ] = h[ u'number of events' ].diff()/ 0.25 #The difference between two adjacent points h[ u'slope indicator' ]=pd.Series.rolling(h[ u'slope ' ].abs(), window = 4 , center = False ).mean() #Use the average absolute value of the slope of the last 4 points as the slope ts=h[ u'threshold ' ][h[ u'slope indicator' ]. idxmin()- 4 ] #.idxmin() returns the index of the minimum value if ts>T: ts = pd.Timedelta ( minutes = 4 ) print (ts)
Section 10-3
#train neural network to recognize bathing events import pandas as pd inputfile1='../10.2/train_neural_network_data.xls' inputfile2='../10.2/test_neural_network_data.xls' testoutputfile='../10.2/test_output_data.xls' data_train=pd.read_excel(inputfile1) data_test=pd.read_excel(inputfile2) x_train=data_train.iloc[:,5:16].as_matrix() #样本特征 y_train=data_train.iloc[:,4].as_matrix() #标签列 x_test=data_test.iloc[:,5:16].as_matrix() y_test=data_test.iloc[:,4].as_matrix() from keras.models import Sequential from keras.layers.core import Dense,Dropout,Activation model=Sequential() #建立模型 model.add(Dense(input_dim=11,units=17)) model.add(Activation('relu')) model.add(Dense(input_dim=17,units=10)) model.add(Activation('relu')) model.add(Dense(input_dim=10,units=1)) model.add(Activation('sigmoid')) model.compile(loss='binary_crossentropy',optimizer='adam') model.fit(x_train,y_train,epochs=100,batch_size=1) model.save_weights('../10.2/net.model') r=pd.DataFrame(model.predict_classes(x_test),columns=[u'预测结果']) pd.concat([data_test.iloc[:,:5],r],axis=1).to_excel(testoutputfile)