There are three main preprocessing of the data:
1. interval scaling
Read data, process data, store data
import pandas as pd import numpy as np from sklearn import preprocessing import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] =['SimHei'] #Used to display Chinese labels normally plt.rcParams['axes.unicode_minus'] =False #Used to display the negative sign normally Filename = 'Hits perSecond_T20m_130.csv' data_f = pd.read_csv(Filename)#two-dimensional dataframe format #print(data_f) plt.plot(data_f[200:600]) plt.title('Before data preprocessing') plt.show() print('***2. Data normalization, mapped to the interval [min, max]:') min_max_scaler =preprocessing.MinMaxScaler(feature_range=(0,10)) data_mi_ma =min_max_scaler.fit_transform(data_f) plt.plot (data_me_ma [200: 600]) plt.title('After data preprocessing') plt.show() print(type(data_mi_ma)) data_ = pd.DataFrame(data_mi_ma) print(type(data_)) data_.to_csv("afterpre.csv",index=0,header=0)#index=0, does not retain the index column, header=0 does not retain the column name
Note: When re-saving the processed data as a CSV file, you need to convert the data into dataframe format first
2. standardization
importpandas as pd importnumpy as np fromsklearn import preprocessing importmatplotlib.pyplot as plt plt.rcParams['font.sans-serif']= ['SimHei'] #Used to display Chinese labels normally plt.rcParams['axes.unicode_minus']= False #Used to display the negative sign normally Filename= 'Hits per Second_T20m_130.csv' data_f =pd.read_csv(Filename)#two-dimensional dataframe format #print(data_f) plt.plot(data_f[200:600]) plt.title('Before data processing') plt.show() data_sta= preprocessing.scale(data_f) #print(data_nor) plt.plot (data_sta [200: 600]) plt.title('After data processing') plt.show() print(type(data_sta)) data_ =pd.DataFrame(data_mi_ma) print(type(data_)) data_.to_csv("afterpre.csv",index=0,header=0)#index=0, does not retain the index column, header=0 does not retain the column name
3. logarithm
importpandas as pd importnumpy as np importmatplotlib.pyplot as plt plt.rcParams['font.sans-serif']= ['SimHei'] #Used to display Chinese labels normally plt.rcParams['axes.unicode_minus']= False #Used to display the negative sign normally Filename= 'Hits per Second_T20m_130.csv' data_f =pd.read_csv(Filename)#two-dimensional dataframe format #print(data_f) plt.plot(data_f[200:600]) plt.title('Before data preprocessing') plt.show() data_log= np.log(data_f) plt.plot(data_log[200:600]) plt.title('Before data preprocessing') plt.show() data_log.to_csv("afterpre.csv",index=0,header=0)#index=0, does not retain the index column, header=0 does not retain the column name