Python implements three data preprocessing

There are three main preprocessing of the data:

1. interval scaling

Read data, process data, store data

import pandas as pd

import numpy as np

from sklearn import preprocessing

import matplotlib.pyplot as plt

 

plt.rcParams['font.sans-serif'] =['SimHei'] #Used to display Chinese labels normally

plt.rcParams['axes.unicode_minus'] =False #Used to display the negative sign normally

 

Filename = 'Hits perSecond_T20m_130.csv'

data_f = pd.read_csv(Filename)#two-dimensional dataframe format

#print(data_f)

plt.plot(data_f[200:600])

plt.title('Before data preprocessing')

plt.show()

 

print('***2. Data normalization, mapped to the interval [min, max]:')

min_max_scaler =preprocessing.MinMaxScaler(feature_range=(0,10))

data_mi_ma =min_max_scaler.fit_transform(data_f)

plt.plot (data_me_ma [200: 600])

plt.title('After data preprocessing')

plt.show()

 

print(type(data_mi_ma))

data_ = pd.DataFrame(data_mi_ma)

print(type(data_))

data_.to_csv("afterpre.csv",index=0,header=0)#index=0, does not retain the index column, header=0 does not retain the column name

 

Note: When re-saving the processed data as a CSV file, you need to convert the data into dataframe format first

2. standardization

importpandas as pd

importnumpy as np

fromsklearn import preprocessing

importmatplotlib.pyplot as plt

 

plt.rcParams['font.sans-serif']= ['SimHei'] #Used to display Chinese labels normally

plt.rcParams['axes.unicode_minus']= False #Used to display the negative sign normally

 

Filename= 'Hits per Second_T20m_130.csv'

data_f =pd.read_csv(Filename)#two-dimensional dataframe format

#print(data_f)

plt.plot(data_f[200:600])

plt.title('Before data processing')

plt.show()

data_sta= preprocessing.scale(data_f)

#print(data_nor)

plt.plot (data_sta [200: 600])

plt.title('After data processing')

plt.show()

 

print(type(data_sta))

data_ =pd.DataFrame(data_mi_ma)

print(type(data_))

data_.to_csv("afterpre.csv",index=0,header=0)#index=0, does not retain the index column, header=0 does not retain the column name

3. logarithm

 

importpandas as pd

importnumpy as np

importmatplotlib.pyplot as plt

 

plt.rcParams['font.sans-serif']= ['SimHei'] #Used to display Chinese labels normally

plt.rcParams['axes.unicode_minus']= False #Used to display the negative sign normally

 

Filename= 'Hits per Second_T20m_130.csv'

data_f =pd.read_csv(Filename)#two-dimensional dataframe format

#print(data_f)

plt.plot(data_f[200:600])

plt.title('Before data preprocessing')

plt.show()

 

data_log= np.log(data_f)

plt.plot(data_log[200:600])

plt.title('Before data preprocessing')

plt.show()

 

data_log.to_csv("afterpre.csv",index=0,header=0)#index=0, does not retain the index column, header=0 does not retain the column name

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324588046&siteId=291194637