[Übersicht über künstliche Intelligenz] Ausreißer herausfiltern und durch nan ersetzen, alle Zeitstempel generieren (der Wert ist nan), nan durch den Durchschnitt der ersten fünf ersetzen, die Dateien zusammenführen und drei Dezimalstellen beibehalten.

[Übersicht über künstliche Intelligenz] Ausreißer herausfiltern und durch nan ersetzen, alle Zeitstempel generieren (der Wert ist nan), nan durch den Durchschnitt der ersten fünf ersetzen, die Dateien zusammenführen und drei Dezimalstellen beibehalten.


1. Ausreißer filtern und durch Nan ersetzen

import os
import numpy as np
import pandas as pd

dataset_folder = './datasets_init/'
file_list = os.listdir(dataset_folder)  # 确定数据集路径

for filename in file_list:  # 遍历数据集路径下的全部文件
    if filename.endswith('.csv'):  # 对csv文件进行操作
        file_name = os.path.join(dataset_folder, filename)
        file_save_path = os.path.join('./datasets/',filename)
        df = pd.read_csv(file_name, header=0, index_col=0)
        df.fillna('NAN',inplace=True)
        
        col_num  = df.shape[1]
        
        # 求均值、方差用z
        for cow_index in range(col_num):
            cow_data = df.iloc[:,cow_index].values # 获取每列的数据
            cow_data_len = len(cow_data) # 获取每列的长度
            data_list = []
            for row in range(cow_data_len):
                data = cow_data[row] 
                if data == 'NAN':
                    pass
                else:
                    data_list.append(data)
            # print(data_list)
            data_list = np.array(data_list)
            data_list = data_list.astype(np.float32)
            mean = data_list.mean()
            std = data_list.std()
            for row in range(cow_data_len):
                data = cow_data[row] 
                if data == 'NAN':
                    pass
                else:
                    data = float(data)
                    z_s = np.abs((data - mean)/std)
                    if z_s > 3:
                        df.iloc[row,cow_index] = 'NAN'
                    if row > 5:
                        data_1 = float(cow_data[row-1])
                        data_2 = float(cow_data[row-2])
                        data_3 = float(cow_data[row-3])
                        data_4 = float(cow_data[row-4])
                        data_5 = float(cow_data[row-5])
                        data_eva = ( data_1 + data_2 + data_3 + data_4 + data_5 ) / 5
                        p = np.abs((data - data_eva)/(data_eva+0.0001)) # 加0.0001是为了避免除以零的事件发生
                        if p > 5:
                            df.iloc[row,cow_index] = 'NAN'
        df.to_csv(file_save_path)

2. Alle Zeitstempel generieren (Wert ist nan)

import numpy as np
import pandas as pd
example = '2017-01-27 17:01:55'
years = ['2017']
month = ['01','02','03','04','05','06','07','08','09','10','11','12']
b_m = ['01','03','05','07','08','10','12']
s_m = ['04','06','09','11']
day1 = ['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31']
day2 = ['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30']
day3 = ['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28']
hour = ['00','01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23']
TTIMESTAMP = [] # 保留时间戳
for year in years:
    for i in month:
        if i in b_m:
            for j in day1:
                for k in hour:
                    time = year+'-'+i+'-'+j+' '+k+':00:00'
                    TTIMESTAMP.append(time)
                    time = year+'-'+i+'-'+j+' '+k+':30:00'
                    TTIMESTAMP.append(time)
        elif i in s_m:
            for j in day2:
                for k in hour:
                    time = year+'-'+i+'-'+j+' '+k+':00:00'
                    TTIMESTAMP.append(time)
                    time = year+'-'+i+'-'+j+' '+k+':30:00'
                    TTIMESTAMP.append(time)
        else:
            for j in day3:
                for k in hour:
                    time = year+'-'+i+'-'+j+' '+k+':00:00'
                    TTIMESTAMP.append(time)
                    time = year+'-'+i+'-'+j+' '+k+':30:00'
                    TTIMESTAMP.append(time)


data = ['NAN' for i in range(17520)] # 生成每列的初始值
df = pd.DataFrame(data={
    
    
    'TTIMESTAMP':TTIMESTAMP,
    'Temp_C':data,
    'SpCond_mS':data,
    'Cond_mS':data,
    'Sal':data,
    'DO_percent':data,
    'DO_ppm':data,
    'pH':data,
    'pH_mV':data,
    'Turb_NTU':data,
    'Chl_ppb':data,
    'Chl_RFU':data,
    'PE_uL':data,
    'PE_RFU':data
})
df.to_csv('05-17.csv',index=False)

3. Ersetzen Sie nan durch den Durchschnitt der ersten fünf

import os
import numpy as np
import pandas as pd

dataset_folder = './datasets/'
file_list = os.listdir(dataset_folder)  # 确定数据集路径

for filename in file_list:  # 遍历数据集路径下的全部文件
    if filename.endswith('.csv'):  # 对csv文件进行操作
        file_name = os.path.join(dataset_folder, filename)
        df = pd.read_csv(file_name, header=0, index_col=0)
        # df.fillna('NAN',inplace=True)
        
        for cow_index in range(13):
            cow_data = df.iloc[:,cow_index].values # 获取每列的数据
            cow_data_len = len(cow_data) # 获取每列的长度
            data_list = []
            for row in range(cow_data_len):
                data = cow_data[row] 
                if data == 'NAN':
                    data_1 = float(cow_data[row-1])
                    data_2 = float(cow_data[row-2])
                    data_3 = float(cow_data[row-3])
                    data_4 = float(cow_data[row-4])
                    data_5 = float(cow_data[row-5])
                    data_eva = ( data_1 + data_2 + data_3 + data_4 + data_5 ) / 5
                    df.iloc[row,cow_index] = data_eva
                else:
                    pass
        df.to_csv(file_name)

4. Dateien zusammenführen und drei Dezimalstellen beibehalten

# 把文件进行拼接,并且强制所有数据保留三位小数,最终保存成类似 '../SeaDataset/05/data_05-17-19.csv' 的形式
# 注意: 中间会产生类似 '../SeaDataset_Init/05/05-17-19.csv' 的中间文件, 试验完自动删掉
# 还可以继续改进,比如:引入for循环进行批量处理

import os
import numpy as np
import pandas as pd


# 合并文件
df1 = pd.read_csv('../SeaDataset_Init/03/03-17.csv', header= 0, index_col= 0)
df2 = pd.read_csv('../SeaDataset_Init/03/03-18-19.csv', header= 0, index_col= 0)

df = pd.concat([df1, df2], axis=0, join='outer', ignore_index= False)

df.to_csv('../SeaDataset_Init/03/03-17-19.csv')


# 保留三位小数
df = pd.read_csv('../SeaDataset_Init/03/03-17-19.csv', header= 0, index_col= 0)
# print(type(df.iloc[0,0]))
df_col = df.columns.tolist()
# print(df_col)
df_data = df.iloc[:,:].values
df_data = np.around(df_data, 3)
# print(type(df_data))
df_data = pd.DataFrame(data=df_data, columns=df_col)
# print(type(df_data.iloc[0,0]))
# print(df_data)

df = pd.read_csv('../SeaDataset_Init/03/03-17-19.csv', header= 0)
df_index = df.iloc[:, 0]
# print(df_index)

df = pd.concat([df_index, df_data], axis= 1, join= 'outer', ignore_index=False)
# print(df)
# print(df.iloc[0,1])
# print(type(df.iloc[0,1]))
df.to_csv('../SeaDataset/03/data_03-17-19.csv', index=False)   # 不要pd给分配的index


# 最终验证是否符合原本的形状
df = pd.read_csv('../SeaDataset/03/data_03-17-19.csv', header= 0, index_col= 0)
print(df.iloc[0,0])   # 一个小数,表示的是温度
print(type(df.iloc[0,0])) #  应该是numpy.float
print(df) # 检查格式,数据数量

# 删除中间文件
os.remove('../SeaDataset_Init/03/03-17-19.csv')

Guess you like

Origin blog.csdn.net/qq_44928822/article/details/131904112