Pandas batches read csv files to get fixed values of a row and a column

Pandas batches read csv files to get fixed values ​​of a row and a column


Problem Description

During the training of the deep learning model, many csv tables are generated, which is too troublesome to handle manually.
insert image description here

Solution

Or code processing is more convenient.

import pandas as pd
import os

from pandas import DataFrame
def flatten(a):
    #降维函数,把多维降为一维
    for each in a:
        if not isinstance(each, list):
            yield each
        else:
            yield from flatten(each)

def convert_to_int(lists):
    #在Python中将嵌套列表列表的元素从字符串转换为整数
    return [int(el) if not isinstance(el,list) else convert_to_int(el) for el in lists]

def walk_dir(dir,topdown=True):
    for root, dirs, files in os.walk(dir, topdown):
        ACC_train = []
        ACC_test = []
        LOSS_train = []
        LOSS_test = []
        for name in files:
            ####选择以cvs结尾的文件
            if name.endswith('.csv'):
                ####筛选包含layer关键词的文件####
                layer = []
                td = []
                for l in range(1,11):
                    L = "CNN" + str(l)
                    layer.append(L)
                    if layer[l-1] in name:
                        for d in range(1,11):
                            TD = "td" + str(d*510)
                            td.append(TD)
                            if td[d-1] in name:
                               # print(name)
                               # print(root)
                               ####返回文件的路径####
                               pathname = os.path.join(root, name).replace("\\","/")
                               # print(pathname)
                               ####读取csv文件####
                               data = pd.read_csv(pathname,header=None)
                               # print('原始的表格:\n',data)
                               df = DataFrame(data)
                               ####为表格添加列名####
                               column =[]
                               for i in range (1,101):
                                   m = "epoch"+str(i)
                                   column.append(m)
                               # print('column:\n', column)

                               df.columns = column
                               # print('添加了列名的表格:\n', df)

                               ####截取trainacc所在行数据####
                               df_trainacc = df.iloc[4:5]
                               df_testacc = df.iloc[5:6]
                               df_trainloss = df.iloc[6:7]
                               df_testloss = df.iloc[7:8]
                               df_trainacc = df_trainacc.dropna(axis=1, how='all')
                               df_testacc = df_testacc.dropna(axis=1, how='all')
                               df_trainloss = df_trainloss.dropna(axis=1, how='all')
                               df_testloss = df_testloss.dropna(axis=1, how='all')

                               # 以"|"为分割符分列,返回Series
                               df_trainacc = df_trainacc["epoch1"].str.split(":",expand=True).fillna("")
                               df_testacc = df_testacc["epoch1"].str.split(":", expand=True).fillna("")
                               df_trainloss = df_trainloss["epoch1"].str.split(":", expand=True).fillna("")
                               df_testloss = df_testloss["epoch1"].str.split(":", expand=True).fillna("")
                               # print('df_trainacc:\n', df_trainacc)

                               #删除第一列数据
                               df_trainacc_drop = df_trainacc.drop([0], axis=1)
                               df_testacc_drop = df_testacc.drop([0], axis=1)
                               df_trainloss_drop = df_trainloss.drop([0], axis=1)
                               df_testloss_drop = df_testloss.drop([0], axis=1)
                               # print('df_trainacc_drop:\n', df_trainacc_drop)

                               #获取trainacc的数值
                               df_trainacc_drop_value = df_trainacc_drop.values
                               df_testacc_drop_value = df_testacc_drop.values
                               df_trainloss_drop_value = df_trainloss_drop.values
                               df_testloss_drop_value = df_testloss_drop.values

                               #对获取的数据进行降维等处理
                               x1 = [token for st in df_trainacc_drop_value for token in st]
                               trainacc = [ float(x) for x in x1 ]
                               # print("x1:", x1)
                               x2 = [token for st in df_testacc_drop_value for token in st]
                               testacc = [float(x) for x in x2]
                               x3 = [token for st in df_trainloss_drop_value for token in st]
                               trainloss = [float(x) for x in x3]
                               x4 = [token for st in df_testloss_drop_value for token in st]
                               testloss = [float(x) for x in x4]
                               # print("trainacc:", trainacc)
                               data_trainacc = [l,d,trainacc[0]]
                               data_testacc = [l, d, testacc[0]]
                               data_trainloss = [l, d, trainloss[0]]
                               data_testloss = [l, d, testloss[0]]
                               ####将第i层第j个数据量下的trainacc保存起来
                               ACC_train.append(data_trainacc)
                               ACC_test.append(data_testacc)
                               LOSS_train.append(data_trainloss)
                               LOSS_test.append(data_testloss)

        print("ACC_train:",ACC_train)
        print("ACC_test:", ACC_test)
        print("LOSS_train:", LOSS_train)
        print("LOSS_test:", LOSS_test)


dirname = "P:/Research/originaldata"
walk_dir(dirname)

My code is still relatively simple to write, but it is faster than manual.
The result is as follows:
insert image description here

Guess you like

Origin blog.csdn.net/qq_38703529/article/details/123030977