Pandasバッチはcsvファイルを読み取り、行と列の固定値を取得します

Pandasバッチはcsvファイルを読み取り、行と列の固定値を取得します


問題の説明

深層学習モデルのトレーニング中に、多くのcsvテーブルが生成されますが、これは手作業で処理するには面倒です。
ここに画像の説明を挿入

解決

または、コード処理の方が便利です。

import pandas as pd
import os

from pandas import DataFrame
def flatten(a):
    #降维函数,把多维降为一维
    for each in a:
        if not isinstance(each, list):
            yield each
        else:
            yield from flatten(each)

def convert_to_int(lists):
    #在Python中将嵌套列表列表的元素从字符串转换为整数
    return [int(el) if not isinstance(el,list) else convert_to_int(el) for el in lists]

def walk_dir(dir,topdown=True):
    for root, dirs, files in os.walk(dir, topdown):
        ACC_train = []
        ACC_test = []
        LOSS_train = []
        LOSS_test = []
        for name in files:
            ####选择以cvs结尾的文件
            if name.endswith('.csv'):
                ####筛选包含layer关键词的文件####
                layer = []
                td = []
                for l in range(1,11):
                    L = "CNN" + str(l)
                    layer.append(L)
                    if layer[l-1] in name:
                        for d in range(1,11):
                            TD = "td" + str(d*510)
                            td.append(TD)
                            if td[d-1] in name:
                               # print(name)
                               # print(root)
                               ####返回文件的路径####
                               pathname = os.path.join(root, name).replace("\\","/")
                               # print(pathname)
                               ####读取csv文件####
                               data = pd.read_csv(pathname,header=None)
                               # print('原始的表格:\n',data)
                               df = DataFrame(data)
                               ####为表格添加列名####
                               column =[]
                               for i in range (1,101):
                                   m = "epoch"+str(i)
                                   column.append(m)
                               # print('column:\n', column)

                               df.columns = column
                               # print('添加了列名的表格:\n', df)

                               ####截取trainacc所在行数据####
                               df_trainacc = df.iloc[4:5]
                               df_testacc = df.iloc[5:6]
                               df_trainloss = df.iloc[6:7]
                               df_testloss = df.iloc[7:8]
                               df_trainacc = df_trainacc.dropna(axis=1, how='all')
                               df_testacc = df_testacc.dropna(axis=1, how='all')
                               df_trainloss = df_trainloss.dropna(axis=1, how='all')
                               df_testloss = df_testloss.dropna(axis=1, how='all')

                               # 以"|"为分割符分列,返回Series
                               df_trainacc = df_trainacc["epoch1"].str.split(":",expand=True).fillna("")
                               df_testacc = df_testacc["epoch1"].str.split(":", expand=True).fillna("")
                               df_trainloss = df_trainloss["epoch1"].str.split(":", expand=True).fillna("")
                               df_testloss = df_testloss["epoch1"].str.split(":", expand=True).fillna("")
                               # print('df_trainacc:\n', df_trainacc)

                               #删除第一列数据
                               df_trainacc_drop = df_trainacc.drop([0], axis=1)
                               df_testacc_drop = df_testacc.drop([0], axis=1)
                               df_trainloss_drop = df_trainloss.drop([0], axis=1)
                               df_testloss_drop = df_testloss.drop([0], axis=1)
                               # print('df_trainacc_drop:\n', df_trainacc_drop)

                               #获取trainacc的数值
                               df_trainacc_drop_value = df_trainacc_drop.values
                               df_testacc_drop_value = df_testacc_drop.values
                               df_trainloss_drop_value = df_trainloss_drop.values
                               df_testloss_drop_value = df_testloss_drop.values

                               #对获取的数据进行降维等处理
                               x1 = [token for st in df_trainacc_drop_value for token in st]
                               trainacc = [ float(x) for x in x1 ]
                               # print("x1:", x1)
                               x2 = [token for st in df_testacc_drop_value for token in st]
                               testacc = [float(x) for x in x2]
                               x3 = [token for st in df_trainloss_drop_value for token in st]
                               trainloss = [float(x) for x in x3]
                               x4 = [token for st in df_testloss_drop_value for token in st]
                               testloss = [float(x) for x in x4]
                               # print("trainacc:", trainacc)
                               data_trainacc = [l,d,trainacc[0]]
                               data_testacc = [l, d, testacc[0]]
                               data_trainloss = [l, d, trainloss[0]]
                               data_testloss = [l, d, testloss[0]]
                               ####将第i层第j个数据量下的trainacc保存起来
                               ACC_train.append(data_trainacc)
                               ACC_test.append(data_testacc)
                               LOSS_train.append(data_trainloss)
                               LOSS_test.append(data_testloss)

        print("ACC_train:",ACC_train)
        print("ACC_test:", ACC_test)
        print("LOSS_train:", LOSS_train)
        print("LOSS_test:", LOSS_test)


dirname = "P:/Research/originaldata"
walk_dir(dirname)

私のコードはまだ比較的簡単に書くことができますが、手動よりも高速です。
結果は次のとおりです。
ここに画像の説明を挿入

おすすめ

転載: blog.csdn.net/qq_38703529/article/details/123030977