Pandas batches read csv files to get fixed values of a row and a column
Problem Description
During the training of the deep learning model, many csv tables are generated, which is too troublesome to handle manually.
Solution
Or code processing is more convenient.
import pandas as pd
import os
from pandas import DataFrame
def flatten(a):
#降维函数,把多维降为一维
for each in a:
if not isinstance(each, list):
yield each
else:
yield from flatten(each)
def convert_to_int(lists):
#在Python中将嵌套列表列表的元素从字符串转换为整数
return [int(el) if not isinstance(el,list) else convert_to_int(el) for el in lists]
def walk_dir(dir,topdown=True):
for root, dirs, files in os.walk(dir, topdown):
ACC_train = []
ACC_test = []
LOSS_train = []
LOSS_test = []
for name in files:
####选择以cvs结尾的文件
if name.endswith('.csv'):
####筛选包含layer关键词的文件####
layer = []
td = []
for l in range(1,11):
L = "CNN" + str(l)
layer.append(L)
if layer[l-1] in name:
for d in range(1,11):
TD = "td" + str(d*510)
td.append(TD)
if td[d-1] in name:
# print(name)
# print(root)
####返回文件的路径####
pathname = os.path.join(root, name).replace("\\","/")
# print(pathname)
####读取csv文件####
data = pd.read_csv(pathname,header=None)
# print('原始的表格:\n',data)
df = DataFrame(data)
####为表格添加列名####
column =[]
for i in range (1,101):
m = "epoch"+str(i)
column.append(m)
# print('column:\n', column)
df.columns = column
# print('添加了列名的表格:\n', df)
####截取trainacc所在行数据####
df_trainacc = df.iloc[4:5]
df_testacc = df.iloc[5:6]
df_trainloss = df.iloc[6:7]
df_testloss = df.iloc[7:8]
df_trainacc = df_trainacc.dropna(axis=1, how='all')
df_testacc = df_testacc.dropna(axis=1, how='all')
df_trainloss = df_trainloss.dropna(axis=1, how='all')
df_testloss = df_testloss.dropna(axis=1, how='all')
# 以"|"为分割符分列,返回Series
df_trainacc = df_trainacc["epoch1"].str.split(":",expand=True).fillna("")
df_testacc = df_testacc["epoch1"].str.split(":", expand=True).fillna("")
df_trainloss = df_trainloss["epoch1"].str.split(":", expand=True).fillna("")
df_testloss = df_testloss["epoch1"].str.split(":", expand=True).fillna("")
# print('df_trainacc:\n', df_trainacc)
#删除第一列数据
df_trainacc_drop = df_trainacc.drop([0], axis=1)
df_testacc_drop = df_testacc.drop([0], axis=1)
df_trainloss_drop = df_trainloss.drop([0], axis=1)
df_testloss_drop = df_testloss.drop([0], axis=1)
# print('df_trainacc_drop:\n', df_trainacc_drop)
#获取trainacc的数值
df_trainacc_drop_value = df_trainacc_drop.values
df_testacc_drop_value = df_testacc_drop.values
df_trainloss_drop_value = df_trainloss_drop.values
df_testloss_drop_value = df_testloss_drop.values
#对获取的数据进行降维等处理
x1 = [token for st in df_trainacc_drop_value for token in st]
trainacc = [ float(x) for x in x1 ]
# print("x1:", x1)
x2 = [token for st in df_testacc_drop_value for token in st]
testacc = [float(x) for x in x2]
x3 = [token for st in df_trainloss_drop_value for token in st]
trainloss = [float(x) for x in x3]
x4 = [token for st in df_testloss_drop_value for token in st]
testloss = [float(x) for x in x4]
# print("trainacc:", trainacc)
data_trainacc = [l,d,trainacc[0]]
data_testacc = [l, d, testacc[0]]
data_trainloss = [l, d, trainloss[0]]
data_testloss = [l, d, testloss[0]]
####将第i层第j个数据量下的trainacc保存起来
ACC_train.append(data_trainacc)
ACC_test.append(data_testacc)
LOSS_train.append(data_trainloss)
LOSS_test.append(data_testloss)
print("ACC_train:",ACC_train)
print("ACC_test:", ACC_test)
print("LOSS_train:", LOSS_train)
print("LOSS_test:", LOSS_test)
dirname = "P:/Research/originaldata"
walk_dir(dirname)
My code is still relatively simple to write, but it is faster than manual.
The result is as follows: