Depth study [1] essays: the file name in the same folder and the contents of txt is read out and saved as a CSV

First, the use os.listdir () all the file names in the folder is extracted, and then extract the file name has a .txt extraction, with open open txt file.

import pandas as pd
import numpy as np
from  pandas import DataFrame as df
import os
import math
import keras
'''第一部分,生成train和val的csv文件,方便从中读取图片数据'''
root=r'C:\Users\liulu\Desktop\huaweicloud\file\train_data\train_data'
os.listdir() 方法用于返回指定的文件夹包含的文件或文件夹的名字的列表
files=os.listdir(root)
files

for file in files: # The first two lines: 'img_1.jpg', img_1.txt ', ...
' ''
category in txt document, the form: img_101.jpg, 1; each to a txt file. "" divided into two, a first image as a name, a second category as a label
'' '
Print (File)
IF' .txt 'in File:
' ''
the os.path.join () function: connecting two or more multi-path name components
1. If the first letter of each component name does not contain '/', the function will automatically add
2. If a component is an absolute path, all components before it will be discarded
3. If last path component is empty, resulting in the end of a '/' separator
r +: readable and writable, if the file does not exist will be given
w +: readable and writable, if the file does not exist will be created
'' '
with open (os.path.join (root, file) , 'r +') as f: # root \ file ( this file is the file name txt file)
line = f.readline () # read each line only, is about to txt contents of each file and then read out the names and labels the extracted image
filename = STR (Line) .split ( ',') [0]
label = int (STR (Line) .split ( '') [. 1])
Print (filename)
Print (label)

# 将图片名称与标签两个部分分开到两个列表当中去
df_ = df()
Filename = []  # 图片名称列表
Label = []  # 标签列表
for file in files:
    if '.txt' in file:
        with open(os.path.join(root, file), 'r+') as f:
            line = f.readline() # 每次只读取一行,其实也只有一行
            filename = str(line).split(',')[0]  # 名称
            label = int(str(line).split(' ')[1])  # 标签
            Filename.append(filename)
            Label.append(label)
# print(Filename, Label)   
print(type(Filename), type(Label), len(Filename), len(Label))
df_['FileName']=Filename   # 数据框中添加元素
df_['Label']=Label
df_
# 打乱数据框中的元素,其实frac是指随机抽取行的比例,frac = 0.8就是抽取80%
df_ = df_.sample(frac=1.0)
df_.shape
# 分训练集与验证集
print(df_.shape[0]*0.8)
print(math.ceil(df_.shape[0]*0.8))  # 上取整
df_train = df_.iloc[0:math.ceil(df_.shape[0]*0.8),:]
df_val = df_.iloc[math.ceil(df_.shape[0]*0.8):,:]
print(df_train.shape, df_val.shape)
df_train.head()
# 写入csv, 保存到当前路径下
df_train.to_csv('train.csv', index=None) # 不要行索引 
df_val.to_csv('val.csv', index=None)

Second use glob can be directly matched to txt file

glob is a python file with its own operations related modules, you can use it to find files in line with their purpose, similar to the Windows file search, wildcard support operations, *,?, [] three wildcard * represents 0 or more characters,? for one character, [] matches the characters in the specified range, such as [0-9] match numbers. The two main methods are as follows.

The main method is to glob glob module, which returns a list of all file paths match (List); the process requires a parameter is used to specify the path string match (string can also be a relative path is absolute), which returns the file name includes only the current directory in the file name, not including subfolders inside.

import glob
glob.glob(r’c:*.txt’)

Get all the txt files on the C drive

import os
import math
import codecs
import random
import numpy as np
from glob import glob
from PIL import Image
import pandas as pd
train_data_dir = r'C:\Users\liulu\Desktop\huaweicloud\test'
label_files = glob(os.path.join(train_data_dir, '*.txt'))  # 读取文件中所有的txt文件
random.shuffle(label_files)
img_paths = []    #图片完整路径和标签的空列表
labels = []
train_df = pd.DataFrame()
valid_df = pd.DataFrame()
for index, file_path in enumerate(label_files):
    with codecs.open(file_path, 'r', 'utf-8') as f:
        line = f.readline()
#         print(line)  # 读取txt文件中的内容
        line_split = line.strip().split(', ') 
#         print(line_split)
        if len(line_split) != 2:
            print('%s contain error lable' % os.path.basename(file_path))
            continue
        print(line_split)
        img_name = line_split[0]
        label = str(line_split[1]) 
#         print(img_name)
#         print(label)
        img_paths.append(img_name)
        labels.append(label)
#         print(labels)
        
# print(img_paths)
# print(labels)
train_img_paths = img_paths[50:len(img_paths)]
random.seed(200)
random.shuffle(train_img_paths)
validation_img_paths = img_paths[:500]
random.seed(200)
random.shuffle(validation_img_paths)
print(train_img_paths)
Released five original articles · won praise 0 · Views 10

Guess you like

Origin blog.csdn.net/liuluTL/article/details/104936989