Tensorflow study notes-get training data set and test data set

Before training the neural network model, you need to obtain the training data set and the test data set. The method of obtaining the data set (get_data_train_test) introduced in this article includes the following steps:
1 In the data set folder, the images of different categories are placed under their respective category names. In the named folder;
2 Obtain all image paths and classifications;
3 Convert the classification to dictionary format;
4 Disrupt all image paths;
5 Divide all image paths into training part and test part;
6 Obtain x part
6.1 Obtain Image;
6.2 Image size adjustment;
6.3 Image dimensionality reduction;
6.4 Image pixel value inversion;
6.5 Image pixel value normalization;
7 Obtaining the y part
7.1 Obtaining the category name of the image;
7.2 Finding the id corresponding to the category name;
7.3 Pushing the list to ;

import os
import random
import math
import sys
import cv2
import numpy as np
from PIL import Image

#数据集路径
DATASET_TRAIN_TEST_DIR = 'D:/word/data_train_test'
DATASET_TEST_DIR = 'D:/word/data_test'
#随机种子
RANDOM_SEED = 0
#验证集数量
NUM_TEST = 20
#分类数量
NUM_CLASS = 10

#获取所有文件以及分类
def get_filenames_and_classes(dataset_dir):
	#数据目录
	directories = []
	#分类名称
	class_names = []
	for filename in os.listdir(dataset_dir):
		#合并文件路径
		path = os.path.join(dataset_dir, filename)
		#判断该路径是否为目录
		if os.path.isdir(path):
			#加入数据目录
			directories.append(path)
			#加入类别名称
			class_names.append(filename)

	photo_filenames = []
	#循环每个分类的文件夹
	for directory in directories:
		for filename in os.listdir(directory):
			path = os.path.join(directory, filename)
			#把图片加入图片列表
			photo_filenames.append(path)

	return photo_filenames, class_names

def get_xs(filenames):
	xs = []
	for i in range(len(filenames)):
		image = Image.open(filenames[i]).convert('L')
		blank = Image.new('L',[28,28],(255))
		max_length = np.max(image.size)
		w = int(image.size[0]*28/max_length)
		h = int(image.size[1]*28/max_length)
		#图像尺寸不超过28*28
		image = image.resize((w,h), Image.NEAREST)
		#图像尺寸调整为28*28
		blank.paste(image, ((28-w)//2, (28-h)//2))
		#图像尺寸调整为1*784
		x = blank.resize((1,784))
		#图像转换为数组
		x = np.array(x)
		#图像降维,如[[1],[2],[3]]变为[1,2,3]
		x = x.squeeze()
		#图像像素值取反
		x = np.full(784, 255) - x
		#图像像素值归一化
		max = np.max(x)
		x = x / np.full(784, max)
		#获取多幅图像数据
		xs.append(x)
	return xs

def get_ys(filenames, class_names_to_ids):
	ys = []
	for i in range(len(filenames)):
		#获得图片的类别名称
		class_name = os.path.basename(os.path.dirname(filenames[i]))
		#找到类别名称对应的id
		class_id = class_names_to_ids[class_name]
		#列表推到
		y=[1 if id==class_id else 0 for id in range(NUM_CLASS)]
		ys.append(y)
	return ys

def get_data_train_test():
	#获得所有图片路径以及分类
	photo_filenames, class_names = get_filenames_and_classes(DATASET_TRAIN_TEST_DIR)

	#把分类转为字典格式,类似于{'A':0, 'B':1, 'C':2}
	class_names_to_ids = dict(zip(class_names, range(len(class_names))))

	#把数据切分为训练集和测试集
	random.seed(RANDOM_SEED)
	random.shuffle(photo_filenames)
	training_filenames = photo_filenames[NUM_TEST:]
	testing_filenames = photo_filenames[:NUM_TEST]
	train_xs = get_xs(training_filenames)
	train_ys = get_ys(training_filenames, class_names_to_ids)
	test_xs = get_xs(testing_filenames)
	test_ys = get_ys(testing_filenames, class_names_to_ids)

	return train_xs, train_ys, test_xs, test_ys

def get_data_test():
	filenames = []
	for filename in os.listdir(DATASET_TEST_DIR):
		#合并文件路径
		path = os.path.join(DATASET_TEST_DIR, filename)
		filenames.append(path)
	xs = get_xs(filenames)
	return xs

Guess you like

Origin blog.csdn.net/wxsy024680/article/details/115063016