Make a tensorflow dataset

1 First prepare your own image dataset, I use five categories of cifar10, namely bird, car, cat, deer, plane. The five categories of data are placed separately. E.g:


Then it is to generate tfrecord based on the data set, and the generated protobuf (binary file, accelerates file transfer and processing speed), the code is as follows

<

import tensorflow as tf

import them

import random
import math
import sys #Number
of validation sets
_NUM_TEST = 500
#Random
seed_RANDOM_SEED = 0
#Data
block_NUM_SHARDS = 5 #Dataset
path
DATASET_DIR = "D:/Tensorflow/slim/images/" #Label
file name
LABELS_FILENAME = "D :/Tensorflow/slim/images/labels.txt"

#Define the path + name of the tfrecord file
def _get_dataset_filename(dataset_dir, split_name, shard_id):
    output_filename = 'image_%s_%05d-of-%05d.tfrecord' % (split_name, shard_id, _NUM_SHARDS)
    return os.path.join(dataset_dir, output_filename)

#Determine whether the tfrecord file exists
def _dataset_exists(dataset_dir):
    for split_name in ['train', 'test']:
        for shard_id in range(_NUM_SHARDS):
            #Define the path + name of the tfrecord file
            output_filename = _get_dataset_filename(dataset_dir, split_name, shard_id)
        if not tf.gfile.Exists(output_filename):
            return False
    return True #Get

all files and categories
def _get_filenames_and_classes(dataset_dir): #Data
    directory
    directories = [] #Classification
    name
    class_names = []
    for filename in os.listdir(dataset_dir): #Merge
        file path
        path = os.path.join(dataset_dir, filename) #Determine
        whether the path is a directory
        if os.path.isdir(path ):
            #Add data directory
            directories.append(path) #Add
            category name
            class_names.append(filename)

    photo_filenames = []
    #循环每个分类的文件夹
    for directory in directories:
        for filename in os.listdir(directory):
            path = os.path.join(directory, filename)
            #把图片加入图片列表
            photo_filenames.append(path)

    return photo_filenames, class_names

def int64_feature(values):
    if not isinstance(values, (tuple, list)):
        values = [values]
    return tf.train.Feature(int64_list=tf.train.Int64List(value=values))

def bytes_feature(values):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[values]))

def image_to_tfexample(image_data, image_format, class_id):
    #Abstract base class for protocol messages.
    return tf.train.Example(features=tf.train.Features(feature={
      'image/encoded': bytes_feature(image_data),
      'image/format': bytes_feature(image_format),
      'image/class/label': int64_feature(class_id),
    }))

def write_label_file(labels_to_class_names, dataset_dir,filename=LABELS_FILENAME):
    labels_filename = os.path.join(dataset_dir, filename)
    with tf.gfile.Open(labels_filename, 'w') as f:
        for label in labels_to_class_names:
            class_name = labels_to_class_names[label]
            f.write('%d:%s\n' % (label, class_name))

#把数据转为TFRecord格式
def _convert_dataset(split_name, filenames, class_names_to_ids, dataset_dir):
    assert split_name in ['train', 'test'] #Calculate
    how much data each data block has
    num_per_shard = int(len(filenames) / _NUM_SHARDS)
    with tf.Graph().as_default():
        with tf.Session() as sess:
            for shard_id in range(_NUM_SHARDS):
                #Define the path + name of the tfrecord file
                output_filename = _get_dataset_filename(dataset_dir, split_name, shard_id)
                with tf.python_io.TFRecordWriter(output_filename) as tfrecord_writer: #The
                    starting position of each data block
                    start_ndx = shard_id * num_per_shard #The
                    last position of each data block
                    end_ndx = min((shard_id+1) * num_per_shard, len(filenames))
                    for i in range(start_ndx, end_ndx):
                        try:
                            sys.stdout.write('\r>> Converting image %d/%d shard %d' % (i+1, len(filenames), shard_id))
                            sys. stdout.flush() #Read
                            the picture
                            image_data = tf.gfile.FastGFile(filenames[i], 'r').read() #Get
                            the category name of the picture
                            class_name = os.path.basename(os.path.dirname( filenames[i])) #Find
                            the id corresponding to the class name
                            class_id = class_names_to_ids[class_name]
                            #Generate tfrecord file
                            example = image_to_tfexample(image_data, b'jpg', class_id)
                            tfrecord_writer.write(example.SerializeToString())
                        except IOError as e:
                            print("Could not read:",filenames[i])
                            print("Error:",e )
                            print("Skip it\n")
                            
    sys.stdout.write('\n')
    sys.stdout.flush()


if __name__ == '__main__':
    #Determine whether the tfrecord file exists
    if _dataset_exists(DATASET_DIR):
        print( 'tfcecord file already exists')
    else: #Get
        all pictures and categories
        photo_filenames,class_names = _get_filenames_and_classes(DATASET_DIR)
        # Convert the classification to dictionary format, similar to {'house': 3, 'flower': 1, 'plane': 4, 'guitar': 2, 'animal': 0}
        class_names_to_ids = dict(zip(class_names, range (len(class_names)))) #Split

        the data into training set and test set
        random.seed(_RANDOM_SEED)
        random.shuffle(photo_filenames)
        training_filenames = photo_filenames[_NUM_TEST:]
        testing_filenames = photo_filenames[:_NUM_TEST] #Data

        conversion_convert_dataset
        ( 'train', training_filenames, class_names_to_ids, DATASET_DIR)
        _convert_dataset('test', testing_filenames, class_names_to_ids, DATASET_DIR) #Output

        labels file
        labels_to_class_names = dict(zip(range(len(class_names)), class_names))
        write_label_file(labels_to_class_names, DATASET_DIR)

>

Then you can execute the above code to produce the tfrecord file, and the result is as follows:

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324439908&siteId=291194637