Examples of small TFrecords production data collection (multiple labels)

 

Production data collection

import os 
import tensorflow as tf 
import numpy as np
output_flie = str(os.path.dirname(os.getcwd()))+"/deepcheml/dataset/train.tfrecords"
with tf.python_io.TFRecordWriter(output_flie) as writer:
    labels = np.array([[1,0,0,1,0],[0,1,0,0,1],[0,0,0,0,1],[1,0,0,0,0]])
    features = np.array([[0,0,0,0,0,0],[1,1,1,1,1,2],[1,1,1,0,0,2],[0,0,0,0,1,9]])
    for i in range(4):
        label =  labels[i]
        feature = features[i]
        example = tf.train.Example(features=tf.train.Features(feature={
            "label": tf.train.Feature(int64_list = tf.train.Int64List(value = label)),
            'feature': tf.train.Feature(int64_list = tf.train.Int64List(value = feature))
        })) 
        writer.write(example.SerializeToString())

Reading the data set

import os 
import tensorflow as tf 
import numpy as np
def read_tf(output_flie):
    filename_queue = tf.train.string_input_producer([output_flie])
    reader = tf.TFRecordReader()
    _, serialized_example = reader.read(filename_queue)
    result = tf.parse_single_example(serialized_example,
                                       features={
                                           'label': tf.FixedLenFeature([], tf.int64),
                                           'feature' : tf.FixedLenFeature([], tf.int64),
                                       })
    feature = result['label']
    label = result['feature']
    return feature,label
output_flie = str(os.path.dirname(os.getcwd()))+"/deepcheml/dataset/train.tfrecords"
feature,label =  read_tf(output_flie)
imageBatch, labelBatch = tf.train.batch([feature, label], batch_size=2,capacity=3)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    coord=tf.train.Coordinator()
    threads= tf.train.start_queue_runners(sess=sess,coord=coord)
    print(1)
    images, labels = sess.run([imageBatch, labelBatch])
    print(images)
    print(labels)
    coord.request_stop()
    coord.join(threads)

It will be reported the following error:

1
INFO:tensorflow:Error reported to Coordinator: <class 'tensorflow.python.framework.errors_impl.InvalidArgumentError'>, Name: <unknown>, Key: label, Index: 0.  Number of int64 values != expected.  Values size: 5 but output shape: []
	 [[Node: ParseSingleExample_3/ParseExample/ParseExample = ParseExample[Ndense=2, Nsparse=0, Tdense=[DT_INT64, DT_INT64], dense_shapes=[[], []], sparse_types=[], _device="/job:localhost/replica:0/task:0/cpu:0"](ParseSingleExample_3/ExpandDims, ParseSingleExample_3/ParseExample/ParseExample/names, ParseSingleExample_3/ParseExample/ParseExample/dense_keys_0, ParseSingleExample_3/ParseExample/ParseExample/dense_keys_1, ParseSingleExample_3/ParseExample/Const, ParseSingleExample_3/ParseExample/Const_1)]]
---------------------------------------------------------------------------
OutOfRangeError                           Traceback (most recent call last)
/opt/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
   1326     try:
-> 1327       return fn(*args)
   1328     except errors.OpError as e:

/opt/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
   1305                                    feed_dict, fetch_list, target_list,
-> 1306                                    status, run_metadata)
   1307 

/opt/anaconda3/lib/python3.6/contextlib.py in __exit__(self, type, value, traceback)
     87             try:
---> 88                 next(self.gen)
     89             except StopIteration:

/opt/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py in raise_exception_on_not_ok_status()
    465           compat.as_text(pywrap_tensorflow.TF_Message(status)),
--> 466           pywrap_tensorflow.TF_GetCode(status))
    467   finally:

OutOfRangeError: FIFOQueue '_55_batch_5/fifo_queue' is closed and has insufficient elements (requested 2, current size 0)
	 [[Node: batch_5 = QueueDequeueManyV2[component_types=[DT_INT64, DT_INT64], timeout_ms=-1, _device="/job:localhost/replica:0/task:0/cpu:0"](batch_5/fifo_queue, batch_5/n)]]

During handling of the above exception, another exception occurred:

OutOfRangeError                           Traceback (most recent call last)
<ipython-input-6-e2d123f65efb> in <module>()
     23     threads= tf.train.start_queue_runners(sess=sess,coord=coord)
     24     print(1)
---> 25     images, labels = sess.run([imageBatch, labelBatch])
     26     print(images)
     27     print(labels)

The method of correction is to add specific length:

    result = tf.parse_single_example(serialized_example,
                                       features={
                                           'label': tf.FixedLenFeature([5], tf.int64),
                                           'feature' : tf.FixedLenFeature([6], tf.int64),

Run again:

1
[[1 0 0 1 0]
 [0 1 0 0 1]]
[[0 0 0 0 0 0]
 [1 1 1 1 1 2]]

If it may also be longer for this:

Production data collection (same as above, has not changed)

import os 
import tensorflow as tf 
import numpy as np
train_TFfile = str(os.path.dirname(os.getcwd()))+"/deepcheml/dataset/hh.tfrecords"
writer = tf.python_io.TFRecordWriter(train_TFfile)
labels = [[1,2,3],[3,4],[5,2,6],[6,4,9],[9]]
features = [[2,5],[3],[5,8],[1,4],[5,9]]
for i in range(5):
    label = labels[i]
    print(label)
    feature = features[i]
    example = tf.train.Example(
        features = tf.train.Features(
            feature = {'label':tf.train.Feature(int64_list = tf.train.Int64List(value = label)),
                       'feature':tf.train.Feature(int64_list = tf.train.Int64List(value = feature))}))
    writer.write(example.SerializeToString())
writer.close()

Read data sets:

The main change is:

tf.VarLenFeature(tf.int64)
import os 
import tensorflow as tf 
import numpy as np
def read_tf(output_flie):
    filename_queue = tf.train.string_input_producer([output_flie])
    reader = tf.TFRecordReader()
    _, serialized_example = reader.read(filename_queue)
    result = tf.parse_single_example(serialized_example,
                                       features={
                                           'label': tf.VarLenFeature(tf.int64),
                                           'feature' : tf.VarLenFeature(tf.int64),
                                       })
    feature = result['feature']
    label = result['label']
    return feature,label
output_flie = str(os.path.dirname(os.getcwd()))+"/deepcheml/dataset/hh.tfrecords"
Feature,Label =  read_tf(output_flie)
Feature_batch, Label_batch = tf.train.batch([Feature, Label], batch_size=2,capacity=3)


with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    coord=tf.train.Coordinator()
    threads= tf.train.start_queue_runners(sess=sess,coord=coord)
    
    for i in range(3):
        feature_batch, label_batch = sess.run([Feature_batch, Label_batch])
        print(feature_batch)
        print(label_batch)
        print('---------')
    coord.request_stop()
    coord.join(threads)

operation result:

SparseTensorValue(indices=array([[0, 0],
       [0, 1],
       [1, 0]]), values=array([2, 5, 3]), dense_shape=array([2, 2]))
SparseTensorValue(indices=array([[0, 0],
       [0, 1],
       [0, 2],
       [1, 0],
       [1, 1]]), values=array([1, 2, 3, 3, 4]), dense_shape=array([2, 3]))
---------
SparseTensorValue(indices=array([[0, 0],
       [0, 1],
       [1, 0],
       [1, 1]]), values=array([5, 8, 1, 4]), dense_shape=array([2, 2]))
SparseTensorValue(indices=array([[0, 0],
       [0, 1],
       [0, 2],
       [1, 0],
       [1, 1],
       [1, 2]]), values=array([5, 2, 6, 6, 4, 9]), dense_shape=array([2, 3]))
---------
SparseTensorValue(indices=array([[0, 0],
       [0, 1],
       [1, 0],
       [1, 1]]), values=array([5, 9, 2, 5]), dense_shape=array([2, 2]))
SparseTensorValue(indices=array([[0, 0],
       [1, 0],
       [1, 1],
       [1, 2]]), values=array([9, 1, 2, 3]), dense_shape=array([2, 3]))
---------

Longer time to when to use it more? Usually when we may need in order to save space when using sparse representation to take into account the more general case, assuming that we encounter or multiple labels

Suppose we have three samples, there are eight kinds of categories, its label are:

[0,1,0,0,0,0,0,0]

[1,0,0,0,1,0,0,0]

[0,0,1,1,0,1,0,0]

That is, if we adopt sparse representation:

[1]

[0,4]

[2,3,5]

So you can, then how to restore our memory above its original label it? Can make use of tf.sparse_to_dense, here give a simple subclass it:

Production data collection

#提取训练集集指纹和label
import os 
import tensorflow as tf 
import numpy as np
train_TFfile = str(os.path.dirname(os.getcwd()))+"/deepcheml/dataset/kk.tfrecords"
writer = tf.python_io.TFRecordWriter(train_TFfile)
labels = [[1],[0,4],[2,3,5]]
for i in range(3):
    label = labels[i]
    example = tf.train.Example(
        features = tf.train.Features(
            feature = {'label':tf.train.Feature(int64_list = tf.train.Int64List(value = label))}))
    writer.write(example.SerializeToString())
writer.close()

Extract data sets:

import os 
import tensorflow as tf 
import numpy as np
def read_tf(output_flie):
    filename_queue = tf.train.string_input_producer([output_flie])
    reader = tf.TFRecordReader()
    _, serialized_example = reader.read(filename_queue)
    result = tf.parse_single_example(serialized_example,
                                       features={
                                           'label': tf.VarLenFeature(tf.int64),
                                       })
    label = result['label']
    return label
output_flie = str(os.path.dirname(os.getcwd()))+"/deepcheml/dataset/kk.tfrecords"
Label =  read_tf(output_flie)
Label_batch = tf.train.batch([Label], batch_size=3,capacity=3)


with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    coord=tf.train.Coordinator()
    threads= tf.train.start_queue_runners(sess=sess,coord=coord)
    
    label_batch = sess.run(Label_batch)
    print(label_batch)
    print('---------')
    label_x = tf.expand_dims(label_batch.indices[:,0],1)
    label_y = tf.expand_dims(label_batch.values,1)
    label_index = tf.concat([label_x, label_y],1).eval()
    feature = tf.sparse_to_dense(label_index, [3,8], 1.0,0.0,validate_indices=False)
    print(sess.run(feature))
    coord.request_stop()
    coord.join(threads)

operation result:

SparseTensorValue(indices=array([[0, 0],
       [1, 0],
       [1, 1],
       [2, 0],
       [2, 1],
       [2, 2]]), values=array([1, 0, 4, 2, 3, 5]), dense_shape=array([3, 3]))
---------
[[0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 1. 0. 1. 0. 0.]]

Briefly explain:

You can clearly see from the above label_batch like, which is made indices, values ​​and configuration dense_shape

label_x is the first dimension, i.e., 0,1,1,2,2,2 label_batch.indices

label_y is label_batch.values ​​namely 1,0,4,2,3,5

label_index is a series of coordinates, is mapped to the original coordinates of all the tags 1 label matrix

Require special attention validate_indices = False must be set to False, otherwise the encounter is not increasing, then it will error

 

When the actual training of the neural network, we can use tfrecords + Dataset to begin with, does not require data from external feed:

For example:

Production data: for example, or a multi-label

#提取训练集集指纹和label
import os 
import tensorflow as tf 
import numpy as np
train_TFfile = str(os.path.dirname(os.getcwd()))+"/deepcheml/dataset/kk.tfrecords"
writer = tf.python_io.TFRecordWriter(train_TFfile)
labels = [[1,3],[0,4,1],[7,9],[3,2,1,5],[7,5,9,2,1,4]]
for i in range(5):
    label = labels[i]
    example = tf.train.Example(
        features = tf.train.Features(
            feature = {'label':tf.train.Feature(int64_list = tf.train.Int64List(value = label))}))
    writer.write(example.SerializeToString())
writer.close()
import os 
import tensorflow as tf 
import numpy as np
def read_tf(example_proto):
    result = tf.parse_single_example(example_proto,
                                       features={
                                           'label': tf.VarLenFeature(tf.int64),
                                       })
    result['label'] = tf.sparse_tensor_to_dense(result['label'])
    y = tf.cast(tf.expand_dims(result['label'],1),tf.int32)
    x = tf.expand_dims(tf.fill([tf.shape(result['label'])[0]], 0),1)
    concated = tf.concat([x, y],1)
    result['label'] = tf.sparse_to_dense(concated, [1,10], 1.0,0.0,validate_indices=False)[0]
    return result
train_file = str(os.path.dirname(os.getcwd()))+"/deepcheml/dataset/kk.tfrecords"

num_epochs = 2
dataset = tf.contrib.data.TFRecordDataset([train_file])
new_dataset = dataset.map(read_tf)
epoch_dataset = new_dataset.repeat(num_epochs)
# shuffle_dataset = epoch_dataset.shuffle(buffer_size=100)

batch_dataset = epoch_dataset.batch(2)
iterator = batch_dataset.make_one_shot_iterator()
next_element = iterator.get_next()



with tf.Session() as sess:
    i=1
    while True:
        try:
            label = sess.run(next_element['label'])
    # 如果遍历完了数据集,则返回错误
        except tf.errors.OutOfRangeError:
            print("End of dataset")
            break
        else:
            # 显示每个样本中的所有feature的信息,只显示scalar的值
            print('==============example %s ==============' %i)
            print(label)
        i+=1

operation result:

==============example 1 ==============
[[0. 1. 0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 1. 0. 0. 1. 0. 0. 0. 0. 0.]]
==============example 2 ==============
[[0. 0. 0. 0. 0. 0. 0. 1. 0. 1.]
 [0. 1. 1. 1. 0. 1. 0. 0. 0. 0.]]
==============example 3 ==============
[[0. 1. 1. 0. 1. 1. 0. 1. 0. 1.]
 [0. 1. 0. 1. 0. 0. 0. 0. 0. 0.]]
==============example 4 ==============
[[1. 1. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 1.]]
==============example 5 ==============
[[0. 1. 1. 1. 0. 1. 0. 0. 0. 0.]
 [0. 1. 1. 0. 1. 1. 0. 1. 0. 1.]]
End of dataset

When using this method to note:

When that is resolved

  result['label'] = tf.sparse_tensor_to_dense(result['label'])

The main character of this statement, summarized as follows:

Pictures from: https://zhuanlan.zhihu.com/p/33223782

Above blog in a very clear, it is recommended to look at

The following question to be determined:

There is also a problem that, if the use of the above structure, so when we want the process of training its dynamic look at the data in the test set above effect, so this time how to feed the test set network do?

We assume that the network is defined as follows:

def Net(x):
    .....
    return result

According to the above case, we can directly:

prediction = Net(next_element['label'])

Suppose now that we have to look at the effect in the above test, then how to do it?

Do it again and then call the network?

prediction = Net(test)

But the weight of argument here is whether or not to call Net weight is above the call of the training set, or again generated a network, it is clear that once again a generation, that is constantly updated by the value of training in the training set parameters right network , while the weight of the test set has been initialized state, the right training and the right set of values ​​is completely different here, two there is no uniform, resulting in a set of test results is the same, so this situation is through feed form a unified feed is better

 

 

 

 

 

Guess you like

Origin blog.csdn.net/weixin_42001089/article/details/90236241