（五）Tensorflow的IO处理和大规模数据处理方式

1.同步io和异步io

同步类似于排队，一个人完成任务后才会换下一个，异步就像是排多列队伍可以同时进行的

2.并行和并发

并行是一种轮询机制，每个任务执行一会就换下一个线程执行，通常时间很短

并发是真正的同步执行，一块执行！

3.join的作用

join线程后代表在该线程执行完毕前不会换其他线程，就是不会与其他线程并行执行！

java的一个例子：

4.生产者消费者模型

两个独立线程，通过缓冲区的作用来进行独立，实现了异步无需等待，生产者往缓冲区加入数据直到缓冲区满就等待消费者取数据，消费者从缓冲区取数据直到为空就等待消费者写入数据！

数据IO的三种方式

1.数据直接嵌入graph（constant或者Variable），在由graph传入session中运行

2.我们最常用的placeholder占位符替代数据，运行时才会填入数据

尽管已经很好了，IO的输入输出会成为一个瓶颈！

3.Pipeline: XXXReader, Queue的流程

TensorFlow提供了一个队列机制，通过多线程将读取数据与计算数据分开。因为在处理海量数据集的训练时，无法把数据集一次全部载入到内存中，需要一边从硬盘中读取，一边进行训练，为了加快训练速度，我们可以采用多个线程读取数据，一个线程消耗数据。

Queue机制：

producer-consumer pattern(生产消费模式)
独立于主线程执行
异步IO: reader.read(queue) tf.train.batch()

tf.TextLineReader()：处理有换行符分割的文件

tf.WholeFileReader()：处理多个文件时，每次读入一个文件

tf.TFRecordReader()：从tfrecoder文件中读取

def productor_custorm():
    # 定义生产者的方式
    filenames = tf.train.match_filenames_once('.\data\*.csv')  # 正则表达式匹配
    filename_queue = tf.train.string_input_producer(filenames, shuffle=False, num_epochs=1)  # num_epochs 生产几份数据！
    # 定义消费者的方式
    reader = tf.TextLineReader()
    _, value = reader.read(filename_queue)
    # 定义的数据和标签
    example, label = tf.decode_csv(value, record_defaults=[['null'], ['null']])
    #这里一定需要初始化局部变量。。。。
    init = tf.local_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        coord = tf.train.Coordinator()  # 创建线程管理协调器
        # 启动多线程
        threads = tf.train.start_queue_runners(sess=sess,coord=coord)
        try:
            while not coord.should_stop():
                print(sess.run([example, label]))
        except tf.errors.OutOfRangeError:
            print('Epochs complete!')
        finally:
            coord.request_stop()
            # 等待该线程结束，在停止主线程！
            coord.join(threads)
        """
        #不鲁棒的代码
        for _ in range(100):
            print(sess.run([example, label]))
        coord.request_stop()
        coord.join(threads)
        """

TensorFlow 数据统一数据输入格式TFRecord

def csv_to_TFRecord():
    import tensorflow as tf
    import pandas as pd
    ## convert（转换） csv files to tfrecord
    train_frame = pd.read_csv("train.csv")
    print(train_frame.head())
    # 这个pop操作类似于python的基本数据结构list的pop
    train_labels_frame = train_frame.pop(item="label")
    # 转化为numpy结构
    train_values = train_frame.values
    train_labels = train_labels_frame.values
    print("values shape: ", train_values.shape)
    print("labels shape:", train_labels.shape)
    # 创建输入流
    writer = tf.python_io.TFRecordWriter("csv_train.tfrecords")
    # 对每行数据进行处理
    for i in range(train_values.shape[0]):
        image_raw = train_values[i].tostring()  # 每行数据用字符串表示
        example = tf.train.Example(
            features=tf.train.Features(
                feature={
                    "image_raw": tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_raw])),
                    "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[train_labels[i]]))
                }
            )
        )
        writer.write(record=example.SerializeToString())  # 在进行序列化
    # 关闭输入流
    writer.close()

def picture_to_TFRecord():
    ### convert pictures to tfrecord
    import matplotlib.pyplot as plt
    import matplotlib.image as mpimg
    import numpy as np
    import tensorflow as tf
    import pandas as pd

    # 获取标签的函数
    def get_label_from_filename(filename):
        return 1
    # 找到所有图片
    filenames = tf.train.match_filenames_once('.\data\*.png')
    # 创建输入流
    writer = tf.python_io.TFRecordWriter('png_train.tfrecords')
    # 每一个图片进行格式存储转换
    for filename in filenames:
        img = mpimg.imread(filename)
        print("{} shape is {}".format(filename, img.shape))
        img_raw = img.tostring()
        label = get_label_from_filename(filename)
        example = tf.train.Example(
            features=tf.train.Features(
                feature={
                    "image_raw": tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_raw])),
                    "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[label]))
                }
            )
        )
        writer.write(record=example.SerializeToString())  # 在进行序列化进行写入
    # 关闭输出流  
    writer.close()

TFRecord数据的读取

def productor_custorm_tfrecord():
    import tensorflow as tf

    # 定义生产者
    filename_queue = tf.train.string_input_producer(["csv_train.tfrecords"])
    # 定义消费者
    reader = tf.TFRecordReader()
    # 消费者进行读取
    _, serialized_record = reader.read(filename_queue)
    features = tf.parse_single_example(
        serialized_record,
        features={
            ## tf.FixedLenFeature return Tensor
            ## tf.VarLenFeature return SparseTensor
            "image_raw": tf.FixedLenFeature([], tf.string),
            "label": tf.FixedLenFeature([], tf.int64),
        })
    # 转化类型
    images = tf.decode_raw(features["image_raw"], tf.uint8)
    labels = tf.cast(features["label"], tf.int32)

    with tf.Session() as sess:
        coord = tf.train.Coordinator()  # 创建线程管理协调器
        # 启动多线程
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        try:
            while not coord.should_stop():
                print(sess.run([images, labels]))
        except tf.errors.OutOfRangeError:
            print('Epochs complete!')
        finally:
            coord.request_stop()
            # 等待该线程结束，在停止主线程！
            coord.join(threads)

TensorFlow 图像的一些预处理

def change_picture_style():
    import tensorflow as tf

    image_raw = tf.gfile.FastGFile("截屏_20180705_235831.jpg", "rb").read()
    # 转换图片格式
    # img_data = tf.image.decode_png(image_raw)
    img_data = tf.image.decode_jpeg(image_raw)
    # img_data = tf.image.convert_image_dtype(img_data, dtype=tf.float32)
    encoded_image = tf.image.encode_jpeg(img_data)

    with tf.Session() as sess:
        # 保存图片格式
        #一张图片节省了大约一半的内存！没有失真！
        with tf.gfile.GFile("changed_picture.jpeg", "wb") as f:
            f.write(sess.run(encoded_image))

def save_picture(image_data,name):
    #进行编码存储
    image_data1 = tf.cast(image_data,tf.uint8)  #有些标准差的操作，需要进行处理
    image_data2 = tf.image.encode_jpeg(image_data1)
    with tf.Session() as sess:
        with tf.gfile.GFile(name,"wb") as f:
            f.write(sess.run(image_data2))
    return

def adjust_picture():
    img_data = tf.gfile.GFile("截屏_20180705_235831.jpg", "rb").read()
    #解码后得到一个三维矩阵
    img_data = tf.image.decode_jpeg(img_data)
    # 图片全部统一到一个尺寸
    # resized: method = {0:bilinear, 1:nearest neighbor, 2:Bicubic 3: Area}
    resized_img = tf.image.resize_images(img_data, (300, 300), method=0)
    save_picture(resized_img,"resized1.jpeg")
    # 通过裁减或者补充crop or pad进行改变
    croped_or_padded = tf.image.resize_image_with_crop_or_pad(img_data, 2000, 2000)
    save_picture(croped_or_padded, "resized2.jpeg")
    central_cropped = tf.image.central_crop(img_data, 0.5)
    save_picture(central_cropped, "resized3.jpeg")

    # 图片反转
    flipped = tf.image.flip_up_down(img_data)
    save_picture(flipped, "flipped1.jpeg")
    flipped = tf.image.flip_left_right(img_data)
    save_picture(flipped, "flipped2.jpeg")
    transposed = tf.image.transpose_image(img_data)
    save_picture(transposed, "flipped3.jpeg")
    flipped = tf.image.random_flip_up_down(img_data)
    save_picture(flipped, "flipped4.jpeg")

    # 调整图片的亮度
    adjusted = tf.image.adjust_brightness(img_data, -0.5)
    save_picture(adjusted, "adjusted1.jpeg")
    adjusted = tf.image.adjust_brightness(img_data, 0.5)
    save_picture(adjusted, "adjusted2.jpeg")
    adjusted = tf.image.random_brightness(img_data, 0.5)
    save_picture(adjusted, "adjusted3.jpeg")

    # 一种重构图片的方法
    adjusted = tf.image.adjust_contrast(img_data, -5)
    save_picture(adjusted, "adjusted4.jpeg")
    adjusted = tf.image.adjust_contrast(img_data, 5)
    save_picture(adjusted, "adjusted5.jpeg")
    adjusted = tf.image.random_contrast(img_data, 1, 10)
    save_picture(adjusted, "adjusted6.jpeg")

    # 对图片进行标准化，standarize : mean=0, stddev=1
    adjusted = tf.image.per_image_standardization(img_data)
    save_picture(adjusted, "adjusted7.jpeg")
    return