使用tfrecord喂数据到graph到模型训练到模型Saver到存储为pb格式文件

我看现在网络上大多数据教程或者demo用tensorflow训练一个模型，喂数据使用tf.placeholder，但是在大型的文本或者图像数据里面使用placeholder就不wok，性能和内存都是瓶颈。还有大多数代码模型保存什么都没有，更别说模型部署，本篇博客从原始数据转化为tfrecord ，再把tfrecord数据丢给模型训练，再到模型保存，这里把保存好的模型再转化为pb格式文件模型，模型部署我在前面说了使用tensorflow里面的java接口或者用tensorflow-serving。我抛砖引玉，做个简单的介绍

原数数据:

1	可惜 关门 了 关门 了 啥时候 再 开 家
0	没 去过 不知道 不知道 为什么 点评
0	还 不错 价格 搬 哈哈哈 哈哈哈 哈哈哈 哈哈哈 哈哈哈
1	已经 关门 了 已经 关门 了 已经 关门 了 已经 关门 了 已经 关门 了 已经 关门 了
1	口味 好 服务 也好 经常 来吃 可惜 现在 好像 倒闭 了

转化为tf-records：

import tensorflow.contrib.keras as kr
import codecs
import tensorflow as tf
def  writerecord(savepath,vocabpath,filename,maxlen=50):
    writer = tf.python_io.TFRecordWriter(savepath)
    _, word_to_id = _read_vocab(vocabpath)
    with  codecs.open(filename,'r',encoding='utf-8') as f:
        for line in f.readlines():
            try:
                label,contet=line.strip().split('\t')
                conntlist=contet.split(" ")
                label = int(label)
                lablehot = [0, 0]
                lablehot[label] = 1

                features=[word_to_id[x] if x in word_to_id else word_to_id["_UNK"] for x in conntlist]

                if  len(features)>=50:
                    x_pad=features[0:50]
                else:
                    x_pad=features+[0]*(50-len(features))
                example = tf.train.Example(
                    features=tf.train.Features(
                        feature={'input': tf.train.Feature(int64_list=tf.train.Int64List(value=x_pad)),
                                 'lable': tf.train.Feature(int64_list=tf.train.Int64List(value=lablehot))}))
                serialized = example.SerializeToString()
                writer.write(serialized)
            except Exception as e:
                print(e)
    writer.close()
    print("finish")


def read_and_decode(filename_queue):
    # 创建一个reader来读取TFRecord文件中的样例
    reader = tf.TFRecordReader()
    # 从文件中读出一个样例
    _, serialized_example = reader.read(filename_queue)
    # 解析读入的一个样例
    features = tf.parse_single_example(serialized_example, features={
        'input': tf.FixedLenFeature([50],tf.int64),
        'lable': tf.FixedLenFeature([2],tf.int64)
    })

    x = tf.cast(features['input'],  tf.int32)
    y = tf.cast(features['lable'], tf.float32)

    return x,y

"""
inputs获批量的数据，其中这里num_epochs通常设置为None，
如果使用的会报错
"""

def inputs(file, batch_size, num_epochs):
    if not num_epochs:
        num_epochs = None
    filename_queue = tf.train.string_input_producer([file])
    image, label = read_and_decode(filename_queue)
    x,y=tf.train.shuffle_batch([image,label],batch_size=batch_size, capacity=300, min_after_dequeue=10)

    return x, y


def getvocablen(filename):
    words=list(map(lambda line:line.strip(),codecs.open(filename,'r',encoding='utf-8').readlines()))
    return len(words)



def  _read_vocab(filename):
    """读取词汇列别"""
    words=list(map(lambda line:line.strip(),codecs.open(filename,'r',encoding='utf-8').readlines()))
    word_to_id=dict(zip(words,range(len(words))))
    return words,word_to_id


def  file_to_ids_single(content,word_to_id,maxlen=50):
    contents=[]
    contents.append(list(content.lower()))
    data_id = []
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
    #print("data_id is ：",data_id)
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, maxlen=maxlen,padding='post',
                  truncating='post')

    return x_pad

if __name__=='__main__':
    """data_id is ： [[266, 1548, 255]]"""
    words, word_to_id = _read_vocab("tensorflow/vocab.txt")
    print("len word_to_id:",len(word_to_id))
    result=file_to_ids_single("准备一个小时",word_to_id=word_to_id)
    print(result[0][49])
    print(result)
    #build_vocab(Path.baseabusepath)

    writerecord("tensorflow/tf.records","tensorflow/vocab.txt","tensorflow/cnn.txt")


    # x_train, y_train, words = preocess_file()
    # print(x_train.shape, y_train.shape)

模型主类：

import tensorflow  as tf
class TextCNNMulFilterSize(object):
    def __init__( self, config,input_x,label,keep_prob):
        self.config=config
        self.input_x =input_x
        self.input_y = label
        self.keep_prob =keep_prob
        self.filter_sizes=list(map(int, self.config.multi_kernel_size.split(",")))
        self.cnn()

        # Embedding layer
    def input_embedding(self):
        """词嵌套
        这里先把指定gpu的程序去掉，线上用cpu部署，指定gpu模型会报错
        """
        #with tf.device('/gpu:0'):
        embedding =tf.get_variable("embedding",[self.config.vocab_size,self.config.embedding_dim])
        _input = tf.nn.embedding_lookup(embedding, self.input_x)
        _input_expanded = tf.expand_dims(_input, -1)
        return _input_expanded
    def cnn(self):
        l2_loss = tf.constant(0.0)
        pooled_outputs = []
        embedding_inputs = self.input_embedding()
        print(tf.shape(embedding_inputs))
        for i, filter_size in enumerate(self.filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, self.config.embedding_dim, 1, self.config.num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[self.config.num_filters]), name="b")
                conv = tf.nn.conv2d( embedding_inputs, W,strides=[1, 1, 1, 1],padding="VALID",name="conv")

                tf.losses.add_loss(self.config.l2_reg_lambda * tf.nn.l2_loss(W), tf.GraphKeys.REGULARIZATION_LOSSES)
                tf.losses.add_loss(self.config.l2_reg_lambda * tf.nn.l2_loss(b), tf.GraphKeys.REGULARIZATION_LOSSES)

                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool( h,ksize=[1, self.config.seq_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],padding='VALID',name="pool")
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total =(self.config.num_filters) * len(self.filter_sizes)
        self.h_pool = tf.concat(pooled_outputs, 3)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
        # 增加dropoout参数
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.keep_prob)
        with tf.name_scope("score"):
            W = tf.get_variable("W",
                shape=[num_filters_total, self.config.num_classes],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[self.config.num_classes]), name="b")
            # l2_loss += tf.nn.l2_loss(W)
            # l2_loss += tf.nn.l2_loss(b)
            tf.losses.add_loss(self.config.l2_reg_lambda*tf.nn.l2_loss(W), tf.GraphKeys.REGULARIZATION_LOSSES)
            tf.losses.add_loss(self.config.l2_reg_lambda*tf.nn.l2_loss(b),tf.GraphKeys.REGULARIZATION_LOSSES)
            self.logits = tf.nn.xw_plus_b(self.h_drop, W, b, name="logits")
            self.pred_y=tf.nn.softmax(self.logits,name="pred_y")
            tf.add_to_collection('pred_network', self.pred_y)
            self.predictions = tf.argmax(self.pred_y, 1, name="predictions")
        with tf.name_scope("loss"):
            tf.losses.softmax_cross_entropy(logits=self.logits,onehot_labels=self.input_y)
            self.loss=tf.losses.get_total_loss()
            # losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
            # self.loss = tf.reduce_mean(losses) + self.config.l2_reg_lambda *l2_loss
        with tf.name_scope("optimize"):
            # 优化器
            optimizer = tf.train.AdamOptimizer(
                learning_rate=self.config.learning_rate)
            self.optim = optimizer.minimize(self.loss)
        with tf.name_scope("accuracy"):
            correct_pred = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.acc = tf.reduce_mean(tf.cast(correct_pred, "float"), name="accuracy")

模型训练以及保存:

from  model import TextCNNMulFilterSize
from  configuration import TCNNConfig
from  data_utils import inputs,getvocablen
import time
import tensorflow as tf
import os
from  datetime  import timedelta
#basepath="/Users/shuubiasahi/Documents/python"
basepath="/home/zhoumeixu"
data_path=basepath+"/credit-tftextclassify-poi/tensorflow/tf.records"
vocapath=basepath+"/credit-tftextclassify-poi/tensorflow/vocab.txt"
modelpath=basepath+"/credit-tftextclassify-poi/tensorflow/"

print(modelpath,"poi识别model开始训练")
def run_epoch():
    # 载入数据
    print('Loading data...')
    vocablen=getvocablen(vocapath)
    x_train, y_train=inputs(data_path,batch_size=30,num_epochs=3)
    keep_prob=tf.constant(0.9,dtype=tf.float32)
    print('Using CNN model...')
    config = TCNNConfig()
    config.vocab_size = vocablen
    print("vocab_size is:",config.vocab_size)
    if config.is_multi_kernel_size==True:
        model = TextCNNMulFilterSize(config,x_train,y_train,keep_prob)
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver=tf.train.Saver()
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord, sess=session)
    start_time = time.time()
    for i in range(1000):
        #session.run(model.optim)
        _,loss_train, acc_train = session.run([model.optim,model.loss, model.acc]
                                            )
        if i%25==0:
            end_time = time.time()
            time_dif = end_time - start_time
            time_dif = timedelta(seconds=int(round(time_dif)))

            msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},'\
                + '  Time: {3}'
            print(msg.format(i + 1, loss_train, acc_train,time_dif))
            start_time = time.time()
        if i%100==0:
            saver.save(session, "/tmp/model/model.ckpt", global_step=i)
    coord.request_stop()
    coord.join(threads)
    session.close()
if __name__ == '__main__':
     run_epoch()

转化为pb格式模型用于做预测:

from  model import TextCNNMulFilterSize
from  configuration import TCNNConfig
from  data_utils import inputs,getvocablen
import time
import tensorflow as tf
#basepath="/Users/shuubiasahi/Documents/python"
basepath="/home/zhoumeixu"
data_path=basepath+"/credit-tftextclassify-poi/tensorflow/tf.records"
vocapath=basepath+"/credit-tftextclassify-poi/tensorflow/vocab.txt"
modelpath=basepath+"/credit-tftextclassify-poi/tensorflow/"
def run_epoch():
    # 载入数据
    graph = tf.Graph()  //1.4.1之后用tf.get_default_graph

    vocablen=getvocablen(vocapath)
    with graph.as_default():
        x_train= tf.placeholder(tf.int32, [None, None], name="input_x")
        y_train = tf.placeholder(tf.float32, [None, None], name="input_y")
        keep_prob = tf.placeholder(tf.float32, name="keep_prob")
        config = TCNNConfig()
        config.vocab_size = vocablen
        print("vocab_size is:",config.vocab_size)
        model = TextCNNMulFilterSize(config,x_train,y_train,keep_prob)
        output=model.pred_y

        restore_saver = tf.train.Saver()
    with tf.Session(graph=graph) as  sess:
        sess.run(tf.global_variables_initializer())
        latest_ckpt = tf.train.latest_checkpoint("/tmp/model")
        print(latest_ckpt)
        print("keep_prob is:",sess.run(model.keep_prob,feed_dict={model.keep_prob:1.0}))
        restore_saver.restore(sess, latest_ckpt)
        output_graph_def = tf.graph_util.\
            convert_variables_to_constants(sess, sess.graph_def, [x_train.op.name, keep_prob.op.name, output.op.name])

        tf.train.write_graph(output_graph_def, '.', "./tensorflow/graph.model", as_text=False)
if __name__ == '__main__':
     run_epoch()

java接口代码：

int[][] arr = gettexttoidnews(text, map);
		Tensor input = Tensor.create(arr);
		Tensor result = sess.runner().feed("input_x", input).feed("keep_prob", keep_prob).fetch("score/pred_y").run()
				.get(0);

		long[] rshape = result.shape();
		int nlabels = (int) rshape[1];
		int batchSize = (int) rshape[0];

		float[][] logits = result.copyTo(new float[batchSize][nlabels]);

		if (nlabels > 1 && batchSize > 0) {
			return logits[0][1];
		}

以上步骤基本上是现在在tensorflow上面训练模型到模型步骤部署的正常规范。。。有问题联系我。

使用tfrecord喂数据到graph到模型训练到模型Saver到存储为pb格式文件

猜你喜欢