tensorflow2.x中的量化感知训练以及tflite的x86端测评

tensorflow 2.x的模型训练结束后一般保存为.h5或save_model的模型格式(只能是fp32格式),如果模型想要部署在移动端,一般需要将模型转换为.tflite格式,这里又分为动态格式、fp32格式、fp16格式和int8格式的tflite【1】。一般而言,如果直接将fp32的h5转换为int8格式的tflite会存在精度下降,为了尽量减少转换过程中的精度下降,可选的方式是在转换前对模型进行量化感知训练。具体办法是在权重保持fp32的情况下,按照int8量化的需求进行训练,之后再正式进行int8量化,这样可以减少int8的量化掉点。具体int8的量化原理这里不讲,下面是量化感知代码,分别比较了量化感知下fp32的h5、量化感知下fp32的tflite、量化感知下int8的tflite、非量化感知下fp32的h5、非量化感知下fp32的tflite、非量化感知下int8的tflite这六种情况下的精度。

量化感知训练

# 实现h5文件转换为tflite文件,可以是fp32或int8,详解参见 https://zhuanlan.zhihu.com/p/165670135
import os
import time
import random
import numpy as np
from tqdm import tqdm
import tensorflow as tf
import tensorflow_model_optimization as tfmot
from tensorflow.keras.optimizers import Adam

############################################################################################################
## 0. 参数设置 ##############################################################################################
############################################################################################################

IMG_SIZE = (128, 128)
BATCH_SIZE = 32
q_epochs = 1              # 感知量化训练的轮数
learning_rate = 0.00001   # 感知量化学习率
num_test = 10             # 由于tflite在CPU上的推理速度有点慢,所以人为限制了最大数据量,其是设置为30就可以了,1000张图片
train_dir = "./train"
test_dir = "./test"
weight_path = ./"XXX.h5"
output_path = "./tflite/"


############################################################################################################
## 1. 读取数据和数据预处理 #####################################################################################
############################################################################################################
train_dataset = ""        # tf.data.dataset
test_dataset = ""         # tf.data.dataset


############################################################################################################
## 2. 量化感知训练 ###########################################################################################
############################################################################################################

# 读取模型并测试准确率
model = tf.keras.models.load_model(weight_path)
loss0, accuracy0 = model.evaluate(test_dataset)

# 量化感知训练,该量化感知训练应该是针对int8量化进行提前准备,如果想要为fp16量化提前准备,应该选择混合精度训练技术
# 当前版本tensorflow,该量化函数不支持嵌套模型,如果某个model0是model1和model2的串接模型,直接量化感知model0即quantize_model(model0)将会报错
# 解决办法是分别对model1和model2量化感知,然后再串接起来,即(model0 = tf.keras.model(quantize_model(model0), quantize_model(model1)))
quantize_model = tfmot.quantization.keras.quantize_model
q_aware_model = quantize_model(model)

# 新模型需要重新编译
q_aware_model.compile(optimizer=Adam(learning_rate=learning_rate),
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy'])
q_aware_model.summary()

# 开始量化感知训练
history_q = q_aware_model.fit(train_dataset, epochs=q_epochs)

# 测试输出模型准确率
loss1, accuracy1 = q_aware_model.evaluate(test_dataset)

############################################################################################################
## 3. 转换为tflite文件 #######################################################################################
############################################################################################################

# 定义int8量化所需数据
def representative_dataset():
    for images, _ in train_dataset.take(32):
        for i in range(BATCH_SIZE):
            image = np.expand_dims(images[i].numpy(), axis=0).astype(np.float32)
            yield [image]

# 评估tflite模型
def evaluate_model(tflite_model, tfl_int8):
    # 初始化
    interpreter = tf.lite.Interpreter(model_content=tflite_model)
    interpreter.allocate_tensors()
    input_details  = interpreter.get_input_details()
    output_details = interpreter.get_output_details()
    input_index  = input_details[0]["index"]
    output_index = output_details[0]["index"]
    scale_in, zero_point_in = input_details[0]['quantization']
    scale_out, zero_point_out = output_details[0]['quantization']

    # 遍历测试集
    prediction_labels = []
    test_labels = []
    num = min(num_test, len(test_dataset))
    n = 0
    pbar = tqdm(total=num*BATCH_SIZE)  # 这里有一个bug,如果最后一个batch的尺寸不是BATCH_SIZE这么大,那么总轮数小于num*BATCH_SIZE
    pbar.set_description("Processing int8" if tfl_int8 else "Processing fp32")
    start = time.time()
    for test_images, labels in test_dataset.take(num):
        for i in range(len(test_images)):
            if tfl_int8:
                test_image = test_images[i] / scale_in + zero_point_in
                test_image = np.expand_dims(test_image.numpy(), axis=0).astype(np.int8)
            else:
                test_image = np.expand_dims(test_images[i].numpy(), axis=0).astype(np.float32)
            interpreter.set_tensor(input_index, test_image)
            interpreter.invoke()
            output = interpreter.get_tensor(output_index)
            if tfl_int8:
                output = output.astype(np.float32)
                output = (output - zero_point_out) * scale_out
            digit = np.argmax(output[0])
            prediction_labels.append(digit)
            test_labels.append(labels[i].numpy())
            n += 1
            pbar.update(1)
    end = time.time()
    prediction_labels = np.array(prediction_labels)
    test_labels = np.array(test_labels)
    accuracy = (prediction_labels == test_labels).mean()
    return accuracy, (end-start)/n

# -1 量化感知模型转化为动态范围的tflite ---------------------------------------------------------------------
# converter = tf.lite.TFLiteConverter.from_keras_model(q_aware_model)      # 读取模型
# converter.optimizations = [tf.lite.Optimize.DEFAULT]                     # 配置优化算法
# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS]   # 配置算子支持
# quantized_tflite_model = converter.convert()                             # 转换模型

# 0 量化感知模型转化为float32的tflite ---------------------------------------------------------------------
converter = tf.lite.TFLiteConverter.from_keras_model(q_aware_model)      # 读取模型
quantized_tflite_model = converter.convert()                             # 转换模型

# 计算量化为float32后的tflite的准确率
accuracy2, time2 = evaluate_model(quantized_tflite_model, False)
base_path = output_path + weight_path.split("/")[-1].rstrip(".h5")
output_model = "{}_{:.2f}_fp32.tflite".format(base_path, accuracy2)
with open(output_model, 'wb') as f:
    f.write(quantized_tflite_model)

# 1 量化感知模型转化为int8的tflite ------------------------------------------------------------------------
converter = tf.lite.TFLiteConverter.from_keras_model(q_aware_model)      # 读取模型
converter.optimizations = [tf.lite.Optimize.DEFAULT]                     # 配置优化算法
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]  # 配置算子支持
converter.inference_input_type = tf.int8                                 # 设置输入数据为int8,如果不设置则默认fp32也就是说,输入fp32然后在网络里自己转换成int8
converter.inference_output_type = tf.int8                                # 设置输出数据为int8,如果不设置则默认fp32也就是说,输入fp32然后在网络里自己转换成int8
converter.representative_dataset = representative_dataset                # int8量化需要数据
converter.allow_custom_ops = False
converter.experimental_new_converter = True
converter.experimental_new_quantizer = True
quantized_tflite_model = converter.convert()

# 计算量化为int8后的tflite的准确率
accuracy3, time3 = evaluate_model(quantized_tflite_model, True)
base_path = output_path + weight_path.split("/")[-1].rstrip(".h5")
output_model = "{}_{:.2f}_int8.tflite".format(base_path, accuracy3)
with open(output_model, 'wb') as f:
    f.write(quantized_tflite_model)

# 2 {未进行}量化感知模型转化为float32的tflite --------------------------------------------------------------
converter = tf.lite.TFLiteConverter.from_keras_model(model)              # 读取模型
tflite_model = converter.convert()                                       # 转换模型

# 计算量化为float32后的tflite的准确率
accuracy4, time4 = evaluate_model(tflite_model, False)

# 3 {未进行}量化感知模型转化为int8的tflite -----------------------------------------------------------------
converter = tf.lite.TFLiteConverter.from_keras_model(model)              # 读取模型
converter.optimizations = [tf.lite.Optimize.DEFAULT]                     # 配置优化算法
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]  # 配置算子支持
converter.inference_input_type = tf.int8                                 # 设置输入数据为int8,如果不设置则默认fp32也就是说,输入fp32然后在网络里自己转换成int8
converter.inference_output_type = tf.int8                                # 设置输出数据为int8,如果不设置则默认fp32也就是说,输入fp32然后在网络里自己转换成int8
converter.representative_dataset = representative_dataset                # int8量化需要数据
converter.allow_custom_ops = False
converter.experimental_new_converter = True
converter.experimental_new_quantizer = True
tflite_model = converter.convert()

# 计算量化为int8后的tflite的准确率
accuracy5, time5 = evaluate_model(tflite_model, True)

# 打印结果
print("initial accuracy: {:.2f}".format(accuracy0))
print("initial accuracy fp32: {:.2f} speed: {:.3f}s".format(accuracy4, time4))
print("initial accuracy int8: {:.2f} speed: {:.3f}s".format(accuracy5, time5))
print("quantized accuracy: {:.2f}".format(accuracy1))
print("quantized accuracy fp32: {:.2f} speed: {:.3f}s".format(accuracy2, time2))
print("quantized accuracy int8: {:.2f} speed: {:.3f}s".format(accuracy3, time3))

tflite的x86端测评

在得到tflite后,往往需要在x86端进行测评,可是当前tflite在x86端只能运行在CPU上,所以速度很慢,而深度学习的数据集往往很大,如果只是测评一个acc就需要一天时间,这就太不合适了,上面代码针对int8的tflite的测评是只选取了一小部分测试集,下面代码通过多进程实现了针对整体数据集的快速测评,且完成了acc,recall,precision和混淆矩阵的计算。

# 计算tflite模型的acc和混淆矩阵,多进程多batch
import os
import cv2
import numpy as np
import tensorflow as tf
from multiprocessing import Process, Queue

# 定义图片预处理函数
def image_preprocess(image, target_length, value=0.0, method=0):
    image = image.astype("float32")
    h, w, _ = image.shape                               # 获得原始尺寸
    ih, iw  = target_length, target_length              # 获得目标尺寸
    scale = min(iw/w, ih/h)                             # 实际拉伸比例
    nw, nh  = int(scale * w), int(scale * h)            # 实际拉伸后的尺寸
    image_resized = cv2.resize(image, (nw, nh))         # 实际拉伸图片
    image_paded = np.full(shape=[ih, iw, 3], fill_value=value)
    dw, dh = (iw - nw) // 2, (ih-nh) // 2
    image_paded[dh:nh+dh, dw:nw+dw, :] = image_resized  # 居中填充图片
    if method == 0:
        image_paded = image_paded / 255.                # 图片归一化
    elif method == 1:
        image_paded = image_paded / 127.5 - 1.0         # 图片标准化
    return image_paded

# 定义进程函数
def fun(q, interpreter, image_list, label_name, batch_size, tfl_int8, last_size_b, num_core, n):
    # 解析解释器
    input_details  = interpreter.get_input_details()
    output_details = interpreter.get_output_details()
    input_index  = input_details[0]["index"]
    output_index = output_details[0]["index"]
    scale_in, zero_point_in = input_details[0]['quantization']
    scale_out, zero_point_out = output_details[0]['quantization']
    # 初始化
    num_i = 0
    num_j = 0
    num_total = len(image_list)
    num_prs = np.array([0.000000001 for _ in range(len(label_name))])
    num_rec = np.array([0.000000001 for _ in range(len(label_name))])
    num_mat = np.array([[0] * len(label_name) for _ in range(len(label_name))])
    # 循环处理每一个batch
    for image_path in image_list:
        if num_j == 0:
            images_list = []
            images_label = []
        image = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
        image = image_preprocess(image, 128, 0, 1)
        images_list.append(image)
        images_label.append(label_name.index(image_path.split("/")[-2]))
        if num_j == batch_size - 1:
            images = np.array(images_list)
            if tfl_int8:
                images = images / scale_in + zero_point_in
                images = images.astype(np.int8)
            else:
                images = images.astype(np.float32)
            interpreter.set_tensor(input_index, images)
            interpreter.invoke()
            output = interpreter.get_tensor(output_index)
            if tfl_int8:
                output = output.astype(np.float32)
                output = (output - zero_point_out) * scale_out
            if n == num_core-1 and num_i == num_total-batch_size and last_size_b != 0:
                num = last_size_b
            else:
                num = batch_size
            for k in range(num):
                digit = np.argmax(output[k])
                num_prs[int(digit)] += 1
                num_rec[images_label[k]] += 1
                num_mat[images_label[k], int(digit)] += 1
            num_i += batch_size
            if num_i%(4*batch_size) == 0:
                print("Process %0.3d : finish %0.4d | total %0.4d | percentage %0.2f%%" % (n, num_i, num_total, num_i/num_total*100))
        # 控制num_j在[0 ~ batch_size-1]之间循环
        if num_j == batch_size - 1:
            num_j = 0
        else:
            num_j += 1
    q.put([num_prs, num_rec, num_mat])


# 初始化
image_size = (128, 128)
batch_size = 32
num_core = 16       # CPU核心数量
tfl_int8 = False    # 是否为int8量化
model_path = "./xxx.tflite"
path = "./data/cats_and_dogs_filtered/validation/"  # 数据集地址
label_name = ['cats', 'dogs']                       # 必须跟tflite模型的标签顺序匹配
num_prs = np.array([0.000000001 for _ in range(len(label_name))])
num_rec = np.array([0.000000001 for _ in range(len(label_name))])
num_mat = np.array([[0] * len(label_name) for _ in range(len(label_name))])
q = Queue()

# 读取tflite模型
interpreter = tf.lite.Interpreter(model_path)
input_index = interpreter.get_input_details()[0]["index"]
interpreter.resize_tensor_input(input_index, [batch_size, image_size[0], image_size[1], 3], strict=True)
interpreter.allocate_tensors()

# 遍历每一个子文件夹,得到所有图片路径
image_list = []
for i in range(len(label_name)):
    label_path = path + label_name[i] + "/"
    for image_name in os.listdir(label_path):
        image_list.append(label_path+image_name)
print("finish read image %d" % len(image_list))

# 按照进程分配任务,分配原则是尽量平均,每个核心处理任务的数量尽量一致
last_size_b = len(image_list) - (len(image_list)//batch_size)*batch_size
if last_size_b != 0: 
    for i in range(batch_size-last_size_b): image_list.append(image_list[i])
last_size_c = len(image_list)/batch_size - ((len(image_list)/batch_size)//num_core)*num_core
if last_size_c != 0:
    delta = (len(image_list)/batch_size)//num_core
    assert delta > 0
    image_index = [0]
    for _ in range(int(last_size_c)): image_index.append(image_index[-1] + int((delta+1)*batch_size))
    for _ in range(int(last_size_c), num_core): image_index.append(image_index[-1] + int(delta*batch_size))
else:
    delta = (len(image_list)/batch_size)//num_core
    assert delta > 0
    image_index = [0]
    for _ in range(num_core): image_index.append(image_index[-1] + int(delta*batch_size))

# 启动进程
process_list = []
for i in range(num_core):
    p = Process(target=fun, args=(q, interpreter, image_list[image_index[i]:image_index[i+1]], label_name, batch_size, tfl_int8, last_size_b, num_core, i))
    p.start()
    process_list.append(p)

# 保持进程
for p in process_list:
    p.join()

# 整理结果
for i in range(num_core):
    result = q.get()
    num_prs += result[0]
    num_rec += result[1]
    num_mat += result[2]

# 计算总的准确率accuracy
num1, num2, num3 = 0, 0, 0
for i in range(len(label_name)):
    num1 += num_mat[i, i]
    num2 += num_prs[i]
    num3 += num_rec[i]
assert int(num2) == int(num3)
accuracy = (num1/num2*100*100//1)/100
print("total accuracy: "+ str(accuracy))

# 计算每个类别的precision
result_prs = {}
prs = [float(num_mat[i, i])/float(num_prs[i])*100 for i in range(len(label_name))]
for i in range(len(label_name)):
    result_prs[label_name[i]] = (prs[i]*100//1)/100
print("every precision: "+ str(result_prs))

# 计算每个类别的recall
result_rec = {}
rec = [float(num_mat[i, i])/float(num_rec[i])*100 for i in range(len(label_name))]
for i in range(len(label_name)):
    result_rec[label_name[i]] = (rec[i]*100//1)/100
print("every recall:   "+ str(result_rec))

# 打印混淆矩阵
print(label_name)
print(num_mat)




0. TensorFlow Lite量化方法介绍

猜你喜欢

转载自blog.csdn.net/BIT_Legend/article/details/122266428