【Keras模型量化】之 Fake Quant(tf.quantization)

Fake Quant 简介

为什么要做模型量化:Deep learning models are typically trained with floating point data but they can quantized into integers during inference without any loss of performance (i.e. accuracy).

量化什么:Quantizing models includes quantizing both the weights and activation data (or layer input/outputs).

量化方式:In this work, we quantize the floating point weights/activation data to Qm.n format, where m,n are fixed within a layer but can vary across different network layers.

Fake quant 之所以叫伪量化,是因为虽然可量化weights/activations,但不是真正意义上的量化,即变量类型还是floating point,而不是integer。但是值却经过了量化操作和反量化操作。之所以要做这样的量化,是因为简单方便,且后期可移植到c上进行定点化(可参考Qm.n format)。

Reference

[1] Arm: ML-KWS-for-MCU

Github: https://github.com/ARM-software/ML-KWS-for-MCU

Quant guide: https://github.com/ARM-software/ML-KWS-for-MCU/blob/master/Deployment/Quant_guide.md

[2] Deepxi

github: https://github.com/anicolson/DeepXi

[3] tf.quantization

tensorflow 2.3:
tf.quantization: https://tensorflow.google.cn/api_docs/python/tf/quantization

tf.quantization.fake_quant_with_min_max_args: https://tensorflow.google.cn/api_docs/python/tf/quantization/fake_quant_with_min_max_args

tensorflow 1.15:
tf.quantization:
https://tensorflow.google.cn/versions/r1.15/api_docs/python/tf/quantization

【4】keras模型查看每层输入输出https://editor.csdn.net/md/?articleId=110677379

注:这里均基于tensorflow2.3实现。

Quantize weights

Quantizing weights is fairly simple, as the weights are fixed after the training and we know their min/max range. Using these ranges, the weights are quantized or discretized to 256 levels. Here is the code snippet for quantizing the weights and biases to 8-bit integers.

min_wt = weight.min() 
max_wt = weight.max()

#find number of integer bits to represent this range
int_bits = int(np.ceil(np.log2(max(abs(min_wt),abs(max_wt))))) 
frac_bits = 7-int_bits #remaining bits are fractional bits (1-bit for sign)
#floating point weights are scaled and rounded to [-128,127], which are used in 
#the fixed-point operations on the actual hardware (i.e., microcontroller)
quant_weight = np.round(weight*(2**frac_bits))

#To quantify the impact of quantized weights, scale them back to
#original range to run inference using quantized weights
weight = quant_weight/(2**frac_bits)

实际使用:

以语音增强模型 deepxi 为例 (https://github.com/anicolson/DeepXi):

# tensorflow 2.3 

def quant_weights_and_biases(deepxi):
    model_variables = deepxi.model.variables

    for v in model_variables:
        min_value = tf.reduce_min(v)
        max_value = tf.reduce_max(v) #var_values.max()
        int_bits = tf.cast(tf.math.ceil(tf.math.log(tf.math.maximum(tf.abs(min_value), tf.abs(max_value))) / tf.math.log(2.0)), dtype=tf.int32)
        dec_bits = tf.math.subtract(7, int_bits)
        new_v = tf.round(tf.math.multiply(v, tf.cast(tf.math.pow(2, dec_bits), dtype=tf.float32)))
        new_v = tf.math.divide(new_v, tf.cast(tf.math.pow(2, dec_bits), dtype=tf.float32))
        v.assign(new_v)

    # test: comparing model_variables and quant_model_variables 
    quant_model_variables = deepxi.model.variables

    return deepxi

全部代码:包括1)创建模型结构,2)载入已训练好模型参数,3)模型测试,4)伪量化及测试对比。

## create model from args:
import os

from deepxi.args import get_args
import numpy as np

from deepxi.network.attention import MHANet, AttentionMaskV2
from deepxi.se_batch import Batch
from deepxi.model import DeepXi
from tensorflow import keras

# model test:
from tqdm import tqdm
from deepxi.utils import read_mat, save_wav
import tensorflow as tf

# class MHANETV2
from tensorflow.keras.layers import Activation, Add, \
	Conv1D, Layer, LayerNormalization, Masking, ReLU

def create_model():
    args = get_args()
    if args.causal:
        args.padding = "causal"
    else:
        args.padding = "same"
    args.model_path = args.model_path + '/' + args.ver  # model save path.
    if args.set_path != "set": args.data_path = args.data_path + '/' + args.set_path.rsplit('/', 1)[-1]  # data path.
    N_d = int(args.f_s * args.T_d * 0.001)  # window duration (samples).
    N_s = int(args.f_s * args.T_s * 0.001)  # window shift (samples).
    K = int(pow(2, np.ceil(np.log2(N_d))))  # number of DFT components.

    if True:
        test_x, test_x_len, _, test_x_base_names = Batch(args.test_x_path)

    deepxi = DeepXi(
        N_d=N_d,
        N_s=N_s,
        K=K,
        sample_dir=args.data_path,
        train_s_list=None,
        train_d_list=None,
        **vars(args))

    keras.utils.plot_model(deepxi.model, args.ver + '-' + args.network_type + '.png', show_shapes=True)

    return deepxi


def load_variables_for_model(deepxi,
                             savedmodel_path = '../deepxi_data_model_out/saved_model/mhanet-1.0c/epoch-199/variables/variables'):
    # test
    model_variables = deepxi.model.variables

    # load variabls
    deepxi.model.load_weights(savedmodel_path)

    # test
    pretrained_model_variables = deepxi.model.variables
    return deepxi

def fake_quant_weights_and_biases(deepxi):
    model_variables = deepxi.model.variables

    for v in model_variables:
        min_value = tf.reduce_min(v)
        max_value = tf.reduce_max(v) #var_values.max()
        int_bits = tf.cast(tf.math.ceil(tf.math.log(tf.math.maximum(tf.abs(min_value), tf.abs(max_value))) / tf.math.log(2.0)), dtype=tf.int32)
        dec_bits = tf.math.subtract(7, int_bits)
        new_v = tf.round(tf.math.multiply(v, tf.cast(tf.math.pow(2, dec_bits), dtype=tf.float32)))
        new_v = tf.math.divide(new_v, tf.cast(tf.math.pow(2, dec_bits), dtype=tf.float32))
        v.assign(new_v)

    # test
    quant_model_variables = deepxi.model.variables

    return deepxi

def save_model(deepxi, savedmodel_path='./my_model'):
    deepxi.model.save(savedmodel_path)

def test_model(deepxi, denoise_path='./out'):
    args = get_args()

    out_type = args.out_type    #'y'
    gain = args.gain            #'mmse-lsa'
    e = args.max_epochs         # 200
    out_path = args.out_path + '/' + deepxi.ver + '/e' + str(e) + '/' + out_type + '/' + gain
    out_path = denoise_path + '/' + deepxi.ver + '/e' + str(e) + '/' + out_type + '/' + gain

    # mkdir
    if os.path.exists(out_path) == False:
        os.makedirs(out_path)

    test_x, test_x_len, _, test_x_base_names = Batch(args.test_x_path)

    print("Processing observations...")
    inp_batch, supplementary_batch, n_frames = deepxi.observation_batch(test_x, test_x_len)

    print("Performing inference...")
    tgt_hat_batch = deepxi.model.predict(inp_batch, batch_size=1, verbose=1)

    print("Saving outputs...")
    batch_size = len(test_x_len)
    for i in tqdm(range(batch_size)):
        base_name = test_x_base_names[i]
        inp = inp_batch[i, :n_frames[i], :]
        tgt_hat = tgt_hat_batch[i, :n_frames[i], :]

        # if tf.is_tensor(supplementary_batch):
        supplementary = supplementary_batch[i, :n_frames[i], :]

        saved_data_path = args.saved_data_path
        if args.saved_data_path is not None:
            saved_data = read_mat(saved_data_path + '/' + base_name + '.mat')
            supplementary = (supplementary, saved_data)

        if out_type == 'y':
            y = deepxi.inp_tgt.enhanced_speech(inp, supplementary, tgt_hat, gain).numpy()
            save_wav(out_path + '/' + base_name + '.wav', y, deepxi.inp_tgt.f_s)
            x = tf.cast(test_x[i, :test_x_len[i]] / 32768, tf.float32).numpy()
            numsamples = np.min((len(x), len(y)))
            xy = tf.stack((x[:numsamples], y[:numsamples]), axis=-1)
            save_wav(out_path + '/' + base_name + '.wav', xy.numpy(), deepxi.inp_tgt.f_s)
        else:
            raise ValueError('Invalid output type.')


if __name__ == '__main__':

    # # Step 1: create and load pretrained model, and test
    deepxi = create_model()
    pretrained_deepxi = load_variables_for_model(deepxi)
    test_model(pretrained_deepxi, denoise_path='./out/pretrained_model')
    
    # # Step 2: fake_quant and test model
    quant_deepxi = fake_quant_weights_and_biases(deepxi)
    test_model(pretrained_deepxi, denoise_path='./out/quant_pretrained_model')

    print('done')

##############################################################################
## args:
# --ver
# mhanet-1.0c
# --network
# MHANetV2
# --d_model
# 256
# --n_blocks
# 5
# --n_heads
# 8
# --warmup_steps
# 40000
# --causal
# 1
# --outp_act
# Sigmoid
# --loss_fnc
# BinaryCrossentropy
# --max_epochs
# 200
# --resume_epoch
# 0
# --test_epoch
# 200
# --mbatch_size
# 4
# --inp_tgt_type
# MagXi
# --map_type
# DBNormalCDF
# --sample_size
# 1000
# --f_s
# 16000
# --T_d
# 32
# --T_s
# 16
# --min_snr
# -10
# --max_snr
# 20
# --snr_inter
# 1
# --out_type
# y
# --save_model
# 1
# --log_iter
# 0
# --eval_example
# 1
# --gain
# mmse-lsa
# --train
# 0
# --infer
# 1
# --test
# 0
# --gpu
# 0
# --set_path
# ../deepxi_dataset/deep_xi_training_set
# --data_path
# ../deepxi_data_model_out/data
# --test_x_path
# ../deepxi_dataset/deepxi_test_set/test_noisy_speech_100
# --test_s_path
# /home/user/tmp/t/Deepxi_data_model_out/test_clean_speech
# --test_d_path
# /home/user/tmp/t/Deepxi_data_model_out/test_noise
# --out_path
# ../deepxi_data_model_out/out
# --model_path
# ../deepxi_data_model_out/saved_model

Quantize activation data

使用代表性数据集:One approach for quantizing the activation data is to run inference on some representative input samples (or ideally, the entire dataset) and find the min/max range of each layer input/output. Using these ranges, the activation data can be quantized similar to the weights as shown in the above code snippet.

受限于数据集的有限性:Any outliers in the dataset may increase this range and may impact the accuracy, hence care must be taken in this approach.

使用fake_quant_with_min_max_args: Other approach is to insert the TensorFlow Op “fake_quant_with_min_max_args” after every operation (convolution, addition, multiplication or concatenation) and find the optimal power of 2 min,max ranges that maximize the accuracy.

This same approach can also be used for quantizing the weights. Furthermore, this modified model with fake_quant_with_min_max_args Op and frozen min,max ranges can be used for retraining/fine-tuning, which may increase the accuracy as the network will adapt to quantization.

具体步骤分两步,第一步:根据给定数据集,统计activation的最大最小值;第二步,创建有根据activation max创建的fake_quant_with_min_max_args层(在激活层之前,如送入relu层之前)

示例如下:
Step 1:统计activation的最大值(最大值:最大最小值的绝对值的最大值),即activation层的输入层的最大值。(keras模型查看每层输入输出可参考:https://editor.csdn.net/md/?articleId=110677379

def generate_activation_max(deepxi, testing_path = '../deepxi_dataset/deepxi_test_set/test_noisy_speech_100'):
    from tensorflow.keras import backend as K
    from deepxi.se_batch import Batch
    from tensorflow import keras
    import numpy as np
    import tensorflow as tf

    inp = deepxi.model.input  # input
    inputs = [layer.input for layer in deepxi.model.layers if (isinstance(layer, keras.layers.ReLU) or isinstance(layer, keras.layers.Activation))]  # activation layer inputs
    outputs = [layer.output for layer in deepxi.model.layers if (isinstance(layer, keras.layers.ReLU) or isinstance(layer, keras.layers.Activation))]
    functors_inp = [K.function([inp], [input]) for input in inputs]
    functors_outp = [K.function([inp], [output]) for output in outputs]

    # Testing
    test_x, test_x_len, _, test_x_base_names = Batch(testing_path)
    print("Processing observations...")
    inp_batch, supplementary_batch, n_frames = deepxi.observation_batch(test_x, test_x_len)

    layer_ins = [func(inp_batch) for func in functors_inp]
    layer_outs = [func([inp_batch, 1.]) for func in functors_outp]

    act_max = np.zeros(shape=(len(layer_ins)), dtype=np.int)
    layer_id = 0
    for layer_in in layer_ins:
        min_value = tf.reduce_min(layer_in)
        max_value = tf.reduce_max(layer_in)
        act_max[layer_id] = tf.cast(tf.math.ceil(tf.math.log(tf.math.maximum(tf.abs(min_value), tf.abs(max_value))) / tf.math.log(2.0)), dtype=tf.int32)
        act_max[layer_id] = tf.cast(tf.math.pow(2, act_max[layer_id]), dtype=tf.int8)
        layer_id += 1

    return act_max

Step 2: 根据actication max创建有fake_quant_with_min_max_args层的模型

from tensorflow.python.keras.layers import Conv1D, ReLU, LayerNormalization, Add, Activation
from deepxi.network.attention import MultiHeadAttention, AttentionMask, AttentionMaskV2
import tensorflow as tf
import tensorflow_addons as tfa

class MHANet_QuantAct:
	"""
	Multi-head attention network.
	"""
	def __init__(
		self,
		inp,
		n_outp,
		d_model,
		n_blocks,
		n_heads,
		warmup_steps,
		causal,
		outp_act,
		# inp_all,
		):
		"""
		Argument/s:
			inp - input placeholder.
			n_outp - number of outputs.
			d_model - model size.
			n_blocks - number of blocks.
			n_heads - number of attention heads.
			warmup_steps - number of warmup steps.
			causal - causal flag.
			outp_act - output activation function.
		"""
		self.n_outp = n_outp
		self.d_model = d_model
		self.n_blocks = n_blocks
		self.n_heads = n_heads
		self.d_ff = d_model*4
		self.warmup_steps = warmup_steps
		self.d_k = self.d_model // self.n_heads

		# if self.inp_all is None:
		# 	self.inp_all = tf.zeros(shape=(None, max_speech_len, 257))
		# else:
		# 	self.inp

		att_mask, seq_mask = AttentionMask(causal, -1.0e9)(inp)

		x = Conv1D(self.d_model, 1, use_bias=False)(inp)
		x = LayerNormalization(axis=2, epsilon=1e-6, center=True, scale=True)(x)
		x = ReLU()(x)

		for _ in range(self.n_blocks): x = self.block(x, att_mask, seq_mask)

		self.outp = Conv1D(self.n_outp, 1, use_bias=True)(x)

		if outp_act == "Sigmoid": self.outp = Activation('sigmoid')(self.outp)
		elif outp_act == "ReLU": self.outp = ReLU()(self.outp)
		elif outp_act == "Linear": self.outp = self.outp
		else: raise ValueError("Invalid outp_act")

	def block(self, x, att_mask, seq_mask):
		"""
		MHANet block.

		Argument/s:
			x - input.
			att_mask - attention mask.
			seq_mask - sequence mask.

		Returns:
			layer_2 - output of second layer.
		"""
		layer_1 = MultiHeadAttention(d_model=self.d_model,
			n_heads=self.n_heads)(x, x, x, att_mask, seq_mask)
		layer_1 = Add()([x, layer_1])
		layer_1 = LayerNormalization(axis=2, epsilon=1e-6, center=True,
			scale=True)(layer_1)

		layer_2 = self.feed_forward_network(layer_1)
		layer_2 = Add()([layer_1, layer_2])
		layer_2 = LayerNormalization(axis=2, epsilon=1e-6, center=True,
			scale=True)(layer_2)
		return layer_2

	def feed_forward_network(self, x):
		"""
		Feed forward network.

		Argument/s:
			inp - input placeholder.

		Returns:
			x - output of second feed forward layer.
		"""
		x = Conv1D(self.d_ff, 1, use_bias=True)(x)
		x = ReLU()(x)
		x = Conv1D(self.d_model, 1, use_bias=True)(x)
		return x


class MHANetV2_QuantAct(MHANet_QuantAct):
	"""
	Multi-head attention network implemented using tfa.layers.MultiHeadAttention.
	"""
	def __init__(
		self,
		inp,
		n_outp,
		d_model,
		n_blocks,
		n_heads,
		warmup_steps,
		causal,
		outp_act,
		act_max,
		):
		"""
		Argument/s:
			inp - input placeholder.
			n_outp - number of outputs.
			d_model - model size.
			n_blocks - number of blocks.
			n_heads - number of attention heads.
			warmup_steps - number of warmup steps.
			causal - causal flag.
			outp_act - output activation function.
		"""
		self.n_outp = n_outp
		self.d_model = d_model
		self.n_blocks = n_blocks
		self.n_heads = n_heads
		self.d_ff = d_model*4
		self.warmup_steps = warmup_steps
		self.d_k = self.d_model // self.n_heads
		# add by zhaodeng
		self.act_max = act_max

		att_mask = AttentionMaskV2(causal)(inp)

		x = Conv1D(self.d_model, 1, use_bias=False)(inp)
		x = LayerNormalization(axis=2, epsilon=1e-6, center=True, scale=True)(x)
		
		#######################################################################################################
		###################tf.quantization.fake_quant_with_min_max_vars########################################
		relu_layer_no = 0
		if self.act_max[relu_layer_no] > 0:
			x = tf.quantization.fake_quant_with_min_max_vars(x,
															 min=-self.act_max[relu_layer_no],
															 max=self.act_max[relu_layer_no] - (self.act_max[relu_layer_no] / 128.0),
															 num_bits=8)
		relu_layer_no += 1
		#######################################################################################################
		x = ReLU()(x)

		for _ in range(self.n_blocks):
			x = self.block(x, att_mask, relu_layer_no)
			relu_layer_no += 1

		self.outp = Conv1D(self.n_outp, 1, use_bias=True)(x)

		#######################################################################################################
		###################tf.quantization.fake_quant_with_min_max_vars########################################
		if self.act_max[relu_layer_no] > 0:
			self.outp = tf.quantization.fake_quant_with_min_max_vars(self.outp,
																	 min=-self.act_max[relu_layer_no],
																	 max=self.act_max[relu_layer_no] - (
																				 self.act_max[relu_layer_no] / 128.0),
																	 num_bits=8)
		#######################################################################################################

		if outp_act == "Sigmoid": self.outp = Activation('sigmoid')(self.outp)
		elif outp_act == "ReLU": self.outp = ReLU()(self.outp)
		elif outp_act == "Linear": self.outp = self.outp
		else: raise ValueError("Invalid outp_act")


	def block(self, x, att_mask, relu_layer_no):
		"""
		MHANet block.

		Argument/s:
			x - input.
			att_mask - attention mask.

		Returns:
			layer_2 - output of second layer.
		"""
		layer_1 = tfa.layers.MultiHeadAttention(
			head_size=self.d_k,
			num_heads=self.n_heads,
			output_size=self.d_model,
			dropout=0.0,
			use_projection_bias=False,
		)([x, x, x, att_mask])
		layer_1 = Add()([x, layer_1])
		layer_1 = LayerNormalization(axis=2, epsilon=1e-6, center=True, scale=True)(layer_1)

		layer_2 = self.feed_forward_network(layer_1, relu_layer_no)
		layer_2 = Add()([layer_1, layer_2])
		layer_2 = LayerNormalization(axis=2, epsilon=1e-6, center=True, scale=True)(layer_2)
		return layer_2

	def feed_forward_network(self, x, relu_layer_no):
		"""
		Feed forward network.

		Argument/s:
			inp - input placeholder.

		Returns:
			x - output of second feed forward layer.
		"""
		x = Conv1D(self.d_ff, 1, use_bias=True)(x)
		#######################################################################################################
		###################tf.quantization.fake_quant_with_min_max_vars########################################
		if self.act_max[relu_layer_no] > 0:
			x = tf.quantization.fake_quant_with_min_max_vars(x,
															 min=-self.act_max[relu_layer_no],
															 max=self.act_max[relu_layer_no] - (self.act_max[relu_layer_no] / 128.0),
															 num_bits=8)
		#######################################################################################################
		x = ReLU()(x)
		x = Conv1D(self.d_model, 1, use_bias=True)(x)
		return x

最后根据act_max创建模型即可。并可在该量化activation的模型上再量化weights。

猜你喜欢

转载自blog.csdn.net/u010637291/article/details/110272797