Fake Quant 简介
为什么要做模型量化
:Deep learning models are typically trained with floating point data but they can quantized into integers during inference without any loss of performance (i.e. accuracy).
量化什么
:Quantizing models includes quantizing both the weights and activation data (or layer input/outputs).
量化方式
:In this work, we quantize the floating point weights/activation data to Qm.n format, where m,n are fixed within a layer but can vary across different network layers.
Fake quant
之所以叫伪量化,是因为虽然可量化weights/activations,但不是真正意义上的量化,即变量类型还是floating point,而不是integer。但是值却经过了量化操作和反量化操作。之所以要做这样的量化,是因为简单方便,且后期可移植到c上进行定点化(可参考Qm.n format)。
Reference
[1] Arm: ML-KWS-for-MCU
Github: https://github.com/ARM-software/ML-KWS-for-MCU
Quant guide: https://github.com/ARM-software/ML-KWS-for-MCU/blob/master/Deployment/Quant_guide.md
[2] Deepxi
github: https://github.com/anicolson/DeepXi
[3] tf.quantization
tensorflow 2.3:
tf.quantization: https://tensorflow.google.cn/api_docs/python/tf/quantization
tf.quantization.fake_quant_with_min_max_args: https://tensorflow.google.cn/api_docs/python/tf/quantization/fake_quant_with_min_max_args
tensorflow 1.15:
tf.quantization:
https://tensorflow.google.cn/versions/r1.15/api_docs/python/tf/quantization
【4】keras模型查看每层输入输出:https://editor.csdn.net/md/?articleId=110677379
注:这里均基于tensorflow2.3实现。
Quantize weights
Quantizing weights is fairly simple, as the weights are fixed after the training and we know their min/max range. Using these ranges, the weights are quantized or discretized to 256 levels. Here is the code snippet for quantizing the weights and biases to 8-bit integers.
min_wt = weight.min()
max_wt = weight.max()
#find number of integer bits to represent this range
int_bits = int(np.ceil(np.log2(max(abs(min_wt),abs(max_wt)))))
frac_bits = 7-int_bits #remaining bits are fractional bits (1-bit for sign)
#floating point weights are scaled and rounded to [-128,127], which are used in
#the fixed-point operations on the actual hardware (i.e., microcontroller)
quant_weight = np.round(weight*(2**frac_bits))
#To quantify the impact of quantized weights, scale them back to
#original range to run inference using quantized weights
weight = quant_weight/(2**frac_bits)
实际使用:
以语音增强模型 deepxi 为例 (https://github.com/anicolson/DeepXi):
# tensorflow 2.3
def quant_weights_and_biases(deepxi):
model_variables = deepxi.model.variables
for v in model_variables:
min_value = tf.reduce_min(v)
max_value = tf.reduce_max(v) #var_values.max()
int_bits = tf.cast(tf.math.ceil(tf.math.log(tf.math.maximum(tf.abs(min_value), tf.abs(max_value))) / tf.math.log(2.0)), dtype=tf.int32)
dec_bits = tf.math.subtract(7, int_bits)
new_v = tf.round(tf.math.multiply(v, tf.cast(tf.math.pow(2, dec_bits), dtype=tf.float32)))
new_v = tf.math.divide(new_v, tf.cast(tf.math.pow(2, dec_bits), dtype=tf.float32))
v.assign(new_v)
# test: comparing model_variables and quant_model_variables
quant_model_variables = deepxi.model.variables
return deepxi
全部代码:包括1)创建模型结构,2)载入已训练好模型参数,3)模型测试,4)伪量化及测试对比。
## create model from args:
import os
from deepxi.args import get_args
import numpy as np
from deepxi.network.attention import MHANet, AttentionMaskV2
from deepxi.se_batch import Batch
from deepxi.model import DeepXi
from tensorflow import keras
# model test:
from tqdm import tqdm
from deepxi.utils import read_mat, save_wav
import tensorflow as tf
# class MHANETV2
from tensorflow.keras.layers import Activation, Add, \
Conv1D, Layer, LayerNormalization, Masking, ReLU
def create_model():
args = get_args()
if args.causal:
args.padding = "causal"
else:
args.padding = "same"
args.model_path = args.model_path + '/' + args.ver # model save path.
if args.set_path != "set": args.data_path = args.data_path + '/' + args.set_path.rsplit('/', 1)[-1] # data path.
N_d = int(args.f_s * args.T_d * 0.001) # window duration (samples).
N_s = int(args.f_s * args.T_s * 0.001) # window shift (samples).
K = int(pow(2, np.ceil(np.log2(N_d)))) # number of DFT components.
if True:
test_x, test_x_len, _, test_x_base_names = Batch(args.test_x_path)
deepxi = DeepXi(
N_d=N_d,
N_s=N_s,
K=K,
sample_dir=args.data_path,
train_s_list=None,
train_d_list=None,
**vars(args))
keras.utils.plot_model(deepxi.model, args.ver + '-' + args.network_type + '.png', show_shapes=True)
return deepxi
def load_variables_for_model(deepxi,
savedmodel_path = '../deepxi_data_model_out/saved_model/mhanet-1.0c/epoch-199/variables/variables'):
# test
model_variables = deepxi.model.variables
# load variabls
deepxi.model.load_weights(savedmodel_path)
# test
pretrained_model_variables = deepxi.model.variables
return deepxi
def fake_quant_weights_and_biases(deepxi):
model_variables = deepxi.model.variables
for v in model_variables:
min_value = tf.reduce_min(v)
max_value = tf.reduce_max(v) #var_values.max()
int_bits = tf.cast(tf.math.ceil(tf.math.log(tf.math.maximum(tf.abs(min_value), tf.abs(max_value))) / tf.math.log(2.0)), dtype=tf.int32)
dec_bits = tf.math.subtract(7, int_bits)
new_v = tf.round(tf.math.multiply(v, tf.cast(tf.math.pow(2, dec_bits), dtype=tf.float32)))
new_v = tf.math.divide(new_v, tf.cast(tf.math.pow(2, dec_bits), dtype=tf.float32))
v.assign(new_v)
# test
quant_model_variables = deepxi.model.variables
return deepxi
def save_model(deepxi, savedmodel_path='./my_model'):
deepxi.model.save(savedmodel_path)
def test_model(deepxi, denoise_path='./out'):
args = get_args()
out_type = args.out_type #'y'
gain = args.gain #'mmse-lsa'
e = args.max_epochs # 200
out_path = args.out_path + '/' + deepxi.ver + '/e' + str(e) + '/' + out_type + '/' + gain
out_path = denoise_path + '/' + deepxi.ver + '/e' + str(e) + '/' + out_type + '/' + gain
# mkdir
if os.path.exists(out_path) == False:
os.makedirs(out_path)
test_x, test_x_len, _, test_x_base_names = Batch(args.test_x_path)
print("Processing observations...")
inp_batch, supplementary_batch, n_frames = deepxi.observation_batch(test_x, test_x_len)
print("Performing inference...")
tgt_hat_batch = deepxi.model.predict(inp_batch, batch_size=1, verbose=1)
print("Saving outputs...")
batch_size = len(test_x_len)
for i in tqdm(range(batch_size)):
base_name = test_x_base_names[i]
inp = inp_batch[i, :n_frames[i], :]
tgt_hat = tgt_hat_batch[i, :n_frames[i], :]
# if tf.is_tensor(supplementary_batch):
supplementary = supplementary_batch[i, :n_frames[i], :]
saved_data_path = args.saved_data_path
if args.saved_data_path is not None:
saved_data = read_mat(saved_data_path + '/' + base_name + '.mat')
supplementary = (supplementary, saved_data)
if out_type == 'y':
y = deepxi.inp_tgt.enhanced_speech(inp, supplementary, tgt_hat, gain).numpy()
save_wav(out_path + '/' + base_name + '.wav', y, deepxi.inp_tgt.f_s)
x = tf.cast(test_x[i, :test_x_len[i]] / 32768, tf.float32).numpy()
numsamples = np.min((len(x), len(y)))
xy = tf.stack((x[:numsamples], y[:numsamples]), axis=-1)
save_wav(out_path + '/' + base_name + '.wav', xy.numpy(), deepxi.inp_tgt.f_s)
else:
raise ValueError('Invalid output type.')
if __name__ == '__main__':
# # Step 1: create and load pretrained model, and test
deepxi = create_model()
pretrained_deepxi = load_variables_for_model(deepxi)
test_model(pretrained_deepxi, denoise_path='./out/pretrained_model')
# # Step 2: fake_quant and test model
quant_deepxi = fake_quant_weights_and_biases(deepxi)
test_model(pretrained_deepxi, denoise_path='./out/quant_pretrained_model')
print('done')
##############################################################################
## args:
# --ver
# mhanet-1.0c
# --network
# MHANetV2
# --d_model
# 256
# --n_blocks
# 5
# --n_heads
# 8
# --warmup_steps
# 40000
# --causal
# 1
# --outp_act
# Sigmoid
# --loss_fnc
# BinaryCrossentropy
# --max_epochs
# 200
# --resume_epoch
# 0
# --test_epoch
# 200
# --mbatch_size
# 4
# --inp_tgt_type
# MagXi
# --map_type
# DBNormalCDF
# --sample_size
# 1000
# --f_s
# 16000
# --T_d
# 32
# --T_s
# 16
# --min_snr
# -10
# --max_snr
# 20
# --snr_inter
# 1
# --out_type
# y
# --save_model
# 1
# --log_iter
# 0
# --eval_example
# 1
# --gain
# mmse-lsa
# --train
# 0
# --infer
# 1
# --test
# 0
# --gpu
# 0
# --set_path
# ../deepxi_dataset/deep_xi_training_set
# --data_path
# ../deepxi_data_model_out/data
# --test_x_path
# ../deepxi_dataset/deepxi_test_set/test_noisy_speech_100
# --test_s_path
# /home/user/tmp/t/Deepxi_data_model_out/test_clean_speech
# --test_d_path
# /home/user/tmp/t/Deepxi_data_model_out/test_noise
# --out_path
# ../deepxi_data_model_out/out
# --model_path
# ../deepxi_data_model_out/saved_model
Quantize activation data
使用代表性数据集:One approach for quantizing the activation data is to run inference on some representative input samples (or ideally, the entire dataset) and find the min/max range of each layer input/output. Using these ranges, the activation data can be quantized similar to the weights as shown in the above code snippet.
受限于数据集的有限性:Any outliers in the dataset may increase this range and may impact the accuracy, hence care must be taken in this approach.
使用fake_quant_with_min_max_args: Other approach is to insert the TensorFlow Op “fake_quant_with_min_max_args
” after every operation (convolution, addition, multiplication or concatenation
) and find the optimal power of 2 min,max ranges that maximize the accuracy.
This same approach can also be used for quantizing the weights. Furthermore, this modified model with fake_quant_with_min_max_args Op and frozen min,max ranges can be used for retraining/fine-tuning, which may increase the accuracy as the network will adapt to quantization.
具体步骤分两步,第一步:根据给定数据集,统计activation的最大最小值;第二步,创建有根据activation max创建的fake_quant_with_min_max_args层(在激活层之前,如送入relu层之前)
示例如下:
Step 1:统计activation的最大值
(最大值:最大最小值的绝对值的最大值),即activation层的输入层的最大值。(keras模型查看每层输入输出可参考:https://editor.csdn.net/md/?articleId=110677379)
def generate_activation_max(deepxi, testing_path = '../deepxi_dataset/deepxi_test_set/test_noisy_speech_100'):
from tensorflow.keras import backend as K
from deepxi.se_batch import Batch
from tensorflow import keras
import numpy as np
import tensorflow as tf
inp = deepxi.model.input # input
inputs = [layer.input for layer in deepxi.model.layers if (isinstance(layer, keras.layers.ReLU) or isinstance(layer, keras.layers.Activation))] # activation layer inputs
outputs = [layer.output for layer in deepxi.model.layers if (isinstance(layer, keras.layers.ReLU) or isinstance(layer, keras.layers.Activation))]
functors_inp = [K.function([inp], [input]) for input in inputs]
functors_outp = [K.function([inp], [output]) for output in outputs]
# Testing
test_x, test_x_len, _, test_x_base_names = Batch(testing_path)
print("Processing observations...")
inp_batch, supplementary_batch, n_frames = deepxi.observation_batch(test_x, test_x_len)
layer_ins = [func(inp_batch) for func in functors_inp]
layer_outs = [func([inp_batch, 1.]) for func in functors_outp]
act_max = np.zeros(shape=(len(layer_ins)), dtype=np.int)
layer_id = 0
for layer_in in layer_ins:
min_value = tf.reduce_min(layer_in)
max_value = tf.reduce_max(layer_in)
act_max[layer_id] = tf.cast(tf.math.ceil(tf.math.log(tf.math.maximum(tf.abs(min_value), tf.abs(max_value))) / tf.math.log(2.0)), dtype=tf.int32)
act_max[layer_id] = tf.cast(tf.math.pow(2, act_max[layer_id]), dtype=tf.int8)
layer_id += 1
return act_max
Step 2: 根据actication max创建有fake_quant_with_min_max_args层的模型
from tensorflow.python.keras.layers import Conv1D, ReLU, LayerNormalization, Add, Activation
from deepxi.network.attention import MultiHeadAttention, AttentionMask, AttentionMaskV2
import tensorflow as tf
import tensorflow_addons as tfa
class MHANet_QuantAct:
"""
Multi-head attention network.
"""
def __init__(
self,
inp,
n_outp,
d_model,
n_blocks,
n_heads,
warmup_steps,
causal,
outp_act,
# inp_all,
):
"""
Argument/s:
inp - input placeholder.
n_outp - number of outputs.
d_model - model size.
n_blocks - number of blocks.
n_heads - number of attention heads.
warmup_steps - number of warmup steps.
causal - causal flag.
outp_act - output activation function.
"""
self.n_outp = n_outp
self.d_model = d_model
self.n_blocks = n_blocks
self.n_heads = n_heads
self.d_ff = d_model*4
self.warmup_steps = warmup_steps
self.d_k = self.d_model // self.n_heads
# if self.inp_all is None:
# self.inp_all = tf.zeros(shape=(None, max_speech_len, 257))
# else:
# self.inp
att_mask, seq_mask = AttentionMask(causal, -1.0e9)(inp)
x = Conv1D(self.d_model, 1, use_bias=False)(inp)
x = LayerNormalization(axis=2, epsilon=1e-6, center=True, scale=True)(x)
x = ReLU()(x)
for _ in range(self.n_blocks): x = self.block(x, att_mask, seq_mask)
self.outp = Conv1D(self.n_outp, 1, use_bias=True)(x)
if outp_act == "Sigmoid": self.outp = Activation('sigmoid')(self.outp)
elif outp_act == "ReLU": self.outp = ReLU()(self.outp)
elif outp_act == "Linear": self.outp = self.outp
else: raise ValueError("Invalid outp_act")
def block(self, x, att_mask, seq_mask):
"""
MHANet block.
Argument/s:
x - input.
att_mask - attention mask.
seq_mask - sequence mask.
Returns:
layer_2 - output of second layer.
"""
layer_1 = MultiHeadAttention(d_model=self.d_model,
n_heads=self.n_heads)(x, x, x, att_mask, seq_mask)
layer_1 = Add()([x, layer_1])
layer_1 = LayerNormalization(axis=2, epsilon=1e-6, center=True,
scale=True)(layer_1)
layer_2 = self.feed_forward_network(layer_1)
layer_2 = Add()([layer_1, layer_2])
layer_2 = LayerNormalization(axis=2, epsilon=1e-6, center=True,
scale=True)(layer_2)
return layer_2
def feed_forward_network(self, x):
"""
Feed forward network.
Argument/s:
inp - input placeholder.
Returns:
x - output of second feed forward layer.
"""
x = Conv1D(self.d_ff, 1, use_bias=True)(x)
x = ReLU()(x)
x = Conv1D(self.d_model, 1, use_bias=True)(x)
return x
class MHANetV2_QuantAct(MHANet_QuantAct):
"""
Multi-head attention network implemented using tfa.layers.MultiHeadAttention.
"""
def __init__(
self,
inp,
n_outp,
d_model,
n_blocks,
n_heads,
warmup_steps,
causal,
outp_act,
act_max,
):
"""
Argument/s:
inp - input placeholder.
n_outp - number of outputs.
d_model - model size.
n_blocks - number of blocks.
n_heads - number of attention heads.
warmup_steps - number of warmup steps.
causal - causal flag.
outp_act - output activation function.
"""
self.n_outp = n_outp
self.d_model = d_model
self.n_blocks = n_blocks
self.n_heads = n_heads
self.d_ff = d_model*4
self.warmup_steps = warmup_steps
self.d_k = self.d_model // self.n_heads
# add by zhaodeng
self.act_max = act_max
att_mask = AttentionMaskV2(causal)(inp)
x = Conv1D(self.d_model, 1, use_bias=False)(inp)
x = LayerNormalization(axis=2, epsilon=1e-6, center=True, scale=True)(x)
#######################################################################################################
###################tf.quantization.fake_quant_with_min_max_vars########################################
relu_layer_no = 0
if self.act_max[relu_layer_no] > 0:
x = tf.quantization.fake_quant_with_min_max_vars(x,
min=-self.act_max[relu_layer_no],
max=self.act_max[relu_layer_no] - (self.act_max[relu_layer_no] / 128.0),
num_bits=8)
relu_layer_no += 1
#######################################################################################################
x = ReLU()(x)
for _ in range(self.n_blocks):
x = self.block(x, att_mask, relu_layer_no)
relu_layer_no += 1
self.outp = Conv1D(self.n_outp, 1, use_bias=True)(x)
#######################################################################################################
###################tf.quantization.fake_quant_with_min_max_vars########################################
if self.act_max[relu_layer_no] > 0:
self.outp = tf.quantization.fake_quant_with_min_max_vars(self.outp,
min=-self.act_max[relu_layer_no],
max=self.act_max[relu_layer_no] - (
self.act_max[relu_layer_no] / 128.0),
num_bits=8)
#######################################################################################################
if outp_act == "Sigmoid": self.outp = Activation('sigmoid')(self.outp)
elif outp_act == "ReLU": self.outp = ReLU()(self.outp)
elif outp_act == "Linear": self.outp = self.outp
else: raise ValueError("Invalid outp_act")
def block(self, x, att_mask, relu_layer_no):
"""
MHANet block.
Argument/s:
x - input.
att_mask - attention mask.
Returns:
layer_2 - output of second layer.
"""
layer_1 = tfa.layers.MultiHeadAttention(
head_size=self.d_k,
num_heads=self.n_heads,
output_size=self.d_model,
dropout=0.0,
use_projection_bias=False,
)([x, x, x, att_mask])
layer_1 = Add()([x, layer_1])
layer_1 = LayerNormalization(axis=2, epsilon=1e-6, center=True, scale=True)(layer_1)
layer_2 = self.feed_forward_network(layer_1, relu_layer_no)
layer_2 = Add()([layer_1, layer_2])
layer_2 = LayerNormalization(axis=2, epsilon=1e-6, center=True, scale=True)(layer_2)
return layer_2
def feed_forward_network(self, x, relu_layer_no):
"""
Feed forward network.
Argument/s:
inp - input placeholder.
Returns:
x - output of second feed forward layer.
"""
x = Conv1D(self.d_ff, 1, use_bias=True)(x)
#######################################################################################################
###################tf.quantization.fake_quant_with_min_max_vars########################################
if self.act_max[relu_layer_no] > 0:
x = tf.quantization.fake_quant_with_min_max_vars(x,
min=-self.act_max[relu_layer_no],
max=self.act_max[relu_layer_no] - (self.act_max[relu_layer_no] / 128.0),
num_bits=8)
#######################################################################################################
x = ReLU()(x)
x = Conv1D(self.d_model, 1, use_bias=True)(x)
return x
最后根据act_max创建模型即可。并可在该量化activation的模型上再量化weights。