【量化】【TensorFlow】实现细节+重要代码片

QuantizationParams

// Quantization parameters, determining the mapping of quantized values
// to real values (i.e. determining how quantized values are mathematically
// interpreted).
//
// The correspondence is as follows:
//
//   real_value = scale * (quantized_value - zero_point);
//
// In other words, zero_point designates which quantized value corresponds to
// the real 0 value, and scale designates the difference between the real values
// corresponding to consecutive quantized values differing by 1.
struct QuantizationParams {
  int32 zero_point = 0;
  double scale = 0.;
};

FullyConnected

inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
                           int32 input_offset, const uint8* filter_data,
                           const Dims<4>& filter_dims, int32 filter_offset,
                           const int32* bias_data, const Dims<4>& bias_dims,
                           int32 output_offset, int32 output_multiplier,
                           int output_shift, int32 output_activation_min,
                           int32 output_activation_max, uint8* output_data,
                           const Dims<4>& output_dims,
                           gemmlowp::GemmContext* gemm_context) {
  (void)gemm_context;  // only used in optimized code.
  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
  // TODO(benoitjacob): This really should be:
  //     const int batches = ArraySize(output_dims, 1);
  // but the current --variable_batch hack consists in overwriting the 3rd
  // dimension with the runtime batch size, as we don't keep track for each
  // array of which dimension is the batch dimension in it.
  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
                      ArraySize(output_dims, 3);
  const int output_depth = MatchingArraySize(filter_dims, 1, output_dims, 0);
  const int accum_depth = ArraySize(filter_dims, 0);
  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
  TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims));
  for (int b = 0; b < batches; ++b) {
    for (int out_c = 0; out_c < output_depth; ++out_c) {
      int32 acc = 0;
      for (int d = 0; d < accum_depth; ++d) {
        int32 input_val = input_data[b * accum_depth + d];
        int32 filter_val = filter_data[out_c * accum_depth + d];
        acc += (filter_val + filter_offset) * (input_val + input_offset);
      }
      if (bias_data) {
        acc += bias_data[Offset(bias_dims, out_c, 0, 0, 0)];
      }
      acc = MultiplyByQuantizedMultiplierSmallerThanOne(acc, output_multiplier,
                                                        output_shift);
      acc += output_offset;
      acc = std::max(acc, output_activation_min);
      acc = std::min(acc, output_activation_max);
      output_data[out_c + output_depth * b] = static_cast<uint8>(acc);
    }
  }
}

GetOrComputeMinMax

const MinMax& GetOrComputeMinMax(Model* model, const string& array_name) {
  auto& array = model->GetArray(array_name);
  // Normally we should have a MinMax recorded on this Array,
  // so we just use it.
  if (array.minmax != nullptr) {
    return *array.minmax;
  }

  // We don't have a MinMax. That's bad news: we need
  // the graph to provide MinMax info for all arrays in order
  // for inference to reproduce faithfully the same quantization
  // error as the training process had.
  //
  // But we still want to support a fallback for constant arrays,
  // just using the plain min and max computed from array elements.
  // We should hopefully never rely on that in production, as that
  // will not give very good accuracy as that typically won't be
  // exactly what the training process used. But it will be useful
  // to allow easily trying out quantization even if the graph
  // lacks some minmax information.
  if (array.buffer != nullptr) {
    LOG(WARNING)
        << "Constant array " << array_name
        << " lacks MinMax information. To make up for that, we will now compute"
        << " the MinMax from actual array elements. That will result in"
        << " quantization parameters that probably do not match whichever "
           "arithmetic"
        << " was used during training, and thus will probably be a cause of "
           "poor"
        << " inference accuracy.";
    CHECK(array.buffer->type == ArrayDataType::kFloat);
    const auto& data = array.GetBuffer<ArrayDataType::kFloat>().data;
    // We always want [min, max] to contain 0.
    float min = 0.f;
    float max = 0.f;
    for (auto val : data) {
      min = std::min(min, val);
      max = std::max(max, val);
    }
    auto& minmax = array.GetOrCreateMinMax();
    minmax.min = min;
    minmax.max = max;
    return minmax;
  }

  LOG(FATAL) << "Array " << array_name
             << " does not have MinMax information, "
                "and is not a constant array. Cannot "
                "proceed with quantization.";
}

quantize_v2

# Define quantize_v2 here in order to make name the second-to-last attribute,
# because round_mode was added later.
@tf_export("quantize_v2")
@deprecation.deprecated(
    "2017-10-25",
    "`tf.quantize_v2` is deprecated, please use `tf.quantize` instead.")
def quantize_v2(input,  # pylint: disable=redefined-builtin
                min_range,
                max_range,
                T,
                mode="MIN_COMBINED",
                name=None,
                round_mode="HALF_AWAY_FROM_ZERO"):
  return gen_array_ops.quantize_v2(input,
                                   min_range,
                                   max_range,
                                   T=T,
                                   mode=mode,
                                   name=name,
                                   round_mode=round_mode)


quantize_v2.__doc__ = """Please use `tf.quantize` instead."""


# We want to expose tf.quantize instead of tf.quantize_v2; we can deprecate
# tf.quantize_v2 in next version of TensorFlow.
@tf_export("quantize")
def quantize(input,  # pylint: disable=redefined-builtin
             min_range,
             max_range,
             T,
             mode="MIN_COMBINED",
             round_mode="HALF_AWAY_FROM_ZERO",
             name=None):
  return gen_array_ops.quantize_v2(
      input,
      min_range,
      max_range,
      T,
      mode=mode,
      round_mode=round_mode,
      name=name)

op_def_library.py

TensorFlow Quantize API

tf.quantize(
    input,
    min_range,
    max_range,
    T,
    mode='MIN_COMBINED',
    round_mode='HALF_AWAY_FROM_ZERO',
    name=None
)

out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
if T == qint8, out[i] -= (range(T) + 1) / 2.0

TensorFlow quantize代码

【量化】【TensorFlow】实现细节+重要代码片

猜你喜欢