1189 lines
53 KiB
Python
1189 lines
53 KiB
Python
# -------------------------------------------------------------------------
|
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
# Licensed under the MIT License. See License.txt in the project root for
|
|
# license information.
|
|
# --------------------------------------------------------------------------
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from dataclasses import dataclass
|
|
from enum import Enum
|
|
from typing import Any
|
|
|
|
import numpy as np
|
|
import onnx
|
|
import onnx.numpy_helper
|
|
from onnx import TensorProto
|
|
from onnx import onnx_pb as onnx_proto
|
|
|
|
from .base_quantizer import BaseQuantizer, QuantizationParams
|
|
from .calibrate import TensorData
|
|
from .quant_utils import (
|
|
DEQUANT_OP_NAME,
|
|
QUANT_OP_NAME,
|
|
QuantizedValue,
|
|
QuantizedValueType,
|
|
__producer__,
|
|
__version__,
|
|
add_dequant_output_suffix,
|
|
add_dequant_suffix,
|
|
add_quant_input_suffix,
|
|
add_quant_output_suffix,
|
|
add_quant_suffix,
|
|
compute_scale_zp,
|
|
compute_scale_zp_float8,
|
|
find_by_name,
|
|
get_qmin_qmax_for_qType,
|
|
ms_domain,
|
|
normalize_axis,
|
|
tensor_proto_to_array,
|
|
)
|
|
from .registry import CreateQDQQuantizer
|
|
|
|
|
|
class QDQQuantTensorType(Enum):
|
|
ACTIVATION = 0
|
|
WEIGHT = 1
|
|
BIAS = 2
|
|
|
|
|
|
# Holds the name of the node input from which a node output will share the
|
|
# same quantization param initializers (zero-point and scale initializers).
|
|
# Ex: A Transpose node's output will use the same quant param initializers used at the input.
|
|
@dataclass
|
|
class QDQQuantParamProvider:
|
|
input_name: str
|
|
node_name: str
|
|
|
|
|
|
# Holds information for tensors that have been marked for quantization by operator quantizers.
|
|
# Does not hold information for bias tensors.
|
|
class QDQTensorQuantInfo:
|
|
def __init__(self, tensor_type=QDQQuantTensorType.ACTIVATION, quant_para_provider=None, axis=None, data_type=None):
|
|
self.tensor_type = tensor_type
|
|
self.quant_para_provider = quant_para_provider
|
|
self.axis = axis
|
|
self.is_shared = quant_para_provider is not None
|
|
assert data_type is not None
|
|
self.data_type = data_type
|
|
|
|
|
|
# Holds information for bias tensors that have been marked for quantization by operator quantizers.
|
|
@dataclass
|
|
class QDQBiasQuantInfo:
|
|
node_name: str
|
|
input_name: str
|
|
weight_name: str
|
|
beta: float
|
|
|
|
|
|
# Holds quantization parameter values (scale, zp) for a tensor.
|
|
# A tensor typically has a one set of quantization parameters, unless the tensor is
|
|
# at a "mixed-precision" boundary where the activation quantization type changes (e.g., from uint8 to uint16).
|
|
@dataclass
|
|
class QDQTensorQuantParams:
|
|
original: QuantizationParams # Generated by producer node.
|
|
converted: QuantizationParams | None # Converted type consumed by some (or all/none) consumer nodes.
|
|
converted_recv_nodes: set[str] | None # The name of nodes that consume the converted type.
|
|
|
|
|
|
# Holds scale and zero_point initializer TensorProtos.
|
|
@dataclass
|
|
class QDQScaleZpInitializers:
|
|
scale: TensorProto
|
|
zero_point: TensorProto
|
|
|
|
|
|
# Holds all scale and zero-point initializers for a tensor.
|
|
# A tensor typically has a one set of quantization parameters, unless the tensor is
|
|
# at a "mixed-precision" boundary where the activation quantization type changes (e.g., from uint8 to uint16).
|
|
@dataclass
|
|
class QDQTensorScaleZpInitializers:
|
|
original: QDQScaleZpInitializers
|
|
converted: QDQScaleZpInitializers | None
|
|
converted_recv_nodes: set[str] | None
|
|
|
|
|
|
# Holds cached information of a tensor's quantized values (types, zp/scale initializer names, etc.).
|
|
# A tensor typically has a one set of quantization parameters, unless the tensor is
|
|
# at a "mixed-precision" boundary where the activation quantization type changes (e.g., from uint8 to uint16).
|
|
@dataclass
|
|
class QDQTensorQuantizedValue:
|
|
original: QuantizedValue
|
|
converted: QuantizedValue | None
|
|
converted_recv_nodes: set[str] | None
|
|
|
|
def get_for_consumer(self, consumer_node_name) -> QuantizedValue:
|
|
if self.converted is None: # Quantized value is not converted, return original
|
|
return self.original
|
|
|
|
if self.converted_recv_nodes is None: # All consumers receive the converted value
|
|
return self.converted
|
|
|
|
# Check if consumer node name is in the list of nodes that
|
|
# receive the converted quantization value. If not, return the original value generated
|
|
# by the tensor's producer.
|
|
return self.converted if (consumer_node_name in self.converted_recv_nodes) else self.original
|
|
|
|
|
|
class QDQQuantizer(BaseQuantizer):
|
|
def __init__(
|
|
self,
|
|
model,
|
|
per_channel,
|
|
reduce_range,
|
|
weight_qType,
|
|
activation_qType,
|
|
tensors_range,
|
|
nodes_to_quantize,
|
|
nodes_to_exclude,
|
|
op_types_to_quantize,
|
|
extra_options=None,
|
|
):
|
|
BaseQuantizer.__init__(
|
|
self,
|
|
model,
|
|
per_channel,
|
|
reduce_range,
|
|
weight_qType,
|
|
activation_qType,
|
|
tensors_range,
|
|
nodes_to_quantize,
|
|
nodes_to_exclude,
|
|
op_types_to_quantize,
|
|
extra_options,
|
|
)
|
|
self.tensors_to_quantize = {}
|
|
self.bias_to_quantize = {}
|
|
|
|
self.nodes_to_remove = []
|
|
|
|
# Specific op types to exclude qdq quantization for their outputs.
|
|
# In TRT, it's not recommended to quantize outputs for weighted ops such as Conv, Matmul, Gemm
|
|
# because those ops may be followed by nodes that require high resolution inputs.
|
|
# Adding QDQ for those ops' output may end up with worse accuracy.
|
|
# So, we don't recommend to add QDQ to node's output under such condition.
|
|
self.op_types_to_exclude_output_quantization = extra_options.get("OpTypesToExcludeOutputQuantization", [])
|
|
|
|
# We do quantization on Dequantizelinear's input to remove Quantizelinear for weight as an optimization.
|
|
# In some cases, for example QDQ BERT model for TensorRT, QDQ should always appear as a pair.
|
|
# Therefore, we need to disable this optimization and add qdq pair to weight.
|
|
self.add_qdq_pair_to_weight = extra_options.get("AddQDQPairToWeight", False)
|
|
|
|
# Some scenarios do not need the bias quantized. For example, in the case of Quantization Aware Training,
|
|
# quantizing the bias is not needed. This is because in QAT, all model parameters are expected to be in
|
|
# floating point format. To that end, we can use the FakeQuant operator for weights and activations that
|
|
# can always have QDQ pairs (by using AddQDQPairToWeight). But for biases in a quantized model, we can't use
|
|
# FakeQuant because it only ever appears before a DQ (since it is quantized as int32).
|
|
self.quantize_bias = extra_options.get("QuantizeBias", True)
|
|
|
|
# The default behavior is that multiple nodes can share a QDQ pair as their inputs.
|
|
# In TRT, QDQ pair can`t be shared between nodes, so it will create dedicated QDQ pairs for each node.
|
|
self.dedicated_qdq_pair = extra_options.get("DedicatedQDQPair", False)
|
|
self.tensor_to_its_receiving_nodes = {}
|
|
|
|
# Let user set channel axis for specific op type and it's effective only when per channel quantization is supported and per_channel is True.
|
|
self.qdq_op_type_per_channel_support_to_axis = extra_options.get("QDQOpTypePerChannelSupportToAxis", {})
|
|
|
|
self.qdq_op_domain = ms_domain if extra_options.get("UseQDQContribOps", False) else None
|
|
|
|
# User can specify if removable activations, like Clip/Relu, should be kept in the graph.
|
|
# Used in the QDQRemovableActivation class.
|
|
self.qdq_keep_removable_activations = extra_options.get("QDQKeepRemovableActivations", False)
|
|
|
|
# The ONNX spec did not support 16-bit Q/DQ ops before opset 21.
|
|
# So, may have to override the Q/DQ op domain to 'com.microsoft' if the activation or weight types
|
|
# are 16-bit or 4-bit integers.
|
|
if self.opset_version < 21:
|
|
opset21_types = (TensorProto.UINT16, TensorProto.INT16, TensorProto.UINT4, TensorProto.INT4)
|
|
overrides_have_opset21_types = any(
|
|
t.tensor_type in opset21_types for t in self.tensor_quant_override_qtypes
|
|
)
|
|
if not self.qdq_op_domain and (
|
|
self.activation_qType in opset21_types
|
|
or self.weight_qType in opset21_types
|
|
or overrides_have_opset21_types
|
|
):
|
|
logging.warning(
|
|
"ONNX QuantizeLinear and DequantizeLinear operators do not support "
|
|
"16-bit/4-bit integer quantization types prior to opset 21. "
|
|
f"The domain of QuantizeLinear and DequantizeLinear operators will be set to '{ms_domain}' to "
|
|
"enable support."
|
|
)
|
|
self.qdq_op_domain = ms_domain
|
|
|
|
self.quantization_params = self.calc_graph_quant_params()
|
|
|
|
# Map of all original value names to quantized value names
|
|
self.quantized_value_map = {}
|
|
|
|
def _get_tensor_type(self, tensor_name):
|
|
"""
|
|
Check if tensor can be quantized
|
|
"""
|
|
weight = find_by_name(tensor_name, self.model.initializer())
|
|
if weight is not None:
|
|
return weight.data_type
|
|
elif tensor_name in self.value_infos:
|
|
vi = self.value_infos[tensor_name]
|
|
if vi.type.HasField("tensor_type"):
|
|
return vi.type.tensor_type.elem_type
|
|
return None
|
|
|
|
def _is_tensor_quantizable(self, tensor_name):
|
|
"""
|
|
Check if tensor can be quantized
|
|
"""
|
|
weight = find_by_name(tensor_name, self.model.initializer())
|
|
if weight is not None:
|
|
if weight.data_type in (onnx_proto.TensorProto.FLOAT, onnx_proto.TensorProto.FLOAT16):
|
|
return True
|
|
elif tensor_name in self.value_infos:
|
|
vi = self.value_infos[tensor_name]
|
|
if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type in (
|
|
TensorProto.FLOAT,
|
|
TensorProto.FLOAT16,
|
|
):
|
|
return True
|
|
else:
|
|
logging.warning(
|
|
f"failed to infer the type of tensor: {tensor_name}. Skip to quantize it. Please check if it is expected."
|
|
)
|
|
|
|
return False
|
|
|
|
def __quantize_tensor(self, tensor_name, quant_sharing_provider=None, tensor_type=QDQQuantTensorType.ACTIVATION):
|
|
"""
|
|
Adds a tensor to the list (actually a dict) of tensors to quantize. Called indirectly by op quantizers that
|
|
want to quantize a tensor (i.e., "mark" a tensor for quantization).
|
|
|
|
If quant_sharing_provider is not None, tensor with name tensor_name will be quantized with the same
|
|
quantization parameters as the node input specified in quant_sharing_provider. Ex: A Tranpose node's output
|
|
will typically use the same quantization parameter initializers used at the Transpose node's input.
|
|
|
|
Args:
|
|
tensor_name: name of the tensor to quantize
|
|
quant_sharing_provider: name of the tensor and node that provides quantization parameter
|
|
tensor_type: QDQQuantTensorType default ACTIVATION
|
|
"""
|
|
if self._is_tensor_quantizable(tensor_name):
|
|
if quant_sharing_provider:
|
|
if not isinstance(quant_sharing_provider, QDQQuantParamProvider):
|
|
raise TypeError(
|
|
f"quant_sharing_provider must be of type QDQQuantParamProvider, not {type(quant_sharing_provider)}."
|
|
)
|
|
|
|
data_type = self._get_tensor_type(tensor_name)
|
|
self.tensors_to_quantize[tensor_name] = QDQTensorQuantInfo(
|
|
tensor_type=tensor_type, quant_para_provider=quant_sharing_provider, data_type=data_type
|
|
)
|
|
elif tensor_name not in self.tensors_to_quantize:
|
|
data_type = self._get_tensor_type(tensor_name)
|
|
self.tensors_to_quantize[tensor_name] = QDQTensorQuantInfo(tensor_type=tensor_type, data_type=data_type)
|
|
|
|
def quantize_activation_tensor(self, tensor_name: str):
|
|
"""
|
|
Adds a tensor to the list of tensors to quantize. Called by op quantizers that
|
|
want to quantize a tensor (i.e., "mark" a tensor for quantization).
|
|
|
|
Args:
|
|
tensor_name: name of the tensor to quantize
|
|
"""
|
|
return self.__quantize_tensor(tensor_name, None, QDQQuantTensorType.ACTIVATION)
|
|
|
|
def quantize_output_same_as_input(self, output_name: str, input_name: str, node_name: str):
|
|
"""
|
|
Adds a tensor to the list of tensors to quantize. Called by op quantizers that
|
|
want to quantize an output tensor using the same quantization parameters as one of the node's inputs.
|
|
|
|
Ex: A Tranpose node's output will typically use the same quantization parameter initializers used at
|
|
the Transpose node's input.
|
|
|
|
Args:
|
|
output_name: name of the node output to quantize so that it uses the same quantization params as an input.
|
|
input_name: name of the node input from which the output tensor will get its quantization params.
|
|
node_name: name of the node that consumes `input_name`.
|
|
"""
|
|
return self.__quantize_tensor(
|
|
output_name, QDQQuantParamProvider(input_name, node_name), QDQQuantTensorType.ACTIVATION
|
|
)
|
|
|
|
def quantize_weight_tensor(self, tensor_name: str):
|
|
"""
|
|
Adds a tensor to the list of weight tensors to quantize. Called by op quantizers that
|
|
want to quantize a weight (i.e., "mark" a weight for quantization).
|
|
|
|
Args:
|
|
tensor_name: name of the weight to quantize
|
|
"""
|
|
return self.__quantize_tensor(tensor_name, None, QDQQuantTensorType.WEIGHT)
|
|
|
|
def quantize_weight_tensor_per_channel(self, tensor_name, axis):
|
|
weight = find_by_name(tensor_name, self.model.initializer())
|
|
if weight:
|
|
if weight.data_type in (onnx_proto.TensorProto.FLOAT, onnx_proto.TensorProto.FLOAT16):
|
|
self.tensors_to_quantize[tensor_name] = QDQTensorQuantInfo(
|
|
tensor_type=QDQQuantTensorType.WEIGHT, axis=axis, data_type=weight.data_type
|
|
)
|
|
else:
|
|
logging.warning(f"only support per-channel quantization on weight. Tensor: {tensor_name} is not quantized.")
|
|
|
|
def quantize_bias_tensor(self, node_name, bias_name, input_name, weight_name, beta=1.0):
|
|
"""
|
|
Adds a bias tensor to the list of bias tensors to quantize. Called by op quantizers that
|
|
want to quantize a bias with bias_zero_point = 0 and bias_scale = input_scale * weight_scale * beta.
|
|
TODO: Explain the reasoning for using this formula.
|
|
|
|
Args:
|
|
node_name: name of the node that consumes the bias, input, and weight tensors.
|
|
bias_name: name of the bias tensor to quantize.
|
|
input_name: name of the input tensor whose scale is used to compute the bias's scale.
|
|
weight_name: name of the weight tensor whose scale is used to compute the bias's scale.
|
|
beta: Multiplier used to compute the bias's scale.
|
|
"""
|
|
# If the user provided quantization overrides for this tensor, treat it as a regular weight.
|
|
if self.tensor_quant_overrides.get(bias_name):
|
|
logging.info(
|
|
f"Quantizing bias tensor '{bias_name}' as a weight due to the presence of user-specified overrides"
|
|
)
|
|
is_per_channel, axis = self.is_tensor_per_channel(bias_name, default_axis=0)
|
|
if is_per_channel:
|
|
self.quantize_weight_tensor_per_channel(bias_name, axis)
|
|
else:
|
|
self.quantize_weight_tensor(bias_name)
|
|
return
|
|
|
|
weight = find_by_name(bias_name, self.model.initializer())
|
|
if weight is not None:
|
|
if weight.data_type in (onnx_proto.TensorProto.FLOAT, onnx_proto.TensorProto.FLOAT16):
|
|
if bias_name not in self.bias_to_quantize:
|
|
self.bias_to_quantize[bias_name] = QDQBiasQuantInfo(node_name, input_name, weight_name, beta)
|
|
else:
|
|
logging.warning(f"Bias {bias_name} has already been marked for quantization")
|
|
else:
|
|
logging.warning(f"Expected {bias_name} to be a weight")
|
|
|
|
def remove_node(self, node):
|
|
self.nodes_to_remove.append(node)
|
|
|
|
def remove_nodes(self):
|
|
self.model.remove_nodes(self.nodes_to_remove)
|
|
|
|
def quantize_model(self):
|
|
for node in self.model.nodes():
|
|
if self.should_quantize_node(node):
|
|
op_quantizer = CreateQDQQuantizer(self, node)
|
|
op_quantizer.quantize()
|
|
|
|
for tensor_name in node.input:
|
|
if tensor_name not in self.tensor_to_its_receiving_nodes:
|
|
self.tensor_to_its_receiving_nodes[tensor_name] = []
|
|
self.tensor_to_its_receiving_nodes[tensor_name].append(node)
|
|
|
|
self._quantize_normal_tensors()
|
|
self._quantize_sharing_param_tensors()
|
|
if self.quantize_bias:
|
|
self._quantize_bias_tensors()
|
|
self.remove_nodes()
|
|
if not self.add_qdq_pair_to_weight:
|
|
self.model.clean_initializers()
|
|
|
|
self.model.model.producer_name = __producer__
|
|
self.model.model.producer_version = __version__
|
|
if self.qdq_op_domain == ms_domain:
|
|
self.model.set_opset_import(ms_domain, 1)
|
|
|
|
return self.model.model
|
|
|
|
def try_replacing_upstream_output(self, upstream_output_name, output_name):
|
|
if (
|
|
output_name in self.quantization_params
|
|
and self.quantization_params[output_name].converted is None
|
|
and self.quantization_params[upstream_output_name].converted is None
|
|
and len(self.model.input_name_to_nodes()[upstream_output_name]) == 1
|
|
and not self.model.is_graph_output(upstream_output_name)
|
|
and not self.model.is_graph_input(upstream_output_name)
|
|
):
|
|
self.model.replace_output_of_all_nodes(upstream_output_name, output_name)
|
|
if upstream_output_name in self.tensors_to_quantize:
|
|
del self.tensors_to_quantize[upstream_output_name]
|
|
return True
|
|
return False
|
|
|
|
def _create_q_node(
|
|
self,
|
|
q_input: str,
|
|
q_output: str,
|
|
quant_node_name: str,
|
|
scale_name: str,
|
|
zp_name: str,
|
|
axis: int | None = None,
|
|
):
|
|
"""
|
|
Creates a QuantizeLinear node and adds it to the model.
|
|
"""
|
|
qlinear_node = onnx.helper.make_node(
|
|
QUANT_OP_NAME,
|
|
[q_input, scale_name, zp_name],
|
|
[q_output],
|
|
quant_node_name,
|
|
axis=axis,
|
|
domain=self.qdq_op_domain,
|
|
)
|
|
self.model.add_nodes([qlinear_node])
|
|
|
|
def _create_dq_node(
|
|
self,
|
|
dq_input: str,
|
|
dq_output: str,
|
|
dequant_node_name: str,
|
|
scale_name: str,
|
|
zp_name: str,
|
|
axis: int | None = None,
|
|
):
|
|
"""
|
|
Creates a DequantizeLinear node and adds it to the model.
|
|
"""
|
|
dequant_node = onnx.helper.make_node(
|
|
DEQUANT_OP_NAME,
|
|
[dq_input, scale_name, zp_name],
|
|
[dq_output],
|
|
dequant_node_name,
|
|
axis=axis,
|
|
domain=self.qdq_op_domain,
|
|
)
|
|
self.model.add_nodes([dequant_node])
|
|
|
|
def _create_qdq_nodes(
|
|
self, q_input, q_output, quant_node_name, dq_input, dq_output, dequant_node_name, scale_name, zp_name, axis=None
|
|
):
|
|
qlinear_node = onnx.helper.make_node(
|
|
QUANT_OP_NAME,
|
|
[q_input, scale_name, zp_name],
|
|
[q_output],
|
|
quant_node_name,
|
|
axis=axis,
|
|
domain=self.qdq_op_domain,
|
|
)
|
|
dequant_node = onnx.helper.make_node(
|
|
DEQUANT_OP_NAME,
|
|
[dq_input, scale_name, zp_name],
|
|
[dq_output],
|
|
dequant_node_name,
|
|
axis=axis,
|
|
domain=self.qdq_op_domain,
|
|
)
|
|
self.model.add_nodes([qlinear_node, dequant_node])
|
|
|
|
def _add_qdq_pair_for_initializer(self, weight_proto, tensor_type, axis=None):
|
|
weight_name = weight_proto.name
|
|
if axis is not None:
|
|
if self.opset_version < 13:
|
|
raise ValueError("Per-Channel support with QDQ format requires onnx opset version 13 or above.")
|
|
|
|
qtype = self.weight_qType if tensor_type is QDQQuantTensorType.WEIGHT else self.activation_qType
|
|
if qtype == onnx.onnx_pb.TensorProto.UINT8:
|
|
qtype = onnx_proto.TensorProto.INT8
|
|
|
|
q_weight_name, zp_name, scale_name = self.quantize_weight_per_channel(
|
|
weight_name,
|
|
# Quantization type is forced to be TensorProto.INT8.
|
|
# when the expected value would be (see below)
|
|
# self.weight_qType if tensor_type is QDQQuantTensorType.WEIGHT else self.activation_qType.
|
|
# QLinearConv expects to have a unique value for all channels.
|
|
# This code does not enforce that but it is necessarily the case when the
|
|
# quantization is symmetric (as for INT8).
|
|
qtype,
|
|
axis,
|
|
keep_float_weight=self.add_qdq_pair_to_weight,
|
|
)
|
|
else:
|
|
q_weight_name, zp_name, scale_name = self.quantize_initializer(
|
|
weight_proto,
|
|
self.weight_qType if tensor_type is QDQQuantTensorType.WEIGHT else self.activation_qType,
|
|
keep_float_weight=self.add_qdq_pair_to_weight,
|
|
)
|
|
|
|
weight_dequant_output = add_dequant_output_suffix(weight_name)
|
|
self.model.replace_input_of_all_nodes(weight_name, weight_dequant_output)
|
|
if self.add_qdq_pair_to_weight:
|
|
weight_quant_output = add_quant_output_suffix(weight_name)
|
|
|
|
self._create_qdq_nodes(
|
|
weight_name,
|
|
weight_quant_output,
|
|
add_quant_suffix(weight_name),
|
|
weight_quant_output,
|
|
weight_dequant_output,
|
|
add_dequant_suffix(weight_name),
|
|
scale_name,
|
|
zp_name,
|
|
axis,
|
|
)
|
|
else:
|
|
dequant_node = onnx.helper.make_node(
|
|
DEQUANT_OP_NAME,
|
|
[q_weight_name, scale_name, zp_name],
|
|
[weight_dequant_output],
|
|
add_dequant_suffix(weight_name),
|
|
axis=axis,
|
|
domain=self.qdq_op_domain,
|
|
)
|
|
self.model.add_node(dequant_node)
|
|
|
|
def _add_qdq_pair_for_activation(self, tensor_name, scale_name, zp_name, data_type=None):
|
|
if (
|
|
self.dedicated_qdq_pair
|
|
and tensor_name in self.tensor_to_its_receiving_nodes
|
|
and len(self.tensor_to_its_receiving_nodes[tensor_name]) > 1
|
|
):
|
|
num_dedicated_qdq_pair = len(self.tensor_to_its_receiving_nodes[tensor_name])
|
|
for i in range(num_dedicated_qdq_pair):
|
|
postfix = f"_{i + 1}"
|
|
tensor_name_quant_output_postfix = add_quant_output_suffix(tensor_name) + postfix
|
|
tensor_name_dequant_output_postfix = add_dequant_output_suffix(tensor_name) + postfix
|
|
quant_node_name_postfix = add_quant_suffix(tensor_name) + postfix
|
|
dequant_node_name_postfix = add_dequant_suffix(tensor_name) + postfix
|
|
self._create_qdq_nodes(
|
|
tensor_name,
|
|
tensor_name_quant_output_postfix,
|
|
quant_node_name_postfix,
|
|
tensor_name_quant_output_postfix,
|
|
tensor_name_dequant_output_postfix,
|
|
dequant_node_name_postfix,
|
|
scale_name,
|
|
zp_name,
|
|
)
|
|
|
|
node = self.tensor_to_its_receiving_nodes[tensor_name][i]
|
|
self.model.replace_node_input(node, tensor_name, tensor_name_dequant_output_postfix)
|
|
if i == 0:
|
|
quantized_value = QuantizedValue(
|
|
tensor_name,
|
|
tensor_name_dequant_output_postfix,
|
|
scale_name,
|
|
zp_name,
|
|
QuantizedValueType.Input,
|
|
scale_type=data_type,
|
|
)
|
|
self.quantized_value_map[tensor_name] = QDQTensorQuantizedValue(quantized_value, None, None)
|
|
else:
|
|
q_input = tensor_name
|
|
dq_output = add_dequant_output_suffix(tensor_name)
|
|
if self.model.is_graph_output(tensor_name):
|
|
q_input = add_quant_input_suffix(tensor_name)
|
|
dq_output = tensor_name
|
|
self.model.replace_output_of_all_nodes(tensor_name, q_input)
|
|
else:
|
|
self.model.replace_input_of_all_nodes(tensor_name, dq_output)
|
|
|
|
self._create_qdq_nodes(
|
|
q_input,
|
|
add_quant_output_suffix(tensor_name),
|
|
add_quant_suffix(tensor_name),
|
|
add_quant_output_suffix(tensor_name),
|
|
dq_output,
|
|
add_dequant_suffix(tensor_name),
|
|
scale_name,
|
|
zp_name,
|
|
)
|
|
|
|
quantized_value = QuantizedValue(
|
|
tensor_name,
|
|
dq_output,
|
|
scale_name,
|
|
zp_name,
|
|
QuantizedValueType.Input,
|
|
scale_type=data_type,
|
|
)
|
|
self.quantized_value_map[tensor_name] = QDQTensorQuantizedValue(quantized_value, None, None)
|
|
|
|
def _add_qdq_ops_for_converted_activation(
|
|
self,
|
|
tensor_name,
|
|
first_scale_name,
|
|
first_zp_name,
|
|
scale_data_type,
|
|
convert_scale_name,
|
|
convert_zp_name,
|
|
convert_recv_nodes,
|
|
):
|
|
"""
|
|
Adds Q and DQ ops to a tensor whose quantized data type is converted. That is, some consumers may use the
|
|
original data type from the producer, while other consumers use the converted data type.
|
|
This is generally done by adding a sequence of ops that convert from one data type (e.g., uint8) to another (e.g., uint16).
|
|
|
|
T_float ---> Quant(to u8) ---> Convert(to u16) ---> Dequant(to float) ---> T_float'
|
|
where Convert(to u16) is equivalent to: ---> Dequant(to float) ---> Quant(to u16) --->
|
|
|
|
This function handles the following scenarios:
|
|
|
|
1) Tensor T is not a graph output; all consumers use the converted type
|
|
|
|
<Producer> ---> Q1 ---> DQ1 ---> Q2 ---> DQ2 ---> <Consumers>
|
|
|
|
2) Tensor T is not a graph output; some consumers use the original type, others use the converted type
|
|
|
|
<Producer> ---> Q1 -+-> DQ1 ---> <Consumers of original type>
|
|
|
|
|
+-> DQ1' ---> Q2 ---> DQ2 ---> <Consumers of converted type>
|
|
|
|
3) Tensor T is a graph output; all consumers use the converted type
|
|
|
|
<Producer> ---> Q1 ---> DQ1 ---> Q2 ---> DQ2 -+-> <Consumers>
|
|
|
|
|
+-> <Graph output>
|
|
|
|
4) Tensor T is a graph output; some consumers use the original type, others use the converted type
|
|
|
|
<Producer> ---> Q1 -+-> DQ1 -+-> <Consumers of original type>
|
|
| |
|
|
| +-> <Graph output>
|
|
|
|
|
+-> DQ1' ---> Q2 ---> DQ2 ---> <Consumers of converted type>
|
|
|
|
5) Tensor T is a graph output that is not consumed by any other nodes.
|
|
|
|
<Producer> ---> Q1 ---> DQ1 ---> Q2 ---> DQ2 ---> <Graph output>
|
|
"""
|
|
tensor_recv_nodes = set([node.name for node in self.tensor_to_its_receiving_nodes.get(tensor_name, [])])
|
|
|
|
if (
|
|
self.dedicated_qdq_pair
|
|
and tensor_name in self.tensor_to_its_receiving_nodes
|
|
and len(self.tensor_to_its_receiving_nodes[tensor_name]) > 1
|
|
):
|
|
# TODO: Add support for dedicated_qdq_pair if/when needed.
|
|
raise ValueError(
|
|
"Do not currently support converted quant_types in TensorQuantOverrides when the `dedicated_qdq_pair` extra_option is enabled"
|
|
)
|
|
|
|
# Determine which nodes consume the original quantized type and which nodes
|
|
# consume the converted quantized type.
|
|
original_recv_nodes = tensor_recv_nodes
|
|
if convert_recv_nodes is None: # In this case, all consumers receive the converted type.
|
|
convert_recv_nodes = tensor_recv_nodes
|
|
original_recv_nodes = set()
|
|
else:
|
|
original_recv_nodes = original_recv_nodes - convert_recv_nodes
|
|
|
|
all_use_converted = len(convert_recv_nodes) == len(tensor_recv_nodes)
|
|
is_graph_output = self.model.is_graph_output(tensor_name)
|
|
|
|
# Create first Q op.
|
|
first_q_input = tensor_name
|
|
if is_graph_output:
|
|
first_q_input = add_quant_input_suffix(tensor_name)
|
|
self.model.replace_output_of_all_nodes(tensor_name, first_q_input)
|
|
|
|
first_q_output = add_quant_output_suffix(tensor_name)
|
|
self._create_q_node(
|
|
first_q_input, first_q_output, add_quant_suffix(tensor_name), first_scale_name, first_zp_name
|
|
)
|
|
|
|
# Create first DQ op.
|
|
first_dq_output = add_dequant_output_suffix(tensor_name)
|
|
if is_graph_output and not all_use_converted:
|
|
first_dq_output = tensor_name
|
|
if original_recv_nodes and first_dq_output != tensor_name:
|
|
self.model.replace_input_of_nodes(tensor_name, first_dq_output, original_recv_nodes)
|
|
|
|
self._create_dq_node(
|
|
first_q_output, first_dq_output, add_dequant_suffix(tensor_name), first_scale_name, first_zp_name
|
|
)
|
|
|
|
# Create parallel clone of first DQ op if _not all_ consumers use the converted type.
|
|
# --> DQ1' --> Q2 --> DQ2 --> <Consumers of converted type>
|
|
#
|
|
# This DQ clone would only have one consumer Q node (Q2) and could be potentially fused with
|
|
# it by some EPs (e.g., QNN) without breaking other "node units".
|
|
# Ex QNN fusion:
|
|
# --> Convert (fused) --> DQ2 --> <Consumers of converted type>
|
|
second_q_input = first_dq_output
|
|
if not all_use_converted:
|
|
second_q_input = add_quant_input_suffix(f"{tensor_name}_convert")
|
|
self._create_dq_node(
|
|
first_q_output,
|
|
second_q_input,
|
|
add_dequant_suffix(f"{tensor_name}_convert_clone"),
|
|
first_scale_name,
|
|
first_zp_name,
|
|
)
|
|
|
|
# Create second Q op.
|
|
second_q_output = add_quant_output_suffix(f"{tensor_name}_convert")
|
|
self._create_q_node(
|
|
second_q_input,
|
|
second_q_output,
|
|
add_quant_suffix(f"{tensor_name}_convert"),
|
|
convert_scale_name,
|
|
convert_zp_name,
|
|
)
|
|
|
|
# Create second DQ op.
|
|
second_dq_output = add_dequant_output_suffix(f"{tensor_name}_convert")
|
|
if is_graph_output and all_use_converted:
|
|
second_dq_output = tensor_name
|
|
if convert_recv_nodes and second_dq_output != tensor_name:
|
|
self.model.replace_input_of_nodes(tensor_name, second_dq_output, convert_recv_nodes)
|
|
self._create_dq_node(
|
|
second_q_output,
|
|
second_dq_output,
|
|
add_dequant_suffix(f"{tensor_name}_convert"),
|
|
convert_scale_name,
|
|
convert_zp_name,
|
|
)
|
|
|
|
# Store in quantized_value_map
|
|
original_quantized_value = QuantizedValue(
|
|
tensor_name,
|
|
first_dq_output,
|
|
first_scale_name,
|
|
first_zp_name,
|
|
QuantizedValueType.Input,
|
|
scale_type=scale_data_type,
|
|
)
|
|
converted_quantized_value = QuantizedValue(
|
|
tensor_name,
|
|
second_dq_output,
|
|
convert_scale_name,
|
|
convert_zp_name,
|
|
QuantizedValueType.Input,
|
|
scale_type=scale_data_type,
|
|
)
|
|
self.quantized_value_map[tensor_name] = QDQTensorQuantizedValue(
|
|
original_quantized_value, converted_quantized_value, convert_recv_nodes
|
|
)
|
|
|
|
def _quantize_normal_tensors(self):
|
|
"""
|
|
Adds Q/DQ ops to tensors (activations and weights) that have been marked for quantization by op quantizers.
|
|
"""
|
|
for tensor_name, tensor_info in self.tensors_to_quantize.copy().items():
|
|
if tensor_name in self.quantized_value_map:
|
|
continue
|
|
|
|
if not tensor_info.is_shared:
|
|
# Quantize the input
|
|
initializer = find_by_name(tensor_name, self.model.initializer())
|
|
if initializer:
|
|
self._add_qdq_pair_for_initializer(initializer, tensor_info.tensor_type, tensor_info.axis)
|
|
else:
|
|
tensor_qparam_initializers = self._make_tensor_scale_zp_initializers(tensor_name)
|
|
if not tensor_qparam_initializers:
|
|
raise ValueError(
|
|
f"Quantization parameters are not specified for param {tensor_name}. "
|
|
"In static mode quantization params for inputs and outputs of nodes to be quantized are required."
|
|
)
|
|
|
|
if tensor_qparam_initializers.converted is None:
|
|
# Normal case: <producer> --> Q --> DQ --> <consumers>
|
|
self._add_qdq_pair_for_activation(
|
|
tensor_name,
|
|
tensor_qparam_initializers.original.scale.name,
|
|
tensor_qparam_initializers.original.zero_point.name,
|
|
data_type=tensor_info.data_type,
|
|
)
|
|
else:
|
|
# Conversion case: <producer> ---> Q1 -+-> DQ1 --> <consumers of original type>
|
|
# |
|
|
# +-> DQ1' --> Q2 --> DQ2 --> <consumers of converted type>
|
|
assert tensor_info.data_type == tensor_qparam_initializers.original.scale.data_type
|
|
self._add_qdq_ops_for_converted_activation(
|
|
tensor_name,
|
|
tensor_qparam_initializers.original.scale.name,
|
|
tensor_qparam_initializers.original.zero_point.name,
|
|
tensor_info.data_type,
|
|
tensor_qparam_initializers.converted.scale.name,
|
|
tensor_qparam_initializers.converted.zero_point.name,
|
|
tensor_qparam_initializers.converted_recv_nodes,
|
|
)
|
|
|
|
del self.tensors_to_quantize[tensor_name]
|
|
|
|
def _quantize_sharing_param_tensors(self):
|
|
"""
|
|
Adds Q/DQ ops to tensors that have been marked for quantization by op quantizers.
|
|
Only operates on tensors that want to use the quantization parameter initializers from an upstream tensor.
|
|
For example, a Transpose node's output tensor will typically want to use the same quantization parameter
|
|
initializers as the Transpose node's input.
|
|
"""
|
|
while self.tensors_to_quantize:
|
|
for tensor_name, tensor_info in self.tensors_to_quantize.copy().items():
|
|
quant_provider = tensor_info.quant_para_provider
|
|
if quant_provider and quant_provider.input_name in self.quantized_value_map:
|
|
del self.tensors_to_quantize[tensor_name]
|
|
|
|
quantized_value = self.quantized_value_map[quant_provider.input_name].get_for_consumer(
|
|
quant_provider.node_name
|
|
)
|
|
if self.is_input_a_initializer(tensor_name):
|
|
raise ValueError("Quantization parameter shared mode is not supported for weight yet")
|
|
|
|
# Need to check if this tensor's quant_type is converted for some consumers.
|
|
# If so, create new scale/zp initializers for these consumers.
|
|
converted_qparam_inits = None
|
|
converted_recv_nodes = None
|
|
if tensor_name in self.quantization_params:
|
|
tensor_params = self.quantization_params[tensor_name]
|
|
if tensor_params.converted:
|
|
converted_qparam_inits = self._make_scale_zp_initializers(
|
|
tensor_name, tensor_params.converted, "_convert"
|
|
)
|
|
converted_recv_nodes = tensor_params.converted_recv_nodes
|
|
|
|
if converted_qparam_inits is None:
|
|
# Normal case: <producer> --> Q_shared --> DQ_shared --> <consumers>
|
|
self._add_qdq_pair_for_activation(
|
|
tensor_name, quantized_value.scale_name, quantized_value.zp_name
|
|
)
|
|
else:
|
|
# Conversion case: <producer> ---> Q_shared -+-> DQ_shared --> <consumers of original type>
|
|
# |
|
|
# +-> DQ_shared' --> Q2 --> DQ2 --> <consumers of converted type>
|
|
self._add_qdq_ops_for_converted_activation(
|
|
tensor_name,
|
|
quantized_value.scale_name,
|
|
quantized_value.zp_name,
|
|
converted_qparam_inits.scale.data_type,
|
|
converted_qparam_inits.scale.name,
|
|
converted_qparam_inits.zero_point.name,
|
|
converted_recv_nodes,
|
|
)
|
|
|
|
def _quantize_bias_tensors(self):
|
|
"""
|
|
Adds DQ ops (or Cast) for bias tensors that have been marked for quantization by op quantizers.
|
|
"""
|
|
for bias_name, bias_info in self.bias_to_quantize.items():
|
|
if bias_name in self.quantized_value_map:
|
|
continue
|
|
# Quantize the input
|
|
self.quantize_bias_static(bias_name, bias_info)
|
|
init = find_by_name(bias_name, self.model.initializer())
|
|
self.model.remove_initializer(init)
|
|
quant_value = self.quantized_value_map[bias_name].original
|
|
if quant_value.node_type == "Cast":
|
|
# simple cast to float 16 and not DequantizeLinear
|
|
# cublasLtMatmul only supports (b)float16, float bias.
|
|
if not isinstance(init.data_type, int):
|
|
raise TypeError(f"Unexpected type {type(init.data_type)} for input={bias_info.input_name!r}")
|
|
node_name = add_dequant_suffix(bias_name)
|
|
dequant_node = onnx.helper.make_node(
|
|
"Cast",
|
|
[quant_value.q_name],
|
|
[bias_name],
|
|
name=node_name,
|
|
to=init.data_type,
|
|
)
|
|
elif quant_value.node_type in (None, "DequantizeLinear"):
|
|
if quant_value.node_qtype in {
|
|
onnx.TensorProto.FLOAT16,
|
|
onnx.TensorProto.BFLOAT16,
|
|
onnx.TensorProto.FLOAT,
|
|
}:
|
|
raise RuntimeError(f"Unexpected quantize type {quant_value.node_qtype} for DequantizeLinear.")
|
|
inputs = [quant_value.q_name, quant_value.scale_name, quant_value.zp_name]
|
|
node_name = add_dequant_suffix(bias_name)
|
|
if quant_value.axis is not None:
|
|
dequant_node = onnx.helper.make_node(
|
|
"DequantizeLinear",
|
|
inputs,
|
|
[bias_name],
|
|
node_name,
|
|
axis=quant_value.axis,
|
|
domain=self.qdq_op_domain,
|
|
)
|
|
else:
|
|
dequant_node = onnx.helper.make_node(
|
|
"DequantizeLinear",
|
|
inputs,
|
|
[bias_name],
|
|
node_name,
|
|
domain=self.qdq_op_domain,
|
|
)
|
|
else:
|
|
raise RuntimeError(f"Unexpected operator type {quant_value.node_type!r}.")
|
|
self.model.add_node(dequant_node)
|
|
|
|
def is_tensor_quantized(self, tensor_name: str):
|
|
return tensor_name in self.tensors_to_quantize or tensor_name in self.bias_to_quantize
|
|
|
|
def quantize_initializer(
|
|
self,
|
|
weight: onnx.TensorProto,
|
|
qType: onnx.TensorProto.DataType,
|
|
reduce_range: bool = False,
|
|
keep_float_weight: bool = False,
|
|
) -> tuple[str, str, str]:
|
|
"""
|
|
:param weight: TensorProto initializer
|
|
:param qType: type to quantize to
|
|
:param keep_float_weight: Whether to quantize the weight. In some cases, we only want to qunatize scale and zero point.
|
|
If keep_float_weight is False, quantize the weight, or don't quantize the weight.
|
|
:return: quantized weight name, zero point name, scale name
|
|
"""
|
|
# Find if this input is already quantized
|
|
if weight.name in self.quantized_value_map:
|
|
quantized_value = self.quantized_value_map[weight.name].original
|
|
return (
|
|
quantized_value.q_name,
|
|
quantized_value.zp_name,
|
|
quantized_value.scale_name,
|
|
)
|
|
|
|
q_weight_name, zp_name, scale_name = self.quantize_initializer_impl(
|
|
weight, qType, reduce_range, keep_float_weight
|
|
)
|
|
|
|
# Log entry for this quantized weight
|
|
quantized_value = QuantizedValue(
|
|
weight.name,
|
|
q_weight_name,
|
|
scale_name,
|
|
zp_name,
|
|
QuantizedValueType.Initializer,
|
|
None,
|
|
)
|
|
self.quantized_value_map[weight.name] = QDQTensorQuantizedValue(quantized_value, None, None)
|
|
return q_weight_name, zp_name, scale_name
|
|
|
|
def is_tensor_per_channel(
|
|
self,
|
|
tensor_name: str,
|
|
default_axis: int,
|
|
op_type: str | None = None,
|
|
) -> tuple[bool, int | None]:
|
|
"""
|
|
Checks if a given tensor is configured to be quantized per-channel. If so, also returns the channel axis.
|
|
|
|
ORT only supports per-channel quantization on static weights (i.e., ONNX initializers). If the user did not provide
|
|
tensor quantization overrides for this tensor, then the value of self.per_channel determines if the weight
|
|
is to be quantized per-channel.
|
|
|
|
Params:
|
|
tensor_name: The name of the tensor to check.
|
|
default_axis: The default channel axis. This method checks if the normalized axis is within bounds.
|
|
Can be overridden via the extra_options 'QDQOpTypePerChannelSupportToAxis'
|
|
and 'TensorQuantOverrides'.
|
|
op_type: Optional, defaults to None. The operator type that is the only consumer of this weight.
|
|
Used to access the extra option 'QDQOpTypePerChannelSupportToAxis'.
|
|
Returns:
|
|
A tuple (is_per_channel, axis) in which the first element indicates whether the tensor is
|
|
quantized per-channel and the second element is the channel axis.
|
|
The returned axis is only None if the tensor is not per-channel or the axis is out of bounds.
|
|
"""
|
|
weight_initializer = self.initializers.get(tensor_name)
|
|
if weight_initializer is None:
|
|
return False, None # Only support per-channel weights
|
|
|
|
if self.tensor_quant_overrides.has_per_tensor_overrides(tensor_name):
|
|
return False, None # User provided per-tensor overrides for this initializer
|
|
|
|
has_per_chan_overrides = self.tensor_quant_overrides.has_per_channel_overrides(tensor_name)
|
|
if not self.per_channel and not has_per_chan_overrides:
|
|
return False, None # global self.per_channel is off and user did not provide per-channel overrides.
|
|
|
|
axis = self.qdq_op_type_per_channel_support_to_axis.get(op_type, default_axis) if op_type else default_axis
|
|
if has_per_chan_overrides:
|
|
per_chan_overrides = self.tensor_quant_overrides.get_per_channel_overrides(tensor_name)
|
|
axis = per_chan_overrides[0]["axis"] # Prefer axis from user-specified tensor-level overrides if available
|
|
|
|
weight_nparray = tensor_proto_to_array(weight_initializer)
|
|
weight_rank = len(weight_nparray.shape)
|
|
axis_valid, axis = normalize_axis(axis, weight_rank)
|
|
if not axis_valid:
|
|
logging.warning(f"Axis {axis} is out-of-range for weight '{tensor_name}' with rank {weight_rank}")
|
|
return False, None
|
|
|
|
return True, axis
|
|
|
|
def quantize_weight_per_channel(
|
|
self,
|
|
weight_name: str,
|
|
weight_qType: onnx.TensorProto.DataType,
|
|
channel_axis: int,
|
|
reduce_range: bool = True,
|
|
keep_float_weight: bool = False,
|
|
) -> tuple[str, str, str]:
|
|
# Find if this input is already quantized
|
|
if weight_name in self.quantized_value_map:
|
|
quantized_value = self.quantized_value_map[weight_name].original
|
|
return (
|
|
quantized_value.q_name,
|
|
quantized_value.zp_name,
|
|
quantized_value.scale_name,
|
|
)
|
|
|
|
q_weight_name, zp_name, scale_name = self.quantize_weight_per_channel_impl(
|
|
weight_name, weight_qType, channel_axis, reduce_range, keep_float_weight
|
|
)
|
|
quantized_value = QuantizedValue(
|
|
weight_name,
|
|
q_weight_name,
|
|
scale_name,
|
|
zp_name,
|
|
QuantizedValueType.Initializer,
|
|
None,
|
|
)
|
|
self.quantized_value_map[weight_name] = QDQTensorQuantizedValue(quantized_value, None, None)
|
|
|
|
return q_weight_name, zp_name, scale_name
|
|
|
|
def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> str:
|
|
"""
|
|
Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale
|
|
"""
|
|
|
|
# Handle case where bias already in quantization map
|
|
if bias_name in self.quantized_value_map:
|
|
return self.quantized_value_map[bias_name].original.q_name
|
|
|
|
# get scale for weight
|
|
weight_scale_name = self.quantized_value_map[bias_info.weight_name].original.scale_name
|
|
weight_initializer = find_by_name(weight_scale_name, self.model.initializer())
|
|
weight_scale = tensor_proto_to_array(weight_initializer)
|
|
|
|
# get scale for input
|
|
input_scale_name = (
|
|
self.quantized_value_map[bias_info.input_name].get_for_consumer(bias_info.node_name).scale_name
|
|
)
|
|
inputscale_initializer = find_by_name(input_scale_name, self.model.initializer())
|
|
input_scale = tensor_proto_to_array(inputscale_initializer)
|
|
|
|
(
|
|
quantized_bias_name,
|
|
quantized_bias_scale_name,
|
|
quantized_bias_zp_name,
|
|
bias_scale_data,
|
|
node_type,
|
|
node_qtype,
|
|
) = self.quantize_bias_static_impl(bias_name, input_scale, weight_scale, bias_info.beta)
|
|
|
|
quantized_value = QuantizedValue(
|
|
bias_name,
|
|
quantized_bias_name,
|
|
quantized_bias_scale_name,
|
|
quantized_bias_zp_name,
|
|
QuantizedValueType.Initializer,
|
|
0 if bias_scale_data.size > 1 else None,
|
|
node_type=node_type,
|
|
node_qtype=node_qtype,
|
|
)
|
|
self.quantized_value_map[bias_name] = QDQTensorQuantizedValue(quantized_value, None, None)
|
|
|
|
return quantized_bias_name
|
|
|
|
def _make_scale_zp_initializers(
|
|
self, param_name: str, params: QuantizationParams, init_name_suffix: str = ""
|
|
) -> QDQScaleZpInitializers:
|
|
"""
|
|
Creates and returns scale and zero-point initializers for the given quantization params. The initializers are
|
|
named:
|
|
- {param_name}_zero_point{init_name_suffix}
|
|
- {param_name}_scale{init_name_suffix}
|
|
"""
|
|
zero_point_values = np.array([params["zero_point"]])
|
|
if not hasattr(params["scale"], "dtype") or params["scale"].dtype not in (np.float32, np.float16):
|
|
raise ValueError(f"Unexpected type {type(params['scale'])} and param_name={param_name!r}")
|
|
scale_values = np.array([params["scale"]])
|
|
assert scale_values.dtype != np.float64
|
|
zero_point_type = params.data.get("quant_type", self.activation_qType)
|
|
|
|
zero_point_shape = []
|
|
zero_point_name = param_name + "_zero_point" + init_name_suffix
|
|
scale_shape = []
|
|
scale_name = param_name + "_scale" + init_name_suffix
|
|
|
|
# Add initializers to model
|
|
init_zp = onnx.helper.make_tensor(
|
|
zero_point_name, zero_point_type, zero_point_shape, zero_point_values.ravel().tolist()
|
|
)
|
|
self.model.add_initializer(init_zp)
|
|
|
|
if scale_values.dtype == np.float32:
|
|
scale_type = onnx_proto.TensorProto.FLOAT
|
|
elif scale_values.dtype == np.float16:
|
|
scale_type = onnx_proto.TensorProto.FLOAT16
|
|
else:
|
|
raise ValueError(f"Unexpected dtype={scale_values.dtype} for param_name={param_name!r}")
|
|
init_scale = onnx.helper.make_tensor(scale_name, scale_type, scale_shape, scale_values.reshape((-1,)).tolist())
|
|
self.model.add_initializer(init_scale)
|
|
|
|
return QDQScaleZpInitializers(init_scale, init_zp)
|
|
|
|
def _make_tensor_scale_zp_initializers(self, tensor_name: str) -> QDQTensorScaleZpInitializers | None:
|
|
"""
|
|
Create and returns all scale/zero_point initializers for a given tensor. If the tensor is converted
|
|
to a different quantization type, this function creates two pairs of zp/scale initializers. Otherwise,
|
|
only one pair of zp/scale initializers is created.
|
|
"""
|
|
if self.quantization_params is None or tensor_name not in self.quantization_params:
|
|
logging.info(f'Quantization parameters for tensor:"{tensor_name}" not specified')
|
|
return None
|
|
|
|
tensor_params = self.quantization_params[tensor_name]
|
|
if not isinstance(tensor_params, QDQTensorQuantParams):
|
|
raise TypeError(f"Unexpected type {type(tensor_params)} for {tensor_name!r}.")
|
|
|
|
original_inits = self._make_scale_zp_initializers(tensor_name, tensor_params.original)
|
|
converted_inits = (
|
|
self._make_scale_zp_initializers(tensor_name, tensor_params.converted, "_convert")
|
|
if tensor_params.converted
|
|
else None
|
|
)
|
|
|
|
return QDQTensorScaleZpInitializers(original_inits, converted_inits, tensor_params.converted_recv_nodes)
|
|
|
|
def calc_quant_params(self, tensor_data: TensorData, quant_overrides: dict[str, Any]) -> QuantizationParams:
|
|
"""
|
|
Calculates quantization parameters (scale/zero-point) given a tensor's min/max range and optional
|
|
user-provided overrides.
|
|
"""
|
|
quant_type = self.activation_qType
|
|
if "quant_type" in quant_overrides:
|
|
quant_type = quant_overrides["quant_type"].tensor_type
|
|
|
|
if "scale" in quant_overrides and "zero_point" in quant_overrides:
|
|
zero, scale = quant_overrides["zero_point"], quant_overrides["scale"]
|
|
elif quant_type == onnx.TensorProto.FLOAT8E4M3FN:
|
|
zero, scale = compute_scale_zp_float8(quant_type, tensor_data.avg_std[1])
|
|
else:
|
|
rmin = quant_overrides.get("rmin", tensor_data.range_value[0])
|
|
rmax = quant_overrides.get("rmax", tensor_data.range_value[1])
|
|
symmetric = quant_overrides.get("symmetric", self.is_activation_symmetric)
|
|
reduce_range = quant_overrides.get("reduce_range", False)
|
|
qmin, qmax = get_qmin_qmax_for_qType(quant_type, reduce_range=reduce_range, symmetric=symmetric)
|
|
zero, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, self.min_real_range)
|
|
|
|
return QuantizationParams(zero_point=zero, scale=scale, quant_type=quant_type)
|
|
|
|
def calc_graph_quant_params(self) -> dict[str, QDQTensorQuantParams]:
|
|
"""
|
|
Calculates quantization parameters (scale/zero-point) for all tensors in the graph using each tensor's min/max range
|
|
and optional user-provided overrides.
|
|
"""
|
|
if self.tensors_range is None:
|
|
return {}
|
|
|
|
self.adjust_tensor_ranges()
|
|
|
|
quantization_params = {}
|
|
for tensor_name in self.tensors_range:
|
|
td = self.tensors_range[tensor_name]
|
|
if not isinstance(td, TensorData):
|
|
raise TypeError(f"Unexpected type {type(td)} for {tensor_name!r}.")
|
|
|
|
quant_overrides = self.tensor_quant_overrides.get_per_tensor_overrides(tensor_name, default_val={})
|
|
original = self.calc_quant_params(td, quant_overrides)
|
|
converted = None
|
|
converted_recv_nodes = None
|
|
|
|
if "convert" in quant_overrides:
|
|
converted = self.calc_quant_params(td, quant_overrides["convert"])
|
|
converted_recv_nodes = quant_overrides["convert"].get("recv_nodes")
|
|
|
|
quantization_params[tensor_name] = QDQTensorQuantParams(original, converted, converted_recv_nodes)
|
|
|
|
return quantization_params
|