I am done

This commit is contained in:
2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions

View File

@ -0,0 +1,78 @@
# automatically generated by the FlatBuffers compiler, do not modify
# namespace: CalTableFlatBuffers
import flatbuffers
from flatbuffers.compat import import_numpy
np = import_numpy()
class KeyValue:
__slots__ = ["_tab"]
@classmethod
def GetRootAs(cls, buf, offset=0): # noqa: N802
n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
x = KeyValue()
x.Init(buf, n + offset)
return x
@classmethod
def GetRootAsKeyValue(cls, buf, offset=0): # noqa: N802
"""This method is deprecated. Please switch to GetRootAs."""
return cls.GetRootAs(buf, offset)
# KeyValue
def Init(self, buf, pos): # noqa: N802
self._tab = flatbuffers.table.Table(buf, pos)
# KeyValue
def Key(self): # noqa: N802
o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
if o != 0:
return self._tab.String(o + self._tab.Pos)
return None
# KeyValue
def Value(self): # noqa: N802
o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
if o != 0:
return self._tab.String(o + self._tab.Pos)
return None
def Start(builder): # noqa: N802
builder.StartObject(2)
def KeyValueStart(builder): # noqa: N802
"""This method is deprecated. Please switch to Start."""
return Start(builder)
def AddKey(builder, key): # noqa: N802
builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(key), 0)
def KeyValueAddKey(builder, key): # noqa: N802
"""This method is deprecated. Please switch to AddKey."""
return AddKey(builder, key)
def AddValue(builder, value): # noqa: N802
builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(value), 0)
def KeyValueAddValue(builder, value): # noqa: N802
"""This method is deprecated. Please switch to AddValue."""
return AddValue(builder, value)
def End(builder): # noqa: N802
return builder.EndObject()
def KeyValueEnd(builder): # noqa: N802
"""This method is deprecated. Please switch to End."""
return End(builder)

View File

@ -0,0 +1,90 @@
# automatically generated by the FlatBuffers compiler, do not modify
# namespace: CalTableFlatBuffers
import flatbuffers
from flatbuffers.compat import import_numpy
np = import_numpy()
class TrtTable:
__slots__ = ["_tab"]
@classmethod
def GetRootAs(cls, buf, offset=0): # noqa: N802
n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
x = TrtTable()
x.Init(buf, n + offset)
return x
@classmethod
def GetRootAsTrtTable(cls, buf, offset=0): # noqa: N802
"""This method is deprecated. Please switch to GetRootAs."""
return cls.GetRootAs(buf, offset)
# TrtTable
def Init(self, buf, pos): # noqa: N802
self._tab = flatbuffers.table.Table(buf, pos)
# TrtTable
def Dict(self, j): # noqa: N802
o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
if o != 0:
x = self._tab.Vector(o)
x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
x = self._tab.Indirect(x)
from onnxruntime.quantization.CalTableFlatBuffers.KeyValue import KeyValue
obj = KeyValue()
obj.Init(self._tab.Bytes, x)
return obj
return None
# TrtTable
def DictLength(self): # noqa: N802
o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
if o != 0:
return self._tab.VectorLen(o)
return 0
# TrtTable
def DictIsNone(self): # noqa: N802
o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
return o == 0
def Start(builder): # noqa: N802
builder.StartObject(1)
def TrtTableStart(builder): # noqa: N802
"""This method is deprecated. Please switch to Start."""
return Start(builder)
def AddDict(builder, dict): # noqa: N802
builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(dict), 0)
def TrtTableAddDict(builder, dict): # noqa: N802
"""This method is deprecated. Please switch to AddDict."""
return AddDict(builder, dict)
def StartDictVector(builder, numElems): # noqa: N802
return builder.StartVector(4, numElems, 4)
def TrtTableStartDictVector(builder, numElems): # noqa: N802
"""This method is deprecated. Please switch to Start."""
return StartDictVector(builder, numElems)
def End(builder): # noqa: N802
return builder.EndObject()
def TrtTableEnd(builder): # noqa: N802
"""This method is deprecated. Please switch to End."""
return End(builder)

View File

@ -0,0 +1,16 @@
from .calibrate import ( # noqa: F401
CalibraterBase,
CalibrationDataReader,
CalibrationMethod,
MinMaxCalibrater,
create_calibrator,
)
from .qdq_quantizer import QDQQuantizer # noqa: F401
from .quant_utils import QuantFormat, QuantType, write_calibration_table # noqa: F401
from .quantize import DynamicQuantConfig # noqa: F401
from .quantize import QuantizationMode # noqa: F401
from .quantize import StaticQuantConfig # noqa: F401
from .quantize import quantize # noqa: F401
from .quantize import quantize_dynamic # noqa: F401
from .quantize import quantize_static # noqa: F401
from .shape_inference import quant_pre_process # noqa: F401

View File

@ -0,0 +1,536 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
import logging
from typing import Any, Dict
import numpy as np
import onnx
import onnx.numpy_helper
try:
from onnx.reference.op_run import to_array_extended
except ImportError:
# old version of onnx.
to_array_extended = None
from .calibrate import TensorData
from .onnx_model import ONNXModel
from .quant_utils import (
ONNX_TYPE_TO_NP_TYPE,
TENSOR_NAME_QUANT_SUFFIX,
QuantType,
find_by_name,
model_has_infer_metadata,
normalize_axis,
pack_bytes_to_4bit,
quantize_data,
quantize_nparray,
save_and_reload_model_with_shape_infer,
tensor_proto_to_array,
)
from .tensor_quant_overrides import TensorQuantOverridesHelper
class QuantizationParams:
def __init__(self, **data: Dict[str, Any]):
self.data = {}
for k, v in data.items():
if not isinstance(k, str):
raise TypeError(f"Keys must be strings not {type(k)} for k={k!r}.")
if not isinstance(v, (int, str, np.ndarray)):
raise TypeError(f"Values must be numpy arrays, int, float, str not {type(v)} for k={k!r}.")
if k == "scale" and v.dtype not in (np.float32, np.float16):
raise ValueError(f"scale must a float32 or float16 numpy element but is {v.dtype} for k={k!r}")
self.data[k] = v
def __iter__(self):
yield from self.data
def __getitem__(self, key):
return self.data[key]
def __len__(self):
return len(self.data)
class BaseQuantizer:
def __init__(
self,
model,
per_channel,
reduce_range,
weight_qType,
activation_qType,
tensors_range,
nodes_to_quantize,
nodes_to_exclude,
op_types_to_quantize,
extra_options=None,
):
if not model_has_infer_metadata(model):
model = save_and_reload_model_with_shape_infer(model)
self.value_infos = {vi.name: vi for vi in model.graph.value_info}
self.value_infos.update({ot.name: ot for ot in model.graph.output})
self.value_infos.update({it.name: it for it in model.graph.input})
self.model = ONNXModel(model)
self.per_channel = per_channel # weight-pack per channel
self.reduce_range = reduce_range
self.extra_options = extra_options if extra_options else {}
self.enable_subgraph_quantization = (
"EnableSubgraph" in self.extra_options and self.extra_options["EnableSubgraph"]
)
self.parent = None
self.force_quantize_no_input_check = (
"ForceQuantizeNoInputCheck" in self.extra_options and self.extra_options["ForceQuantizeNoInputCheck"]
)
self.is_weight_symmetric = self.extra_options.get(
"WeightSymmetric", weight_qType in (QuantType.QInt8, QuantType.QInt16, QuantType.QFLOAT8E4M3FN)
)
self.is_activation_symmetric = self.extra_options.get("ActivationSymmetric", False)
self.min_real_range = self.extra_options.get("MinimumRealRange")
self.activation_qType = getattr(activation_qType, "tensor_type", activation_qType)
self.weight_qType = getattr(weight_qType, "tensor_type", weight_qType)
"""
Dictionary specifying the min and max values for tensors. It has following format:
{
"param_name": [min, max]
}
example:
{
'Conv_3:0': [np.float32(0), np.float32(0.5)],
'Conv_4:0': [np.float32(1), np.float32(3.5)]
}
"""
if tensors_range is not None and any(map(lambda t: not isinstance(t, TensorData), tensors_range.values())):
raise TypeError(
f"tensors_range contains unexpected types {set(type(v) for v in tensors_range.values())}, not TensorData."
)
self.tensors_range = tensors_range
self.nodes_to_quantize = nodes_to_quantize # specific nodes to quantize
self.nodes_to_exclude = nodes_to_exclude # specific nodes to exclude
self.op_types_to_quantize = op_types_to_quantize
self.opset_version = self.check_opset_version()
# Get tensor-level quantization overrides and ensure they are valid.
self.tensor_quant_overrides = TensorQuantOverridesHelper(self.extra_options.get("TensorQuantOverrides", {}))
self.initializers = {initzer.name: initzer for initzer in self.model.initializer()}
overrides_valid, overrides_err = self.tensor_quant_overrides.is_valid(
self.initializers, self.value_infos.keys(), activation_qType
)
if not overrides_valid:
raise ValueError(overrides_err)
self.tensor_quant_override_qtypes = self.tensor_quant_overrides.get_quant_types()
def quantize_model(self):
raise NotImplementedError
def is_input_a_initializer(self, input_name):
initializer = find_by_name(input_name, self.model.initializer())
return initializer is not None
def is_per_channel(self):
return self.per_channel
def is_valid_quantize_weight(self, weight_name):
weight = find_by_name(weight_name, self.model.initializer())
if weight is not None:
return weight.data_type in (onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16)
if (not self.enable_subgraph_quantization) or (self.parent is None):
return False
return self.parent.is_valid_quantize_weight(weight_name)
def should_quantize_node(self, node):
if (
self.nodes_to_quantize is not None
and len(self.nodes_to_quantize) != 0
and node.name not in self.nodes_to_quantize
):
return False
if node.op_type not in self.op_types_to_quantize:
return False
if self.nodes_to_exclude is not None and node.name in self.nodes_to_exclude:
return False
return True
def check_opset_version(self):
ai_onnx_domain = [
opset for opset in self.model.model.opset_import if not opset.domain or opset.domain == "ai.onnx"
]
if len(ai_onnx_domain) != 1:
raise ValueError("Failed to find proper ai.onnx domain")
opset_version = ai_onnx_domain[0].version
if opset_version == 10:
logging.warning(
f"The original model opset version is {opset_version}, which does not support node fusions. Please update the model to opset >= 11 for better performance."
)
return 10
if opset_version < 10:
logging.warning(
f"The original model opset version is {opset_version}, which does not support quantization. Please update the model to opset >= 11. Updating the model automatically to opset 11. Please verify the quantized model."
)
self.model.model.opset_import.remove(ai_onnx_domain[0])
self.model.model.opset_import.extend([onnx.helper.make_opsetid("", 11)])
opset_version = 11
if opset_version < 19 and self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
logging.warning(
f"The original model opset version is {opset_version}, which does not support quantization to float 8. "
"Please update the model to opset >= 19. Updating the model automatically to opset 19. "
"Please verify the quantized model."
)
self.model.model.opset_import.remove(ai_onnx_domain[0])
self.model.model.opset_import.extend([onnx.helper.make_opsetid("", 19)])
self.model.model.ir_version = 9
opset_version = 19
return opset_version
def quantize_bias_static_impl(self, bias_name, input_scale, weight_scale, beta=1.0):
"""
Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale
"""
# get bias
bias_initializer = find_by_name(bias_name, self.model.initializer())
bias_data = tensor_proto_to_array(bias_initializer)
quantized_bias_name = bias_name + TENSOR_NAME_QUANT_SUFFIX
# quantize bias
if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
data = np.asarray(bias_data)
if data.dtype == np.float16:
node_qtype = onnx.TensorProto.FLOAT16
elif data.dtype == np.float32:
node_qtype = onnx.TensorProto.FLOAT
else:
raise TypeError(f"Only float16 or float32 are supported with float 8 but bias dtype is {data.dtype}.")
quantized_data = data.astype(np.float32)
bias_scale = np.array([1], dtype=quantized_data.dtype)
bias_scale_data = bias_scale.reshape(-1)
packed_bias_initializer = onnx.numpy_helper.from_array(quantized_data, quantized_bias_name)
self.model.initializer_extend([packed_bias_initializer])
node_type = "Cast"
else:
# calculate scale for bias
# TODO: This formula should be explained including why the scale is not estimated for the bias as well.
bias_scale = input_scale * weight_scale * beta
quantized_data = (np.asarray(bias_data) / bias_scale).round().astype(np.int32)
# update bias initializer
bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(bias_initializer.dims)
packed_bias_initializer = onnx.numpy_helper.from_array(bias_np_data, quantized_bias_name)
self.model.initializer_extend([packed_bias_initializer])
# Bias's scale dtype should match the original bias data's unquantized type (float32 or float16).
bias_scale_data = np.asarray(bias_scale, dtype=bias_data.dtype).reshape(-1)
node_type = "DequantizeLinear"
node_qtype = self.weight_qType
# update scale initializer
quantized_bias_scale_name = quantized_bias_name + "_scale"
packed_bias_scale_initializer = onnx.numpy_helper.from_array(bias_scale_data, quantized_bias_scale_name)
self.model.initializer_extend([packed_bias_scale_initializer])
# update zero initializer
if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
tensor_type = self.weight_qType
else:
tensor_type = onnx.TensorProto.INT32
quantized_bias_zp_name = quantized_bias_name + "_zero_point"
if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
packed_bias_zp_initializer = onnx.helper.make_tensor(quantized_bias_zp_name, self.weight_qType, [1], [0.0])
elif bias_scale.size > 1:
bias_zp_data = np.zeros(bias_scale.shape, dtype=np.int32).reshape(-1)
packed_bias_zp_initializer = onnx.numpy_helper.from_array(bias_zp_data, quantized_bias_zp_name)
else:
packed_bias_zp_initializer = onnx.helper.make_tensor(quantized_bias_zp_name, tensor_type, [], [0])
self.model.initializer_extend([packed_bias_zp_initializer])
return (
quantized_bias_name,
quantized_bias_scale_name,
quantized_bias_zp_name,
bias_scale_data,
node_type,
node_qtype,
)
def quantize_initializer_impl(self, weight, qType, reduce_range=False, keep_float_weight=False):
"""
:param weight: TensorProto initializer
:param qType: type to quantize to
:param keep_float_weight: Whether to quantize the weight. In some cases, we only want to qunatize scale and zero point.
If keep_float_weight is False, quantize the weight, or don't quantize the weight.
:return: quantized weight name, zero point name, scale name
"""
q_weight_name = weight.name + TENSOR_NAME_QUANT_SUFFIX
zp_name = weight.name + "_zero_point"
scale_name = weight.name + "_scale"
# Quantize weight data. Use quantization overrides if provided by the user.
weight_data = tensor_proto_to_array(weight)
quant_overrides = self.tensor_quant_overrides.get_per_tensor_overrides(weight.name, default_val={})
if "quant_type" in quant_overrides:
qType = quant_overrides["quant_type"].tensor_type # noqa: N806
if "scale" in quant_overrides and "zero_point" in quant_overrides:
zero_point = np.array(quant_overrides["zero_point"], dtype=ONNX_TYPE_TO_NP_TYPE[qType])
scale = np.array(quant_overrides["scale"])
q_weight_data = quantize_nparray(qType, weight_data.flatten(), scale, zero_point)
assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
assert (
zero_point.dtype != np.float32 and zero_point.dtype != np.float16
), f"Unexpected dtype {zero_point.dtype}"
assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
else:
_, _, zero_point, scale, q_weight_data = quantize_data(
weight_data.flatten(),
qType,
quant_overrides.get("symmetric", self.is_weight_symmetric),
reduce_range=quant_overrides.get("reduce_range", self.reduce_range and reduce_range),
min_real_range=self.min_real_range,
rmin_override=quant_overrides.get("rmin"),
rmax_override=quant_overrides.get("rmax"),
)
assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
assert (
zero_point.dtype != np.float32 and zero_point.dtype != np.float16
), f"Unexpected dtype {zero_point.dtype}"
assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
scale_dtype = weight.data_type
scale_initializer = onnx.helper.make_tensor(scale_name, scale_dtype, [], scale.reshape((-1,)).tolist())
zero_initializer = onnx.helper.make_tensor(zp_name, qType, [], zero_point.reshape((-1,)).tolist())
self.model.initializer_extend([scale_initializer, zero_initializer])
if not keep_float_weight:
if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
q_weight_initializer = onnx.TensorProto()
q_weight_initializer.data_type = self.weight_qType
q_weight_initializer.dims.extend(weight.dims)
q_weight_initializer.name = q_weight_name
# Do not remove .flatten().copy() numpy is not clear about data persistence.
q_weight_initializer.raw_data = q_weight_data.flatten().copy().tobytes()
if to_array_extended is not None:
# This test should not be needed but it helped catch some issues
# with data persistence and tobytes.
check = to_array_extended(q_weight_initializer)
if check.shape != weight_data.shape or check.tobytes() != q_weight_data.tobytes():
raise RuntimeError(
f"The initializer of shape {weight_data.shape} could not be created, expecting "
f"{q_weight_data.tobytes()[:10]}, got {check.tobytes()[:10]} and shape={weight.shape}"
f"\nraw={str(q_weight_initializer)[:200]}."
)
elif qType in (onnx.TensorProto.INT4, onnx.TensorProto.UINT4):
if q_weight_data.dtype not in (np.int8, np.uint8):
raise RuntimeError(
f"Quantized weights for {q_weight_name} must be 8-bit before packing as 4-bit values."
)
# We do not use onnx.helper.pack_float32_to_4bit() due to performance.
# This can be the difference between a large model taking 30 minutes to quantize vs 5 minutes.
packed_data = bytes(pack_bytes_to_4bit(q_weight_data.tobytes()))
# We only use onnx.helper.make_tensor with raw data due to bug: https://github.com/onnx/onnx/pull/6161
q_weight_initializer = onnx.helper.make_tensor(q_weight_name, qType, weight.dims, packed_data, raw=True)
else:
q_weight_data = np.asarray(q_weight_data, dtype=onnx.helper.tensor_dtype_to_np_dtype(qType)).reshape(
weight.dims
)
q_weight_initializer = onnx.numpy_helper.from_array(q_weight_data, q_weight_name)
self.model.initializer_extend([q_weight_initializer])
return q_weight_name, zp_name, scale_name
def quantize_weight_per_channel_impl(
self,
weight_name,
weight_qType,
channel_axis,
reduce_range=True,
keep_float_weight=False,
):
initializer = find_by_name(weight_name, self.model.initializer())
if initializer is None:
raise ValueError("{} is not an initializer", weight_name)
weights = tensor_proto_to_array(initializer)
weights_rank = len(weights.shape)
is_axis_valid, axis_norm = normalize_axis(channel_axis, weights_rank)
if not is_axis_valid:
raise ValueError(
f"Weight {weight_name} has a per-channel axis with value {channel_axis} that is "
f"out-of-bounds for rank {weights_rank}"
)
channel_axis = axis_norm
channel_count = weights.shape[channel_axis]
quant_overrides_for_channels = self.tensor_quant_overrides.get_per_channel_overrides(
weight_name, default_val=[{"axis": channel_axis}]
)
num_channel_overrides = len(quant_overrides_for_channels)
if num_channel_overrides != 1 and num_channel_overrides != channel_count:
raise ValueError(
f"Per-channel tensor quantization overrides for {weight_name} must have "
f"either 1 or {channel_count} elements in the list of dictionaries."
)
is_axis_override_valid, axis_override = normalize_axis(quant_overrides_for_channels[0]["axis"], weights_rank)
if not is_axis_override_valid or axis_override != channel_axis:
raise ValueError(
f"Tensor quantization overrides for {weight_name} specify an unexpected axis. "
f"Expected {channel_axis}, but got {quant_overrides_for_channels[0]['axis']}."
)
# If user provides per-channel quantization overrides, all channels must use the same quant_type,
# axis, symmetric, and reduce_range values. So, just use the first channel's values.
if "quant_type" in quant_overrides_for_channels[0]:
weight_qType = quant_overrides_for_channels[0]["quant_type"].tensor_type # noqa: N806
symmetric = quant_overrides_for_channels[0].get(
"symmetric",
(
self.is_weight_symmetric
or weight_qType in (onnx.TensorProto.INT8, onnx.TensorProto.FLOAT8E4M3FN, onnx.TensorProto.INT4)
),
)
reduce_range = quant_overrides_for_channels[0].get("reduce_range", self.reduce_range and reduce_range)
zero_point_list = []
scale_list = []
quantized_per_channel_data_list = []
for i in range(channel_count):
per_channel_data = weights.take(i, channel_axis)
channel_override_index = i if i < num_channel_overrides else 0
channel_quant_overrides = quant_overrides_for_channels[channel_override_index]
if "scale" in channel_quant_overrides and "zero_point" in channel_quant_overrides:
zero_point = np.array(channel_quant_overrides["zero_point"], dtype=ONNX_TYPE_TO_NP_TYPE[weight_qType])
scale = np.array(channel_quant_overrides["scale"])
quantized_per_channel_data = quantize_nparray(
weight_qType, per_channel_data.flatten(), scale, zero_point
)
assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
assert (
zero_point.dtype != np.float32 and zero_point.dtype != np.float16
), f"Unexpected dtype {zero_point.dtype}"
assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
assert isinstance(
quantized_per_channel_data, np.ndarray
), f"Unexpected type {type(quantized_per_channel_data)}"
else:
_, _, zero_point, scale, quantized_per_channel_data = quantize_data(
per_channel_data.flatten(),
weight_qType,
symmetric,
reduce_range=reduce_range,
min_real_range=self.min_real_range,
rmin_override=channel_quant_overrides.get("rmin"),
rmax_override=channel_quant_overrides.get("rmax"),
)
assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
assert (
zero_point.dtype != np.float32 and zero_point.dtype != np.float16
), f"Unexpected dtype {zero_point.dtype}"
assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
assert isinstance(
quantized_per_channel_data, np.ndarray
), f"Unexpected type {type(quantized_per_channel_data)}"
zero_point_list.append(zero_point)
scale_list.append(scale)
quantized_per_channel_data_list.append(quantized_per_channel_data)
# combine per_channel_data into one
weights_shape = list(weights.shape)
reshape_dims = list(weights_shape) # deep copy
reshape_dims[channel_axis] = 1 # only one per channel for reshape
quantized_weights = np.asarray(quantized_per_channel_data_list[0]).reshape(reshape_dims)
for i in range(1, len(quantized_per_channel_data_list)):
channel_weights = np.asarray(quantized_per_channel_data_list[i]).reshape(reshape_dims)
quantized_weights = np.concatenate((quantized_weights, channel_weights), channel_axis)
q_weight_name = weight_name + TENSOR_NAME_QUANT_SUFFIX
zp_name = weight_name + "_zero_point"
scale_name = weight_name + "_scale"
# Update packed weight, zero point, and scale initializers
zero_scale_shape = [initializer.dims[channel_axis]]
scale_initializer = onnx.helper.make_tensor(
scale_name, initializer.data_type, zero_scale_shape, np.hstack(scale_list).tolist()
)
zero_initializer = onnx.helper.make_tensor(
zp_name, weight_qType, zero_scale_shape, np.hstack(zero_point_list).tolist()
)
self.model.initializer_extend([scale_initializer, zero_initializer])
if not keep_float_weight:
if weight_qType in (onnx.TensorProto.INT4, onnx.TensorProto.UINT4):
if quantized_weights.dtype not in (np.int8, np.uint8):
raise RuntimeError(
f"Quantized weights for {q_weight_name} must be 8-bit before packing as 4-bit values."
)
# We do not use onnx.helper.pack_float32_to_4bit() due to performance.
# This can be the difference between a large model taking 30 minutes to quantize vs 5 minutes.
packed_data = bytes(pack_bytes_to_4bit(quantized_weights.tobytes()))
# We only use onnx.helper.make_tensor with raw data due to bug: https://github.com/onnx/onnx/pull/6161
q_weight_initializer = onnx.helper.make_tensor(
q_weight_name, weight_qType, weights_shape, packed_data, raw=True
)
self.model.initializer_extend([q_weight_initializer])
else:
quantized_weights = np.asarray(
quantized_weights,
dtype=onnx.helper.tensor_dtype_to_np_dtype(weight_qType),
).reshape(initializer.dims)
q_weight_initializer = onnx.numpy_helper.from_array(quantized_weights, q_weight_name)
self.model.initializer_extend([q_weight_initializer])
return q_weight_name, zp_name, scale_name
def adjust_tensor_ranges(self):
if self.tensors_range is None:
return
for node in self.model.nodes():
# adjust tensor_ranges for input of Clip and Relu node
if node.op_type in ["Clip", "Relu"]:
if self.is_activation_symmetric:
continue
if not self.should_quantize_node(node):
continue
if len(self.model.input_name_to_nodes()[node.input[0]]) != 1:
continue
if node.input[0] not in self.tensors_range or node.output[0] not in self.tensors_range:
continue
td = self.tensors_range[node.output[0]]
if not isinstance(td, TensorData):
raise TypeError(f"Unexpected type {type(td)} for {node.output[0]!r}.")
self.tensors_range[node.input[0]] = td
# Adjust Softmax to range from 0.0 to 1.0
elif node.op_type == "Softmax":
self.tensors_range[node.output[0]] = TensorData(lowest=np.float32(0.0), highest=np.float32(1.0))

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
from .preprocess import qnn_preprocess_model # noqa: F401
from .quant_config import get_qnn_qdq_config # noqa: F401

View File

@ -0,0 +1,132 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
from __future__ import annotations
import onnx
from ...fusions import Fusion
from ...onnx_model import ONNXModel
class FusionLpNormalization(Fusion):
def __init__(self, model: ONNXModel, epsilon: float = 1e-12):
super().__init__(model, "LpNormalization", "ReduceL2")
self.epsilon = epsilon
def fuse(
self,
reduce_node: onnx.NodeProto,
input_name_to_nodes: dict[str, list[onnx.NodeProto]],
output_name_to_node: dict[str, onnx.NodeProto],
):
"""
Interface function that tries to fuse a node sequence containing a ReduceL2 node into a single
LpNormalization node.
Pattern 1:
[root] --> ReduceL2 -----> Clip --> Expand ----> Div -->
| (axis=-1) (min=epsilon) (shape=root) ^
| (keepdims=True) |
| |
+-----------------------------------------------+
Notes:
- ReduceL2 must use the last axis, and keepdims == True
- Clip must only have a min attribute that is ~1e-12
- Expand must restore the shape to root.shape
- The output of Expand must be the second input to Div.
"""
if reduce_node.output[0] not in input_name_to_nodes:
return
# ReduceL2 must have one Clip child
children = input_name_to_nodes[reduce_node.output[0]]
if len(children) != 1 or children[0].op_type != "Clip":
return
# ReduceL2 must have keepdims == True
keepdims = self.get_node_attribute(reduce_node, "keepdims")
if not keepdims:
return
# ReduceL2 axes must refer only to the last dimension.
# Axes became an input in opset 18. Before then, axes was an attribute
reduce_input_ttype = self.model.get_tensor_type(reduce_node.input[0])
if not reduce_input_ttype:
return
reduce_input_shape = self.tensor_shape_to_list(reduce_input_ttype)
if not reduce_input_shape:
return
axes = self.get_node_attribute(reduce_node, "axes")
if not axes and len(reduce_node.input) > 1:
axes = self.model.get_constant_value(reduce_node.input[1])
if not axes or len(axes) != 1:
return
last_dim = len(reduce_input_shape) - 1
if axes[0] != -1 and axes[0] != last_dim:
return
# Clip node must have a min attribute approximately equal to 1e-12
clip_node = children[0]
clip_min = self.get_node_attribute(clip_node, "min")
if clip_min is None and len(clip_node.input) > 1:
clip_min = self.model.get_constant_value(clip_node.input[1])
clip_max = self.get_node_attribute(clip_node, "max") # TODO: clip_max could be FLOAT_MAX
if clip_max is None and len(clip_node.input) > 2:
clip_max = self.model.get_constant_value(clip_node.input[2])
if not (clip_max is None and clip_min is not None and clip_min > 0 and abs(clip_min - self.epsilon) < 1e-13):
return
if clip_node.output[0] not in input_name_to_nodes:
return
# Clip must have a single Expand child.
children = input_name_to_nodes[clip_node.output[0]]
if len(children) != 1 or children[0].op_type != "Expand":
return
expand_node = children[0]
if expand_node.output[0] not in input_name_to_nodes:
return
# Expand must have a single Div child
children = input_name_to_nodes[expand_node.output[0]]
if len(children) != 1 or children[0].op_type != "Div":
return
div_node = children[0]
# The first input to Div must be the root of the subgraph (i.e., reduce_node.input[0])
# The second input to Div must be the output of the Expand.
# As long as these two inputs go to the same Div node, then ONNX validation will ensure that
# their shapes match.
if div_node.input[0] != reduce_node.input[0]:
return
if div_node.input[1] != expand_node.output[0]:
return
subgraph_input = reduce_node.input[0]
subgraph_output = div_node.output[0]
subgraph_nodes = [reduce_node, clip_node, expand_node, div_node]
if not self.is_safe_to_fuse_nodes(subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node):
return
self.nodes_to_remove.extend(subgraph_nodes)
fused_node = onnx.helper.make_node(
self.fused_op_type,
name=self.create_unique_node_name(),
inputs=[subgraph_input],
outputs=[subgraph_output],
p=2,
axis=-1,
)
self.nodes_to_add.append(fused_node)

View File

@ -0,0 +1,413 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
from __future__ import annotations
import logging
from dataclasses import dataclass
import onnx
from ...quant_utils import QuantType
from ...tensor_quant_overrides import QuantTypeInfo, TensorQuantOverridesHelper
@dataclass
class TensorTypeRequest:
"""
Bundles desired quantization type requests for a tensor. A distinction is made between the
produced type and the consumed type.
"""
# The tensor's quant type at the producer end. If None, assumed to be the default activation quant type.
producer: QuantTypeInfo | None
# The tensor's quant type received by a set of consumer nodes.
# If None, assumed to be the default activation quant type for all consumers.
# consumers[1] is a set of consumer node names.
consumers: tuple[QuantTypeInfo, set[str]] | None
class MixedPrecisionTensorQuantOverridesFixer:
"""
Helper that generates tensor quantization overrides for mixed-precision QDQ models.
Specifically, this helper fixes an initial set of quantization overrides that assign a non-default
activation quantization type to one or more tensors by doing the following:
- Inferring which other tensors need to be overridden to the non-default activation quantization type.
- Inserting quantization data type conversions.
Example:
--------
Float model:
input_0 --> Op1 --> Op3 --> Op5 --> Op6 --> output_0
^
|
input_1 --> Op2 -+-> Op4 ----+
|
+-> Op7 --> output_1
|
+-> Op8 --> output_2
If we'd like to quantize this model to uint8 precision, but would like to make sure tensor "Op4_out"
is quantized to 16-bit, then we would specify the following initial tensor quantization overrides:
```
init_overrides = {"Op4_out": [{"quant_type": QuantType.QUInt16}]}
```
These initial overrides may not create a valid model because Op4 and Op5 may require both the input and output
to be the same type (e.g., uint16). This helper fixes the overrides so that input/output data types
are valid:
```
overrides = TensorQuantOverridesHelper(init_overrides)
fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(overrides, model, QuantType.QUInt8)
fixer.apply(
default_activation_qtype=QuantType.QUInt8,
default_activation_symmetric=False,
)
```
The above snippet generates the following "fixed" overrides (get via overrides.get_dict()):
{
"Op2_out": [{"quant_type": QUInt8, "convert": {"quant_type": QUInt16, "recv_nodes": {"Op4"}}}],
"Op3_out": [{"quant_type": QUInt8, "convert": {"quant_type": QUInt16, "recv_nodes": {"Op5"}}}],
"Op4_out": [{"quant_type": QUInt16}],
"Op5_out": [{"quant_type": QUInt16, "convert": {"quant_type": QUInt8, "recv_nodes": {"Op6"}}}]
}
How to interpret the fixed overrides:
- Op2's output is consumed by Op4, Op7, and Op8. Op4 consumes the converted u16 type,
but Op7 and Op8 consume the original u8 type.
- Op3's output is converted from u8 to u16. Op5 consumes the converted u16 type.
- Op4's output is just u16 (not converted). All consumers of Op4_out get the u16 type.
- Op5's output is converted from u16 to u8. Op6 consumes the u8 type.
"""
def __init__(
self,
overrides: TensorQuantOverridesHelper,
producers: dict[str, onnx.NodeProto],
consumers: dict[str, list[onnx.NodeProto]],
value_infos: dict[str, onnx.ValueInfoProto],
initializers: dict[str, onnx.TensorProto],
):
"""
Params:
overrides: The initial tensor quantization overrides to fix.
producers: Dictionary that maps a tensor name to the producer node that generates the tensor.
consumers: Dictionary that maps a tensor name to the consumer nodes that take the tensor as input.
value_infos: Dictionary that maps a tensor name to its onnx.ValueInfoProto.
initializers: Dictionary that maps an initializer name to its onnx.TensorProto.
"""
self.overrides = overrides
self.consumers = consumers
self.producers = producers
self.value_infos = value_infos
self.initializers = initializers
@staticmethod
def create_from_model(
overrides: TensorQuantOverridesHelper, model: onnx.ModelProto, default_activation_qtype: QuantType
) -> MixedPrecisionTensorQuantOverridesFixer:
"""
Helper function that creates an instance of this class from a loaded ONNX model.
Params:
overrides: The initial tensor quantization overrides to fix.
model: Loaded ONNX model
default_activation_qtype: The intended default activation quantization type.
Used to validate the initial overrides.
Returns:
Initialized MixedPrecisionTensorQuantOverridesFixer object
"""
model = onnx.shape_inference.infer_shapes(model) # Need to infer shapes to get value_infos
# Build dictionaries that enable convenient lookups of initializers and value_infos by name.
initializers = {initializer.name: initializer for initializer in model.graph.initializer}
value_infos = {vi.name: vi for vi in model.graph.value_info}
value_infos.update({ot.name: ot for ot in model.graph.output})
value_infos.update({it.name: it for it in model.graph.input})
# Ensure that the user-provided initial overrides are actually valid.
valid, err = overrides.is_valid(initializers, set(value_infos), default_activation_qtype)
if not valid:
pprint_overrides = overrides.pprint_str(indent=4)
logging.error(f"Provided invalid tensor quantization overrides:\n{pprint_overrides}")
raise ValueError(err)
consumers = {}
producers = {}
# Build dictionaries that map a tensor name to the consumer or producer nodes.
for node in model.graph.node:
for input_name in node.input:
if input_name:
if input_name not in consumers:
consumers[input_name] = []
consumers[input_name].append(node)
for output_name in node.output:
producers[output_name] = node
return MixedPrecisionTensorQuantOverridesFixer(overrides, producers, consumers, value_infos, initializers)
def apply(
self,
default_activation_qtype: QuantType,
default_activation_symmetric: bool,
):
"""
Fixes the initial tensor quantization overrides (in-place) for use in mixed-precision QDQ models.
Params:
default_activation_qtype: The intended default activation quantization type.
default_activation_symmetric: The intended default symmetry used to quantize activations.
"""
type_requests = self.get_desired_tensor_types(default_activation_qtype, default_activation_symmetric)
# Use type requests to "fix" tensor quantization overrides by adding
# quantization type conversions where necessary.
for tensor_name, type_req in type_requests.items():
all_consumers = set([node.name for node in self.consumers.get(tensor_name, [])])
has_producer_req = type_req.producer is not None
has_consumer_req = bool(type_req.consumers)
# Only producer type: Add conversion back to default activation type
if has_producer_req and not has_consumer_req:
self._update_converted_tensor(
tensor_name, type_req.producer, QuantTypeInfo(default_activation_qtype), all_consumers
)
# Only consumers
elif not has_producer_req and has_consumer_req:
prod_type_info = self.overrides.get_node_output_qtype_info(tensor_name, default_activation_qtype)
consumer_type_info = type_req.consumers[0]
if prod_type_info != consumer_type_info:
self._update_converted_tensor(
tensor_name, prod_type_info, consumer_type_info, type_req.consumers[1]
)
else:
if not self._check_nodes_are_not_convert_consumers(tensor_name, type_req.consumers[1]):
raise ValueError(
f"Tensor override for '{tensor_name}' converts the type for consumers that need the original type."
)
# Both producer and consumers
elif has_producer_req and has_consumer_req:
prod_type_info = type_req.producer
consumer_type_info = type_req.consumers[0]
if prod_type_info != consumer_type_info:
self._update_converted_tensor(
tensor_name, prod_type_info, consumer_type_info, type_req.consumers[1]
)
else:
consumers_for_original_type = all_consumers.difference(type_req.consumers[1])
if len(consumers_for_original_type) == 0:
# All consumers want the overridden type, so no need for convert nodes!
# Just add the override to the new new if not already present.
if tensor_name not in self.overrides:
self.overrides[tensor_name] = [{}]
prod_type_info.save_to_dict(self.overrides[tensor_name][0])
assert "convert" not in self.overrides[tensor_name][0]
else:
# Some consumers don't want the overridden type.
self._update_converted_tensor(
tensor_name,
prod_type_info,
QuantTypeInfo(default_activation_qtype),
consumers_for_original_type,
)
else:
raise ValueError(f"TypeRequest for tensor {tensor_name} has no producer or consumers.")
# Done. Check if the overrides are valid.
valid, err = self.overrides.is_valid(self.initializers, set(self.value_infos), default_activation_qtype)
if not valid:
pprint_overrides = self.overrides.pprint_str(indent=4)
logging.error(
f"Generated invalid tensor quantization overrides for mixed-precision QDQ model:\n{pprint_overrides}"
)
raise ValueError(err)
def get_desired_tensor_types(
self,
default_activation_qtype: QuantType,
default_activation_symmetric: bool,
) -> dict[str, TensorTypeRequest]:
"""
Iterates through the initial tensor quantization overrides and builds a set of TensorTypeRequests objects
that describe the quantization types required at each tensor. These TensorTypeRequests objects are ultimately
used to generated the "fixed" overrides.
Params:
default_activation_qtype: The intended default activation quantization type.
default_activation_symmetric: The intended default symmetry used to quantize activations.
Returns:
TensorTypeRequest objects as a dict that maps a tensor name to its requested types.
"""
type_requests = {}
default_activation_type_info = QuantTypeInfo(default_activation_qtype, default_activation_symmetric)
# Scan tensor overrides for type conversion requests.
for tensor_name, override_list in self.overrides.items():
if not self.__is_tensor_quantizable(tensor_name):
continue # Skip non-quantizable tensors (e.g., not a float)
if tensor_name in self.initializers:
continue # Skip initializers
if not override_list or len(override_list) > 1:
continue # Skip per-channel stuff
override_dict = override_list[0]
quant_type_info = QuantTypeInfo.load_from_dict(override_dict, default_activation_type_info.quant_type)
producer_node = self.producers.get(tensor_name) # None if this is a model input
if quant_type_info != default_activation_type_info and "convert" not in override_dict:
if producer_node is not None:
self._add_type_requests_for_node(type_requests, quant_type_info, producer_node)
# Find all consumer nodes of `tensor_name` and update their inputs/outputs to the new type.
for consumer_node in self.consumers.get(tensor_name, []):
self._add_type_requests_for_node(type_requests, quant_type_info, consumer_node)
return type_requests
def _add_type_requests_for_node(
self,
type_requests: dict[str, TensorTypeRequest],
quant_type_info: QuantTypeInfo,
node: onnx.NodeProto,
):
"""
Adds TensorTypeRequest objects for a given node, assuming that we want all its inputs and outputs
to have the same quantization type (as specified by the `quant_type_info` parameter).
Params:
type_requests: Dictionary of type requests to append to for this node.
quant_type_info: The quantization type to use for inputs and outputs.
node: The node for which the TensorTypeRequest objects are created and added to type_requests.
"""
# Add output side
for output_name in node.output:
if not self.__is_tensor_quantizable(output_name):
continue
if output_name not in type_requests:
type_requests[output_name] = TensorTypeRequest(quant_type_info, None)
else:
if (
type_requests[output_name].producer is not None
and type_requests[output_name].producer != quant_type_info
):
raise ValueError(f"Tensor {output_name} has multiple types.")
type_requests[output_name].producer = quant_type_info
# Add the consumer side
for input_name in node.input:
if input_name and input_name not in self.initializers and self.__is_tensor_quantizable(input_name):
if input_name not in type_requests:
type_requests[input_name] = TensorTypeRequest(None, None)
if type_requests[input_name].consumers is None:
type_requests[input_name].consumers = (quant_type_info, set())
if type_requests[input_name].consumers[0] != quant_type_info:
raise ValueError(f"Tensor {input_name} has consumers requesting different types.")
if not node.name:
raise ValueError(
f"Node of type {node.op_type} with output 0 {node.output[0]} does not have a name!"
)
type_requests[input_name].consumers[1].add(node.name)
def _update_converted_tensor(
self,
tensor_name: str,
producer_type_info: QuantTypeInfo,
consumer_type_info: QuantTypeInfo,
consumer_names: set[str],
):
"""
Updates the tensor quantization overrides for a tensor that is converted from one type to another.
Params:
tensor_name: The name of the tensor for which to update overrides.
producer_type_info: Info for the tensor's produced type.
consumer_type_info: Info for the tensor's consumed (i.e., converted) type.
consumer_names: Nodes names of consumers that consume the converted type.
"""
if tensor_name not in self.overrides or not self.overrides[tensor_name]:
self.overrides[tensor_name] = [{}]
producer_type_info.save_to_dict(self.overrides[tensor_name][0])
overrides = self.overrides[tensor_name][0]
if producer_type_info != QuantTypeInfo.load_from_dict(overrides):
raise ValueError(f"Desired producer quant_type for {tensor_name} doesn't match existing type.")
if consumer_names:
if "convert" not in overrides:
overrides["convert"] = {}
consumer_type_info.save_to_dict(overrides["convert"])
convert_dict = overrides["convert"]
if consumer_type_info != QuantTypeInfo.load_from_dict(convert_dict):
raise ValueError(f"Desired consumer quant_type for {tensor_name} doesn't match existing type.")
if "recv_nodes" not in convert_dict:
convert_dict["recv_nodes"] = set()
convert_dict["recv_nodes"].update(consumer_names)
def _check_nodes_are_not_convert_consumers(self, tensor_name: str, node_names: set[str]):
"""
Returns true if the given nodes do not consume/receive a converted quantization type.
Params:
tensor_name: The name of the tensor to check.
node_names: Set of node names that should not be consumers of the converted type.
"""
if tensor_name not in self.overrides or not self.overrides[tensor_name]:
return True
overrides = self.overrides[tensor_name][0]
if "convert" not in overrides:
return True
convert_dict = overrides["convert"]
if "recv_nodes" not in convert_dict:
return False
return not convert_dict["recv_nodes"].intersection(node_names)
def __is_tensor_quantizable(self, tensor_name):
weight = self.initializers.get(tensor_name)
if weight is not None:
if weight.data_type in (onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16):
return True
elif tensor_name in self.value_infos:
vi = self.value_infos[tensor_name]
if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type in (
onnx.TensorProto.FLOAT,
onnx.TensorProto.FLOAT16,
):
return True
return False

View File

@ -0,0 +1,307 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
from __future__ import annotations
import logging
from pathlib import Path
import onnx
from ...fusions import FusionGelu, FusionLayerNormalization
from ...onnx_model import ONNXModel
from .fusion_lpnorm import FusionLpNormalization
def qnn_preprocess_model(
model_input: str | Path | onnx.ModelProto,
model_output: str | Path,
fuse_layernorm: bool = False,
save_as_external_data: bool = False,
all_tensors_to_one_file: bool = False,
external_data_location: str | None = None,
external_data_size_threshold: int = 1024,
external_data_convert_attribute: bool = False,
inputs_to_make_channel_last: list[str] | None = None,
outputs_to_make_channel_last: list[str] | None = None,
) -> bool:
"""
If necessary, this method creates a new "pre-processed" model in preparation for
quantization of a model to be used in QNN EP. Returns true if a new model was created.
This method perfoms the following operations:
- Fuse Erf sequence into a single Gelu node.
- Fuse ReduceL2 sequence into a single LpNormalization node (p == 2).
- (Optional) Fuse ReduceMean sequence into a single LayerNormalization node.
Args:
model_input: Path to the input model file or ModelProto.
model_output: Path the output model file, which is only created if this method returns True.
fuse_layernorm: True if ReduceMean sequences should be fused into LayerNormalization nodes.
Defaults to False.
save_as_external_data: True if output model should be saved with external data. Defaults to false.
all_tensors_to_one_file: Effective only if save_as_external_data is true. Defaults to false.
If true, save all tensors to one external file specified by external_data_location.
If false, save each tensor to a file named with the tensor name.
external_data_location: Effective only if save_as_external_data is true. Defaults to None.
Specify the external file to which all tensors are saved. Path is relative
to the model path. If not specified, the model's name is used.
external_data_size_threshold: Effective only if save_as_external_data is true. Defaults to 1024.
Tensors with a data size >= external_data_size_threshold are converted to external data.
To convert every tensor with raw data to external data, set to 0.
external_data_convert_attribute: Effective only if save_as_external_data is true. Defaults to false.
If true, convert all tensors to external data.
If false, convert only non-attribute tensors to external data.
inputs_to_make_channel_last: List of graph input names to transpose to be "channel-last". For example,
if "input0" originally has the shape (N, C, D1, D2, ..., Dn), the resulting model will change input0's
shape to (N, D1, D2, ..., Dn, C) and add a transpose node after it.
Original:
input0 (N, C, D1, D2, ..., Dn) --> <Nodes>
Updated:
input0 (N, D1, D2, ..., Dn, C) --> Transpose --> input0_chanfirst (N, C, D1, D2, ..., Dn) --> <Nodes>
This can potentially improve inference latency for QDQ models running on QNN EP because the
additional transpose node may allow other transpose nodes inserted during ORT layout transformation
to cancel out.
outputs_to_make_channel_last: List of graph output names to transpose to be "channel-last". For example,
if "output0" originally has the shape (N, C, D1, D2, ..., Dn), the resulting model will change output0's
shape to (N, D1, D2, ..., Dn, C) and add a transpose node before it.
Original:
<Nodes> --> output0 (N, C, D1, D2, ..., Dn)
Updated:
<Nodes> --> output0_chanfirst (N, C, D1, D2, ..., Dn) --> Transpose --> output0 (N, D1, D2, ..., Dn, C)
This can potentially improve inference latency for QDQ models running on QNN EP because the
additional transpose node may allow other transpose nodes inserted during ORT layout transformation
to cancel out.
"""
modified = False
model = model_input if isinstance(model_input, onnx.ModelProto) else onnx.load_model(model_input)
onnx_model = ONNXModel(model)
# Fuse Erf sequence into a single Gelu
fusion_gelu = FusionGelu(onnx_model)
if fusion_gelu.apply():
modified = True
# Fuse ReduceL2 sequence into a single LpNormalization node with p == 2.
fusion_lpnorm = FusionLpNormalization(onnx_model)
if fusion_lpnorm.apply():
modified = True
# Optionally, fuse ReduceMean sequence into a single LayerNormalization node.
if fuse_layernorm:
onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
# Need opset >= 17 to use LayerNormalization.
if onnx_opset.version < 17:
logging.warning(
"Unable to fuse ReduceMean sequence into a LayerNormalization node. "
"ONNX model must use an opset >= 17 in order to use LayerNormalization, "
f"but found version {onnx_opset.version}. Please use onnx.version_converter to update your model."
)
else:
fusion_layernorm = FusionLayerNormalization(onnx_model)
if fusion_layernorm.apply():
modified = True
# Optionally, transpose inputs and/or outputs to make them "channel-last".
if inputs_to_make_channel_last or outputs_to_make_channel_last:
transpose_node_prefix = "Transpose_channel_"
transpose_node_suffix: int = onnx_model.get_largest_node_name_suffix(transpose_node_prefix) + 1
update_io_to_channel_last(
onnx_model.model,
inputs_to_make_channel_last,
outputs_to_make_channel_last,
transpose_node_name_prefix=transpose_node_prefix,
transpose_node_name_start_suffix=transpose_node_suffix,
)
modified = True
# Make sure all nodes have a name.
unnamed_node_prefix = "qnn_preproc_node_"
available_suffix = onnx_model.get_largest_node_name_suffix(unnamed_node_prefix) + 1
for node in onnx_model.model.graph.node:
if node.op_type != "Constant" and not node.name:
new_node_name = f"{unnamed_node_prefix}{available_suffix!s}"
available_suffix += 1
node.name = new_node_name
modified = True
logging.warning(f"Node of type {node.op_type} does not have a name. Renamed to {new_node_name}.")
if modified:
onnx_model.topological_sort()
onnx.save_model(
model,
model_output,
save_as_external_data=save_as_external_data,
all_tensors_to_one_file=all_tensors_to_one_file,
location=external_data_location,
size_threshold=external_data_size_threshold,
convert_attribute=external_data_convert_attribute,
)
return modified
class InputOutputNameMap:
def __init__(
self,
orig_tensor_names: set[str],
orig_graph_inputs: dict[str, onnx.ValueInfoProto],
orig_graph_outputs: dict[str, onnx.ValueInfoProto],
):
self.orig_tensor_names = orig_tensor_names
self.orig_graph_inputs = orig_graph_inputs
self.orig_graph_outputs = orig_graph_outputs
self.updated_io_names = {}
self.new_value_infos = []
def get_new_name(self, orig_name: str):
if orig_name in self.updated_io_names:
return self.updated_io_names[orig_name]
# Make a new tensor name that is unique among all tensors in the graph.
prefix: str = f"{orig_name}_channel_first_"
suffix: int = -1
for tensor_name in self.orig_tensor_names:
if tensor_name.startswith(prefix) and tensor_name[len(prefix) :].isdigit():
index = int(tensor_name[len(prefix) :])
suffix = max(suffix, index)
suffix += 1 # This is the first available suffix.
new_name = f"{prefix}{suffix!s}"
# Add new value_info objects for these new tensors.
orig_value_info = self.orig_graph_inputs.get(orig_name) or self.orig_graph_outputs[orig_name]
value_info_proto = onnx.ValueInfoProto()
value_info_proto.CopyFrom(orig_value_info)
value_info_proto.name = new_name
self.new_value_infos.append(value_info_proto)
self.updated_io_names[orig_name] = new_name
return self.updated_io_names[orig_name]
def update_io_to_channel_last(
model: onnx.ModelProto,
inputs_to_update: list[str] | None,
outputs_to_update: list[str] | None,
transpose_node_name_prefix: str = "Transpose_channel_",
transpose_node_name_start_suffix: int = 0,
):
inputs_to_update = set(inputs_to_update or [])
outputs_to_update = set(outputs_to_update or [])
if not inputs_to_update and not outputs_to_update:
return
graph = model.graph
orig_graph_inputs = {ginput.name: ginput for ginput in graph.input}
orig_graph_outputs = {goutput.name: goutput for goutput in graph.output}
# Check that the user passed in actual input and output names.
for input_name in inputs_to_update:
if input_name not in orig_graph_inputs:
raise ValueError(f"{input_name} is not a graph input")
for output_name in outputs_to_update:
if output_name not in orig_graph_outputs:
raise ValueError(f"{output_name} is not a graph output")
orig_tensor_names = set()
orig_tensor_names.update(set(orig_graph_inputs))
orig_tensor_names.update(set(orig_graph_outputs))
orig_tensor_names.update(input_name for node in graph.node for input_name in node.input if input_name)
# Maps original input (or output) name to its updated name used within the graph.
io_map = InputOutputNameMap(orig_tensor_names, orig_graph_inputs, orig_graph_outputs)
# Update each node's inputs/outputs to use the transposed versions.
for node in graph.node:
for i in range(len(node.input)):
if node.input[i] and node.input[i] in inputs_to_update:
node.input[i] = io_map.get_new_name(node.input[i])
elif node.input[i] and node.input[i] in outputs_to_update:
node.input[i] = io_map.get_new_name(node.input[i])
for i in range(len(node.output)):
if node.output[i] in outputs_to_update:
node.output[i] = io_map.get_new_name(node.output[i])
# Update graph inputs to channel-last and a Transpose (to channel-first) after each.
for g_input_name in inputs_to_update:
g_input = orig_graph_inputs[g_input_name]
if not g_input.type.HasField("tensor_type") or not g_input.type.tensor_type.HasField("shape"):
raise ValueError(f"Expected input {g_input.name} to have a tensor_type with a shape")
input_shape = g_input.type.tensor_type.shape
input_rank = len(input_shape.dim)
if input_rank < 3:
raise ValueError(f"Expected input {g_input.name} to be of rank >= 3")
channel_dim = onnx.TensorShapeProto.Dimension()
channel_dim.CopyFrom(input_shape.dim[1])
for i in range(1, input_rank - 1):
input_shape.dim[i].CopyFrom(input_shape.dim[i + 1])
input_shape.dim[input_rank - 1].CopyFrom(channel_dim)
transpose_perm = list(range(input_rank))
for i in range(input_rank):
transpose_perm[i] = i if i < 1 else i - 1
transpose_perm[1] = input_rank - 1
transpose_node = onnx.helper.make_node(
"Transpose",
name=f"{transpose_node_name_prefix}{transpose_node_name_start_suffix!s}",
inputs=[g_input.name],
outputs=[io_map.get_new_name(g_input.name)],
perm=transpose_perm,
)
transpose_node_name_start_suffix += 1
graph.node.extend([transpose_node])
# Update graph outputs to channel-last and a Transpose (from channel-first) before each.
for g_output_name in outputs_to_update:
g_output = orig_graph_outputs[g_output_name]
if not g_output.type.HasField("tensor_type") or not g_output.type.tensor_type.HasField("shape"):
raise ValueError(f"Expected output {g_output.name} to have a tensor_type with a shape")
output_shape = g_output.type.tensor_type.shape
output_rank = len(output_shape.dim)
if output_rank < 3:
raise ValueError(f"Expected output {g_output.name} to be of rank >= 3")
channel_dim = onnx.TensorShapeProto.Dimension()
channel_dim.CopyFrom(output_shape.dim[1])
for i in range(1, output_rank - 1):
output_shape.dim[i].CopyFrom(output_shape.dim[i + 1])
output_shape.dim[output_rank - 1].CopyFrom(channel_dim)
transpose_perm = list(range(output_rank))
for i in range(output_rank):
transpose_perm[i] = i if i == 0 else i + 1
transpose_perm[output_rank - 1] = 1
transpose_node = onnx.helper.make_node(
"Transpose",
name=f"{transpose_node_name_prefix}{transpose_node_name_start_suffix!s}",
inputs=[io_map.get_new_name(g_output.name)],
outputs=[g_output.name],
perm=transpose_perm,
)
transpose_node_name_start_suffix += 1
graph.node.extend([transpose_node])
graph.value_info.extend(io_map.new_value_infos)

View File

@ -0,0 +1,387 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
from __future__ import annotations
import copy
import logging
from pathlib import Path
from typing import Any
import numpy as np
import onnx
from ...calibrate import CalibrationDataReader, CalibrationMethod
from ...quant_utils import QuantType
from ...quantize import StaticQuantConfig
from ...tensor_quant_overrides import TensorQuantOverridesHelper
from .mixed_precision_overrides_utils import MixedPrecisionTensorQuantOverridesFixer
Q16_TYPES = {QuantType.QInt16, QuantType.QUInt16}
Q8_TYPES = {QuantType.QInt8, QuantType.QUInt8}
Q4_TYPES = {QuantType.QInt4, QuantType.QUInt4}
OP_TYPES_TO_EXCLUDE = {"Cast"}
MODEL_SIZE_THRESHOLD = 2147483648 # Quant model should use external data if >= 2GB
def warn_unable_to_override(
node: onnx.NodeProto,
what_str: str,
tensor_name: str,
io_kind: str,
):
logging.warning(
f"Unable to override {what_str} for {node.op_type} node's {io_kind} "
"because it has already been overridden! Check the initial quantization overrides provided "
"to get_qnn_qdq_config() if the generated QDQ model does not run on QNN EP. "
f"Node name: {node.name}, {io_kind} name: {tensor_name}"
)
def get_qnn_qdq_config(
model_input: str | Path | onnx.ModelProto,
calibration_data_reader: CalibrationDataReader,
calibrate_method: CalibrationMethod = CalibrationMethod.MinMax,
activation_type: QuantType = QuantType.QUInt8,
weight_type: QuantType = QuantType.QUInt8,
per_channel: bool = False,
init_overrides: dict[str, list[dict[str, Any]]] | None = None,
add_qtype_converts: bool = True,
activation_symmetric: bool = False,
weight_symmetric: bool | None = None,
keep_removable_activations: bool = False,
stride: int | None = None,
) -> StaticQuantConfig:
"""
Returns a static quantization configuration suitable for running QDQ models on QNN EP.
This is done primarily by setting tensor-level quantization overrides.
Params:
model_input: Path to the input model file or ModelProto.
calibration_data_reader: Calibration data reader.
calibrate_methode: The calibration method. Defaults to MinMax.
activation_type: The default activation quantization type. Defaults to QUInt8.
weight_type: The default weight quantization type. Defaults to QUInt8.
per_channel: Global option that determines if a fixed set of operator types should be quantized per-channel.
Defaults to false. Alternatively, use the tensor-level `init_overrides` to select individual operators
and their quantization axes.
If set, the quantization tool uses per-channel quantization for the following operator types and inputs:
- Conv:
- input[1] on axis 0
- input[2] (bias) on axis 0
- ConvTranspose:
- input[1] on axis 1
- input[2] (bias) on axis 0
init_overrides: Initial tensor-level quantization overrides. Defaults to None. This function updates of a copy
of these overrides with any necessary adjustments and includes them in the returned
configuration object (i.e., config.extra_options['TensorQuantOverrides']).
The key is a tensor name and the value is a list of dictionaries. For per-tensor quantization, the list
contains a single dictionary. For per-channel quantization, the list contains either a dictionary for
each channel in the tensor or a single dictionary that is assumed to apply to all channels. An 'axis'
key must be present in the first dictionary for per-channel quantization.
Each dictionary contains optional overrides with the following keys and values.
'quant_type' = QuantType : The tensor's quantization data type.
'axis' = Int : The per-channel axis. Must be present for per-channel weights.
'scale' = Float : The scale value to use. Must also specify `zero_point` if set.
'zero_point' = Int : The zero-point value to use. Must also specify `scale` is set.
'symmetric' = Bool : If the tensor should use symmetric quantization. Invalid if also
set `scale` or `zero_point`.
'reduce_range' = Bool : If the quantization range should be reduced. Invalid if also
set `scale` or `zero_point`. Only valid for initializers.
'rmax' = Float : Override the maximum real tensor value in calibration data.
Invalid if also set `scale` or `zero_point`.
'rmin' = Float : Override the minimum real tensor value in calibration data.
Invalid if also set `scale` or `zero_point`.
'convert' = Dict : A nested dictionary with the same keys for an activation
tensor that should be converted to another quantization type.
'convert["recv_nodes"] = Set : Set of node names that consume the converted activation,
other nodes get the original type. If not specified,
assume all consumer nodes get the converted type.
add_qtype_converts: True if this function should automatically add "convert" entries to the provided
`init_overrides` to ensure that operators use valid input/output types (activations only).
Ex: if you override the output of an Add to 16-bit, this option ensures that the activation inputs
of the Add are also up-converted to 16-bit and that data types for surrounding ops are converted
appropriately. Refer to the documentation in mixed_precision_overrides_utils.py for additional details.
activation_symmetric: True if activations should be quantized symmetrically (i.e, rmax == -rmin) by default.
Defaults to false. For int8 and int16, this results in zero-point values of 0. For uint8 and uin16,
the zero-point values are 128 and 32,768, respectively.
weight_symmetric: True if weights should be quantized symmetrically (i.e., rmax == -rmin) by default.
Defaults to None. If set to None, weight_symmetric is assumed true if the weight_type is a signed int.
keep_removable_activations: Defaults to false. If true, "removable" activations (e.g., Clip or Relu) will not
be removed, and will be explicitly represented in the QDQ model. If false, these activations
are automatically removed if activations are asymmetrically quantized. Keeping these activations
is necessary if optimizations or EP transformations will later remove
QuantizeLinear/DequantizeLinear operators from the model.
Returns:
A StaticQuantConfig object
"""
if weight_symmetric is None:
weight_symmetric = weight_type in {QuantType.QInt8, QuantType.QInt16}
model = (
model_input
if isinstance(model_input, onnx.ModelProto)
else onnx.load_model(model_input, load_external_data=False)
)
op_types = set()
model_has_external_data = False
name_to_initializer = {}
# Build map of initializers (name -> initializer) and
# check if the model has external data.
for initializer in model.graph.initializer:
name_to_initializer[initializer.name] = initializer
if onnx.external_data_helper.uses_external_data(initializer):
model_has_external_data = True
overrides_helper = TensorQuantOverridesHelper(copy.deepcopy(init_overrides) if init_overrides else {})
if not overrides_helper.empty() and add_qtype_converts:
# Fix mixed-precision overrides.
overrides_fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(
overrides_helper, model, activation_type
)
overrides_fixer.apply(activation_type, activation_symmetric)
# Setup quantization overrides for specific operator types to ensure compatibility with QNN EP.
qnn_compat = QnnCompatibilityOverrides(
activation_type,
weight_type,
activation_symmetric,
weight_symmetric,
per_channel,
overrides_helper,
name_to_initializer,
)
for node in model.graph.node:
op_types.add(node.op_type)
qnn_compat.process_node(node)
extra_options = {
"MinimumRealRange": 0.0001,
"DedicatedQDQPair": False, # Let ORT optimizer duplicate DQ nodes
"QDQKeepRemovableActivations": keep_removable_activations,
"TensorQuantOverrides": overrides_helper.get_dict(),
"ActivationSymmetric": activation_symmetric,
"WeightSymmetric": weight_symmetric,
"CalibStridedMinMax": stride,
}
# ONNX opset < 21 does not support 16-bit quantization, so must use 'com.microsoft' domain
# on Q/DQ operators if using 16-bit or 4-bit quantization.
onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
if onnx_opset.version < 21:
opset21_types = Q16_TYPES.union(Q4_TYPES)
overrides_have_opset21_types = any(t in opset21_types for t in overrides_helper.get_quant_types())
if activation_type in opset21_types or weight_type in opset21_types or overrides_have_opset21_types:
extra_options["UseQDQContribOps"] = True
return StaticQuantConfig(
calibration_data_reader,
calibrate_method=calibrate_method,
activation_type=activation_type,
weight_type=weight_type,
op_types_to_quantize=list(op_types.difference(OP_TYPES_TO_EXCLUDE)),
per_channel=per_channel,
use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD),
extra_options=extra_options,
)
class QnnCompatibilityOverrides:
"""
Helper that processes nodes to generate quantization overrides that make the resulting QDQ model
compatible with QNN EP.
"""
def __init__(
self,
default_activation_qtype: QuantType,
default_weight_qtype: QuantType,
activation_symmetric: bool,
weight_symmetric: bool,
per_channel: bool,
overrides: TensorQuantOverridesHelper,
initializers: dict[str, onnx.TensorProto],
):
self.default_activation_qtype = default_activation_qtype
self.default_weight_qtype = default_weight_qtype
self.activation_symmetric = activation_symmetric
self.weight_symmetric = weight_symmetric
self.per_channel = per_channel
self.overrides = overrides
self.initializers = initializers
self.process_fns = {
"MatMul": self._process_matmul,
"LayerNormalization": self._process_layernorm,
"Sigmoid": self._process_sigmoid,
"Tanh": self._process_tanh,
}
def process_node(self, node: onnx.NodeProto):
process_fn = self.process_fns.get(node.op_type)
if process_fn is not None:
process_fn(node)
def _make_static_inputs_use_default_weight_type(self, node: onnx.NodeProto):
"""
Overrides initializer input(s) to use the default weight type if:
- The default weight type is 8-bit
- One of the inputs is a 16-bit activation
- The other input is an initializer (per-tensor quantized)
This is necessary because the quantization tool does not assign MatMul or LayerNorm initializer
inputs the default weight type. Instead, it assigns the default activation type.
"""
if self.default_weight_qtype not in Q8_TYPES:
return
input_16bit_act_name = None
input_weight_name = None
# Loop through first 2 inputs to find a 16-bit activation and a (per-tensor) weight.
for i in range(2):
input_name = node.input[i]
if not input_name:
continue
is_weight = input_name in self.initializers
qtype_info = self.overrides.get_node_input_qtype_info(
input_name,
node.name,
default_qtype=None if is_weight else self.default_activation_qtype,
)
if qtype_info.axis is not None:
return # Don't process MatMul with a per-channel quantized input.
if (
is_weight
and qtype_info.quant_type == self.default_weight_qtype
and qtype_info.symmetric == self.weight_symmetric
):
return # Return. Weight is already overridden to use the desired weight type.
if is_weight:
input_weight_name = input_name
elif qtype_info.quant_type in Q16_TYPES:
input_16bit_act_name = input_name
# Override initializer input to use the default weight type.
if input_16bit_act_name and input_weight_name:
did_update = self.overrides.update_tensor_overrides(
input_weight_name,
{"quant_type": self.default_weight_qtype, "symmetric": self.weight_symmetric},
overwrite=False,
)
if not did_update:
warn_unable_to_override(node, "quant_type/symmetric", input_weight_name, "input weight")
def _process_matmul(self, node: onnx.NodeProto):
assert node.op_type == "MatMul", f"Expected MatMul, but got {node.op_type}"
if not self.per_channel:
self._make_static_inputs_use_default_weight_type(node)
return
# QNN does not support per-channel MatMul. However, the ORT quantization tool attempts to use per-channel
# quantization for MatMul by default *if* the global per_channel setting is enabled. So, we need to
# provide explicit per-tensor quantization overrides for MatMul if per_channel is enabled and
# the user did not provide any other overrides.
for input_name in node.input:
is_weight_no_overrides = input_name in self.initializers and input_name not in self.overrides
if is_weight_no_overrides:
self.overrides.update_tensor_overrides(
input_name,
{"quant_type": self.default_weight_qtype, "symmetric": self.weight_symmetric},
)
def _process_layernorm(self, node: onnx.NodeProto):
assert node.op_type == "LayerNormalization", f"Expected LayerNormalization, but got {node.op_type}"
if not self.per_channel:
self._make_static_inputs_use_default_weight_type(node)
return
has_weight_no_overrides = node.input[1] in self.initializers and node.input[1] not in self.overrides
has_bias_no_overrides = (
len(node.input) > 2
and node.input[2]
and node.input[2] in self.initializers
and node.input[2] not in self.overrides
)
if has_weight_no_overrides or has_bias_no_overrides:
# TODO: Make bias input not per-channel. QNN needs it to be per-tensor, but quantizer
# tries to makes it per-channel if the weight is also per-channel.
raise ValueError(
"get_qnn_qdq_config() does not currently support the global per_channel option with LayerNormalization."
" Please try using custom overrides that make bias per-tensor quantized."
)
def _process_sigmoid(self, node: onnx.NodeProto):
"""
Overrides 16-bit Sigmoid's output scale and zero-point as per QNN requirements.
"""
assert node.op_type == "Sigmoid", f"Expected Sigmoid, but got {node.op_type}"
output_type = self.overrides.get_node_output_qtype_info(
node.output[0], self.default_activation_qtype
).quant_type
if output_type == QuantType.QUInt16:
self.overrides.update_tensor_overrides(
node.output[0],
{
"quant_type": output_type,
"scale": np.array(1.0 / 65536.0, dtype=np.float32),
"zero_point": np.array(0, dtype=np.uint16),
},
)
elif output_type == QuantType.QInt16:
self.overrides.update_tensor_overrides(
node.output[0],
{
"quant_type": output_type,
"scale": np.array(1.0 / 32768.0, dtype=np.float32),
"zero_point": np.array(0, dtype=np.int16),
},
)
def _process_tanh(self, node: onnx.NodeProto):
"""
Overrides 16-bit Tanh's output scale and zero-point as per QNN requirements.
"""
assert node.op_type == "Tanh", f"Expected Tanh, but got {node.op_type}"
output_type = self.overrides.get_node_output_qtype_info(
node.output[0], self.default_activation_qtype
).quant_type
if output_type == QuantType.QUInt16:
self.overrides.update_tensor_overrides(
node.output[0],
{
"quant_type": output_type,
"scale": np.array(1.0 / 32768.0, dtype=np.float32),
"zero_point": np.array(32768, dtype=np.uint16),
},
)
elif output_type == QuantType.QInt16:
self.overrides.update_tensor_overrides(
node.output[0],
{
"quant_type": output_type,
"scale": np.array(1.0 / 32768.0, dtype=np.float32),
"zero_point": np.array(0, dtype=np.int16),
},
)

View File

@ -0,0 +1,3 @@
from .fusion import Fusion # noqa: F401
from .fusion_gelu import FusionGelu # noqa: F401
from .fusion_layernorm import FusionLayerNormalization # noqa: F401

View File

@ -0,0 +1,311 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
from __future__ import annotations
from collections import deque
import onnx
from ..onnx_model import ONNXModel
class Fusion:
"""
Base class for fusions.
"""
def __init__(self, model: ONNXModel, fused_op_type: str, search_op_type: str):
self.search_op_type: str = search_op_type
self.fused_op_type: str = fused_op_type
self.model: ONNXModel = model
self.nodes_to_remove: list = []
self.nodes_to_add: list = []
self._new_node_name_prefix = self.fused_op_type + "_fused_" + self.search_op_type + "_"
self._new_node_name_suffix = None # int|None used to create unique node names for the fused ops.
def fuse(
self,
node: onnx.NodeProto,
input_name_to_nodes: dict[str, list[onnx.NodeProto]],
output_name_to_node: dict[str, onnx.NodeProto],
):
"""
Interface function for derived fusion classes. Tries to fuse a node sequence containing
the specified node.
"""
raise NotImplementedError
def apply(self) -> bool:
"""
Apply graph fusion on the entire model graph.
"""
input_name_to_nodes = self.model.input_name_to_nodes()
output_name_to_node = self.model.output_name_to_node()
for node in self.model.nodes():
if node.op_type == self.search_op_type:
self.fuse(node, input_name_to_nodes, output_name_to_node)
self.model.remove_nodes(self.nodes_to_remove)
self.model.add_nodes(self.nodes_to_add)
graph_updated = bool(self.nodes_to_remove or self.nodes_to_add)
if graph_updated:
self.model.remove_unused_constant()
return graph_updated
def create_unique_node_name(self):
prefix = self._new_node_name_prefix
if self._new_node_name_suffix is None:
largest_suffix: int = self.model.get_largest_node_name_suffix(prefix)
self._new_node_name_suffix = largest_suffix + 1
new_name = f"{prefix}{self._new_node_name_suffix!s}"
self._new_node_name_suffix += 1
return new_name
@staticmethod
def is_safe_to_fuse_nodes(
nodes_to_remove: list[onnx.NodeProto],
keep_outputs: list[str],
input_name_to_nodes: dict[str, list[onnx.NodeProto]],
output_name_to_node: dict[str, onnx.NodeProto],
) -> bool:
for node_to_remove in nodes_to_remove:
for output_to_remove in node_to_remove.output:
if output_to_remove in keep_outputs:
continue
if output_to_remove in input_name_to_nodes:
for impacted_node in input_name_to_nodes[output_to_remove]:
if impacted_node not in nodes_to_remove:
# Not safe to remove nodes since output is used by impacted_node
return False
return True
@staticmethod
def get_node_attribute(node: onnx.NodeProto, attribute_name: str):
for attr in node.attribute:
if attr.name == attribute_name:
value = onnx.helper.get_attribute_value(attr)
return value
return None
@staticmethod
def input_index(node_output: str, child_node: onnx.NodeProto) -> int:
for index, input_name in enumerate(child_node.input):
if input_name == node_output:
return index
return -1
@staticmethod
def tensor_shape_to_list(tensor_type) -> list[int]:
shape_list = []
for d in tensor_type.shape.dim:
if d.HasField("dim_value"):
shape_list.append(d.dim_value) # known dimension
elif d.HasField("dim_param"):
shape_list.append(d.dim_param) # unknown dimension with symbolic name
else:
shape_list.append("?") # shall not happen
return shape_list
def get_constant_input(self, node: onnx.NodeProto):
for i, inp in enumerate(node.input):
value = self.model.get_constant_value(inp)
if value is not None:
return i, value
return None, None
def find_constant_input(self, node: onnx.NodeProto, expected_value: float, delta: float = 0.000001) -> int:
i, value = self.get_constant_input(node)
if value is not None and value.size == 1 and abs(value - expected_value) < delta:
return i
return -1
def has_constant_input(self, node: onnx.NodeProto, expected_value: float, delta: float = 0.000001) -> bool:
return self.find_constant_input(node, expected_value, delta) >= 0
def is_constant_with_specified_rank(self, output_name: str, rank: int) -> bool:
value = self.model.get_constant_value(output_name)
if value is None:
return False # Not an initializer
if len(value.shape) != rank:
return False # Wrong dimensions
return True
def match_first_parent(
self,
node: onnx.NodeProto,
parent_op_type: str,
output_name_to_node: dict[str, onnx.NodeProto] | None = None,
exclude: list[onnx.NodeProto] = [], # noqa: B006
) -> tuple[onnx.NodeProto | None, int | None]:
"""
Find parent node based on constraints on op_type.
Args:
node: current node.
parent_op_type (str): constraint of parent node op_type.
output_name_to_node (dict): dictionary with output name as key, and node as value.
exclude (list): list of nodes that are excluded (not allowed to match as parent).
Returns:
parent: The matched parent node. None if not found.
index: The input index of matched parent node. None if not found.
"""
if output_name_to_node is None:
output_name_to_node = self.model.output_name_to_node()
for i, inp in enumerate(node.input):
if inp in output_name_to_node:
parent = output_name_to_node[inp]
if parent.op_type == parent_op_type and parent not in exclude:
return parent, i
return None, None
def match_parent(
self,
node: onnx.NodeProto,
parent_op_type: str,
input_index: int | None = None,
output_name_to_node: dict[str, onnx.NodeProto] | None = None,
exclude: list[onnx.NodeProto] = [], # noqa: B006
return_indice: list[int] | None = None,
) -> onnx.NodeProto | None:
"""
Find parent node based on constraints on op_type and index.
When input_index is None, we will find the first parent node based on constraints,
and return_indice will be appended the corresponding input index.
Args:
node (str): current node name.
parent_op_type (str): constraint of parent node op_type.
input_index (int or None): only check the parent given input index of current node.
output_name_to_node (dict): dictionary with output name as key, and node as value.
exclude (list): list of nodes that are excluded (not allowed to match as parent).
return_indice (list): a list to append the input index when input_index is None.
Returns:
parent: The matched parent node.
"""
assert node is not None
assert input_index is None or input_index >= 0
if output_name_to_node is None:
output_name_to_node = self.model.output_name_to_node()
if input_index is None:
parent, index = self.match_first_parent(node, parent_op_type, output_name_to_node, exclude)
if return_indice is not None:
return_indice.append(index)
return parent
if input_index >= len(node.input):
# Input index out of bounds.
return None
parent = self.model.get_parent(node, input_index, output_name_to_node)
if parent is not None and parent.op_type == parent_op_type and parent not in exclude:
return parent
return None
def match_parent_path(
self,
node: onnx.NodeProto,
parent_op_types: list[str],
parent_input_index: list[int] | None = None,
output_name_to_node: dict[str, onnx.NodeProto] | None = None,
return_indice: list[int] | None = None,
) -> list[onnx.NodeProto] | None:
"""
Find a sequence of input edges based on constraints on parent op_type and index.
When input_index is None, we will find the first parent node based on constraints,
and return_indice will be appended the corresponding input index.
Args:
node (str): current node name.
parent_op_types (str): constraint of parent node op_type of each input edge.
parent_input_index (list): constraint of input index of each input edge. None means no constraint.
output_name_to_node (dict): dictionary with output name as key, and node as value.
return_indice (list): a list to append the input index
When there is no constraint on input index of an edge.
Returns:
parents: a list of matched parent node.
"""
if parent_input_index is not None:
assert len(parent_input_index) == len(parent_op_types)
if output_name_to_node is None:
output_name_to_node = self.model.output_name_to_node()
current_node = node
matched_parents = []
for i, op_type in enumerate(parent_op_types):
matched_parent = self.match_parent(
current_node,
op_type,
parent_input_index[i] if parent_input_index is not None else None,
output_name_to_node,
exclude=[],
return_indice=return_indice,
)
if matched_parent is None:
return None
matched_parents.append(matched_parent)
current_node = matched_parent
return matched_parents
def match_parent_paths(
self,
node: onnx.NodeProto,
paths: list[tuple[list[str], list[int]]],
output_name_to_node: dict[str, onnx.NodeProto],
) -> tuple[int, list[onnx.NodeProto] | None, list[int] | None]:
"""
Find a matching parent path to the given node.
"""
for i, path in enumerate(paths):
return_indice = []
matched = self.match_parent_path(node, path[0], path[1], output_name_to_node, return_indice)
if matched:
return i, matched, return_indice
return -1, None, None
def find_first_child_by_type(
self,
node: onnx.NodeProto,
child_type: str,
input_name_to_nodes: dict[str, list[onnx.NodeProto]] | None = None,
recursive: bool = True,
) -> onnx.NodeProto | None:
children = self.model.get_children(node, input_name_to_nodes)
dq = deque(children)
while len(dq) > 0:
current_node = dq.pop()
if current_node.op_type == child_type:
return current_node
if recursive:
children = self.model.get_children(current_node, input_name_to_nodes)
for child in children:
dq.appendleft(child)
return None

View File

@ -0,0 +1,272 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
from __future__ import annotations
import onnx
from ..onnx_model import ONNXModel
from .fusion import Fusion
class FusionGelu(Fusion):
def __init__(self, model: ONNXModel):
super().__init__(model, "Gelu", "Erf")
def fuse(
self,
erf_node: onnx.NodeProto,
input_name_to_nodes: dict[str, list[onnx.NodeProto]],
output_name_to_node: dict[str, onnx.NodeProto],
):
"""
Interface function that tries to fuse a node sequence containing an Erf node into a single
Gelu node.
"""
if (
self.fuse_1(erf_node, input_name_to_nodes, output_name_to_node)
or self.fuse_2(erf_node, input_name_to_nodes, output_name_to_node)
or self.fuse_3(erf_node, input_name_to_nodes, output_name_to_node)
):
self.model.set_opset_import("com.microsoft", 1)
def fuse_1(
self,
erf_node: onnx.NodeProto,
input_name_to_nodes: dict[str, list[onnx.NodeProto]],
output_name_to_node: dict[str, onnx.NodeProto],
) -> bool:
"""
This pattern is from PyTorch model
Fuse Gelu with Erf into one node:
Pattern 1:
+-------Mul(0.5)---------------------+
| |
| v
[root] --> Div -----> Erf --> Add --> Mul -->
(B=1.4142...) (1)
Pattern 2:
+------------------------------------+
| |
| v
[root] --> Div -----> Erf --> Add --> Mul -->Mul -->
(B=1.4142...) (1) (0.5)
Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
"""
if erf_node.output[0] not in input_name_to_nodes:
return False
children = input_name_to_nodes[erf_node.output[0]]
if len(children) != 1 or children[0].op_type != "Add":
return False
add_after_erf = children[0]
if not self.has_constant_input(add_after_erf, 1):
return False
if add_after_erf.output[0] not in input_name_to_nodes:
return False
children = input_name_to_nodes[add_after_erf.output[0]]
if len(children) != 1 or children[0].op_type != "Mul":
return False
mul_after_erf = children[0]
div = self.match_parent(erf_node, "Div", 0, output_name_to_node)
if div is None:
return False
if self.find_constant_input(div, 1.4142, delta=0.001) != 1:
return False
subgraph_input = div.input[0]
another = 1 if mul_after_erf.input[0] == add_after_erf.output[0] else 0
if subgraph_input == mul_after_erf.input[another]: # pattern 2
children = input_name_to_nodes[mul_after_erf.output[0]]
if len(children) != 1 or children[0].op_type != "Mul":
return False
mul_half = children[0]
if not self.has_constant_input(mul_half, 0.5):
return False
subgraph_output = mul_half.output[0]
else: # pattern 1
mul_half = self.match_parent(mul_after_erf, "Mul", another, output_name_to_node)
if mul_half is None:
return False
if not self.has_constant_input(mul_half, 0.5):
return False
if subgraph_input not in mul_half.input:
return False
subgraph_output = mul_after_erf.output[0]
subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul_half]
if not self.is_safe_to_fuse_nodes(subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node):
return False
self.nodes_to_remove.extend(subgraph_nodes)
fused_node = onnx.helper.make_node(
"Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[subgraph_output]
)
fused_node.domain = "com.microsoft"
self.nodes_to_add.append(fused_node)
return True
def fuse_2(
self,
erf_node: onnx.NodeProto,
input_name_to_nodes: dict[str, list[onnx.NodeProto]],
output_name_to_node: dict[str, onnx.NodeProto],
) -> bool:
"""
This pattern is from Keras model
Fuse Gelu with Erf into one node:
+------------------------------------------+
| |
| v
[root] --> Div -----> Erf --> Add --> Mul -->Mul
(B=1.4142...) (A=1) (A=0.5)
Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
"""
if erf_node.output[0] not in input_name_to_nodes:
return False
children = input_name_to_nodes[erf_node.output[0]]
if len(children) != 1 or children[0].op_type != "Add":
return False
add_after_erf = children[0]
if not self.has_constant_input(add_after_erf, 1):
return False
if add_after_erf.output[0] not in input_name_to_nodes:
return False
children = input_name_to_nodes[add_after_erf.output[0]]
if len(children) != 1 or children[0].op_type != "Mul":
return False
mul_after_erf = children[0]
if not self.has_constant_input(mul_after_erf, 0.5):
return False
if mul_after_erf.output[0] not in input_name_to_nodes:
return False
children = input_name_to_nodes[mul_after_erf.output[0]]
if len(children) != 1 or children[0].op_type != "Mul":
return False
mul = children[0]
div = self.match_parent(erf_node, "Div", 0, output_name_to_node)
if div is None:
return False
sqrt_node = None
if self.find_constant_input(div, 1.4142, delta=0.001) != 1:
sqrt_node = self.match_parent(div, "Sqrt", 1, output_name_to_node)
if sqrt_node is None:
return False
if not self.has_constant_input(sqrt_node, 2.0):
return False
subgraph_input = div.input[0]
if subgraph_input not in mul.input:
return False
subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul]
if sqrt_node:
subgraph_nodes.append(sqrt_node)
if not self.is_safe_to_fuse_nodes(subgraph_nodes, [mul.output[0]], input_name_to_nodes, output_name_to_node):
return False
self.nodes_to_remove.extend(subgraph_nodes)
fused_node = onnx.helper.make_node(
"Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[mul.output[0]]
)
fused_node.domain = "com.microsoft"
self.nodes_to_add.append(fused_node)
return True
def fuse_3(
self,
erf_node: onnx.NodeProto,
input_name_to_nodes: dict[str, list[onnx.NodeProto]],
output_name_to_node: dict[str, onnx.NodeProto],
) -> bool:
"""
This pattern is from TensorFlow model
Fuse Gelu with Erf into one node:
+----------------------------------------------+
| |
| v
[root] --> Mul -----> Erf --> Add --> Mul -->Mul
(A=0.7071067690849304) (B=1) (B=0.5)
Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
"""
if erf_node.output[0] not in input_name_to_nodes:
return False
children = input_name_to_nodes[erf_node.output[0]]
if len(children) != 1 or children[0].op_type != "Add":
return False
add_after_erf = children[0]
if not self.has_constant_input(add_after_erf, 1):
return False
if add_after_erf.output[0] not in input_name_to_nodes:
return False
children = input_name_to_nodes[add_after_erf.output[0]]
if len(children) != 1 or children[0].op_type != "Mul":
return False
mul_half = children[0]
if not self.has_constant_input(mul_half, 0.5):
return False
first_mul = self.match_parent(erf_node, "Mul", 0, output_name_to_node)
if first_mul is None:
return False
i = self.find_constant_input(first_mul, 0.7071067690849304, delta=0.001)
if i < 0:
return False
root_input_index = 1 - i
subgraph_input = first_mul.input[root_input_index]
if mul_half.output[0] not in input_name_to_nodes:
return False
children = input_name_to_nodes[mul_half.output[0]]
if len(children) != 1 or children[0].op_type != "Mul":
return False
last_mul = children[0]
if not (last_mul.input[0] == subgraph_input or last_mul.input[1] == subgraph_input):
return False
subgraph_nodes = [first_mul, erf_node, add_after_erf, mul_half, last_mul]
if not self.is_safe_to_fuse_nodes(
subgraph_nodes,
[last_mul.output[0]],
input_name_to_nodes,
output_name_to_node,
):
return False
self.nodes_to_remove.extend(subgraph_nodes)
fused_node = onnx.helper.make_node(
"Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[last_mul.output[0]]
)
fused_node.domain = "com.microsoft"
self.nodes_to_add.append(fused_node)
return True

View File

@ -0,0 +1,135 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
from __future__ import annotations
import onnx
from ..onnx_model import ONNXModel
from .fusion import Fusion
class FusionLayerNormalization(Fusion):
def __init__(self, model: ONNXModel):
super().__init__(model, "LayerNormalization", "ReduceMean")
def fuse(
self,
reduce_mean_node: onnx.NodeProto,
input_name_to_nodes: dict[str, list[onnx.NodeProto]],
output_name_to_node: dict[str, onnx.NodeProto],
):
"""
Interface function that tries to fuse a node sequence containing a ReduceMean node into a single
LayerNormalization node.
+----------------------+
| |
| v
[Root] --> ReduceMean --> Sub --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
(axis=2 or -1) | (Y=2) (axis=2 or -1) (E-6 or E-12 or 0) ^
| |
+-------------------------------------------------+
It also handles cases of duplicated sub nodes exported from older version of PyTorch:
+----------------------+
| v
| +-------> Sub-----------------------------------------------+
| | |
| | v
[Root] --> ReduceMean --> Sub --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
| ^
| |
+----------------------+
"""
children = self.model.get_children(reduce_mean_node, input_name_to_nodes)
if len(children) == 0 or len(children) > 2:
return
root_input = reduce_mean_node.input[0]
if children[0].op_type != "Sub" or children[0].input[0] != root_input:
return
if len(children) == 2:
if children[1].op_type != "Sub" or children[1].input[0] != root_input:
return
div_node = None
for child in children:
div_node = self.find_first_child_by_type(child, "Div", input_name_to_nodes, recursive=False)
if div_node is not None:
break
if div_node is None:
return
path_id, parent_nodes, _ = self.match_parent_paths(
div_node,
[
(["Sqrt", "Add", "ReduceMean", "Pow", "Sub"], [1, 0, 0, 0, 0]),
(
["Sqrt", "Add", "ReduceMean", "Pow", "Cast", "Sub"],
[1, 0, 0, 0, 0, 0],
),
],
output_name_to_node,
)
if path_id < 0:
return
sub_node = parent_nodes[-1]
if sub_node not in children:
return
second_add_node = parent_nodes[1]
i, add_weight = self.get_constant_input(second_add_node)
if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
# Skip fusion since epsilon value is not expected.
return
pow_node = parent_nodes[3]
if self.find_constant_input(pow_node, 2.0) != 1:
return
mul_node = input_name_to_nodes[div_node.output[0]][0]
if mul_node.op_type != "Mul":
return
last_add_node = input_name_to_nodes[mul_node.output[0]][0]
if last_add_node.op_type != "Add":
return
subgraph_nodes = [reduce_mean_node]
subgraph_nodes.extend(children)
subgraph_nodes.extend(parent_nodes[:-1])
subgraph_nodes.extend([last_add_node, mul_node, div_node])
if not self.is_safe_to_fuse_nodes(
subgraph_nodes,
last_add_node.output,
input_name_to_nodes,
output_name_to_node,
):
return
weight_input = mul_node.input[1 - self.input_index(div_node.output[0], mul_node)]
if not self.is_constant_with_specified_rank(weight_input, 1):
return
bias_input = last_add_node.input[1 - self.input_index(mul_node.output[0], last_add_node)]
if not self.is_constant_with_specified_rank(bias_input, 1):
return
self.nodes_to_remove.extend(subgraph_nodes)
normalize_node = onnx.helper.make_node(
"LayerNormalization",
name=self.create_unique_node_name(),
inputs=[reduce_mean_node.input[0], weight_input, bias_input],
outputs=[last_add_node.output[0]],
)
normalize_node.attribute.extend([onnx.helper.make_attribute("epsilon", float(add_weight))])
self.nodes_to_add.append(normalize_node)

View File

@ -0,0 +1,857 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
from __future__ import annotations
import argparse
import copy
import importlib
import logging
import os
import numpy as np
import numpy.typing as npt
import onnx
from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
from packaging import version
from onnxruntime.capi._pybind_state import quantize_matmul_4bits, quantize_qdq_matmul_4bits
from .calibrate import CalibrationDataReader
from .onnx_model import ONNXModel
from .quant_utils import QuantFormat, attribute_to_kwarg
logging.basicConfig(format="%(asctime)s %(name)s [%(levelname)s] - %(message)s", level=logging.INFO)
logger = logging.getLogger(__name__)
class WeightOnlyQuantConfig:
def __init__(self, algorithm, quant_format):
"""This is the Base class for Weight Only Quant Configuration.
Args:
algorithm:
weight only quantize algorithm name.
quant_format: QuantFormat{QOperator, QDQ}.
QOperator format quantizes the model with quantized operators directly.
QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
"""
self.algorithm = algorithm
self.quant_format = quant_format
class RTNWeightOnlyQuantConfig(WeightOnlyQuantConfig):
def __init__(
self,
ratios=None,
quant_format=QuantFormat.QOperator,
):
"""
This is a class for round-to-nearest (RTN) algorithm Weight Only Quant Configuration.
RTN is the most straightforward way to quantize weight using scale maps.
Args:
ratios:
percentile of clip. Defaults to {}.
quant_format (QuantFormat{QOperator, QDQ}, optional):
QOperator format quantizes the model with quantized operators directly.
QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
Defaults to QuantFormat.QOperator.
"""
assert quant_format == QuantFormat.QOperator, "RTN only supports QOperator format"
if ratios is None:
ratios = {}
super().__init__(
algorithm="RTN",
quant_format=quant_format,
)
self.ratios = ratios
class GPTQWeightOnlyQuantConfig(WeightOnlyQuantConfig):
def __init__(
self,
calibration_data_reader: CalibrationDataReader,
percdamp=0.01,
block_size=128,
actorder=False,
mse=False,
perchannel=True,
quant_format=QuantFormat.QOperator,
):
"""
This is a class for GPTQ algorithm Weight Only Quant Configuration.
GPTQ algorithm provides more accurate quantization but requires more computational resources.
Args:
calibration_data_reader:
a calibration data reader. It enumerates calibration data and generates inputs for the original model.
percdamp:
percent of the average Hessian diagonal to use for dampening.
block_size (int, optional):
channel number in one block to execute a GPTQ quantization iteration.
actorder (bool, optional):
whether rearrange Hessian matrix considering the diag's value.
mse (bool, optional):
whether get scale and zero point with mse error.
perchannel (bool, optional):
whether quantize weight per-channel.
quant_format (QuantFormat{QOperator, QDQ}, optional):
QOperator format quantizes the model with quantized operators directly.
QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
Defaults to QuantFormat.QOperator.
"""
assert quant_format == QuantFormat.QOperator, "GPTQ only supports QOperator format"
super().__init__(
algorithm="GPTQ",
quant_format=quant_format,
)
self.calibration_data_reader = calibration_data_reader
self.percdamp = percdamp
self.block_size = block_size
self.actorder = actorder
self.mse = mse
self.perchannel = perchannel
class HQQWeightOnlyQuantConfig(WeightOnlyQuantConfig):
def __init__(
self,
block_size=128,
bits=4,
axis=1,
quant_format=QuantFormat.QOperator,
):
"""
This is a class for HQQ algorithm Weight Only Quant Configuration.
HQQ algorithm quant weight without needing calibrate data.
Args:
block_size (int, optional):
channel number in one block to execute a HQQ quantization iteration.
bits (int, optional):
how many bits to represent weight.
axis (int, optional):
0 or 1. which axis to quantize. https://arxiv.org/pdf/2309.15531.pdf
quant_format (QuantFormat{QOperator, QDQ}, optional):
QOperator format quantizes the model with quantized operators directly.
QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
Defaults to QuantFormat.QOperator.
"""
assert quant_format == QuantFormat.QOperator, "HQQ only supports QOperator format"
super().__init__(
algorithm="HQQ",
quant_format=quant_format,
)
self.block_size = block_size
self.bits = bits
self.axis = axis
class DefaultWeightOnlyQuantConfig(WeightOnlyQuantConfig):
def __init__(
self,
block_size: int = 128,
is_symmetric: bool = False,
accuracy_level: int | None = None,
quant_format=QuantFormat.QOperator,
):
"""
This is a class for weight only affine quantization configuration.
Args:
block_size (int, optional):
channel number in one block to execute an affine quantization iteration.
is_symmetric (bool, optional):
whether quantize weight symmetrically.
accuracy_level (int, optional):
Accuracy level of the 4-bit quantized MatMul computation.
Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details.
(https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits)
quant_format (QuantFormat{QOperator, QDQ}, optional):
QOperator format quantizes the model with quantized operators directly.
QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
Defaults to QuantFormat.QOperator.
"""
super().__init__(algorithm="DEFAULT", quant_format=quant_format)
self.block_size = block_size
self.is_symmetric = is_symmetric
self.bits = 4
self.accuracy_level = accuracy_level
def is_divisible(val1, val2):
return int(val2 * np.ceil(val1 / val2)) == val1
class HQQWeightOnlyQuantizer:
def __init__(
self,
config: HQQWeightOnlyQuantConfig,
):
self.config = config
# Proximal solver || weight - dequantize(quantize(weight))||_p^p
@staticmethod
def optimize_weights(
tensor,
scale,
zero,
min_max: list[int],
axis: int = 0,
opt_params: dict = None, # noqa: RUF013
verbose=False,
):
import torch
opt_params = {"lp_norm": 0.7, "beta": 1e1, "kappa": 1.01, "iters": 20} if opt_params is None else opt_params
lp_norm, beta, kappa, iters = (
opt_params["lp_norm"],
opt_params["beta"],
opt_params["kappa"],
opt_params["iters"],
)
dtype = torch.float16 if tensor.is_cuda else torch.float32
w_f = tensor.to(dtype)
scale = scale.to(dtype)
zero = zero.to(dtype)
if lp_norm == 1:
def shrink_op(x, beta):
return torch.sign(x) * torch.nn.functional.relu(torch.abs(x) - 1.0 / beta)
else:
def shrink_op(x, beta, p=lp_norm):
return torch.sign(x) * torch.nn.functional.relu(
torch.abs(x) - (1.0 / beta) * torch.pow(torch.abs(x) + 1e-8, p - 1)
)
best_error = 1e4
for i in range(iters):
w_q = torch.round(w_f * scale + zero).clamp(min_max[0], min_max[1])
w_r = (w_q - zero) / scale
w_e = shrink_op(w_f - w_r, beta)
zero = torch.mean(w_q - (w_f - w_e) * scale, axis=axis, keepdim=True)
beta *= kappa
current_error = float(torch.abs(w_f - w_r).mean())
if verbose:
print(i, np.round(current_error, 6))
if current_error < best_error:
best_error = current_error
else:
break
del w_f, w_q, w_r, w_e
return scale, zero
@staticmethod
def pack_on_row_fast_248bit(pack_tensor, ori_int_tensor, bits):
if pack_tensor.shape[0] == ori_int_tensor.shape[0]:
ori_int_tensor = ori_int_tensor.T
pack_tensor = pack_tensor.T
if bits in [2, 4, 8]:
compress_ratio = pack_tensor.element_size() * 8 // bits
for j in range(compress_ratio):
pack_tensor[0:] |= ori_int_tensor[j::compress_ratio] << (bits * (j))
else:
raise NotImplementedError("Only 2,4,8 bits are supported.")
# from Official implementation of Half-Quadratic Quantization (HQQ)
def quantize_internal(
self, tensor, bits=4, channel_wise=True, group_size=64, optimize=True, round_zero=True, axis=1
):
import torch
weight = tensor.float()
ori_shape = weight.shape
pad_len = (group_size - ori_shape[axis] % group_size) % group_size
if axis == 1:
weight = torch.nn.functional.pad(weight, (0, pad_len), "constant", 0)
else:
weight = torch.nn.functional.pad(weight, (0, 0, 0, pad_len), "constant", 0)
shape = weight.shape
# Reshape for grouping
if (group_size is not None) and channel_wise:
weight = weight.reshape([-1, group_size]) if (axis == 1) else weight.reshape([group_size, -1])
# Get min/max values
if channel_wise is False:
_min, _max = weight.min(), weight.max()
optimize = False
else:
_min = weight.min(axis=axis, keepdim=True)[0]
_max = weight.max(axis=axis, keepdim=True)[0]
max_v = 2**bits - 1
min_v = 0
min_max = [min_v, max_v]
# Note: here we work with the inverse of the scale to avoid division and quantize instead via weight*scale + zero, the scale is inverted later on.
# clamp to avoid half-precision problems
scale = (max_v / (_max - _min)).clamp(max=2e4)
#!!!!!!!!!!!!!!!
min_max_axis = _max - _min
if (min_max_axis == 0).sum().item() > 0:
min_max_axis[min_max_axis == 0] = max_v
scale = (max_v / min_max_axis).clamp(max=2e4)
zero = -_min * scale
if round_zero:
zero = torch.round(zero)
# Fine-tune weights
if optimize:
scale, zero = self.optimize_weights(tensor=weight, scale=scale, zero=zero, min_max=min_max, axis=axis)
# Quantize
# Necessary for fake quantization backprop
w_q = torch.round(weight * scale + zero).clamp(min_max[0], min_max[1])
w_q = w_q.reshape(shape).int()
scale = 1.0 / scale
if axis == 1:
scale = scale.reshape(shape[0], -1)
zero = zero.reshape(shape[0], -1)
else:
scale = scale.reshape(-1, shape[-1])
zero = zero.reshape(-1, shape[-1])
# cleanup
del weight, _min, _max
return w_q, scale.to(tensor.dtype), zero.to(tensor.dtype)
def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> list[NodeProto]:
"""
If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node.
If QOperator format, return MatMulNbits. If QDQ format, return DeQuantizeLinear + MatMul.
"""
if node.op_type != "MatMul":
return [node] # only care about MatMul for now
import torch
logger.info(f"start to quantize {node.name} ...")
input_b = node.input[1]
b_pb, bs_graph = get_initializer(input_b, graph_stack)
if b_pb is None:
logger.info("MatMul doesn't have const weight. Skip to quantize")
return [node] # only care about constant weight
b_array = onnx.numpy_helper.to_array(b_pb)
if len(b_array.shape) != 2:
logger.info("MatMul weight is not 2D. Skip to quantize")
return [node] # can only process 2-D matrix
b_array_torch = torch.from_numpy(b_array)
if torch.cuda.is_available():
b_array_torch = b_array_torch.cuda()
quant_weight_torch, scales_torch, zero_points_torch = self.quantize_internal(
b_array_torch.T, bits=self.config.bits, group_size=self.config.block_size
)
quant_weight_torch = quant_weight_torch.contiguous()
scales_torch = scales_torch.contiguous()
zero_points_torch = zero_points_torch.contiguous()
packed_torch = torch.zeros(
(quant_weight_torch.shape[0], quant_weight_torch.shape[1] // 2),
dtype=torch.uint8,
device=quant_weight_torch.device,
)
self.pack_on_row_fast_248bit(packed_torch, quant_weight_torch, self.config.bits)
scales = scales_torch.cpu().numpy()
zero_points = zero_points_torch.cpu().numpy()
# reshape to the predefined shape in MatmulNbits
scales = scales.reshape(-1)
zero_points = zero_points.reshape(-1)
rows, cols = b_array_torch.shape
block_size = self.config.block_size
blob_size = block_size // 2
k_blocks = (rows + block_size - 1) // block_size
packed_torch = packed_torch.reshape(cols, k_blocks, blob_size)
b_quant = onnx.numpy_helper.from_array(packed_torch.cpu().numpy())
b_quant.name = b_pb.name + "_Q4"
for input in bs_graph.input:
if input.name == input_b:
bs_graph.input.remove(input)
break
scales_tensor = onnx.numpy_helper.from_array(scales)
scales_tensor.name = b_pb.name + "_scales"
bs_graph.initializer.extend([b_quant, scales_tensor])
input_names = [node.input[0], b_quant.name, scales_tensor.name]
zp_tensor = onnx.numpy_helper.from_array(zero_points)
zp_tensor.name = b_pb.name + "_zero_points"
bs_graph.initializer.extend([zp_tensor])
input_names.append(zp_tensor.name)
kwargs = {}
rows, cols = b_array.shape
kwargs["K"] = rows
kwargs["N"] = cols
kwargs["bits"] = self.config.bits
kwargs["block_size"] = self.config.block_size
matmul_q4_node = onnx.helper.make_node(
"MatMulNBits",
inputs=input_names,
outputs=[node.output[0]],
name=node.name + "_Q4" if node.name else "",
domain="com.microsoft",
**kwargs,
)
logger.info(f"complete quantization of {node.name} ...")
return [matmul_q4_node]
def get_initializer(name, graph_path: list[GraphProto]) -> tuple[TensorProto, GraphProto]:
for gid in range(len(graph_path) - 1, -1, -1):
graph = graph_path[gid]
for tensor in graph.initializer:
if tensor.name == name:
return tensor, graph
return None, None
class DefaultWeightOnlyQuantizer:
def __init__(self, config: DefaultWeightOnlyQuantConfig):
self.config = config
def int4_block_quant(self, fp32weight: npt.ArrayLike) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""4b quantize fp32 weight to a blob"""
if len(fp32weight.shape) != 2:
raise ValueError("Current int4 block quantization only supports 2D tensors!")
rows, cols = fp32weight.shape
block_size = self.config.block_size
k_blocks = (rows + block_size - 1) // block_size
if self.config.quant_format == QuantFormat.QOperator:
blob_size = block_size // 2
padded_rows = k_blocks * block_size
pad_len = padded_rows - rows
if pad_len > 0:
fp32weight = np.pad(fp32weight, ((0, pad_len), (0, 0)), "constant")
# block wise quantization, each block comes from a single column
packed = np.zeros((cols, k_blocks, blob_size), dtype="uint8")
zero_point = np.zeros(cols * ((k_blocks + 1) // 2), dtype="uint8")
scales = np.zeros((cols * k_blocks), dtype=fp32weight.dtype)
quantize_matmul_4bits(
packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric
)
else:
packed = np.zeros((rows * cols + 1) // 2, dtype="uint8")
zero_point = np.zeros((cols * k_blocks + 1) // 2, dtype="uint8")
scales = np.zeros((k_blocks, cols), dtype=fp32weight.dtype)
quantize_qdq_matmul_4bits(
packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric
)
return (packed, scales, zero_point)
def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> list[NodeProto]:
"""
If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node.
If QOperator format, return MatMulNbits. If QDQ format, return DeQuantizeLinear + MatMul.
"""
if node.op_type != "MatMul":
return [node] # only care about MatMul for now
logger.info(f"start to quantize {node.name} ...")
qtype = TensorProto.INT4 if self.config.is_symmetric else TensorProto.UINT4
input_b = node.input[1]
b_tensor, b_graph = get_initializer(input_b, graph_stack)
if b_tensor is None:
logger.info("MatMul doesn't have const weight. Skip to quantize")
return [node] # only care about constant weight
b_ndarray = onnx.numpy_helper.to_array(b_tensor)
if len(b_ndarray.shape) != 2:
logger.info("MatMul weight is not 2D. Skip to quantize")
return [node] # can only process 2-D matrix
packed, scales, zero_points = self.int4_block_quant(b_ndarray)
if self.config.quant_format == QuantFormat.QOperator:
b_quant = onnx.numpy_helper.from_array(packed, b_tensor.name + "_Q4")
scales_tensor = onnx.numpy_helper.from_array(scales, b_tensor.name + "_scales")
else:
b_quant = onnx.helper.make_tensor(b_tensor.name + "_DQ_Q4", qtype, b_ndarray.shape, packed.tobytes(), True)
scales_tensor = onnx.numpy_helper.from_array(scales, b_tensor.name + "_DQ_scales")
for input in b_graph.input:
if input.name == input_b:
b_graph.input.remove(input)
break
b_graph.initializer.extend([b_quant, scales_tensor])
output_nodes = []
if self.config.quant_format == QuantFormat.QOperator:
input_names = [node.input[0], b_quant.name, scales_tensor.name]
if not self.config.is_symmetric:
zp_tensor = onnx.numpy_helper.from_array(zero_points, b_tensor.name + "_zero_points")
input_names.append(zp_tensor.name)
b_graph.initializer.extend([zp_tensor])
kwargs = {}
rows, cols = b_ndarray.shape
kwargs["K"] = rows
kwargs["N"] = cols
kwargs["bits"] = 4
kwargs["block_size"] = self.config.block_size
if self.config.accuracy_level is not None:
kwargs["accuracy_level"] = self.config.accuracy_level
matmul_q4_node = onnx.helper.make_node(
"MatMulNBits",
inputs=input_names,
outputs=[node.output[0]],
name=node.name + "_Q4" if node.name else "",
domain="com.microsoft",
**kwargs,
)
output_nodes.append(matmul_q4_node)
else:
dq_input_names = [b_quant.name, scales_tensor.name]
dq_output_names = [b_quant.name + "_output"]
matmul_input_names = [node.input[0], dq_output_names[0]]
matmul_output_names = [node.output[0]]
if not self.config.is_symmetric:
zp_tensor = onnx.helper.make_tensor(
b_tensor.name + "_DQ_zero_points", qtype, scales.shape, zero_points.tobytes(), True
)
dq_input_names.append(zp_tensor.name)
b_graph.initializer.extend([zp_tensor])
dq_kwargs = {"axis": 0, "block_size": self.config.block_size}
dq_node = onnx.helper.make_node(
"DequantizeLinear",
inputs=dq_input_names,
outputs=dq_output_names,
name=node.name + "_DQ_Q4" if node.name else "",
**dq_kwargs,
)
matmul_node = onnx.helper.make_node(
"MatMul",
inputs=matmul_input_names,
outputs=matmul_output_names,
name=node.name + "_matmul_Q4" if node.name else "",
)
output_nodes.extend([dq_node, matmul_node])
logger.info(f"complete quantization of {node.name} ...")
return output_nodes
class MatMul4BitsQuantizer:
"""
Perform 4b quantization of constant MatMul weights.
If algo_config.quant_format is QOperator, the quantized weight is stored in a MatMulNBits node, which relaces the
MatMul node.
If algo_config.quant_format is QDQ, the quantized weight is stored in a DeQuantizeLinear node. The MatMul node is
replaced by the DequantizeLinear + MatMul nodes.
"""
def __init__(
self,
model: ModelProto | str,
block_size: int = 128,
is_symmetric: bool = False,
accuracy_level: int | None = None,
nodes_to_exclude=None,
quant_format=QuantFormat.QOperator,
algo_config: WeightOnlyQuantConfig | None = None,
):
if nodes_to_exclude is None:
nodes_to_exclude = []
self.model = ONNXModel(onnx.load(model)) if isinstance(model, str) else ONNXModel(model)
self.model_path = model if isinstance(model, str) else None
self.block_size = block_size
self.is_symmetric = is_symmetric
self.accuracy_level = accuracy_level
self.nodes_to_exclude = set(nodes_to_exclude)
self.node_quantizer = None
if algo_config is None:
algo_config = DefaultWeightOnlyQuantConfig(
block_size=block_size,
is_symmetric=is_symmetric,
accuracy_level=accuracy_level,
quant_format=quant_format,
)
self.algo_config = algo_config
if algo_config.algorithm == "HQQ":
self.node_quantizer = HQQWeightOnlyQuantizer(self.algo_config)
elif algo_config.algorithm == "DEFAULT":
self.node_quantizer = DefaultWeightOnlyQuantizer(self.algo_config)
def _process_subgraph(self, graph_stack: list[GraphProto]):
new_nodes = []
graph = graph_stack[-1]
for node in graph.node:
graph_attrs = [
attr
for attr in node.attribute
if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS
]
if len(graph_attrs):
kwargs = {}
for attr in node.attribute:
if attr.type == onnx.AttributeProto.GRAPH:
# recursive call to take care of sub-graph
graph_stack.append(attr.g)
kv = {attr.name: self._process_subgraph(graph_stack)}
elif attr.type == onnx.AttributeProto.GRAPHS:
value = []
for subgraph in attr.graphs:
# recursive call to take care of sub-graph
graph_stack.append(subgraph)
value.extend([self._process_subgraph(graph_stack)])
kv = {attr.name: value}
else:
kv = attribute_to_kwarg(attr)
kwargs.update(kv)
node = onnx.helper.make_node( # noqa: PLW2901
node.op_type, node.input, node.output, name=node.name, **kwargs
)
out_nodes = []
if node.name in self.nodes_to_exclude:
logger.info(f"exclude to quantize {node.name} as specified by nodes_to_exclude...")
out_nodes = [node]
elif self.algo_config is not None and self.algo_config.algorithm == "HQQ":
out_nodes = self.node_quantizer.quantize(node, graph_stack)
else:
out_nodes = self.node_quantizer.quantize(node, graph_stack)
new_nodes.extend(out_nodes)
graph.ClearField("node")
graph.node.extend(new_nodes)
graph_stack.pop()
return graph
def _generate_q4_node_config(self):
"""Generate weight only quant configuration for nodes."""
q4_node_config = {}
template_config_q4 = {
"bits": 4,
"group_size": self.block_size,
"scheme": "sym" if self.is_symmetric else "asym",
}
for node in self.model.model.graph.node:
if node.op_type in ["MatMul"]:
if not all([self.model.get_initializer(i) is None for i in node.input]):
q4_node_config[node.name] = template_config_q4
return q4_node_config
def int4_quant_algo(self):
"""4b quantize a model with RTN or GPTQ algorithm. Please refer to
https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md
for more details on weight only quantization using Intel® Neural Compressor.
"""
def inc_dataloader():
data_reader = copy.deepcopy(self.algo_config.calibration_data_reader)
for data in data_reader:
yield data, None
kwargs = {}
if self.accuracy_level is not None:
kwargs["accuracy_level"] = self.accuracy_level
weight_only_node_config = self._generate_q4_node_config()
algorithm = self.algo_config.algorithm
logger.info(f"start to quantize model with {algorithm} algorithm...")
if algorithm == "RTN":
from neural_compressor.adaptor.ox_utils.weight_only import rtn_quantize
kwargs["ratios"] = self.algo_config.ratios
self.model = rtn_quantize(
model=self.model_path if self.model_path is not None else self.model.model,
weight_config=weight_only_node_config,
**kwargs,
)
elif algorithm == "GPTQ":
from neural_compressor.adaptor.ox_utils.weight_only import gptq_quantize
kwargs["percdamp"] = self.algo_config.percdamp
kwargs["blocksize"] = self.algo_config.block_size
kwargs["actorder"] = self.algo_config.actorder
kwargs["mse"] = self.algo_config.mse
kwargs["perchannel"] = self.algo_config.perchannel
kwargs["n_samples"] = -1
dataloader = inc_dataloader()
self.model = gptq_quantize(
model=self.model_path if self.model_path is not None else self.model.model,
weight_config=weight_only_node_config,
dataloader=dataloader,
**kwargs,
)
logger.info(f"complete quantization of model with {algorithm} algorithm.")
def process(self):
if self.algo_config.algorithm in ["HQQ", "DEFAULT"]:
# use a stack to keep track of sub-graphs
graph_stack = [self.model.graph()]
# Update domain opset
if self.algo_config.quant_format == QuantFormat.QOperator:
self.model.set_opset_import("com.microsoft", 1)
else:
opset_import = self.model.opset_import()
for opset in opset_import:
if opset.domain in [None, "ai.onnx", ""] and opset.version < 21:
logger.warning(
"The opset of the input model is under 21 and doesn't support int4 data type. "
"Force to update it to opset 21, but the generated model may not be a valid model."
)
self.model.set_opset_import(opset.domain, 21)
self._process_subgraph(graph_stack)
self.model.clean_initializers()
else:
# use Intel® Neural Compressor for RTN or GPTQ weight-only quantize algorithm
try:
importlib.import_module("neural_compressor")
except Exception as e:
logging.error(f"{e}.")
raise RuntimeError(
"neural-compressor is not correctly installed. Please check your environment."
) from e
import neural_compressor
assert version.parse(neural_compressor.__version__) >= version.parse(
"2.3.2"
), "Require neural-compressor >= 2.3.2 to support weight only quantization!"
self.int4_quant_algo()
def ort_convert_str_to_bool(value):
return value.lower() in ("true", "1")
def parse_args():
parser = argparse.ArgumentParser(
description="""Blockwise int4 quantization for MatMul 2D weight matrices.
A weight matrix is partitioned into into blocks, where each block is a
continguous subset inside each column. Each block is quantized into a
set of 4b integers with a scaling factor and an optional offset.
"""
)
parser.add_argument("--input_model", required=True, help="Path to the input model file")
parser.add_argument("--output_model", required=True, help="Path to the output model file")
parser.add_argument("--block_size", required=False, default=32, type=int, help="Block size for quantization")
parser.add_argument(
"--quant_method",
default="default",
type=str,
choices=["default", "hqq", "rtn", "gptq"],
help="the algorithm used to quantize weight, \nrtn and gptq leverage Intel® Neural Compressor",
)
parser.add_argument("--bits", default=4, type=int, help="the target bits to represent weight")
parser.add_argument(
"--symmetric",
required=False,
default=True,
const=True,
nargs="?",
type=ort_convert_str_to_bool,
choices=[True, False],
help="Indicate whether to quantize the model symmetrically, symmetric is not supported by hqq",
)
parser.add_argument(
"--accuracy_level",
required=False,
type=int,
help="Accuracy level of the 4-bit quantized MatMul computation. "
"Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details "
"(https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits).",
)
parser.add_argument("-v", "--verbose", required=False, action="store_true")
parser.set_defaults(verbose=False)
parser.add_argument(
"--nodes_to_exclude",
nargs="+",
type=str,
required=False,
default=[],
help="Specify the nodes to be excluded from quantization with node names",
)
parser.add_argument(
"--quant_format",
default="QOperator",
type=str,
choices=["QOperator", "QDQ"],
help="QuantFormat {QOperator, QDQ}"
"QOperator format quantizes the model with quantized operators directly."
"QDQ format quantize the model by inserting DeQuantizeLinear before the MatMul.",
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
if args.verbose:
logger.setLevel(logging.DEBUG)
input_model_path = args.input_model
output_model_path = args.output_model
quant_format = QuantFormat[args.quant_format]
if os.path.exists(output_model_path):
logger.error(f"file {output_model_path} already exists")
raise Exception(f"file {output_model_path} already exists")
if args.symmetric and args.quant_method == "hqq":
logger.warning("symmetric is not supportted by hqq, will force to symmetric=False")
args.symmetric = False
model = onnx.load(input_model_path)
if args.quant_method == "hqq":
quant_config = HQQWeightOnlyQuantConfig(block_size=args.block_size, bits=args.bits)
elif args.quant_method == "default":
quant_config = DefaultWeightOnlyQuantConfig(
block_size=args.block_size,
is_symmetric=args.symmetric,
accuracy_level=args.accuracy_level,
quant_format=quant_format,
)
elif args.quant_method == "rtn":
quant_config = RTNWeightOnlyQuantConfig()
elif args.quant_method == "gptq":
quant_config = GPTQWeightOnlyQuantConfig(block_size=args.block_size)
else:
raise ValueError(f"Unsupported quantization method: {args.quant_method}")
quant = MatMul4BitsQuantizer(
model=model,
accuracy_level=args.accuracy_level,
nodes_to_exclude=args.nodes_to_exclude,
algo_config=quant_config,
)
quant.process()
quant.model.save_model_to_file(output_model_path, True)

View File

@ -0,0 +1,240 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
import argparse
import logging
import os
from typing import List, Tuple
import numpy as np
import numpy.typing as npt
import onnx
from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
from onnxruntime.capi._pybind_state import quantize_matmul_bnb4
from .onnx_model import ONNXModel
from .quant_utils import attribute_to_kwarg
logger = logging.getLogger(__name__)
class MatMulBnb4Quantizer:
"""Perform 4b quantization of constant MatMul weights using FP4 or NF4 data type"""
##################
# quantization types, must be consistent with native code type
# Bnb_DataType_t defined in blockwise_quant_block_bnb4.h
# 4b floating point with bias of 3
FP4 = 0
# 4b NormalFloat
NF4 = 1
def __init__(self, model: ModelProto, quant_type: int, block_size: int, nodes_to_exclude=None):
nodes_to_exclude = nodes_to_exclude or []
assert quant_type in [MatMulBnb4Quantizer.FP4, MatMulBnb4Quantizer.NF4]
self.model = ONNXModel(model)
self.quant_type = quant_type
self.block_size = block_size
self.nodes_to_exclude = set(nodes_to_exclude)
@staticmethod
def __get_initializer(name, graph_path: List[GraphProto]) -> Tuple[TensorProto, GraphProto]:
for gid in range(len(graph_path) - 1, -1, -1):
graph = graph_path[gid]
for tensor in graph.initializer:
if tensor.name == name:
return tensor, graph
return None, None
def bnb4_block_quant(self, fpweight: npt.ArrayLike) -> np.ndarray:
"""4b quantize fp32/fp16 weight"""
if len(fpweight.shape) != 2:
raise ValueError("Current bnb4 block quantization only supports 2D tensors!")
# need to copy since the transposed weight still has the original memory layout
# Linear4bit quantizes its weight data which is the transposed weight
fpweight_t = fpweight.transpose().copy()
rows, cols = fpweight.shape
numel = rows * cols
block_size = self.block_size
num_blocks = (numel + block_size - 1) // block_size
quantized_numel = (numel + 1) // 2
packed = np.zeros(quantized_numel, dtype="uint8")
absmax = np.zeros(num_blocks, dtype=fpweight.dtype)
# block wise quantization, fpweight_t is flattened and divided into blocks
quantize_matmul_bnb4(packed, fpweight_t, absmax, block_size, self.quant_type, cols, rows)
return (packed, absmax)
def _bnb4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto]) -> NodeProto:
"""If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node"""
if node.op_type != "MatMul":
return node # only care about MatMul for now
logger.debug(f"start to quantize {node.name} ...")
if node.name in self.nodes_to_exclude:
logger.debug(f"exclude to quantize {node.name} as specified by nodes_to_exclude...")
return node
inputB = node.input[1] # noqa: N806
B, Bs_graph = MatMulBnb4Quantizer.__get_initializer(inputB, graph_stack) # noqa: N806
if B is None:
logger.debug("MatMul doesn't have const weight. Skip to quantize")
return node # only care about constant weight
B_array = onnx.numpy_helper.to_array(B) # noqa: N806
if len(B_array.shape) != 2:
logger.debug("MatMul weight is not 2D. Skip to quantize")
return node # can only process 2-D matrix
packed, absmax = self.bnb4_block_quant(B_array)
B_quant = onnx.numpy_helper.from_array(packed) # noqa: N806
B_quant.name = B.name + "_Bnb4"
for input in Bs_graph.input:
if input.name == inputB:
Bs_graph.input.remove(input)
break
absmax_tensor = onnx.numpy_helper.from_array(absmax)
absmax_tensor.name = B.name + "_absmax"
Bs_graph.initializer.extend([B_quant, absmax_tensor])
kwargs = {}
rows, cols = B_array.shape
kwargs["K"] = rows
kwargs["N"] = cols
kwargs["block_size"] = self.block_size
kwargs["quant_type"] = self.quant_type
matmul_bnb4_node = onnx.helper.make_node(
"MatMulBnb4",
inputs=[node.input[0], B_quant.name, absmax_tensor.name],
outputs=[node.output[0]],
name=node.name + "_Bnb4" if node.name else "",
domain="com.microsoft",
**kwargs,
)
logger.debug(f"complete quantization of {node.name} ...")
return matmul_bnb4_node
def _process_subgraph(self, graph_stack: List[GraphProto]):
new_nodes = []
graph = graph_stack[-1]
for node in graph.node:
graph_attrs = [
attr
for attr in node.attribute
if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS
]
if len(graph_attrs):
kwargs = {}
for attr in node.attribute:
if attr.type == onnx.AttributeProto.GRAPH:
# recursive call to take care of sub-graph
graph_stack.append(attr.g)
kv = {attr.name: self._process_subgraph(graph_stack)}
elif attr.type == onnx.AttributeProto.GRAPHS:
value = []
for subgraph in attr.graphs:
# recursive call to take care of sub-graph
graph_stack.append(subgraph)
value.extend([self._process_subgraph(graph_stack)])
kv = {attr.name: value}
else:
kv = attribute_to_kwarg(attr)
kwargs.update(kv)
node = onnx.helper.make_node( # noqa: PLW2901
node.op_type, node.input, node.output, name=node.name, **kwargs
)
new_nodes.append(self._bnb4_matmul_node_weight(node, graph_stack))
graph.ClearField("node")
graph.node.extend(new_nodes)
graph_stack.pop()
return graph
def process(self):
# use a stack to keep track of sub-graphs
graph_stack = [self.model.graph()]
opset_import = self.model.opset_import()
has_ms_domain = False
for opset in opset_import:
if opset.domain == "com.microsoft":
has_ms_domain = True
if not has_ms_domain:
opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)])
self._process_subgraph(graph_stack)
self.model.clean_initializers()
def parse_args():
parser = argparse.ArgumentParser(
description="""Blockwise FP4/NF4 quantization for MatMul 2D weight matrices.
A weight matrix is partitioned into blocks, where each block is a contiguous
subset inside the flattened transposed weight matrix. Each block is quantized
into a set of 4b integers with an absolute value scaling factor.
"""
)
parser.add_argument("--input_model", required=True, help="Path to the input model file")
parser.add_argument("--output_model", required=True, help="Path to the output model file")
parser.add_argument(
"--quant_type",
required=False,
default=1,
choices=[MatMulBnb4Quantizer.FP4, MatMulBnb4Quantizer.NF4],
help="Quantization data type. 0: FP4, 1: NF4",
)
parser.add_argument(
"--block_size",
required=False,
default=64,
help="Block size for blockwise quantization. Note: bnb.nn.Linear4bit only uses block_size=64",
)
parser.add_argument("-v", "--verbose", required=False, action="store_true")
parser.set_defaults(verbose=False)
parser.add_argument(
"--nodes_to_exclude",
nargs="+",
type=str,
required=False,
default=[],
help="Specify the nodes to be excluded from quantization with node names",
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
if args.verbose:
logger.setLevel(logging.DEBUG)
input_model_path = args.input_model
output_model_path = args.output_model
if os.path.exists(output_model_path):
logger.error(f"file {output_model_path} already exists")
raise Exception(f"file {output_model_path} already exists")
model = onnx.load(input_model_path)
quant = MatMulBnb4Quantizer(model, args.quant_type, args.block_size, nodes_to_exclude=args.nodes_to_exclude)
quant.process()
quant.model.save_model_to_file(output_model_path, True)

View File

@ -0,0 +1,580 @@
# --------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------
from pathlib import Path
import onnx
import onnx.helper as onnx_helper
import onnx.numpy_helper as onnx_numpy_helper
from onnx.onnx_pb import ModelProto
from .quant_utils import attribute_to_kwarg, find_by_name
def _clean_initializers_helper(graph, model):
"""Clean unused initializers from graph.
Returns:
A cleaned graph without unused initializers
A list of tensor names, which are not produced by this graph and its subgraphes
"""
requesting_tensor_names = set()
requesting_tensor_names.update(input_name for node in graph.node for input_name in node.input if input_name)
requesting_tensor_names.update(g_out.name for g_out in graph.output if g_out.name)
new_nodes = []
for node in graph.node:
new_node = node
graph_attrs = [
attr
for attr in node.attribute
if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS
]
if graph_attrs:
kwargs = {}
for attr in node.attribute:
new_attribute = {}
if attr.type == onnx.AttributeProto.GRAPH:
(
cleaned_sub_graph,
sub_requesting_tensor_names,
) = _clean_initializers_helper(attr.g, model)
new_attribute = {attr.name: cleaned_sub_graph}
requesting_tensor_names.update(sub_requesting_tensor_names)
elif attr.type == onnx.AttributeProto.GRAPHS:
cleaned_graphes = []
for subgraph in attr.graphs:
(
cleaned_sub_graph,
sub_requesting_tensor_names,
) = _clean_initializers_helper(subgraph, model)
cleaned_graphes.append(cleaned_sub_graph)
requesting_tensor_names.update(sub_requesting_tensor_names)
new_attribute = {attr.name: cleaned_graphes}
else:
new_attribute = attribute_to_kwarg(attr)
kwargs.update(new_attribute)
new_node = onnx_helper.make_node(node.op_type, node.input, node.output, name=node.name, **kwargs)
new_nodes.append(new_node)
graph.ClearField("node")
graph.node.extend(new_nodes)
requesting_tensor_names.difference_update(output for node in graph.node for output in node.output)
unused_initializer = []
for initializer in graph.initializer:
if initializer.name in requesting_tensor_names:
requesting_tensor_names.remove(initializer.name)
else:
# mark it to remove, remove here directly will cause mis-behavier
unused_initializer.append(initializer)
name_to_input = {input.name: input for input in graph.input}
for initializer in unused_initializer:
graph.initializer.remove(initializer)
if initializer.name in name_to_input:
try:
graph.input.remove(name_to_input[initializer.name])
except StopIteration:
if model.ir_version < 4:
print(f"Warning: invalid weight name {initializer.name} found in the graph (not a graph input)")
requesting_tensor_names.difference_update(input.name for input in graph.input)
return graph, requesting_tensor_names
class ONNXModel:
def __init__(self, model: ModelProto):
self.model = model
def nodes(self):
return self.model.graph.node
def initializer(self):
return self.model.graph.initializer
def initializer_extend(self, inits):
if len(inits) == 0:
raise ValueError("Can add an empty list.")
for init in self.initializer():
self._check_init(init, "gain")
for init in inits:
self._check_init(init)
self.model.graph.initializer.append(init)
def graph(self):
return self.model.graph
def ir_version(self):
return self.model.ir_version
def opset_import(self):
return self.model.opset_import
def set_opset_import(self, domain, version):
for opset in self.model.opset_import:
if opset.domain == domain:
opset.version = version
return
self.model.opset_import.extend([onnx_helper.make_opsetid(domain, version)])
def remove_node(self, node):
if node in self.model.graph.node:
self.model.graph.node.remove(node)
def remove_nodes(self, nodes_to_remove):
for node in nodes_to_remove:
self.remove_node(node)
def add_node(self, node):
self.model.graph.node.extend([self._check_node(node)])
def add_nodes(self, nodes_to_add):
for node in nodes_to_add:
self.add_node(node)
def add_initializer(self, tensor):
if find_by_name(tensor.name, self.model.graph.initializer) is None:
self._check_init(tensor)
self.model.graph.initializer.extend([tensor])
def get_initializer(self, name):
for tensor in self.model.graph.initializer:
if tensor.name == name:
return tensor
return None
def find_graph_input(self, input_name):
for input in self.model.graph.input:
if input.name == input_name:
return input
return None
def find_graph_output(self, output_name):
for output in self.model.graph.output:
if output.name == output_name:
return output
return None
def get_tensor_type(self, tensor_name: str):
tensor_type_map = {obj.name: obj.type for obj in self.model.graph.value_info}
if tensor_name in tensor_type_map:
return tensor_type_map[tensor_name].tensor_type
g_input = self.find_graph_input(tensor_name)
if g_input:
return g_input.type.tensor_type
g_output = self.find_graph_output(tensor_name)
if g_output:
return g_output.type.tensor_type
return None
def get_constant_value(self, output_name):
for node in self.model.graph.node:
if node.op_type == "Constant":
if node.output[0] == output_name:
for attr in node.attribute:
if attr.name == "value":
return onnx_numpy_helper.to_array(attr.t)
# Fallback to initializer since constant folding may have been applied.
initializer = self.get_initializer(output_name)
if initializer is not None:
return onnx_numpy_helper.to_array(initializer)
return None
def get_initializer_name_set(self):
return {initializer.name for initializer in self.model.graph.initializer}
def remove_initializer(self, tensor):
if tensor in self.model.graph.initializer:
self.model.graph.initializer.remove(tensor)
for input in self.model.graph.input:
if input.name == tensor.name:
self.model.graph.input.remove(input)
break
def remove_initializers(self, init_to_remove):
for initializer in init_to_remove:
self.remove_initializer(initializer)
def get_non_initializer_inputs(self):
initializer_names = self.get_initializer_name_set()
non_initializer_inputs = set()
for input in self.model.graph.input:
if input.name not in initializer_names:
non_initializer_inputs.add(input.name)
return non_initializer_inputs
def input_name_to_nodes(self):
input_name_to_nodes = {}
for node in self.model.graph.node:
for input_name in node.input:
if input_name: # Could be empty when it is optional
if input_name not in input_name_to_nodes:
input_name_to_nodes[input_name] = [node]
else:
input_name_to_nodes[input_name].append(node)
return input_name_to_nodes
def output_name_to_node(self):
output_name_to_node = {}
for node in self.model.graph.node:
for output_name in node.output:
if output_name: # Could be empty when it is optional
output_name_to_node[output_name] = node
return output_name_to_node
def get_children(self, node, input_name_to_nodes=None):
if input_name_to_nodes is None:
input_name_to_nodes = self.input_name_to_nodes()
children = []
for output in node.output:
if output in input_name_to_nodes:
for node in input_name_to_nodes[output]:
children.append(node) # noqa: PERF402
return children
def get_parents(self, node, output_name_to_node=None):
if output_name_to_node is None:
output_name_to_node = self.output_name_to_node()
parents = []
for input in node.input:
if input in output_name_to_node:
parents.append(output_name_to_node[input])
return parents
def get_parent(self, node, idx, output_name_to_node=None):
if output_name_to_node is None:
output_name_to_node = self.output_name_to_node()
if len(node.input) <= idx:
return None
input = node.input[idx]
if input not in output_name_to_node:
return None
return output_name_to_node[input]
def find_node_by_name(self, node_name, new_nodes_list, graph):
"""Find out if a node exists in a graph or a node is in the
new set of nodes created during quantization.
Returns:
The node found or None.
"""
graph_nodes_list = list(graph.node) # deep copy
graph_nodes_list.extend(new_nodes_list)
node = find_by_name(node_name, graph_nodes_list)
return node
def get_largest_node_name_suffix(self, node_name_prefix):
"""
Gets the largest node name (int) suffix for all node names that begin with `node_name_prefix`.
Example: for nodes my_prefix_0 and my_prefix_3, this method returns 3.
"""
suffix = -1
for node in self.model.graph.node:
if node.name and node.name.startswith(node_name_prefix):
try:
index = int(node.name[len(node_name_prefix) :])
suffix = max(index, suffix)
except ValueError:
continue
return suffix
def find_nodes_by_initializer(self, graph, initializer):
"""
Find all nodes with given initializer as an input.
"""
nodes = []
for node in graph.node:
for node_input in node.input:
if node_input == initializer.name:
nodes.append(node)
return nodes
@staticmethod
def __get_initializer(name, graph_path):
for gid in range(len(graph_path) - 1, -1, -1):
graph = graph_path[gid]
for tensor in graph.initializer:
if tensor.name == name:
return tensor, graph
return None, None
@staticmethod
def __replace_gemm_with_matmul(graph_path):
new_nodes = []
graph = graph_path[-1]
for node in graph.node:
graph_attrs = [attr for attr in node.attribute if attr.type == 5 or attr.type == 10]
if len(graph_attrs):
kwargs = {}
for attr in node.attribute:
if attr.type == 5:
graph_path.append(attr.g)
kv = {attr.name: ONNXModel.__replace_gemm_with_matmul(graph_path)}
elif attr.type == 10:
value = []
for subgraph in attr.graphs:
graph_path.append(subgraph)
value.extend([ONNXModel.__replace_gemm_with_matmul(graph_path)])
kv = {attr.name: value}
else:
kv = attribute_to_kwarg(attr)
kwargs.update(kv)
node = onnx_helper.make_node( # noqa: PLW2901
node.op_type, node.input, node.output, name=node.name, **kwargs
)
if node.op_type == "Gemm":
alpha = 1.0
beta = 1.0
transA = 0 # noqa: N806
transB = 0 # noqa: N806
for attr in node.attribute:
if attr.name == "alpha":
alpha = onnx_helper.get_attribute_value(attr)
elif attr.name == "beta":
beta = onnx_helper.get_attribute_value(attr)
elif attr.name == "transA":
transA = onnx_helper.get_attribute_value(attr) # noqa: N806
elif attr.name == "transB":
transB = onnx_helper.get_attribute_value(attr) # noqa: N806
if alpha == 1.0 and beta == 1.0 and transA == 0:
inputB = node.input[1] # noqa: N806
if transB == 1:
B, Bs_graph = ONNXModel.__get_initializer(node.input[1], graph_path) # noqa: N806
if B:
# assume B is not used by any other node
B_array = onnx_numpy_helper.to_array(B) # noqa: N806
B_trans = onnx_numpy_helper.from_array(B_array.T) # noqa: N806
B_trans.name = B.name
Bs_graph.initializer.remove(B)
for input in Bs_graph.input:
if input.name == inputB:
Bs_graph.input.remove(input)
break
Bs_graph.initializer.extend([B_trans])
else:
inputB += "_Transposed" # noqa: N806
transpose_node = onnx_helper.make_node(
"Transpose",
inputs=[node.input[1]],
outputs=[inputB],
name=node.name + "_Transpose" if node.name else "",
)
new_nodes.append(transpose_node)
matmul_node = onnx_helper.make_node(
"MatMul",
inputs=[node.input[0], inputB],
outputs=[node.output[0] + ("_MatMul" if len(node.input) > 2 else "")],
name=node.name + "_MatMul" if node.name else "",
)
new_nodes.append(matmul_node)
if len(node.input) > 2:
add_node = onnx_helper.make_node(
"Add",
inputs=[node.output[0] + "_MatMul", node.input[2]],
outputs=node.output,
name=node.name + "_Add" if node.name else "",
)
new_nodes.append(add_node)
# unsupported
else:
new_nodes.append(node)
# not GEMM
else:
new_nodes.append(node)
graph.ClearField("node")
graph.node.extend(new_nodes)
graph_path.pop()
return graph
def replace_gemm_with_matmul(self):
graph_path = [self.graph()]
ONNXModel.__replace_gemm_with_matmul(graph_path)
def save_model_to_file(self, output_path, use_external_data_format=False):
"""
Save model to external data, which is needed for model size > 2GB
"""
self.topological_sort()
if use_external_data_format:
onnx.external_data_helper.convert_model_to_external_data(
self.model,
all_tensors_to_one_file=True,
location=Path(output_path).name + ".data",
convert_attribute=True,
)
for init in self.model.graph.initializer:
self._check_init(init, "end")
onnx.save_model(self.model, output_path)
@staticmethod
def replace_node_input(node, old_input_name, new_input_name):
assert isinstance(old_input_name, str) and isinstance(new_input_name, str)
for j in range(len(node.input)):
if node.input[j] == old_input_name:
node.input[j] = new_input_name
def replace_input_of_all_nodes(self, old_input_name, new_input_name):
for node in self.model.graph.node:
ONNXModel.replace_node_input(node, old_input_name, new_input_name)
def replace_input_of_nodes(self, old_input_name, new_input_name, node_names_set):
for node in self.model.graph.node:
if node.name in node_names_set:
ONNXModel.replace_node_input(node, old_input_name, new_input_name)
@staticmethod
def replace_node_output(node, old_output_name, new_output_name):
assert isinstance(old_output_name, str) and isinstance(new_output_name, str)
for j in range(len(node.output)):
if node.output[j] == old_output_name:
node.output[j] = new_output_name
def replace_output_of_all_nodes(self, old_output_name, new_output_name):
for node in self.model.graph.node:
ONNXModel.replace_node_output(node, old_output_name, new_output_name)
def replace_output_of_nodes(self, old_output_name, new_output_name, node_names_set):
for node in self.model.graph.node:
if node.name in node_names_set:
ONNXModel.replace_node_output(node, old_output_name, new_output_name)
def remove_unused_constant(self):
input_name_to_nodes = self.input_name_to_nodes()
# remove unused constant
unused_nodes = []
nodes = self.nodes()
for node in nodes:
if (
node.op_type == "Constant"
and not self.is_graph_output(node.output[0])
and node.output[0] not in input_name_to_nodes
):
unused_nodes.append(node)
self.remove_nodes(unused_nodes)
ununsed_weights = []
for w in self.initializer():
if w.name not in input_name_to_nodes and not self.is_graph_output(w.name):
ununsed_weights.append(w)
# Remove from graph.input
for graph_input in self.graph().input:
if graph_input.name == w.name:
self.graph().input.remove(graph_input)
self.remove_initializers(ununsed_weights)
def is_graph_output(self, output_name):
return any(output.name == output_name for output in self.model.graph.output)
def is_graph_input(self, tensor_name: str) -> bool:
return any(input.name == tensor_name for input in self.model.graph.input)
# TODO:use OnnxModel.graph_topological_sort(self.model.graph) from transformers.onnx_model
# Currently it breaks Openvino/Linux training gpu pipeline so hold off for 1.8 release
def topological_sort(self):
deps_count = [0] * len(self.nodes()) # dependency count of each node
deps_to_nodes = {} # input to node indice
sorted_nodes = [] # initialize sorted_nodes
for node_idx, node in enumerate(self.nodes()):
# CANNOT use len(node.input) directly because input can be optional
deps_count[node_idx] = sum(1 for _ in node.input if _)
if deps_count[node_idx] == 0: # Constant doesn't depend on any inputs
sorted_nodes.append(self.nodes()[node_idx])
continue
for input_name in node.input:
if not input_name:
continue
if input_name not in deps_to_nodes:
deps_to_nodes[input_name] = [node_idx]
else:
deps_to_nodes[input_name].append(node_idx)
initializer_names = [init.name for init in self.initializer()]
graph_input_names = [input.name for input in self.model.graph.input]
input_names = initializer_names + graph_input_names
input_names.sort()
prev_input_name = None
for input_name in input_names:
if prev_input_name == input_name:
continue
prev_input_name = input_name
if input_name in deps_to_nodes:
for node_idx in deps_to_nodes[input_name]:
deps_count[node_idx] = deps_count[node_idx] - 1
if deps_count[node_idx] == 0:
sorted_nodes.append(self.nodes()[node_idx])
start = 0
end = len(sorted_nodes)
while start < end:
for output in sorted_nodes[start].output:
if output in deps_to_nodes:
for node_idx in deps_to_nodes[output]:
deps_count[node_idx] = deps_count[node_idx] - 1
if deps_count[node_idx] == 0:
sorted_nodes.append(self.nodes()[node_idx])
end = end + 1
start = start + 1
assert end == len(self.graph().node), "Graph is not a DAG"
self.graph().ClearField("node")
self.graph().node.extend(sorted_nodes)
def clean_initializers(self):
return _clean_initializers_helper(self.graph(), self.model)
def _check_init(self, init, test=None):
if init.data_type == onnx.TensorProto.FLOAT8E4M3FN:
if init.HasField("raw_data"):
b = list(init.raw_data)
if any(map(lambda i: (i & 127) == 127, b)):
raise ValueError(f"Initializer {init.name!r} has nan.")
return init
def _check_node(self, node):
"""
A quantization to float 8 does not use quantized bias but float 16 bias.
This function checks that DequantizeLinear is not used to
dequantize from float 16.
"""
if node.op_type == "DequantizeLinear":
zero_point = node.input[2]
init = self.get_initializer(zero_point)
dtype = init.data_type
if dtype in {
onnx.TensorProto.FLOAT16,
onnx.TensorProto.FLOAT,
onnx.TensorProto.DOUBLE,
onnx.TensorProto.BFLOAT16,
}:
raise RuntimeError(f"Unsupported DequantizeLinear operator, dequantization from {dtype}.")
return node

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
# from .base_operator import QuantOperatorBase
# from .matmul import MatMulInteger

View File

@ -0,0 +1,119 @@
import onnx
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
from .qdq_base_operator import QDQOperatorBase
class QLinearActivation(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def QuantizeClipRelu(self): # noqa: N802
node = self.node
assert node.op_type == "Relu" or node.op_type == "Clip"
# When mode is QLinearOps, the output quantization params are calculated based on outputs from
# activation nodes, therefore these nodes can be removed from the graph if they follow a quantized op.
# If input to this node is not quantized then keep this node
# If activation is symmetric, not quantize the op and simply return
if node.input[0] not in self.quantizer.quantized_value_map or self.quantizer.is_activation_symmetric:
return super().quantize()
quantized_value = self.quantizer.quantized_value_map[node.input[0]]
self.quantizer.quantized_value_map[node.output[0]] = quantized_value
def quantize(self):
node = self.node
if node.op_type == "Relu" or node.op_type == "Clip":
self.QuantizeClipRelu()
return
nnapi_sigmoid_option = "extra.Sigmoid.nnapi"
sigmoid_nnapi_mode = (
node.op_type == "Sigmoid"
and nnapi_sigmoid_option in self.quantizer.extra_options
and self.quantizer.extra_options[nnapi_sigmoid_option]
)
use_scale = 1 / 256.0 if sigmoid_nnapi_mode else None
use_zeropoint = 0 if sigmoid_nnapi_mode else None
# No assert on op_type as it is controlled by registry
# only try to quantize when given quantization parameters for it
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0], use_scale, use_zeropoint)
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
if not data_found or quantized_input_names is None:
return super().quantize()
qlinear_activation_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
qlinear_activation_name = ""
if node.name:
qlinear_activation_name = node.name + "_quant"
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
qlinear_activation_inputs = [
quantized_input_names[0],
scale_names[0],
zero_point_names[0],
output_scale_name,
output_zp_name,
]
qlinear_activation_node = onnx.helper.make_node(
"QLinear" + node.op_type,
qlinear_activation_inputs,
[qlinear_activation_output],
qlinear_activation_name,
**kwargs,
)
# Create an entry for this quantized value
q_output = QuantizedValue(
node.output[0],
qlinear_activation_output,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = q_output
nodes.append(qlinear_activation_node)
self.quantizer.new_nodes += nodes
class QDQRemovableActivation(QDQOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
# If input to this node is not quantized then keep this node
if not self.quantizer.is_tensor_quantized(node.input[0]):
return
if (
not self.quantizer.is_activation_symmetric
and not self.quantizer.qdq_keep_removable_activations
and self.quantizer.try_replacing_upstream_output(node.input[0], node.output[0])
):
self.quantizer.remove_node(self.node)
else:
self.quantizer.quantize_activation_tensor(node.input[0])
if not self.disable_qdq_for_node_output:
self.quantizer.quantize_activation_tensor(node.output[0])

View File

@ -0,0 +1,18 @@
from .base_operator import QuantOperatorBase
# Use the quantized tensor as input without DQ.
class QArgMax(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
quantized_input_value = self.quantizer.find_quantized_value(node.input[0])
if quantized_input_value is None:
self.quantizer.new_nodes += [node]
return
node.input[0] = quantized_input_value.q_name
self.quantizer.new_nodes += [node]

View File

@ -0,0 +1,73 @@
import onnx
from onnx import onnx_pb as onnx_proto # noqa: F401
from ..quant_utils import attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
"""
Quantize Attention
"""
class AttentionQuant(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def should_quantize(self):
return self.quantizer.should_quantize_node(self.node)
def quantize(self):
"""
parameter node: Attention node.
parameter new_nodes_list: List of new nodes created before processing this node.
return: a list of nodes in topological order that represents quantized Attention node.
"""
node = self.node
assert node.op_type == "Attention"
# TODO This is a temporary fix to stop exporting QAttention with qkv_hidden_sizes
# attribute. This needs to be removed once the QAttention for varied q,k,v sizes
# is implemented
for attr in node.attribute:
if attr.name == "qkv_hidden_sizes":
return super().quantize()
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
(
quantized_input_names_weight,
zero_point_names_weight,
scale_names_weight,
nodes_weight,
) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
quantized_input_names.extend(quantized_input_names_weight)
zero_point_names.extend(zero_point_names_weight)
scale_names.extend(scale_names_weight)
nodes.extend(nodes_weight)
if quantized_input_names is None:
return super().quantize()
qattention_name = "" if not node.name else node.name + "_quant"
inputs = []
inputs.extend(quantized_input_names)
inputs.extend([node.input[2]])
inputs.extend(scale_names)
inputs.extend([node.input[3] if len(node.input) > 3 else ""])
inputs.extend(zero_point_names)
inputs.extend([node.input[4] if len(node.input) > 4 else ""])
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
qattention_node = onnx.helper.make_node("QAttention", inputs, node.output, qattention_name, **kwargs)
nodes.append(qattention_node)
self.quantizer.new_nodes += nodes

View File

@ -0,0 +1,26 @@
class QuantOperatorBase:
def __init__(self, onnx_quantizer, onnx_node):
self.quantizer = onnx_quantizer
self.node = onnx_node
def should_quantize(self):
if not self.quantizer.should_quantize_node(self.node):
return False
return self.quantizer.is_float_tensor(self.node.input[0])
def quantize(self):
"""
Given a node which does not support quantization, this method checks whether the input to
this node is quantized and adds a DequantizeLinear node to dequantize this input back to FP32
parameter node: Current node
parameter new_nodes_list: List of new nodes created before processing current node
return: List of new nodes created
"""
for _, node_input in enumerate(self.node.input):
dequantize_node = self.quantizer._dequantize_value(node_input)
if dequantize_node is not None:
self.quantizer.new_nodes.append(dequantize_node)
# Append the original node
self.quantizer.new_nodes.append(self.node)

View File

@ -0,0 +1,72 @@
import onnx
from onnx import onnx_pb as onnx_proto # noqa: F401
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
class QLinearBinaryOp(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0, 1])
if not data_found or quantized_input_names is None:
return super().quantize()
qlinear_binary_math_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
qlinear_binary_math_name = node.name + "_quant" if node.name else ""
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
qlinear_binary_math_inputs = []
# Input 0
qlinear_binary_math_inputs.append(quantized_input_names[0])
qlinear_binary_math_inputs.append(scale_names[0])
qlinear_binary_math_inputs.append(zero_point_names[0])
# Input 1
qlinear_binary_math_inputs.append(quantized_input_names[1])
qlinear_binary_math_inputs.append(scale_names[1])
qlinear_binary_math_inputs.append(zero_point_names[1])
# Output
qlinear_binary_math_inputs.append(output_scale_name)
qlinear_binary_math_inputs.append(output_zp_name)
qlinear_binary_math_node = onnx.helper.make_node(
"QLinear" + node.op_type,
qlinear_binary_math_inputs,
[qlinear_binary_math_output],
qlinear_binary_math_name,
**kwargs,
)
nodes.append(qlinear_binary_math_node)
# Create an entry for this quantized value
q_output = QuantizedValue(
node.output[0],
qlinear_binary_math_output,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = q_output
self.quantizer.new_nodes += nodes

View File

@ -0,0 +1,62 @@
import onnx
from ..quant_utils import ( # noqa: F401
TENSOR_NAME_QUANT_SUFFIX,
QuantizedValue,
QuantizedValueType,
attribute_to_kwarg,
ms_domain,
)
from .base_operator import QuantOperatorBase
from .qdq_base_operator import QDQOperatorBase # noqa: F401
class QLinearConcat(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
(
q_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [*range(len(node.input))])
if not data_found or q_input_names is None:
return super().quantize()
# Create an entry for output quantized value
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
quantized_output_value = QuantizedValue(
node.output[0],
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
output_scale_name,
output_zp_name,
quantized_input_value.value_type,
)
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
qnode_name = node.name + "_quant" if node.name else ""
qlconcat_inputs = [output_scale_name, output_zp_name]
for i in range(len(q_input_names)):
qlconcat_inputs.extend([q_input_names[i], scale_names[i], zero_point_names[i]])
qlconcat_node = onnx.helper.make_node(
"QLinearConcat", qlconcat_inputs, [quantized_output_value.q_name], qnode_name, **kwargs
)
self.quantizer.new_nodes += nodes
self.quantizer.new_nodes += [qlconcat_node]

View File

@ -0,0 +1,258 @@
import numpy as np
import onnx
from onnx import onnx_pb as onnx_proto
from ..quant_utils import (
TENSOR_NAME_QUANT_SUFFIX,
QuantizedValue,
QuantizedValueType,
attribute_to_kwarg,
find_by_name,
get_mul_node,
)
from .base_operator import QuantOperatorBase
from .qdq_base_operator import QDQOperatorBase
class ConvInteger(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def add_bias(self, nodes, scaled_output):
"""
Given a node, this function handles bias add by adding a "reshape" node on bias and an "add" node
parameter nodes: new nodes would be appended into nodes
parameter node: current node (Conv)
parameter scaled_output: output of quant conv without bias
parameter output: output of Conv
parameter bias_name: bias of Conv
return: the name of output
"""
node = self.node
model = self.quantizer.model
# Add tensors for the shape to be reshaped to
weight = find_by_name(node.input[1], model.initializer())
if weight is None:
raise ValueError(f"Expected {node.input[1]} to be an initializer")
# Add reshape for correct broadcase
output = node.output[0]
reshape_input_data = node.input[2] # bias of Conv
reshape_input_shape = output + "_bias_reshape_shape"
reshape_output = output + "_bias_reshape_output"
shape = np.ones((len(weight.dims)), dtype=np.int64)
shape[1] = -1
init_shape = onnx.helper.make_tensor(
reshape_input_shape, onnx_proto.TensorProto.INT64, [len(weight.dims)], shape
)
model.add_initializer(init_shape)
reshape_node = onnx.helper.make_node("Reshape", [reshape_input_data, reshape_input_shape], [reshape_output])
nodes.append(reshape_node)
# Add an Add operation for bias
add_node = onnx.helper.make_node("Add", [scaled_output, reshape_output], [output], output + "_bias_add")
nodes.append(add_node)
def quantize(self):
node = self.node
assert node.op_type == "Conv"
# Get Quantized from both activation(input[0]) and weight(input[1])
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
(
quantized_input_names_weight,
zero_point_names_weight,
scale_names_weight,
nodes_weight,
) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
quantized_input_names.extend(quantized_input_names_weight)
zero_point_names.extend(zero_point_names_weight)
scale_names.extend(scale_names_weight)
nodes.extend(nodes_weight)
conv_integer_output = node.output[0] + "_output_quantized"
conv_integer_name = node.name + "_quant" if node.name else ""
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
conv_integer_node = onnx.helper.make_node(
"ConvInteger", quantized_input_names + zero_point_names, [conv_integer_output], conv_integer_name, **kwargs
)
nodes.append(conv_integer_node)
# Add cast operation to cast convInteger output to float.
onnx_type = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
cast_op_output = conv_integer_output + "_cast_output"
cast_node = onnx.helper.make_node(
"Cast",
[conv_integer_output],
[cast_op_output],
conv_integer_output + "_cast",
to=onnx_type, # TODO: FLOAT ot FLOAT16
)
nodes.append(cast_node)
# Add mul operation to multiply scales of two inputs.
assert len(scale_names) == 2
if conv_integer_name:
scales_mul_op = conv_integer_name + "_scales_mul"
else:
scales_mul_op = scale_names[0] + "_" + scale_names[1] + "_mul"
scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
if scales_mul_node is None:
scales_mul_node = get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
nodes.append(scales_mul_node)
scales_mul_op_output = scales_mul_node.output[0]
has_bias = len(node.input) == 3
scaled_output_name = node.output[0] if not has_bias else node.output[0] + "quant_scaled_output"
# Add mul operation to multiply mul_scales_op result with output of ConvInteger
# and make the output of this node the same as output of original conv node.
output_scale_mul_op = conv_integer_name + "_output_scale_mul" if conv_integer_name else ""
nodes.append(
get_mul_node(
[cast_op_output, scales_mul_op_output],
scaled_output_name,
output_scale_mul_op,
)
)
if has_bias:
self.add_bias(nodes, scaled_output_name)
self.quantizer.new_nodes += nodes
class QLinearConv(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "Conv"
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
if self.quantizer.is_input_a_initializer(node.input[1]) and self.quantizer.is_per_channel():
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
node.input[1], onnx_proto.TensorProto.INT8, 0 # self.quantizer.weight_qType?
)
quantized_input_names.append(quant_weight_tuple[0])
zero_point_names.append(quant_weight_tuple[1])
scale_names.append(quant_weight_tuple[2])
else:
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
(
quantized_input_names_weight,
zero_point_names_weight,
scale_names_weight,
nodes_weight,
) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
quantized_input_names.extend(quantized_input_names_weight)
zero_point_names.extend(zero_point_names_weight)
scale_names.extend(scale_names_weight)
nodes.extend(nodes_weight)
if not data_found or quantized_input_names is None:
return super().quantize()
quantized_bias_name = ""
bias_present = False
if len(node.input) == 3:
if self.quantizer.weight_qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
raise RuntimeError("Quantization to FLOAT8E4M3FN for operator Conv is not supported.")
quantized_bias_name = self.quantizer.quantize_bias_static(node.input[2], node.input[0], node.input[1])
bias_present = True
qlinear_conv_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
qlinear_conv_name = node.name + "_quant" if node.name else ""
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
qlinear_conv_inputs = []
# Input 0
qlinear_conv_inputs.append(quantized_input_names[0])
qlinear_conv_inputs.append(scale_names[0])
qlinear_conv_inputs.append(zero_point_names[0])
# Input 1
qlinear_conv_inputs.append(quantized_input_names[1])
qlinear_conv_inputs.append(scale_names[1])
qlinear_conv_inputs.append(zero_point_names[1])
# Output
qlinear_conv_inputs.append(output_scale_name)
qlinear_conv_inputs.append(output_zp_name)
if bias_present:
qlinear_conv_inputs.append(quantized_bias_name)
qlinear_conv_node = onnx.helper.make_node(
"QLinearConv", qlinear_conv_inputs, [qlinear_conv_output], qlinear_conv_name, **kwargs
)
nodes.append(qlinear_conv_node)
# Create an entry for this quantized value
q_output = QuantizedValue(
node.output[0],
qlinear_conv_output,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = q_output
self.quantizer.new_nodes += nodes
class QDQConv(QDQOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "Conv" or node.op_type == "ConvTranspose"
self.quantizer.quantize_activation_tensor(node.input[0])
if not self.disable_qdq_for_node_output:
self.quantizer.quantize_activation_tensor(node.output[0])
is_weight_per_channel, weight_axis = self.quantizer.is_tensor_per_channel(
node.input[1], default_axis=0 if node.op_type == "Conv" else 1
)
if is_weight_per_channel:
self.quantizer.quantize_weight_tensor_per_channel(node.input[1], weight_axis)
else:
self.quantizer.quantize_weight_tensor(node.input[1])
if len(node.input) == 3:
self.quantizer.quantize_bias_tensor(node.name, node.input[2], node.input[0], node.input[1])

View File

@ -0,0 +1,78 @@
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType
from .base_operator import QuantOperatorBase
from .qdq_base_operator import QDQOperatorBase
# For operators that support 8bits operations directly, and output could
# reuse input[0]'s type, zeropoint, scale; For example,Transpose, Reshape, etc.
class Direct8BitOp(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
if not self.quantizer.force_quantize_no_input_check:
# Keep backward compatibility
# Quantize when input[0] is quantized already. Otherwise keep it.
quantized_input_value = self.quantizer.find_quantized_value(node.input[0])
if quantized_input_value is None:
self.quantizer.new_nodes += [node]
return
quantized_output_value = QuantizedValue(
node.output[0],
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
quantized_input_value.scale_name,
quantized_input_value.zp_name,
quantized_input_value.value_type,
)
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
node.input[0] = quantized_input_value.q_name
node.output[0] = quantized_output_value.q_name
self.quantizer.new_nodes += [node]
else:
# Force quantize those ops if possible, use exclude node list if this is not you want
if not self.quantizer.is_valid_quantize_weight(node.input[0]):
super().quantize()
return
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
if quantized_input_names is None:
return super().quantize()
# Create an entry for output quantized value
quantized_output_value = QuantizedValue(
node.output[0],
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
scale_names[0],
zero_point_names[0],
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
node.input[0] = quantized_input_names[0]
node.output[0] = quantized_output_value.q_name
nodes.append(node)
self.quantizer.new_nodes += nodes
class QDQDirect8BitOp(QDQOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
if self.quantizer.force_quantize_no_input_check:
self.quantizer.quantize_activation_tensor(self.node.input[0])
if not self.disable_qdq_for_node_output:
self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
elif self.quantizer.is_tensor_quantized(self.node.input[0]) and not self.disable_qdq_for_node_output:
self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)

View File

@ -0,0 +1,121 @@
import logging
import onnx
from onnx import onnx_pb as onnx_proto # noqa: F401
from ..quant_utils import attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
"""
Quantizes the EmbedLayerNorm fused ONNXRuntime Op.
This Quant operator keeps the input and segment IDs at int32 but will quantize all initializer and
weight inputs associated with the node to uint8.
"""
class EmbedLayerNormalizationQuant(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def should_quantize(self):
return self.quantizer.should_quantize_node(self.node)
def quantize(self):
node = self.node
assert node.op_type == "EmbedLayerNormalization"
if len(node.output) > 2:
logging.info(f"Quantization is not applied to {node.name} since it has 3 outputs")
return super().quantize()
"""
Pre-quantization EmbedLayerNorm inputs:
[0] input_ids (int32)
[1] segment_ids (int32)
[2] word_embedding (float32)
[3] position_embedding (float32)
[4] segment_embedding (float32)
[5] gamma (float32)
[6] beta (float32)
[7] mask (int32) (optional)
"""
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [2, 3, 4, 5, 6])
if quantized_input_names is None:
return super().quantize()
qembed_layer_norm_name = "" if not node.name else node.name + "_quant"
"""
Quantized Input Tensor List
[0] input_ids (int32)
[1] segment_ids (int32)
[2] word_embedding (uint8)
[3] position_embedding (uint8)
[4] segment_embedding (uint8)
[5] gamma (uint8)
[6] beta (uint8)
[7] mask (int32) (optional)
[8] word_embedding_scale (float)
[9] position_embedding_scale (float)
[10] segment_embedding_scale (float)
[11] gamma_scale (float)
[12] beta_scale (float)
[13] word_embedding_zero_point (uint8)
[14] position_embedding_zero_point (uint8)
[15] segment_embedding_zero_point (uint8)
[16] gamma_zero_point (uint8)
[17] beta_zero_point (uint8)
"""
inputs = []
# 'input_ids'
inputs.extend([node.input[0]])
# 'segment_ids'
inputs.extend([node.input[1]])
# 'word_embedding_quant'
inputs.extend([quantized_input_names[0]])
# 'position_embedding_quant'
inputs.extend([quantized_input_names[1]])
# 'segment_embedding_quant'
inputs.extend([quantized_input_names[2]])
# 'gamma_quant'
inputs.extend([quantized_input_names[3]])
# 'beta_quant'
inputs.extend([quantized_input_names[4]])
# 'mask' (optional)
inputs.extend([node.input[7] if len(node.input) > 7 else ""])
# Add all scales:
inputs.extend([scale_names[0]])
inputs.extend([scale_names[1]])
inputs.extend([scale_names[2]])
inputs.extend([scale_names[3]])
inputs.extend([scale_names[4]])
# Add all zero points:
inputs.extend([zero_point_names[0]])
inputs.extend([zero_point_names[1]])
inputs.extend([zero_point_names[2]])
inputs.extend([zero_point_names[3]])
inputs.extend([zero_point_names[4]])
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
qembed_layer_norm_node = onnx.helper.make_node(
"QEmbedLayerNormalization",
inputs,
node.output,
qembed_layer_norm_name,
**kwargs,
)
nodes.append(qembed_layer_norm_node)
self.quantizer.new_nodes += nodes

View File

@ -0,0 +1,64 @@
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType
from .base_operator import QuantOperatorBase
from .qdq_base_operator import QDQOperatorBase
"""
Quantize Gather
"""
class GatherQuant(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def should_quantize(self):
if not self.quantizer.should_quantize_node(self.node):
return False
return self.quantizer.is_valid_quantize_weight(self.node.input[0])
def quantize(self):
node = self.node
assert node.op_type == "Gather"
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
if quantized_input_names is None:
return super().quantize()
gather_new_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
# Create an entry for this quantized value
q_output = QuantizedValue(
node.output[0],
gather_new_output,
scale_names[0],
zero_point_names[0],
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = q_output
node.output[0] = gather_new_output
node.input[0] = quantized_input_names[0]
nodes.append(node)
self.quantizer.new_nodes += nodes
class QDQGather(QDQOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "Gather"
if self.quantizer.is_valid_quantize_weight(node.input[0]) or self.quantizer.force_quantize_no_input_check:
self.quantizer.quantize_activation_tensor(node.input[0])
self.quantizer.quantize_output_same_as_input(node.output[0], node.input[0], node.name)
elif self.quantizer.is_tensor_quantized(node.input[0]):
self.quantizer.quantize_output_same_as_input(node.output[0], node.input[0], node.name)

View File

@ -0,0 +1,62 @@
import onnx
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
class QGlobalAveragePool(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "GlobalAveragePool"
# If input to this node is not quantized then keep this node.
if node.input[0] not in self.quantizer.quantized_value_map:
return super().quantize()
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
# Create an entry for output quantized value.
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
(
data_found,
output_scale_name_from_parameter,
output_zp_name_from_parameter,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
# Just use input scale and zp if parameters for output is not specified.
output_scale_name = output_scale_name_from_parameter if data_found else quantized_input_value.scale_name
output_zp_name = output_zp_name_from_parameter if data_found else quantized_input_value.zp_name
quantized_output_value = QuantizedValue(
node.output[0],
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
kwargs["channels_last"] = 0
qnode_name = node.name + "_quant" if node.name else ""
qnode = onnx.helper.make_node(
"QLinear" + node.op_type,
[
quantized_input_value.q_name,
quantized_input_value.scale_name,
quantized_input_value.zp_name,
output_scale_name,
output_zp_name,
],
[quantized_output_value.q_name],
qnode_name,
**kwargs,
)
self.quantizer.new_nodes += [qnode]

View File

@ -0,0 +1,166 @@
import logging
import numpy as np # noqa: F401
import onnx
from ..quant_utils import find_by_name # noqa: F401
from ..quant_utils import get_mul_node # noqa: F401
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase # noqa: F401
from .matmul import QOpMatMul
from .qdq_base_operator import QDQOperatorBase
def is_B_transposed(gemm_node): # noqa: N802
transB_attribute = [attr for attr in gemm_node.attribute if attr.name == "transB"] # noqa: N806
if len(transB_attribute):
return onnx.helper.get_attribute_value(transB_attribute[0]) > 0
return False
def get_beta(gemm_node):
beta_attribute = [attr for attr in gemm_node.attribute if attr.name == "beta"]
if len(beta_attribute):
return onnx.helper.get_attribute_value(beta_attribute[0])
return 1.0
def set_default_beta(gemm_node):
beta_attribute = [attr for attr in gemm_node.attribute if attr.name == "beta"]
if len(beta_attribute):
beta_attribute[0].f = 1.0
return 1.0
class QLinearGemm(QOpMatMul):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "Gemm"
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
if self.quantizer.is_input_a_initializer(node.input[1]) and self.quantizer.is_per_channel():
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
node.input[1],
self.quantizer.weight_qType,
0 if is_B_transposed(node) else 1,
)
quantized_input_names.append(quant_weight_tuple[0])
zero_point_names.append(quant_weight_tuple[1])
scale_names.append(quant_weight_tuple[2])
else:
# Get Quantized from both activation(input[0]) and weight(input[1])
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
(
quantized_input_names_weight,
zero_point_names_weight,
scale_names_weight,
nodes_weight,
) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
quantized_input_names.extend(quantized_input_names_weight)
zero_point_names.extend(zero_point_names_weight)
scale_names.extend(scale_names_weight)
nodes.extend(nodes_weight)
if not data_found or quantized_input_names is None:
return super().quantize()
quantized_bias_name = ""
if len(node.input) == 3:
if not self.quantizer.is_input_a_initializer(node.input[2]):
return super().quantize()
# Note: if the quantized type is float 8, the bias is converted into float 16.
# cublasLtMatMul only supports (b)float16 or float32 bias.
quantized_bias_name = self.quantizer.quantize_bias_static(
node.input[2], node.input[0], node.input[1], get_beta(self.node)
)
qgemm_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
qgemm_name = node.name + "_quant" if node.name else ""
kwargs = {}
for attribute in node.attribute:
if attribute.name != "beta":
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
# generate input
qgemm_inputs = []
for i in range(2):
qgemm_inputs.extend([quantized_input_names[i], scale_names[i], zero_point_names[i]])
qgemm_inputs.extend([quantized_bias_name, output_scale_name, output_zp_name])
qgemm_node = onnx.helper.make_node("QGemm", qgemm_inputs, [qgemm_output], qgemm_name, **kwargs)
nodes.append(qgemm_node)
# Create an entry for this quantized value
q_output = QuantizedValue(
node.output[0],
qgemm_output,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
node_type=node.op_type,
node_qtype=self.quantizer.weight_qType,
)
self.quantizer.quantized_value_map[node.output[0]] = q_output
self.quantizer.new_nodes += nodes
class QDQGemm(QDQOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "Gemm"
self.quantizer.quantize_activation_tensor(node.input[0])
if not self.disable_qdq_for_node_output:
self.quantizer.quantize_activation_tensor(node.output[0])
is_weight_per_channel, weight_axis = self.quantizer.is_tensor_per_channel(
node.input[1], default_axis=0 if is_B_transposed(node) else 1
)
if is_weight_per_channel:
self.quantizer.quantize_weight_tensor_per_channel(node.input[1], weight_axis)
else:
self.quantizer.quantize_weight_tensor(node.input[1])
if len(node.input) == 3:
if self.quantizer.is_input_a_initializer(node.input[2]):
self.quantizer.quantize_bias_tensor(
node.name, node.input[2], node.input[0], node.input[1], get_beta(self.node)
)
set_default_beta(self.node)
else:
logging.warning(
f"Bias of Gemm node '{self.node.name}' is not constant. Please exclude this node for better performance."
)

View File

@ -0,0 +1,117 @@
import numpy
import onnx
from onnx import onnx_pb as onnx_proto
from ..quant_utils import QuantType, attribute_to_kwarg, ms_domain # noqa: F401
from .base_operator import QuantOperatorBase
"""
Quantize LSTM
"""
class LSTMQuant(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
"""
parameter node: LSTM node.
parameter new_nodes_list: List of new nodes created before processing this node.
return: a list of nodes in topological order that represents quantized Attention node.
"""
node = self.node
assert node.op_type == "LSTM"
if not self.quantizer.is_valid_quantize_weight(node.input[1]) or not self.quantizer.is_valid_quantize_weight(
node.input[2]
):
super().quantize()
return
model = self.quantizer.model
W = model.get_initializer(node.input[1]) # noqa: N806
R = model.get_initializer(node.input[2]) # noqa: N806
if len(W.dims) != 3 or len(R.dims) != 3:
super().quantize()
return
[W_num_dir, W_4_hidden_size, W_input_size] = W.dims # noqa: N806
[R_num_dir, R_4_hidden_size, R_hidden_size] = R.dims # noqa: N806
if self.quantizer.is_per_channel():
del W.dims[0]
del R.dims[0]
W.dims[0] = W_num_dir * W_4_hidden_size
R.dims[0] = R_num_dir * R_4_hidden_size
quant_input_weight_tuple = self.quantizer.quantize_weight_per_channel(
node.input[1], onnx_proto.TensorProto.INT8, 0 # self.quantizer.weight_qType?
)
quant_recurrent_weight_tuple = self.quantizer.quantize_weight_per_channel(
node.input[2], onnx_proto.TensorProto.INT8, 0 # self.quantizer.weight_qType?
)
W_quant_weight = model.get_initializer(quant_input_weight_tuple[0]) # noqa: N806
R_quant_weight = model.get_initializer(quant_recurrent_weight_tuple[0]) # noqa: N806
W_quant_array = onnx.numpy_helper.to_array(W_quant_weight) # noqa: N806
R_quant_array = onnx.numpy_helper.to_array(R_quant_weight) # noqa: N806
W_quant_array = numpy.reshape(W_quant_array, (W_num_dir, W_4_hidden_size, W_input_size)) # noqa: N806
R_quant_array = numpy.reshape(R_quant_array, (R_num_dir, R_4_hidden_size, R_hidden_size)) # noqa: N806
W_quant_array = numpy.transpose(W_quant_array, (0, 2, 1)) # noqa: N806
R_quant_array = numpy.transpose(R_quant_array, (0, 2, 1)) # noqa: N806
W_quant_tranposed = onnx.numpy_helper.from_array(W_quant_array, quant_input_weight_tuple[0]) # noqa: N806
R_quant_tranposed = onnx.numpy_helper.from_array(R_quant_array, quant_recurrent_weight_tuple[0]) # noqa: N806
model.remove_initializers([W_quant_weight, R_quant_weight])
model.add_initializer(W_quant_tranposed)
model.add_initializer(R_quant_tranposed)
W_quant_zp = model.get_initializer(quant_input_weight_tuple[1]) # noqa: N806
R_quant_zp = model.get_initializer(quant_recurrent_weight_tuple[1]) # noqa: N806
W_quant_scale = model.get_initializer(quant_input_weight_tuple[2]) # noqa: N806
R_quant_scale = model.get_initializer(quant_recurrent_weight_tuple[2]) # noqa: N806
if self.quantizer.is_per_channel():
W_quant_zp.dims[:] = [W_num_dir, W_4_hidden_size]
R_quant_zp.dims[:] = [R_num_dir, R_4_hidden_size]
W_quant_scale.dims[:] = [W_num_dir, W_4_hidden_size]
R_quant_scale.dims[:] = [R_num_dir, R_4_hidden_size]
inputs = []
input_len = len(node.input)
inputs.extend([node.input[0]])
inputs.extend([quant_input_weight_tuple[0], quant_recurrent_weight_tuple[0]])
inputs.extend([node.input[3] if input_len > 3 else ""])
inputs.extend([node.input[4] if input_len > 4 else ""])
inputs.extend([node.input[5] if input_len > 5 else ""])
inputs.extend([node.input[6] if input_len > 6 else ""])
inputs.extend([node.input[7] if input_len > 7 else ""])
inputs.extend(
[
quant_input_weight_tuple[2],
quant_input_weight_tuple[1],
quant_recurrent_weight_tuple[2],
quant_recurrent_weight_tuple[1],
]
)
kwargs = {}
for attribute in node.attribute:
if attribute.name == "layout":
continue
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
quant_lstm_name = "" if not node.name else node.name + "_quant"
quant_lstm_node = onnx.helper.make_node("DynamicQuantizeLSTM", inputs, node.output, quant_lstm_name, **kwargs)
self.quantizer.new_nodes.append(quant_lstm_node)
dequantize_node = self.quantizer._dequantize_value(node.input[0])
if dequantize_node is not None:
self.quantizer.new_nodes.append(dequantize_node)

View File

@ -0,0 +1,228 @@
import itertools
import logging
import onnx
from onnx import onnx_pb as onnx_proto
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, find_by_name, get_mul_node
from .base_operator import QuantOperatorBase
from .qdq_base_operator import QDQOperatorBase
class QOpMatMul(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def should_quantize(self):
if not self.quantizer.should_quantize_node(self.node):
logging.debug(f"Ignore MatMul {self.node.name}]")
return False
if (not self.quantizer.is_float_tensor(self.node.input[1])) and (
not self.quantizer.is_float_tensor(self.node.input[0])
):
logging.info(f"Ignore MatMul due to non float inputs {self.node.name}]")
return False
# do not quantize non-constant B matrices for matmul
if self.quantizer.q_matmul_const_b_only:
if not self.quantizer.find_initializer_in_path(self.node.input[1]):
logging.info(f"Ignore MatMul due to non constant B: {self.quantizer.graph_scope}[{self.node.name}]")
return False
return True
"""
Used when quantize mode is QuantizationMode.IntegerOps.
"""
class MatMulInteger(QOpMatMul):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "MatMul"
# Get Quantized from both activation(input[0]) and weight(input[1])
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
(
quantized_input_names_weight,
zero_point_names_weight,
scale_names_weight,
nodes_weight,
) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
quantized_input_names.extend(quantized_input_names_weight)
zero_point_names.extend(zero_point_names_weight)
scale_names.extend(scale_names_weight)
nodes.extend(nodes_weight)
matmul_integer_output = node.output[0] + "_output_quantized"
matmul_integer_name = node.name + "_quant" if node.name else ""
matmul_integer_node = onnx.helper.make_node(
"MatMulInteger",
quantized_input_names + zero_point_names,
[matmul_integer_output],
matmul_integer_name,
)
nodes.append(matmul_integer_node)
# Add cast operation to cast matmulInteger output to float.
cast_op_output = matmul_integer_output + "_cast_output"
otype = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
cast_node = onnx.helper.make_node(
"Cast",
[matmul_integer_output],
[cast_op_output],
matmul_integer_output + "_cast",
to=otype,
)
nodes.append(cast_node)
# Add mul operation to multiply scales of two inputs.
assert len(scale_names) == 2
scales_mul_op = (
matmul_integer_name + "_scales_mul"
if matmul_integer_name
else scale_names[0] + "_" + scale_names[1] + "_mul"
)
scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
if scales_mul_node is None:
scales_mul_node = get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
nodes.append(scales_mul_node)
scales_mul_op_output = scales_mul_node.output[0]
# Add mul operation to multiply mul_scales_op result with output of MatMulInteger
# and make the output of this node the same as output of original matmul node.
output_scale_mul_op = ""
if matmul_integer_name:
output_scale_mul_op = matmul_integer_name + "_output_scale_mul"
nodes.append(
get_mul_node(
[cast_op_output, scales_mul_op_output],
node.output[0],
output_scale_mul_op,
)
)
self.quantizer.new_nodes += nodes
"""
Used when quantize mode is QuantizationMode.QLinearOps
"""
class QLinearMatMul(QOpMatMul):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "MatMul"
# Get Quantized from both activation(input[0]) and weight(input[1])
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
(
quantized_input_names_weight,
zero_point_names_weight,
scale_names_weight,
nodes_weight,
) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
quantized_input_names.extend(quantized_input_names_weight)
zero_point_names.extend(zero_point_names_weight)
scale_names.extend(scale_names_weight)
nodes.extend(nodes_weight)
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
if not data_found or quantized_input_names is None:
return super().quantize()
qlinear_matmul_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
qlinear_matmul_name = node.name + "_quant" if node.name else ""
qlinear_matmul_inputs = []
# Input 0
qlinear_matmul_inputs.append(quantized_input_names[0])
qlinear_matmul_inputs.append(scale_names[0])
qlinear_matmul_inputs.append(zero_point_names[0])
# Input 1
qlinear_matmul_inputs.append(quantized_input_names[1])
qlinear_matmul_inputs.append(scale_names[1])
qlinear_matmul_inputs.append(zero_point_names[1])
# Output quantization parameter
qlinear_matmul_inputs.append(output_scale_name)
qlinear_matmul_inputs.append(output_zp_name)
domain = (
"com.microsoft"
if self.quantizer.weight_qType
in {
onnx_proto.TensorProto.FLOAT8E4M3FN,
onnx_proto.TensorProto.FLOAT8E4M3FNUZ,
onnx_proto.TensorProto.FLOAT8E5M2,
onnx_proto.TensorProto.FLOAT8E5M2FNUZ,
}
else ""
)
qlinear_matmul_node = onnx.helper.make_node(
"QLinearMatMul",
qlinear_matmul_inputs,
[qlinear_matmul_output],
qlinear_matmul_name,
domain=domain,
)
nodes.append(qlinear_matmul_node)
# Create an entry for this quantized value
q_output = QuantizedValue(
node.output[0],
qlinear_matmul_output,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = q_output
self.quantizer.new_nodes += nodes
class QDQMatMul(QDQOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "MatMul"
if self.disable_qdq_for_node_output:
nodes_to_iterate = node.input
else:
nodes_to_iterate = itertools.chain(node.input, node.output)
for tensor_name in nodes_to_iterate:
is_per_channel, channel_axis = self.quantizer.is_tensor_per_channel(
tensor_name, default_axis=1, op_type=node.op_type
)
if is_per_channel:
self.quantizer.quantize_weight_tensor_per_channel(tensor_name, channel_axis)
else:
self.quantizer.quantize_activation_tensor(tensor_name)

View File

@ -0,0 +1,34 @@
from .direct_q8 import Direct8BitOp, QDQDirect8BitOp
class QMaxPool(Direct8BitOp):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "MaxPool"
# if version is less than 12, go to normal quantize.
if self.quantizer.opset_version < 12:
super(Direct8BitOp, self).quantize()
return
# Direct 8bits op
return super().quantize()
class QDQMaxPool(QDQDirect8BitOp):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "MaxPool"
# if version is less than 12, just no change
if self.quantizer.opset_version < 12:
return
# Direct 8bits op
return super().quantize()

View File

@ -0,0 +1,40 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------
from .qdq_base_operator import QDQOperatorBase
class QDQNormalization(QDQOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type in {"InstanceNormalization", "LayerNormalization", "BatchNormalization"}
# Input
self.quantizer.quantize_activation_tensor(node.input[0])
# Scale
scale_is_initializer = self.quantizer.is_input_a_initializer(node.input[1])
scale_is_per_channel, scale_channel_axis = self.quantizer.is_tensor_per_channel(
node.input[1], default_axis=1, op_type=node.op_type
)
if scale_is_per_channel:
self.quantizer.quantize_weight_tensor_per_channel(node.input[1], axis=scale_channel_axis)
elif scale_is_initializer:
self.quantizer.quantize_weight_tensor(node.input[1])
else:
self.quantizer.quantize_activation_tensor(node.input[1])
# Bias
if len(node.input) > 2 and node.input[2]:
self.quantizer.quantize_bias_tensor(node.name, node.input[2], node.input[0], node.input[1])
# Output
if not self.disable_qdq_for_node_output:
for output_name in node.output:
self.quantizer.quantize_activation_tensor(output_name)

View File

@ -0,0 +1,100 @@
import onnx
from ..quant_utils import (
TENSOR_NAME_QUANT_SUFFIX,
QuantizedValue,
QuantizedValueType,
attribute_to_kwarg,
quantize_nparray,
)
from .base_operator import QuantOperatorBase
class QPad(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "Pad"
# Only after version 11, it has the optional constant_value
# If input[0] is not quantized, do not quanitize this node
if (self.quantizer.opset_version < 11) or (node.input[0] not in self.quantizer.quantized_value_map):
super().quantize()
return
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
kwargs = {}
for attribute in node.attribute:
kv = attribute_to_kwarg(attribute)
kwargs.update(kv)
if "mode" not in kwargs or kwargs["mode"] == b"constant":
if len(node.input) > 2 and node.input[2] != "": # There is 3rd input 'constant_value'
zp_tensor = self.quantizer.model.get_initializer(quantized_input_value.zp_name)
scale_tensor = self.quantizer.model.get_initializer(quantized_input_value.scale_name)
if zp_tensor is None or scale_tensor is None:
super().quantize()
return
padding_constant_initializer = self.quantizer.model.get_initializer(node.input[2])
if padding_constant_initializer is not None:
zp_array = onnx.numpy_helper.to_array(zp_tensor)
zp_value = zp_array.item() if zp_array.ndim == 0 else zp_array[0]
scale_array = onnx.numpy_helper.to_array(scale_tensor)
scale_value = scale_array.item() if scale_array.ndim == 0 else scale_array[0]
padding_constant_array = onnx.numpy_helper.to_array(padding_constant_initializer)
quantized_padding_constant_array = quantize_nparray(
self.quantizer.activation_qType,
padding_constant_array,
scale_value,
zp_value,
)
quantized_padding_constant_name = node.input[2] + TENSOR_NAME_QUANT_SUFFIX
quantized_padding_constant_initializer = onnx.numpy_helper.from_array(
quantized_padding_constant_array,
quantized_padding_constant_name,
)
# Suppose this padding constant initializer only used by the node
self.quantizer.model.remove_initializer(padding_constant_initializer)
self.quantizer.model.add_initializer(quantized_padding_constant_initializer)
node.input[2] = quantized_padding_constant_name
else:
# TODO: check quantize_inputs after sub graph is supported
pad_value_qnodes = self.quantizer._get_quantize_input_nodes(
node,
2,
self.quantizer.activation_qType,
quantized_input_value.scale_name,
quantized_input_value.zp_name,
initial_type=scale_tensor.data_type,
)
self.quantizer.new_nodes.extend(pad_value_qnodes)
node.input[2] = pad_value_qnodes[0].output[0]
else:
# In quantized format, the `zero` before quantization is mapped
# to quantized_input_value.zp_name. Thus, padding 0 to
# original tensor should become padding zero point to quantized
# tensor.
if len(node.input) == 2:
# Feed quantization's zero point to padding node.
node.input.append(quantized_input_value.zp_name)
else:
# Assign quantization's zero point to padding node.
assert node.input[2] == ""
node.input[2] = quantized_input_value.zp_name
# Create an entry for output quantized value
quantized_output_value = QuantizedValue(
node.output[0],
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
quantized_input_value.scale_name,
quantized_input_value.zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
node.input[0] = quantized_input_value.q_name
node.output[0] = quantized_output_value.q_name
self.quantizer.new_nodes += [node]

View File

@ -0,0 +1,67 @@
import onnx
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
class QLinearPool(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
# only try to quantize when given quantization parameters for it
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
# get quantized input tensor names, quantize input if needed
(
quantized_input_names,
input_zero_point_names,
input_scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
if not data_found or quantized_input_names is None:
return super().quantize()
# Create an entry for output quantized value.
qlinear_output_name = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
quantized_output_value = QuantizedValue(
node.output[0],
qlinear_output_name,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
# Create qlinear pool node for given type (AveragePool, etc)
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
qlinear_node_name = node.name + "_quant" if node.name else ""
qnode = onnx.helper.make_node(
"QLinear" + node.op_type,
[
quantized_input_names[0],
input_scale_names[0],
input_zero_point_names[0],
output_scale_name,
output_zp_name,
],
[qlinear_output_name],
qlinear_node_name,
**kwargs,
)
# add all newly created nodes
nodes.append(qnode)
self.quantizer.new_nodes += nodes

View File

@ -0,0 +1,22 @@
import itertools
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, quantize_nparray # noqa: F401
from .base_operator import QuantOperatorBase # noqa: F401
class QDQOperatorBase:
def __init__(self, onnx_quantizer, onnx_node):
self.quantizer = onnx_quantizer
self.node = onnx_node
self.disable_qdq_for_node_output = onnx_node.op_type in onnx_quantizer.op_types_to_exclude_output_quantization
def quantize(self):
node = self.node
if self.disable_qdq_for_node_output:
tensors_to_quantize = node.input
else:
tensors_to_quantize = itertools.chain(node.input, node.output)
for tensor_name in tensors_to_quantize:
self.quantizer.quantize_activation_tensor(tensor_name)

View File

@ -0,0 +1,34 @@
from .direct_q8 import Direct8BitOp, QDQDirect8BitOp
class QResize(Direct8BitOp):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "Resize"
# if version is less than 11, go to normal quantize.
if self.quantizer.opset_version < 11:
super(Direct8BitOp, self).quantize()
return
# Direct 8bits op
return super().quantize()
class QDQResize(QDQDirect8BitOp):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "Resize"
# if version is less than 11, just keep this node
if self.quantizer.opset_version < 11:
return
# Direct 8bits op
return super().quantize()

View File

@ -0,0 +1,74 @@
import onnx
import onnx.helper
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
class QLinearSoftmax(QuantOperatorBase):
def quantize(self):
node = self.node
# set limitations for softmax output scale and zp, because the output of softmax is always 0-1
if self.quantizer.activation_qType == onnx.onnx_pb.TensorProto.UINT8:
out_scale = 1 / 256.0
out_zero_point = 0
else:
out_scale = 1 / 256.0
out_zero_point = -128
# only try to quantize when given quantization parameters for it
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0], out_scale, out_zero_point)
# get quantized input tensor names, quantize input if needed
(
quantized_input_names,
input_zero_point_names,
input_scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
if not data_found or quantized_input_names is None:
return super().quantize()
# Create an entry for output quantized value.
qlinear_output_name = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
quantized_output_value = QuantizedValue(
node.output[0],
qlinear_output_name,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
# Create qlinear softmax node for given type
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
# make qlinearsoft has the real opset_version, its default SinceVersion would be 1
kwargs["opset"] = self.quantizer.opset_version
qlinear_node_name = node.name + "_quant" if node.name else ""
qnode = onnx.helper.make_node(
"QLinear" + node.op_type,
[
quantized_input_names[0],
input_scale_names[0],
input_zero_point_names[0],
output_scale_name,
output_zp_name,
],
[qlinear_output_name],
qlinear_node_name,
**kwargs,
)
# add all newly created nodes
nodes.append(qnode)
self.quantizer.new_nodes += nodes
return None

View File

@ -0,0 +1,63 @@
import onnx
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg
from .base_operator import QuantOperatorBase
from .qdq_base_operator import QDQOperatorBase
class QSplit(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
if quantized_input_names is None:
return super().quantize()
quantized_node_name = ""
if node.name:
quantized_node_name = node.name + "_quant"
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
# Output just derive the scale/zero from input
quantized_output_names = []
for output_name in node.output:
quantized_output_name = output_name + "quantized"
quantized_output_names.append(quantized_output_name)
q_output = QuantizedValue(
output_name,
quantized_output_name,
scale_names[0],
zero_point_names[0],
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[output_name] = q_output
if len(node.input) > 1:
quantized_input_names.extend(node.input[1:])
quantized_node = onnx.helper.make_node(
node.op_type, quantized_input_names, quantized_output_names, quantized_node_name, **kwargs
)
nodes.append(quantized_node)
self.quantizer.new_nodes += nodes
class QDQSplit(QDQOperatorBase):
def quantize(self):
node = self.node
assert node.op_type == "Split"
if not self.quantizer.is_tensor_quantized(node.input[0]):
self.quantizer.quantize_activation_tensor(node.input[0])
if not self.disable_qdq_for_node_output:
for output in node.output:
self.quantizer.quantize_output_same_as_input(output, node.input[0], node.name)

View File

@ -0,0 +1,87 @@
import onnx
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
from .qdq_base_operator import QDQOperatorBase
class QLinearWhere(QuantOperatorBase):
def should_quantize(self):
return True
def quantize(self):
node = self.node
assert node.op_type == "Where"
if not self.quantizer.force_quantize_no_input_check:
self.quantizer.new_nodes += [node]
return
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
(
q_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [1, 2])
if not data_found or q_input_names is None:
return super().quantize()
qlinear_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
qlinear_output_name = node.name + "_quant" if node.name else ""
q_output = QuantizedValue(
node.output[0],
qlinear_output,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = q_output
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
qlwhere_inputs = [
node.input[0],
q_input_names[0],
scale_names[0],
zero_point_names[0],
q_input_names[1],
scale_names[1],
zero_point_names[1],
output_scale_name,
output_zp_name,
]
qlwhere_node = onnx.helper.make_node(
"QLinearWhere", qlwhere_inputs, [qlinear_output], qlinear_output_name, **kwargs
)
self.quantizer.new_nodes += nodes
self.quantizer.new_nodes += [qlwhere_node]
class QDQWhere(QDQOperatorBase):
def quantize(self):
node = self.node
assert node.op_type == "Where"
if self.quantizer.force_quantize_no_input_check:
if not self.quantizer.is_tensor_quantized(node.input[1]):
self.quantizer.quantize_activation_tensor(node.input[1])
if not self.quantizer.is_tensor_quantized(node.input[2]):
self.quantizer.quantize_activation_tensor(node.input[2])
if not self.disable_qdq_for_node_output:
for output in node.output:
self.quantizer.quantize_activation_tensor(output)
elif (
self.quantizer.is_tensor_quantized(node.input[1])
and self.quantizer.is_tensor_quantized(node.input[2])
and not self.disable_qdq_for_node_output
):
for output in node.output:
self.quantizer.quantize_activation_tensor(output)

View File

@ -0,0 +1,141 @@
# --------------------------------------------------------------------------
# Copyright (c) Microsoft, Intel Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
import argparse
import logging
import sys
from .shape_inference import quant_pre_process
logger = logging.getLogger(__name__)
def parse_arguments():
parser = argparse.ArgumentParser(
description="""Model optimizer and shape inferencer, in preparation for quantization,
Consists of three optional steps:
1. Symbolic shape inference (best for transformer models).
2. Model optimization.
3. ONNX shape inference.
Model quantization with QDQ format, i.e. inserting QuantizeLinear/DeQuantizeLinear on
the tensor, requires tensor shape information to perform its best. Currently, shape inferencing
works best with optimized model. As a result, it is highly recommended to run quantization
on optimized model with shape information. This is the tool for optimization and shape
inferencing.
Essentially this tool performs the following three (skippable) steps:
1. Symbolic shape inference.
2. Model optimization
3. ONNX shape inference"""
)
parser.add_argument("--input", required=True, help="Path to the input model file")
parser.add_argument("--output", required=True, help="Path to the output model file")
parser.add_argument(
"--skip_optimization",
type=bool,
default=False,
help="Skip model optimization step if true. It's a known issue that ORT"
" optimization has difficulty with model size greater than 2GB, rerun with"
" this option to get around this issue.",
)
parser.add_argument(
"--skip_onnx_shape",
type=bool,
default=False,
help="Skip ONNX shape inference. Symbolic shape inference is most effective"
" with transformer based models. Skipping all shape inferences may"
" reduce the effectiveness of quantization, as a tensor with unknown"
" shape can not be quantized.",
)
parser.add_argument(
"--skip_symbolic_shape",
type=bool,
default=False,
help="Skip symbolic shape inference. Symbolic shape inference is most"
" effective with transformer based models. Skipping all shape"
" inferences may reduce the effectiveness of quantization, as a tensor"
" with unknown shape can not be quantized.",
)
parser.add_argument(
"--auto_merge",
help="Automatically merge symbolic dims when confliction happens",
action="store_true",
default=False,
)
parser.add_argument(
"--int_max",
help="maximum value for integer to be treated as boundless for ops like slice",
type=int,
default=2**31 - 1,
)
parser.add_argument(
"--guess_output_rank",
help="guess output rank to be the same as input 0 for unknown ops",
action="store_true",
default=False,
)
parser.add_argument(
"--verbose",
help="Prints detailed logs of inference, 0: turn off, 1: warnings, 3: detailed",
type=int,
default=0,
)
parser.add_argument(
"--save_as_external_data",
help="Saving an ONNX model to external data",
action="store_true",
default=False,
)
parser.add_argument(
"--all_tensors_to_one_file",
help="Saving all the external data to one file",
action="store_true",
default=False,
)
parser.add_argument(
"--external_data_location",
help="The file location to save the external file",
default=None,
)
parser.add_argument(
"--external_data_size_threshold",
help="The size threshold for external data",
type=int,
default=1024,
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_arguments()
if args.skip_optimization and args.skip_onnx_shape and args.skip_symbolic_shape:
logger.error("Skipping all three steps, nothing to be done. Quitting...")
sys.exit()
if (not args.skip_optimization) and args.save_as_external_data:
logger.error("ORT model optimization does not support external data yet!")
sys.exit()
logger.info("input model: %s", args.input)
logger.info("output model: %s", args.output)
quant_pre_process(
args.input,
args.output,
args.skip_optimization,
args.skip_onnx_shape,
args.skip_symbolic_shape,
args.auto_merge,
args.int_max,
args.guess_output_rank,
args.verbose,
args.save_as_external_data,
args.all_tensors_to_one_file,
args.external_data_location,
args.external_data_size_threshold,
)

View File

@ -0,0 +1,389 @@
# --------------------------------------------------------------------------
# Copyright (c) Microsoft, Intel Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
"""Utilities to run a given ONNX model, while saving input/output tensors of
eligible operator nodes.
A use case is to debug quantization induced accuracy drop. An AI engineer can
run the original float32 model and the quantized model with the same inputs,
then compare the corresponding activations between the two models to find
where the divergence is.
Example Usage:
```python
class ExampleDataReader(CalibrationDataReader):
def __init__(self):
...
def get_next(self):
...
input_data_reader = ExampleDataReader()
augmented_model_path = str(Path(self._tmp_model_dir.name).joinpath("augmented_model.onnx"))
modify_model_output_intermediate_tensors (path_to_onnx_model, augmented_model_path)
tensor_dict = collect_activations(augmented_model_path, input_data_reader)
```
`tensor_dict` points to a dictionary where the keys are tensor names and each value
is a list of tensors, one from each model run
"""
import logging
import math
import time
from pathlib import Path
from typing import Callable, Dict, List, Optional, Sequence, Union
import numpy
import onnx
from onnx import helper, numpy_helper
import onnxruntime
from .calibrate import CalibraterBase, CalibrationDataReader
from .onnx_model import ONNXModel
from .quant_utils import (
DEQUANT_OP_NAME,
DEQUANT_OUTPUT_SUFFIX,
QUANT_INPUT_SUFFIX,
TENSOR_NAME_QUANT_SUFFIX,
find_by_name,
load_model_with_shape_infer,
)
_TENSOR_SAVE_POSTFIX = "_ReshapedSavedOutput"
_TENSOR_SAVE_POSTFIX_LEN = len(_TENSOR_SAVE_POSTFIX)
def modify_model_output_intermediate_tensors(
input_model_path: Union[str, Path],
output_model_path: Union[str, Path],
op_types_for_saving: Optional[Sequence[str]] = None,
save_as_external_data: bool = False,
) -> None:
"""Augment a given ONNX model to save node input/output tensors.
Add all input/output tensors of operator nodes to model outputs
so that their values can be retrieved for debugging purposes.
Args:
input_model: the path to load the model.
op_types_for_saving: Operator types for which the
input/output should be saved. By default, saving all the
float32/float16 tensors.
Returns:
The augmented ONNX model
"""
if op_types_for_saving is None:
op_types_for_saving = []
saver = CalibraterBase(input_model_path, op_types_to_calibrate=op_types_for_saving)
model_to_augment = saver.model
tensors, value_infos = saver.select_tensors_to_calibrate(model_to_augment)
reshape_shape_name = "LinearReshape_" + str(time.time())
reshape_shape = numpy_helper.from_array(numpy.array([-1], dtype=numpy.int64), reshape_shape_name)
model_to_augment.graph.initializer.append(reshape_shape)
for tensor_name in tensors:
reshape_output = tensor_name + _TENSOR_SAVE_POSTFIX
reshape_node = onnx.helper.make_node(
"Reshape",
inputs=[tensor_name, reshape_shape_name],
outputs=[reshape_output],
name=reshape_output,
)
model_to_augment.graph.node.append(reshape_node)
reshape_output_value_info = helper.make_tensor_value_info(
reshape_output, value_infos[tensor_name].type.tensor_type.elem_type, [-1]
)
model_to_augment.graph.output.append(reshape_output_value_info)
onnx.save(
model_to_augment,
output_model_path,
save_as_external_data=save_as_external_data,
)
def collect_activations(
augmented_model: str,
input_reader: CalibrationDataReader,
session_options=None,
execution_providers: Optional[Sequence[str]] = None,
) -> Dict[str, List[numpy.ndarray]]:
"""Run augmented model and collect activations tensors.
Args:
augmented_model: Path to augmented model created by modify_model_output_intermediate_tensors ()
input_reader: Logic for reading input for the model, augmented model have the same
input with the original model.
session_options: Optional OnnxRuntime session options for controlling model run.
By default graph optimization is turned off
execution_providers: Collection of execution providers for running the model.
Only CPU EP is used by default.
Returns:
A dictionary where the key is tensor name and values are list of tensors from each batch
"""
if session_options is None:
session_options = onnxruntime.SessionOptions()
session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
if execution_providers is None:
execution_providers = ["CPUExecutionProvider"]
inference_session = onnxruntime.InferenceSession(
augmented_model,
sess_options=session_options,
providers=execution_providers,
)
intermediate_outputs = []
for input_d in input_reader:
intermediate_outputs.append(inference_session.run(None, input_d))
if not intermediate_outputs:
raise RuntimeError("No data is collected while running augmented model!")
output_dict = {}
output_info = inference_session.get_outputs()
for batch in intermediate_outputs:
for output, output_data in zip(output_info, batch):
if output.name.endswith(_TENSOR_SAVE_POSTFIX):
output_name = output.name[:-_TENSOR_SAVE_POSTFIX_LEN]
output_dict.setdefault(output_name, []).append(output_data)
return output_dict
_POST_QDQ_POSTFIX1 = DEQUANT_OUTPUT_SUFFIX + "_1"
def _add_pre_post_qdq_pair(
qdq_cmp: Dict[str, Dict[str, Sequence[numpy.ndarray]]],
activation_name: str,
pre_qdq_tensors: Optional[Sequence[numpy.ndarray]],
post_qdq_tensors: Optional[Sequence[numpy.ndarray]],
) -> None:
if post_qdq_tensors is not None and pre_qdq_tensors is not None:
qdq_cmp[activation_name] = {}
qdq_cmp[activation_name]["pre_qdq"] = pre_qdq_tensors
qdq_cmp[activation_name]["post_qdq"] = post_qdq_tensors
def create_activation_matching(
qdq_activations: Dict[str, Sequence[numpy.ndarray]],
float_activations: Optional[Dict[str, Sequence[numpy.ndarray]]] = None,
) -> Dict[str, Dict[str, Sequence[numpy.ndarray]]]:
"""Comparing activation values to help debugging accuracy loss due to quantization.
This functions takes saved activations from the QDQ model and (optionally) the
float point model, and provides a data structure for comparing:
* from the qdq model, activation values before and after QDQ operation
* across both models, activations from the orignal model vs the corresponding
activations in the QDQ model
Arg:
qdq_activations: Output of `collect_activations`. This must be from a quantized
model with QDQ format.
float_activations: Output of `collect_activations`. This must be from the float
point model.
Returns:
Dict for comparing pre and post quantized activation tensors. E.g.
```
qdq_cmp = cmp_qdq_input_output(qdq_activations)
print(qdq_cmp['activation1']['pre_qdq'][0])
print(qdq_cmp['activation1'][`post_qdq'][0])
qdq_cmp = cmp_qdq_input_output(qdq_activations, float_activations)
print(qdq_cmp['activation1']['float'][0])
print(qdq_cmp['activation1']['pre_qdq'][0])
print(qdq_cmp['activation1'][`post_qdq'][0])
```
"""
qdq_cmp: Dict[str, Dict[str, Sequence[numpy.ndarray]]] = {}
for tensor_name, tensors in qdq_activations.items():
if tensor_name.endswith(QUANT_INPUT_SUFFIX):
pre_name = tensor_name[: -len(QUANT_INPUT_SUFFIX)]
post_qdq_tensors = qdq_activations.get(pre_name)
pre_qdq_tensors = tensors
_add_pre_post_qdq_pair(qdq_cmp, pre_name, pre_qdq_tensors, post_qdq_tensors)
elif tensor_name.endswith(DEQUANT_OUTPUT_SUFFIX):
pre_name = tensor_name[: -len(DEQUANT_OUTPUT_SUFFIX)]
pre_qdq_tensors = qdq_activations.get(pre_name)
post_qdq_tensors = tensors
_add_pre_post_qdq_pair(qdq_cmp, pre_name, pre_qdq_tensors, post_qdq_tensors)
elif tensor_name.endswith(_POST_QDQ_POSTFIX1):
pre_name = tensor_name[: -len(_POST_QDQ_POSTFIX1)]
pre_qdq_tensors = qdq_activations.get(pre_name)
post_qdq_tensors = tensors
_add_pre_post_qdq_pair(qdq_cmp, pre_name, pre_qdq_tensors, post_qdq_tensors)
if not float_activations:
return qdq_cmp
for act_name, act_values in qdq_cmp.items():
float_acts = float_activations.get(act_name)
if float_acts is not None:
act_values["float"] = float_acts
return qdq_cmp
def _run_dequantize_linear(
weight_tensor: numpy.ndarray, weight_scale: numpy.ndarray, weight_zp: numpy.ndarray, channel_axis: int
) -> Optional[numpy.ndarray]:
assert weight_scale.shape == weight_zp.shape
if weight_zp.size == 1:
return (weight_tensor - weight_zp) * weight_scale
assert weight_zp.ndim == 1
reshape_dims = list(weight_tensor.shape) # deep copy
reshape_dims[channel_axis] = 1 # only one per channel for reshape
channel_count = weight_tensor.shape[channel_axis]
dequantized_weights = None
for i in range(channel_count):
per_channel_data = weight_tensor.take(i, channel_axis)
dequantized_per_channel_data = (per_channel_data - weight_zp[i]) * weight_scale[i]
if i == 0:
dequantized_weights = numpy.asarray(dequantized_per_channel_data).reshape(reshape_dims)
else:
channel_weights = numpy.asarray(dequantized_per_channel_data).reshape(reshape_dims)
dequantized_weights = numpy.concatenate((dequantized_weights, channel_weights), channel_axis)
if dequantized_weights is None:
return None
dequantized_weights.reshape(weight_tensor.shape)
return dequantized_weights
def create_weight_matching(float_model_path: str, qdq_model_path: str) -> Dict[str, Dict[str, numpy.ndarray]]:
"""Comparing weight values to help debugging accuracy loss due to quantization.
This functions takes the float model and the qdq model, and provides a data structure for comparing
their corresponding weights to locate quantization errors
Arg:
float_model_path: Path points to the float point model.
qdq_model_path: Path points to the qdq model.
Returns:
Dict for comparing weight tensors. E.g.
```
qdq_weight_cmp = create_weight_matching(float_model, qdq_model)
print(qdq_weight_cmp['activation1']['float'])
print(qdq_weight_cmp['activation1']['dequantized'])
```
"""
float_onnx_model = ONNXModel(load_model_with_shape_infer(Path(float_model_path)))
qdq_onnx_model = ONNXModel(load_model_with_shape_infer(Path(qdq_model_path)))
matched_weights: Dict[str, Dict[str, numpy.ndarray]] = {}
initializers = qdq_onnx_model.initializer()
for node in qdq_onnx_model.nodes():
if node.op_type != DEQUANT_OP_NAME:
continue # Only care about DQ node
weight_name: str = node.input[0]
weight_values = find_by_name(weight_name, initializers)
if not weight_values:
continue # Only care about DQ node with const inputs
if not weight_name.endswith(TENSOR_NAME_QUANT_SUFFIX):
logging.error(f"Model Error in '{qdq_model_path}': Dequantized tensor name '{weight_name}' not recognized!")
continue
axis = -1
for attr in node.attribute:
if attr.name == "axis":
axis = attr.i
weight_tensor = numpy_helper.to_array(weight_values)
weight_scale = numpy_helper.to_array(find_by_name(node.input[1], initializers))
if len(node.input) > 2:
weight_zp = numpy_helper.to_array(find_by_name(node.input[2], initializers))
else:
weight_zp = numpy.zeros(weight_scale.shape, dtype=numpy.int32)
# Perform dequantization:
if weight_scale.size == weight_zp.size == 1:
# Avoids the confusion between a scaler and a tensor of one element.
weight_scale = weight_scale.reshape(tuple())
weight_zp = weight_zp.reshape(tuple())
if weight_scale.shape != weight_zp.shape:
raise RuntimeError(
f"scale and zero_point must have the same shape but {weight_scale.shape} != {weight_zp.shape}"
)
weight_quant = _run_dequantize_linear(weight_tensor, weight_scale, weight_zp, channel_axis=axis)
weight_name = weight_name[: -len(TENSOR_NAME_QUANT_SUFFIX)]
if weight_quant is None:
logging.error(f"Model Error in '{qdq_model_path}': '{weight_name}' per-channel quantization on 0 channel")
continue
float_values = find_by_name(weight_name, float_onnx_model.initializer())
if not float_values:
logging.error(f"Model Error in '{float_model_path}': weight tensor '{weight_name}' not found!")
continue
weight_float = numpy_helper.to_array(float_values)
matched_weights[weight_name] = {"float": weight_float, "dequantized": weight_quant}
return matched_weights
def compute_signal_to_quantization_noice_ratio(
x: Union[Sequence[numpy.ndarray], numpy.ndarray], y: Union[Sequence[numpy.ndarray], numpy.ndarray]
) -> float:
if isinstance(x, numpy.ndarray):
xlist = [x]
else:
xlist = x
if isinstance(y, numpy.ndarray):
ylist = [y]
else:
ylist = y
if len(xlist) != len(ylist):
raise RuntimeError("Unequal number of tensors to compare!")
left = numpy.concatenate(xlist).flatten()
right = numpy.concatenate(ylist).flatten()
epsilon = numpy.finfo("float").eps
tensor_norm = max(numpy.linalg.norm(left), epsilon)
diff_norm = max(numpy.linalg.norm(left - right), epsilon)
res = tensor_norm / diff_norm
return 20 * math.log10(res)
def compute_weight_error(
weights_match: Dict[str, Dict[str, numpy.ndarray]],
err_func: Callable[[numpy.ndarray, numpy.ndarray], float] = compute_signal_to_quantization_noice_ratio,
) -> Dict[str, float]:
result: Dict[str, float] = {}
for weight_name, weight_match in weights_match.items():
result[weight_name] = err_func(weight_match["float"], weight_match["dequantized"])
return result
def compute_activation_error(
activations_match: Dict[str, Dict[str, Sequence[numpy.ndarray]]],
err_func: Callable[
[Sequence[numpy.ndarray], Sequence[numpy.ndarray]], float
] = compute_signal_to_quantization_noice_ratio,
) -> Dict[str, Dict[str, float]]:
result: Dict[str, Dict[str, float]] = {}
for name, match in activations_match.items():
err_result: Dict[str, float] = {}
err_result["qdq_err"] = err_func(match["pre_qdq"], match["post_qdq"])
float_activation = match["float"]
if float_activation:
err_result["xmodel_err"] = err_func(float_activation, match["post_qdq"])
result[name] = err_result
return result

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,866 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
from __future__ import annotations
import logging
import os
import tempfile
from enum import Enum
from pathlib import Path
import numpy
import onnx
from onnx import ModelProto, TensorProto, external_data_helper
from onnx import onnx_pb as onnx_proto
from onnx.helper import make_graph, make_model, make_node, make_tensor_value_info
from onnx.reference import ReferenceEvaluator
from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions
try:
from onnx.reference.custom_element_types import float8e4m3fn
except ImportError:
float8e4m3fn = None
# INT4 np.dtypes added in ONNX 1.16. These map to np.int8/np.uint8 because numpy
# does not support sub-byte types.
try:
from onnx.reference.custom_element_types import int4, uint4
except ImportError:
int4 = None
uint4 = None
__producer__ = "onnx.quantize"
__version__ = "0.1.0"
onnx_domain = "ai.onnx"
ms_domain = "com.microsoft"
QUANT_OP_NAME = "QuantizeLinear"
QUANT_INPUT_SUFFIX = "_QuantizeLinear_Input"
DEQUANT_OP_NAME = "DequantizeLinear"
DEQUANT_OUTPUT_SUFFIX = "_DequantizeLinear_Output"
TENSOR_NAME_QUANT_SUFFIX = "_quantized"
FLOAT8_DISTRIBUTIONS = {}
type_to_name = {getattr(TensorProto, k): k for k in dir(TensorProto) if isinstance(getattr(TensorProto, k), int)}
# Quantization mode
# IntegerOps: Use IntegerOps in quantized model. Only ConvInteger and MatMulInteger ops are supported now.
# QLinearOps: Use QLinearOps in quantized model. Only QLinearConv and QLinearMatMul ops are supported now.
class QuantizationMode(Enum):
IntegerOps = 0
QLinearOps = 1
def __str__(self):
return self.name
@staticmethod
def from_string(mode):
try:
return QuantizationMode[mode]
except KeyError:
raise ValueError() # noqa: B904
class QuantizedValueType(Enum):
Input = 0
Initializer = 1
def __str__(self):
return self.name
@staticmethod
def from_string(v):
try:
return QuantizedValueType[v]
except KeyError:
raise ValueError() # noqa: B904
class QuantType(Enum):
QInt8 = 0
QUInt8 = 1
QFLOAT8E4M3FN = 2
QInt16 = 3
QUInt16 = 4
QInt4 = 5
QUInt4 = 6
def __str__(self):
return self.name
@staticmethod
def from_string(t):
try:
return QuantType[t]
except KeyError:
raise ValueError() # noqa: B904
@property
def tensor_type(self):
if self == QuantType.QInt8:
return TensorProto.INT8
if self == QuantType.QUInt8:
return TensorProto.UINT8
if self == QuantType.QUInt16:
return TensorProto.UINT16
if self == QuantType.QInt16:
return TensorProto.INT16
if self == QuantType.QFLOAT8E4M3FN:
return TensorProto.FLOAT8E4M3FN
if self == QuantType.QUInt4:
return TensorProto.UINT4
if self == QuantType.QInt4:
return TensorProto.INT4
raise ValueError(f"Unexpected value qtype={self!r}.")
class QuantFormat(Enum):
QOperator = 0
QDQ = 1
def __str__(self):
return self.name
@staticmethod
def from_string(format):
try:
return QuantFormat[format]
except KeyError:
raise ValueError() # noqa: B904
ONNX_TYPE_TO_NP_TYPE = {
onnx_proto.TensorProto.INT8: numpy.dtype("int8"),
onnx_proto.TensorProto.UINT8: numpy.dtype("uint8"),
onnx_proto.TensorProto.INT16: numpy.dtype("int16"),
onnx_proto.TensorProto.UINT16: numpy.dtype("uint16"),
onnx_proto.TensorProto.FLOAT8E4M3FN: float8e4m3fn,
onnx_proto.TensorProto.INT4: int4, # base_dtype is np.int8
onnx_proto.TensorProto.UINT4: uint4, # base_dtype is np.uint8
}
ONNX_INT_TYPE_RANGE = {
onnx_proto.TensorProto.UINT8: (numpy.array(0, dtype=numpy.uint8), numpy.array(255, dtype=numpy.uint8)),
onnx_proto.TensorProto.INT8: (numpy.array(-128, dtype=numpy.int8), numpy.array(127, dtype=numpy.int8)),
onnx_proto.TensorProto.UINT16: (numpy.array(0, dtype=numpy.uint16), numpy.array(65535, dtype=numpy.uint16)),
onnx_proto.TensorProto.INT16: (numpy.array(-32768, dtype=numpy.int16), numpy.array(32767, dtype=numpy.int16)),
onnx_proto.TensorProto.UINT4: (numpy.array(0, dtype=uint4), numpy.array(15, dtype=uint4)),
onnx_proto.TensorProto.INT4: (numpy.array(-8, dtype=int4), numpy.array(7, dtype=int4)),
}
ONNX_INT_TYPE_SYMMETRIC_RANGE = {
onnx_proto.TensorProto.INT8: (numpy.array(-127, dtype=numpy.int8), numpy.array(127, dtype=numpy.int8)),
onnx_proto.TensorProto.INT16: (numpy.array(-32767, dtype=numpy.int16), numpy.array(32767, dtype=numpy.int16)),
}
ONNX_INT_TYPE_REDUCED_RANGE = {
onnx_proto.TensorProto.UINT8: (numpy.array(0, dtype=numpy.uint8), numpy.array(127, dtype=numpy.uint8)),
onnx_proto.TensorProto.INT8: (numpy.array(-64, dtype=numpy.int8), numpy.array(64, dtype=numpy.int8)),
onnx_proto.TensorProto.UINT16: (numpy.array(0, dtype=numpy.uint16), numpy.array(32767, dtype=numpy.uint16)),
onnx_proto.TensorProto.INT16: (numpy.array(-16384, dtype=numpy.int16), numpy.array(16384, dtype=numpy.int16)),
onnx_proto.TensorProto.UINT4: (numpy.array(0, dtype=int4), numpy.array(7, dtype=int4)),
onnx_proto.TensorProto.INT4: (numpy.array(-4, dtype=int4), numpy.array(3, dtype=int4)),
}
def _check_type(*args, zero_point_index=-1):
new_args = []
for i, a in enumerate(args):
if numpy.issubdtype(type(a), numpy.number):
new_args.append(numpy.array(a))
elif isinstance(a, numpy.ndarray):
new_args.append(a)
else:
raise TypeError(f"arg {i} is not an array: {a}")
if i == zero_point_index:
v = new_args[-1]
if v.dtype == numpy.float32 or v.dtype == numpy.float16:
raise TypeError(f"zero_point cannot be {v.dtype}")
return tuple(new_args) if len(new_args) > 1 else new_args[0]
def quantize_nparray(qType, arr, scale, zero_point, low=None, high=None):
assert (
qType in ONNX_TYPE_TO_NP_TYPE
), f"Unexpected data type {qType} requested. Only INT8, UINT8, INT16, and UINT16 are supported."
if qType in (
onnx_proto.TensorProto.FLOAT8E4M3FN,
onnx_proto.TensorProto.FLOAT8E4M3FNUZ,
onnx_proto.TensorProto.FLOAT8E5M2,
onnx_proto.TensorProto.FLOAT8E5M2FNUZ,
):
if zero_point != 0:
raise NotImplementedError(f"zero_point is expected to be null for float 8 not {zero_point!r}.")
if arr.dtype == numpy.float32:
onnx_type = TensorProto.FLOAT
elif arr.dtype == numpy.float16:
onnx_type = TensorProto.FLOAT16
else:
raise ValueError(f"Unexpected dtype {arr.dtype}.")
onnx_model = make_model(
make_graph(
[
make_node(
"Constant", [], ["zero_point"], value=onnx.helper.make_tensor("zero_point", qType, [], [0])
),
make_node("QuantizeLinear", ["X", "scale", "zero_point"], ["Y"]),
],
"qu",
[
make_tensor_value_info("X", onnx_type, None),
make_tensor_value_info("scale", onnx_type, None),
],
[make_tensor_value_info("Y", qType, None)],
)
)
ref = ReferenceEvaluator(onnx_model)
return _check_type(ref.run(None, {"X": arr, "scale": scale})[0])
else:
# Quantizes data for all integer types.
#
# For int4 types, the quantized data is returned as either np.int8 or np.uint8,
# which matches the python reference ONNX implementation of QuantizeLinear.
# This data can be packed into 4-bit elements by using pack_bytes_to_4bit().
dtype = ONNX_TYPE_TO_NP_TYPE[qType]
(qmin, qmax) = get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=True)
cliplow = max(qmin, low) if low is not None else qmin
cliphigh = min(qmax, high) if high is not None else qmax
arr_fp32 = numpy.asarray((arr.astype(numpy.float32) / scale).round() + zero_point)
numpy.clip(arr_fp32, cliplow, cliphigh, out=arr_fp32)
return _check_type(arr_fp32.astype(dtype))
def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False, min_real_range=None):
"""Calculate the scale s and zero point z for the quantization relation
r = s(q-z), where r are the original values and q are the corresponding
quantized values.
r and z are calculated such that every value within [rmin,rmax] has an
approximate representation within [qmin,qmax]. In addition, qmin <= z <=
qmax is enforced. If the symmetric flag is set to True, the interval
[rmin,rmax] is symmetrized to [-absmax, +absmax], where
absmax = max(abs(rmin), abs(rmax)).
:parameter rmin: minimum value of r
:parameter rmax: maximum value of r
:parameter qmin: minimum value representable by the target quantization data type
:parameter qmax: maximum value representable by the target quantization data type
:parameter symmetric: True if the floating-point range should be made symmetric. Defaults to False.
:parameter min_real_range: Minimum floating-point range (i.e., rmax - rmin) to enforce. Defaults to None.
:return: zero and scale [z, s]
"""
if qmin > 0 or qmax < 0:
raise ValueError(f"qmin and qmax must meet requirement: qmin <= 0 <= qmax while qmin:{qmin}, qmmax:{qmax}")
# Adjust rmin and rmax such that 0 is included in the range. This is
# required to make sure zero can be represented by the quantization data
# type (i.e. to make sure qmin <= zero_point <= qmax)
rmin = numpy.minimum(rmin, numpy.array(0, dtype=rmin.dtype))
rmax = numpy.maximum(rmax, numpy.array(0, dtype=rmax.dtype))
# Ensure a minimum float-point range if specified.
if min_real_range is not None:
rmax = max(rmax, rmin + min_real_range)
if symmetric:
absmax = numpy.maximum(numpy.abs(rmin), numpy.abs(rmax))
rmin = -absmax
rmax = +absmax
assert qmin <= qmax, f"qmin={rmin} > qmax={rmax}"
dr = numpy.array(rmax - rmin, dtype=numpy.float64)
dq = numpy.array(qmax, dtype=numpy.float64) - numpy.array(qmin, dtype=numpy.float64)
scale = numpy.array(dr / dq)
assert scale >= 0, "scale isse"
if scale < numpy.finfo(rmax.dtype).tiny:
scale = numpy.array(1.0, dtype=rmax.dtype)
zero_point = numpy.array(0, dtype=qmin.dtype)
else:
if symmetric:
# When symmetric (i.e., rmax == -rmin), the zero_point formula reduces to round((qmax + qmin) / 2.0).
# This simpler formula doesn't depend on scale and guarantees that the zero point values
# for int8, uint8, int16, and uint16 are always 0, 128, 0, and 32768, respectively.
# This is important for per-channel/symmetric QLinearConv on CPU EP, which requires all channels to have
# the exact same zero_point values.
zero_point = numpy.array(
numpy.round((qmin + qmax) / numpy.array(2.0, dtype=numpy.float64)), dtype=qmin.dtype
)
else:
zero_point = numpy.array(numpy.round(qmin - rmin / scale), dtype=qmin.dtype)
scale = scale.astype(rmax.dtype)
return [zero_point, scale]
def compute_scale_zp_float8(element_type, std):
"""Calculate the scale s for a float8 type (E4M3FN).
The function assumes the coefficient distribution and the float 8
distribution are similar to two gaussian laws.
:return: zero and scale [z, s]
More details in notebook `quantization_fp8.ipynb
<https://github.com/microsoft/onnxruntime/blob/main/docs/python/notebooks/quantization_fp8.ipynb>`_.
"""
zp_dtype = None
if element_type not in FLOAT8_DISTRIBUTIONS:
if element_type == TensorProto.FLOAT8E4M3FN:
from onnx.numpy_helper import float8e4m3_to_float32
from onnx.reference.custom_element_types import float8e4m3fn
zp_dtype = float8e4m3fn
all_values = [float8e4m3_to_float32(i) for i in range(256)]
values = numpy.array(
[f for f in all_values if not numpy.isnan(f) and not numpy.isinf(f)], dtype=numpy.float32
)
else:
raise ValueError(f"Quantization to element_type={element_type} not implemented.")
FLOAT8_DISTRIBUTIONS[element_type] = values
elif element_type == TensorProto.FLOAT8E4M3FN:
from onnx.reference.custom_element_types import float8e4m3fn
zp_dtype = float8e4m3fn
if zp_dtype is None:
raise TypeError(f"Unexpected element_type {element_type}.")
std_f8 = numpy.std(FLOAT8_DISTRIBUTIONS[element_type])
zero = numpy.array(0, dtype=zp_dtype)
scale = numpy.array(std / std_f8, dtype=std.dtype)
return [zero, scale]
def quantize_data(
data, qType, symmetric, reduce_range=False, min_real_range=None, rmin_override=None, rmax_override=None
):
"""
:param data: data to quantize
:param qType: data type to quantize to. Supported types UINT8 and INT8
:param symmetric: whether symmetric quantization is used or not. This is applied to INT8.
:parameter reduce_range: True if the quantization range should be reduced. Defaults to False.
:parameter min_real_range: Minimum floating-point range (i.e., rmax - rmin) to enforce. Defaults to None.
:parameter rmin_override: The value of rmin to use if not None. Otherwise, uses min(data).
:parameter rmax_override: The value of rmax to use if not None. Otherwise, uses max(data).
:return: minimum, maximum, zero point, scale, and quantized weights
To pack weights, we compute a linear transformation
- when data `type == uint8` mode, from `[rmin, rmax]` -> :math:`[0, 2^{b-1}]` and
- when data `type == int8`, from `[-m , m]` -> :math:`[-(2^{b-1}-1), 2^{b-1}-1]` where
`m = max(abs(rmin), abs(rmax))`
and add necessary intermediate nodes to transform quantized weight to full weight using the equation
:math:`r = S(q-z)`, where
- *r*: real original value
- *q*: quantized value
- *S*: scale
- *z*: zero point
"""
if not isinstance(data, numpy.ndarray):
raise TypeError(f"Weight must be given as an array not {type(data)}.")
if rmin_override is not None:
rmin = rmin_override
else:
rmin = data.min() if len(data) else 0.0
if rmax_override is not None:
rmax = rmax_override
else:
rmax = data.max() if len(data) else 0.0
rmin = numpy.array(rmin, dtype=data.dtype)
rmax = numpy.array(rmax, dtype=data.dtype)
zero_point = 0
scale = numpy.array(1.0, dtype=data.dtype)
if qType == TensorProto.FLOAT8E4M3FN:
if reduce_range:
raise RuntimeError("Unsupported option reduce_range=True for float 8.")
std = numpy.std(data)
zero_point, scale = compute_scale_zp_float8(qType, std)
quantized_data = quantize_nparray(qType, data, scale, zero_point)
if any((quantized_data.astype(numpy.uint8).ravel() & 127) == 127):
np_data = numpy.asarray(data)
raise RuntimeError(
f"One of the quantized value is NaN data in [{np_data.min()}, {np_data.max()}], "
f"quantized_data in [{quantized_data.min()}, {quantized_data.max()}]."
)
return _check_type(rmin, rmax, zero_point, scale, quantized_data, zero_point_index=2)
if qType in (
TensorProto.INT8,
TensorProto.UINT8,
TensorProto.INT16,
TensorProto.UINT16,
TensorProto.INT4,
TensorProto.UINT4,
):
if len(data):
qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, symmetric=symmetric)
zero_point, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, min_real_range)
quantized_data = quantize_nparray(qType, data, scale, zero_point)
return _check_type(rmin, rmax, zero_point, scale, quantized_data, zero_point_index=2)
raise ValueError(f"Unexpected value for qType={qType}.")
def get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=False): # noqa: N802
"""
Return qmin and qmax, the minimum and maximum value representable by the given qType
:parameter qType: onnx.onnx_pb.TensorProto.UINT8 or onnx.onnx_pb.TensorProto.UINT8
:return: qmin, qmax
"""
if qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
raise NotImplementedError("This function is not implemented for float 8 as not needed.")
qrange = None
if reduce_range:
qrange = ONNX_INT_TYPE_REDUCED_RANGE.get(qType)
elif symmetric and qType in ONNX_INT_TYPE_SYMMETRIC_RANGE:
qrange = ONNX_INT_TYPE_SYMMETRIC_RANGE[qType]
else:
qrange = ONNX_INT_TYPE_RANGE.get(qType)
if not qrange:
raise ValueError(f"Unexpected data type {qType} requested. Only INT8, UINT8, INT16, and UINT16 are supported.")
qmin, qmax = qrange
if qmin > 0 or qmax < 0:
raise ValueError(
f"qmin and qmax must meet requirement: qmin <= 0 <= qmax while "
f"qmin:{qmin}, qmmax:{qmax}, dtype={qmin.dtype}, reduce_range={reduce_range}, "
f"symmetric={symmetric}, qType={qType}"
)
return qrange
def get_qrange_for_qType(qType, reduce_range=False, symmetric=False): # noqa: N802
"""
Helper function to get the quantization range for a type.
parameter qType: quantization type.
return: quantization range.
"""
qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, symmetric=symmetric)
return qmax - qmin
def normalize_axis(axis: int, rank: int) -> tuple[bool, int]:
"""
Helper function that tries to return a normalized axis in the range [0, rank - 1].
:parameter axis: The axis to normalize.
:parameter rank: The tensor rank (number of dimensions).
:return (is_valid, axis_norm)
"""
axis_norm = axis + rank if axis < 0 else axis
is_valid = axis_norm >= 0 and axis_norm < rank
return is_valid, axis_norm
def pack_bytes_to_4bit(src_8bit: bytes) -> bytearray:
"""
Copies a source array of 8-bit values into a destination bytearray of packed 4-bit values.
Assumes that the source values are already in the appropriate int4 range.
:parameter src_8bit: The 8-bit element values to pack.
:return A bytearray with every two 8-bit src elements packed into a single byte.
"""
num_elems = len(src_8bit)
if num_elems == 0:
return bytearray()
dst_size = (num_elems + 1) // 2 # Ex: 5 8-bit elems packed into 3 bytes
dst = bytearray(dst_size)
src_i: int = 0
dst_i: int = 0
# Pack two 8-bit elements into a single byte in each iteration.
while src_i < num_elems - 1:
dst[dst_i] = ((src_8bit[src_i + 1] & 0xF) << 4) | (src_8bit[src_i] & 0xF)
dst_i += 1
src_i += 2
if src_i < num_elems:
# Odd number of elements.
dst[dst_i] = src_8bit[src_i] & 0xF
return dst
class QuantizedInitializer:
"""
Represents a linearly quantized weight input from ONNX operators
"""
def __init__(
self,
name,
initializer,
rmins,
rmaxs,
zero_points,
scales,
data=[], # noqa: B006
quantized_data=[], # noqa: B006
axis=None,
):
self.name = name
self.initializer = initializer # TensorProto initializer in ONNX graph
self.rmins = rmins # List of minimum range for each axis
self.rmaxs = rmaxs # List of maximum range for each axis
# 1D tensor of zero points computed for each axis. scalar if axis is empty
self.zero_points = zero_points
self.scales = scales # 1D tensor of scales computed for each axis. scalar if axis is empty
self.data = data # original data from initializer TensorProto
self.quantized_data = quantized_data # weight-packed data from data
# Scalar to specify which dimension in the initializer to weight pack.
self.axis = axis
# If empty, single zero point and scales computed from a single rmin and rmax
class QuantizedValue:
"""
Represents a linearly quantized value (input\\output\\intializer)
"""
def __init__(
self,
name,
new_quantized_name,
scale_name,
zero_point_name,
quantized_value_type,
axis=None,
node_type=None,
node_qtype=None,
scale_type=None,
):
self.original_name = name
self.q_name = new_quantized_name
self.scale_name = scale_name
self.zp_name = zero_point_name
self.value_type = quantized_value_type
self.axis = axis
self.node_type = node_type
self.node_qtype = node_qtype
self.scale_type = scale_type
class BiasToQuantize:
"""
Represents a bias to be quantized
"""
def __init__(self, bias_name, input_name, weight_name):
self.bias_name = bias_name
self.input_name = input_name
self.weight_name = weight_name
def attribute_to_kwarg(attribute):
"""
Convert attribute to kwarg format for use with onnx.helper.make_node.
:parameter attribute: attribute in AttributeProto format.
:return: attribute in {key: value} format.
"""
if attribute.type == 0:
raise ValueError(f"attribute {attribute.name} does not have type specified.")
# Based on attribute type definitions from AttributeProto
# definition in https://github.com/onnx/onnx/blob/main/onnx/onnx.proto
if attribute.type == 1:
value = attribute.f
elif attribute.type == 2:
value = attribute.i
elif attribute.type == 3:
value = attribute.s
elif attribute.type == 4:
value = attribute.t
elif attribute.type == 5:
value = attribute.g
elif attribute.type == 6:
value = attribute.floats
elif attribute.type == 7:
value = attribute.ints
elif attribute.type == 8:
value = attribute.strings
elif attribute.type == 9:
value = attribute.tensors
elif attribute.type == 10:
value = attribute.graphs
else:
raise ValueError(f"attribute {attribute.name} has unsupported type {attribute.type}.")
return {attribute.name: value}
def find_by_name(item_name, item_list):
"""
Helper function to find item by name in a list.
parameter item_name: name of the item.
parameter item_list: list of items.
return: item if found. None otherwise.
"""
items = [item for item in item_list if item.name == item_name]
return items[0] if len(items) > 0 else None
def get_elem_index(elem_name, elem_list):
"""
Helper function to return index of an item in a node list
"""
elem_idx = -1
for i in range(len(elem_list)):
if elem_list[i] == elem_name:
elem_idx = i
return elem_idx
def get_mul_node(inputs, output, name):
"""
Helper function to create a Mul node.
parameter inputs: list of input names.
parameter output: output name.
parameter name: name of the node.
return: Mul node in NodeProto format.
"""
return onnx.helper.make_node("Mul", inputs, [output], name)
def generate_identified_filename(filename: Path, identifier: str) -> Path:
"""
Helper function to generate a identifiable filepath by concatenating the given identifier as a suffix.
"""
return filename.parent.joinpath(filename.stem + identifier + filename.suffix)
def apply_plot(hist, hist_edges):
import sys
import matplotlib.pyplot as plt
import numpy
numpy.set_printoptions(threshold=sys.maxsize)
print("Histogram:")
print(hist)
print("Histogram Edges:")
print(hist_edges)
plt.stairs(hist, hist_edges, fill=True)
plt.xlabel("Tensor value")
plt.ylabel("Counts")
plt.title("Tensor value V.S. Counts")
plt.show()
def write_calibration_table(calibration_cache, dir="."):
"""
Helper function to write calibration table to files.
"""
import json
import flatbuffers
import onnxruntime.quantization.CalTableFlatBuffers.KeyValue as KeyValue
import onnxruntime.quantization.CalTableFlatBuffers.TrtTable as TrtTable
logging.info(f"calibration cache: {calibration_cache}")
with open(os.path.join(dir, "calibration.json"), "w") as file:
file.write(json.dumps(calibration_cache)) # use `json.loads` to do the reverse
# Serialize data using FlatBuffers
builder = flatbuffers.Builder(1024)
key_value_list = []
for key in sorted(calibration_cache.keys()):
values = calibration_cache[key]
value = str(max(abs(values[0]), abs(values[1])))
flat_key = builder.CreateString(key)
flat_value = builder.CreateString(value)
KeyValue.KeyValueStart(builder)
KeyValue.KeyValueAddKey(builder, flat_key)
KeyValue.KeyValueAddValue(builder, flat_value)
key_value = KeyValue.KeyValueEnd(builder)
key_value_list.append(key_value)
TrtTable.TrtTableStartDictVector(builder, len(key_value_list))
for key_value in key_value_list:
builder.PrependUOffsetTRelative(key_value)
main_dict = builder.EndVector()
TrtTable.TrtTableStart(builder)
TrtTable.TrtTableAddDict(builder, main_dict)
cal_table = TrtTable.TrtTableEnd(builder)
builder.Finish(cal_table)
buf = builder.Output()
with open(os.path.join(dir, "calibration.flatbuffers"), "wb") as file:
file.write(buf)
# Deserialize data (for validation)
if os.environ.get("QUANTIZATION_DEBUG", 0) in (1, "1"):
cal_table = TrtTable.TrtTable.GetRootAsTrtTable(buf, 0)
dict_len = cal_table.DictLength()
for i in range(dict_len):
key_value = cal_table.Dict(i)
logging.info(key_value.Key())
logging.info(key_value.Value())
# write plain text
with open(os.path.join(dir, "calibration.cache"), "w") as file:
for key in sorted(calibration_cache.keys()):
value = calibration_cache[key]
s = key + " " + str(max(abs(value[0]), abs(value[1])))
file.write(s)
file.write("\n")
def smooth_distribution(p, eps=0.0001):
"""Given a discrete distribution (may have not been normalized to 1),
smooth it by replacing zeros with eps multiplied by a scaling factor
and taking the corresponding amount off the non-zero values.
Ref: http://web.engr.illinois.edu/~hanj/cs412/bk3/KL-divergence.pdf
https://github.com//apache/incubator-mxnet/blob/master/python/mxnet/contrib/quantization.py
"""
is_zeros = (p == 0).astype(numpy.float32)
is_nonzeros = (p != 0).astype(numpy.float32)
n_zeros = is_zeros.sum()
n_nonzeros = p.size - n_zeros
if not n_nonzeros:
# raise ValueError('The discrete probability distribution is malformed. All entries are 0.')
return None
eps1 = eps * float(n_zeros) / float(n_nonzeros)
assert eps1 < 1.0, "n_zeros=%d, n_nonzeros=%d, eps1=%f" % (
n_zeros,
n_nonzeros,
eps1,
)
hist = p.astype(numpy.float32)
hist += eps * is_zeros + (-eps1) * is_nonzeros
assert (hist <= 0).sum() == 0
return hist
def model_has_external_data(model_path: Path):
model = onnx.load(model_path.as_posix(), load_external_data=False)
for intializer in model.graph.initializer:
if external_data_helper.uses_external_data(intializer):
return True
return False
def optimize_model(model_path: Path, opt_model_path: Path):
"""
Generate model that applies graph optimization (constant folding, etc.)
parameter model_path: path to the original onnx model
parameter opt_model_path: path to the optimized onnx model
:return: optimized onnx model
"""
sess_option = SessionOptions()
sess_option.optimized_model_filepath = opt_model_path.as_posix()
sess_option.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC
kwargs = {}
# This will rename constant initializer names, disable it to make test pass.
kwargs["disabled_optimizers"] = ["ConstantSharing"]
_ = InferenceSession(model_path.as_posix(), sess_option, providers=["CPUExecutionProvider"], **kwargs)
def add_pre_process_metadata(model: ModelProto):
"""Tag the model that it went through quantization pre-processing"""
metadata_props = {"onnx.quant.pre_process": "onnxruntime.quant"}
if model.metadata_props:
for prop in model.metadata_props:
metadata_props.update({prop.key: prop.value})
onnx.helper.set_model_props(model, metadata_props)
def model_has_pre_process_metadata(model: ModelProto) -> bool:
"""Check the model whether it went through quantization pre-processing"""
if model.metadata_props:
for prop in model.metadata_props:
if prop.key == "onnx.quant.pre_process" and prop.value == "onnxruntime.quant":
return True
return False
def add_infer_metadata(model: ModelProto):
metadata_props = {"onnx.infer": "onnxruntime.quant"}
if model.metadata_props:
for p in model.metadata_props:
metadata_props.update({p.key: p.value})
onnx.helper.set_model_props(model, metadata_props)
def model_has_infer_metadata(model: ModelProto) -> bool:
if model.metadata_props:
for p in model.metadata_props:
if p.key == "onnx.infer" and p.value == "onnxruntime.quant":
return True
return False
def load_model_with_shape_infer(model_path: Path) -> ModelProto:
inferred_model_path = generate_identified_filename(model_path, "-inferred")
onnx.shape_inference.infer_shapes_path(str(model_path), str(inferred_model_path))
model = onnx.load(inferred_model_path.as_posix())
add_infer_metadata(model)
inferred_model_path.unlink()
return model
def save_and_reload_model_with_shape_infer(model: ModelProto) -> ModelProto:
with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
model_path = Path(quant_tmp_dir).joinpath("model.onnx")
onnx.save_model(model, model_path.as_posix(), save_as_external_data=True)
return load_model_with_shape_infer(model_path)
def tensor_proto_to_array(initializer: TensorProto) -> numpy.ndarray:
if initializer.data_type in (onnx_proto.TensorProto.FLOAT, onnx_proto.TensorProto.FLOAT16):
return onnx.numpy_helper.to_array(initializer)
raise ValueError(
f"Only float type is supported. Weights {initializer.name} is {type_to_name[initializer.data_type]}"
)
def add_quant_suffix(tensor_name: str) -> str:
return tensor_name + "_QuantizeLinear"
def add_quant_input_suffix(tensor_name: str) -> str:
return tensor_name + QUANT_INPUT_SUFFIX
def add_quant_output_suffix(tensor_name) -> str:
return tensor_name + "_QuantizeLinear_Output"
def add_dequant_suffix(tensor_name) -> str:
return tensor_name + "_DequantizeLinear"
def add_dequant_input_suffix(tensor_name) -> str:
return tensor_name + "_DequantizeLinear_Input"
def add_dequant_output_suffix(tensor_name) -> str:
return tensor_name + DEQUANT_OUTPUT_SUFFIX

View File

@ -0,0 +1,737 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
import logging
import tempfile
from pathlib import Path
from typing import Union
import onnx
from .calibrate import CalibrationDataReader, CalibrationMethod, TensorsData, create_calibrator
from .onnx_quantizer import ONNXQuantizer
from .qdq_quantizer import QDQQuantizer
from .quant_utils import (
QuantFormat,
QuantizationMode,
QuantType,
load_model_with_shape_infer,
model_has_pre_process_metadata,
save_and_reload_model_with_shape_infer,
)
from .registry import IntegerOpsRegistry, QDQRegistry, QLinearOpsRegistry
class QuantConfig:
def __init__(
self,
activation_type=QuantType.QUInt8,
weight_type=QuantType.QInt8,
op_types_to_quantize=None,
nodes_to_quantize=None,
nodes_to_exclude=None,
per_channel=False,
reduce_range=False,
use_external_data_format=False,
):
"""
This is the Base class for both Static and Dynamic Quantize Configuration
Args:
activation_type:
quantization data type of activation. Please refer to
https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
weight_type:
quantization data type of weight. Please refer to
https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
op_types_to_quantize:
specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
It quantizes all supported operators by default.
nodes_to_quantize:
List of nodes names to quantize. When this list is not None only the nodes in this list
are quantized.
example:
[
'Conv__224',
'Conv__252'
]
nodes_to_exclude:
List of nodes names to exclude. The nodes in this list will be excluded from quantization
when it is not None.
per_channel: quantize weights per channel
reduce_range:
quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
especially for per-channel mode
use_external_data_format: option used for large size (>2GB) model. Set to False by default.
"""
nodes_to_exclude = nodes_to_exclude or []
nodes_to_quantize = nodes_to_quantize or []
op_types_to_quantize = op_types_to_quantize or []
self.op_types_to_quantize = op_types_to_quantize
self.per_channel = per_channel
self.reduce_range = reduce_range
self.weight_type = weight_type
self.activation_type = activation_type
self.nodes_to_quantize = nodes_to_quantize
self.nodes_to_exclude = nodes_to_exclude
self.use_external_data_format = use_external_data_format
class StaticQuantConfig(QuantConfig):
def __init__(
self,
calibration_data_reader: CalibrationDataReader,
calibrate_method=CalibrationMethod.MinMax,
quant_format=QuantFormat.QDQ,
activation_type=QuantType.QInt8,
weight_type=QuantType.QInt8,
op_types_to_quantize=None,
nodes_to_quantize=None,
nodes_to_exclude=None,
per_channel=False,
reduce_range=False,
use_external_data_format=False,
extra_options=None,
):
"""
This is the derived class for static Quantize Configuration
Args:
calibration_data_reader:
a calibration data reader. It enumerates calibration data and generates inputs for the original model.
calibrate_method:
Current calibration methods supported are MinMax, Entropy and Percentile.
quant_format: QuantFormat{QOperator, QDQ}.
QOperator format quantizes the model with quantized operators directly.
QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
extra_options:
key value pair dictionary for various options in different case. Current used:
extra.Sigmoid.nnapi = True/False (Default is False)
ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized.
Dyanmic mode currently is supported. Will support more in future.
ForceQuantizeNoInputCheck = True/False :
By default, some latent operators like maxpool, transpose, do not quantize if their input is not
quantized already. Setting to True to force such operator always quantize input and so generate
quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
MatMulConstBOnly = True/False:
Default is False for static mode. If enabled, only MatMul with const B will be quantized.
AddQDQPairToWeight = True/False :
Default is False which quantizes floating-point weight and feeds it to solely inserted
DeQuantizeLinear node. If True, it remains floating-point weight and inserts both
QuantizeLinear/DeQuantizeLinear nodes to weight.
OpTypesToExcludeOutputQuantization = list of op type :
Default is []. If any op type is specified, it won't quantize the output of ops with this
specific op types.
DedicatedQDQPair = True/False :
Default is False. When inserting QDQ pair, multiple nodes can share a single QDQ pair as their
inputs. If True, it will create identical and dedicated QDQ pair for each node.
QDQOpTypePerChannelSupportToAxis = dictionary :
Default is {}. Set channel axis for specific op type, for example: {'MatMul': 1}, and it's
effective only when per channel quantization is supported and per_channel is True. If specific
op type supports per channel quantization but not explicitly specified with channel axis,
default channel axis will be used.
CalibTensorRangeSymmetric = True/False :
Default is False. If enabled, the final range of tensor during calibration will be explicitly
set to symmetric to central point "0".
CalibMovingAverage = True/False :
Default is False. If enabled, the moving average of the minimum and maximum values will be
computed when the calibration method selected is MinMax.
CalibMovingAverageConstant = float :
Default is 0.01. Constant smoothing factor to use when computing the moving average of the
minimum and maximum values. Effective only when the calibration method selected is MinMax and
when CalibMovingAverage is set to True.
QuantizeBias = True/False :
Default is True which quantizes floating-point biases and it solely inserts
a DeQuantizeLinear node. If False, it remains floating-point bias and does not insert
any quantization nodes associated with biases.
This extra option is only effective when quant_format is QuantFormat.QDQ.
SmoothQuant = True/False :
Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do
fake input channel quantization.
SmoothQuantAlpha = float :
Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight
and activation quantization. A larger alpha value could be used on models with more significant
activation outliers to migrate more quantization difficulty to weights.
SmoothQuantFolding = True/False :
Default is True. It only works if SmoothQuant is True. If enabled, inserted Mul ops during
SmoothQuant will be folded into the previous op if the previous op is foldable.
UseQDQContribOps = True/False :
Default is False. If enabled, the inserted QuantizeLinear and DequantizeLinear ops will have the
`com.microsoft` domain, which forces use of ONNX Runtime's QuantizeLinear and DequantizeLinear
contrib op implementations. The contrib op implementations may support features not standardized
into the ONNX specification (e.g., 16-bit quantization types).
MinimumRealRange = float|None :
Default is None. If set to a floating-point value, the calculation of the quantization parameters
(i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax-rmin)
is less than the specified minimum range, rmax will be set to rmin + MinimumRealRange. This is
necessary for EPs like QNN that require a minimum floating-point range when determining
quantization parameters.
TensorQuantOverrides = dictionary :
Default is {}. Set tensor quantization overrides. The key is a tensor name and the value is a
list of dictionaries. For per-tensor quantization, the list contains a single dictionary. For
per-channel quantization, the list contains a dictionary for each channel in the tensor.
Each dictionary contains optional overrides with the following keys and values.
'quant_type' = QuantType : The tensor's quantization data type.
'scale' = Float : The scale value to use. Must also specify `zero_point` if set.
'zero_point' = Int : The zero-point value to use. Must also specify `scale` is set.
'symmetric' = Bool : If the tensor should use symmetric quantization. Invalid if also
set `scale` or `zero_point`.
'reduce_range' = Bool : If the quantization range should be reduced. Invalid if also
set `scale` or `zero_point`.
'rmax' = Float : Override the maximum real tensor value in calibration data.
Invalid if also set `scale` or `zero_point`.
'rmin' = Float : Override the minimum real tensor value in calibration data.
Invalid if also set `scale` or `zero_point`.
QDQKeepRemovableActivations = True/False:
Default is False. If true, "removable" activations (e.g., Clip or Relu) will not be removed, and
will be explicitly represented in the QDQ model. If false, these activations are automatically
removed if activations are asymmetrically quantized. Keeping these activations is necessary if
optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear
operators from the model.
execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc.
Raises:
ValueError: Raise ValueError if execution provider is unknown
"""
super().__init__(
activation_type=activation_type,
weight_type=weight_type,
op_types_to_quantize=op_types_to_quantize,
nodes_to_quantize=nodes_to_quantize,
nodes_to_exclude=nodes_to_exclude,
per_channel=per_channel,
reduce_range=reduce_range,
use_external_data_format=use_external_data_format,
)
self.calibration_data_reader = calibration_data_reader
self.calibrate_method = calibrate_method
self.quant_format = quant_format
self.extra_options = extra_options or {}
class DynamicQuantConfig(QuantConfig):
def __init__(
self,
weight_type=QuantType.QInt8,
op_types_to_quantize=None,
nodes_to_quantize=None,
nodes_to_exclude=None,
per_channel=False,
reduce_range=False,
use_external_data_format=False,
extra_options=None,
):
"""
This is a class for dynamic Quant Configuration
Args:
extra_options: key value pair dictionary for various options in different case. Current used:
extra.Sigmoid.nnapi = True/False (Default is False)
ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
EnableSubgraph = True/False :
Default is False. If enabled, subgraph will be quantized. Dynamic mode currently is supported. Will
support more in the future.
ForceQuantizeNoInputCheck = True/False :
By default, some latent operators like maxpool, transpose, do not quantize if their input is not
quantized already. Setting to True to force such operator always quantize input and so generate
quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
MatMulConstBOnly = True/False:
Default is True for dynamic mode. If enabled, only MatMul with const B will be quantized.
execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc.
Raises:
ValueError: Raise ValueError if execution provider is unknown
"""
super().__init__(
op_types_to_quantize=op_types_to_quantize,
per_channel=per_channel,
reduce_range=reduce_range,
weight_type=weight_type,
nodes_to_quantize=nodes_to_quantize,
nodes_to_exclude=nodes_to_exclude,
use_external_data_format=use_external_data_format,
)
self.extra_options = extra_options or {}
def check_static_quant_arguments(quant_format: QuantFormat, activation_type: QuantType, weight_type: QuantType):
if activation_type == QuantType.QInt8 and weight_type == QuantType.QUInt8:
raise ValueError(
"ONNXRuntime quantization doesn't support data format:"
"activation_type=QuantType.QInt8, weight_type=QuantType.QUInt8"
)
if activation_type != QuantType.QFLOAT8E4M3FN and weight_type == QuantType.QFLOAT8E4M3FN:
raise ValueError(
f"ONNXRuntime quantization doesn't support data format: activation_type={activation_type} "
f"!=QuantType.QFLOAT8E4M3FN, weight_type=QuantType.QFLOAT8E4M3FN."
)
if activation_type == QuantType.QFLOAT8E4M3FN and weight_type != QuantType.QFLOAT8E4M3FN:
raise ValueError(
"ONNXRuntime quantization doesn't support data format: activation_type=QuantType.QFLOAT8E4M3FN, "
f"weight_type={weight_type}!=QuantType.QFLOAT8E4M3FN"
)
q16_types = [QuantType.QInt16, QuantType.QUInt16]
if (activation_type in q16_types or weight_type in q16_types) and quant_format != QuantFormat.QDQ:
raise ValueError("Only QuantFormat.QDQ supports 16-bit quantization types.")
if activation_type == QuantType.QInt8 and weight_type == QuantType.QInt8 and quant_format != QuantFormat.QDQ:
logging.warning(
"Please use QuantFormat.QDQ for activation type QInt8 and weight type QInt8. "
"Or it will lead to bad performance on x64."
)
def quantize_static(
model_input: Union[str, Path, onnx.ModelProto],
model_output: Union[str, Path],
calibration_data_reader: CalibrationDataReader,
quant_format=QuantFormat.QDQ,
op_types_to_quantize=None,
per_channel=False,
reduce_range=False,
activation_type=QuantType.QInt8,
weight_type=QuantType.QInt8,
nodes_to_quantize=None,
nodes_to_exclude=None,
use_external_data_format=False,
calibrate_method=CalibrationMethod.MinMax,
extra_options=None,
):
"""
Given an onnx model and calibration data reader, create a quantized onnx model and save it into a file
It is recommended to use QuantFormat.QDQ format from 1.11 with activation_type = QuantType.QInt8 and weight_type
= QuantType.QInt8. If model is targeted to GPU/TRT, symmetric activation and weight are required. If model is
targeted to CPU, asymmetric activation and symmetric weight are recommended for balance of performance and
accuracy.
Args:
model_input: file path of model or ModelProto to quantize
model_output: file path of quantized model
calibration_data_reader: a calibration data reader. It
enumerates calibration data and generates inputs for the
original model.
quant_format: QuantFormat{QOperator, QDQ}.
QOperator format quantizes the model with quantized operators directly.
QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
activation_type:
quantization data type of activation. Please refer to
https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
calibrate_method:
Current calibration methods supported are MinMax and Entropy.
Please use CalibrationMethod.MinMax or CalibrationMethod.Entropy as options.
op_types_to_quantize:
specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
It quantizes all supported operators by default.
per_channel: quantize weights per channel
reduce_range:
quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
especially for per-channel mode
weight_type:
quantization data type of weight. Please refer to
https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
nodes_to_quantize:
List of nodes names to quantize. When this list is not None only the nodes in this list
are quantized.
example:
[
'Conv__224',
'Conv__252'
]
nodes_to_exclude:
List of nodes names to exclude. The nodes in this list will be excluded from quantization
when it is not None.
use_external_data_format: option used for large size (>2GB) model. Set to False by default.
extra_options:
key value pair dictionary for various options in different case. Current used:
extra.Sigmoid.nnapi = True/False (Default is False)
ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized.
Dyanmic mode currently is supported. Will support more in the future.
ForceQuantizeNoInputCheck = True/False :
By default, some latent operators like maxpool, transpose, do not quantize if their input is not
quantized already. Setting to True to force such operator always quantize input and so generate
quantized output. Also, the True behavior could be disabled per node using the nodes_to_exclude.
MatMulConstBOnly = True/False:
Default is False for static mode. If enabled, only MatMul with const B will be quantized.
AddQDQPairToWeight = True/False :
Default is False which quantizes floating-point weight and feeds it to solely inserted
DeQuantizeLinear node. If True, it remains floating-point weight and inserts both
QuantizeLinear/DeQuantizeLinear nodes to weight.
OpTypesToExcludeOutputQuantization = list of op type :
Default is []. If any op type is specified, it won't quantize the output of ops with this
specific op types.
DedicatedQDQPair = True/False :
Default is False. When inserting QDQ pair, multiple nodes can share a single QDQ pair as their
inputs. If True, it will create identical and dedicated QDQ pair for each node.
QDQOpTypePerChannelSupportToAxis = dictionary :
Default is {}. Set channel axis for specific op type, for example: {'MatMul': 1}, and it's
effective only when per channel quantization is supported and per_channel is True. If specific
op type supports per channel quantization but not explicitly specified with channel axis,
default channel axis will be used.
CalibTensorRangeSymmetric = True/False :
Default is False. If enabled, the final range of tensor during calibration will be explicitly
set to symmetric to central point "0".
CalibStridedMinMax = Optional[int] :
Default is None. If set to an integer, during calculation of the min-max, only stride amount of
data will be used and then all results will be merged in the end.
CalibMovingAverage = True/False :
Default is False. If enabled, the moving average of the minimum and maximum values will be
computed when the calibration method selected is MinMax.
CalibMovingAverageConstant = float :
Default is 0.01. Constant smoothing factor to use when computing the moving average of the
minimum and maximum values. Effective only when the calibration method selected is MinMax and
when CalibMovingAverage is set to True.
CalibMaxIntermediateOutputs = Optional[int] :
Default is None. If set to an integer, during calculation of the min-max range of the tensors
it will load at max value number of outputs before computing and merging the range. This will
produce the same result as all computing with None, but is more memory efficient.
SmoothQuant = True/False :
Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do
fake input channel quantization.
SmoothQuantAlpha = float :
Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight
and activation quantization. A larger alpha value could be used on models with more significant
activation outliers to migrate more quantization difficulty to weights.
SmoothQuantFolding = True/False :
Default is True. It only works if SmoothQuant is True. If enabled, inserted Mul ops during
SmoothQuant will be folded into the previous op if the previous op is foldable.
UseQDQContribOps = True/False :
Default is False. If enabled, the inserted QuantizeLinear and DequantizeLinear ops will have the
`com.microsoft` domain, which forces use of ONNX Runtime's QuantizeLinear and DequantizeLinear
contrib op implementations. The contrib op implementations may support features not standardized
into the ONNX specification (e.g., 16-bit quantization types).
MinimumRealRange = float|None :
Default is None. If set to a floating-point value, the calculation of the quantization parameters
(i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax - rmin)
is less than the specified minimum range, rmax will be set to rmin + MinimumRealRange. This is
necessary for EPs like QNN that require a minimum floating-point range when determining
quantization parameters.
TensorQuantOverrides = dictionary :
Default is {}. Set tensor quantization overrides. The key is a tensor name and the value is a
list of dictionaries. For per-tensor quantization, the list contains a single dictionary. For
per-channel quantization, the list contains a dictionary for each channel in the tensor.
Each dictionary contains optional overrides with the following keys and values.
'quant_type' = QuantType : The tensor's quantization data type.
'scale' = Float : The scale value to use. Must also specify `zero_point` if set.
'zero_point' = Int : The zero-point value to use. Must also specify `scale` is set.
'symmetric' = Bool : If the tensor should use symmetric quantization. Invalid if also
set `scale` or `zero_point`.
'reduce_range' = Bool : If the quantization range should be reduced. Invalid if also
set `scale` or `zero_point`.
'rmax' = Float : Override the maximum real tensor value in calibration data.
Invalid if also set `scale` or `zero_point`.
'rmin' = Float : Override the minimum real tensor value in calibration data.
Invalid if also set `scale` or `zero_point`.
QDQKeepRemovableActivations = True/False:
Default is False. If true, "removable" activations (e.g., Clip or Relu) will not be removed, and
will be explicitly represented in the QDQ model. If false, these activations are automatically
removed if activations are asymmetrically quantized. Keeping these activations is necessary if
optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear
operators from the model.
"""
if activation_type == QuantType.QFLOAT8E4M3FN or weight_type == QuantType.QFLOAT8E4M3FN:
if calibrate_method != CalibrationMethod.Distribution:
raise ValueError("Only Distribution calibration method is supported for float quantization.")
extra_options = extra_options or {}
nodes_to_exclude = nodes_to_exclude or []
nodes_to_quantize = nodes_to_quantize or []
op_types_to_quantize = op_types_to_quantize or []
mode = QuantizationMode.QLinearOps
if not op_types_to_quantize or len(op_types_to_quantize) == 0:
q_linear_ops = list(QLinearOpsRegistry.keys())
qdq_ops = list(QDQRegistry.keys())
op_types_to_quantize = list(set(q_linear_ops + qdq_ops))
model = (
save_and_reload_model_with_shape_infer(model_input)
if isinstance(model_input, onnx.ModelProto)
else load_model_with_shape_infer(Path(model_input))
)
pre_processed: bool = model_has_pre_process_metadata(model)
if not pre_processed:
logging.warning(
"Please consider to run pre-processing before quantization. Refer to example: "
"https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification"
"/cpu/ReadMe.md "
)
calib_extra_options_keys = [
("CalibTensorRangeSymmetric", "symmetric"),
("CalibMovingAverage", "moving_average"),
("CalibMovingAverageConstant", "averaging_constant"),
("CalibMaxIntermediateOutputs", "max_intermediate_outputs"),
]
calib_extra_options = {
key: extra_options.get(name) for (name, key) in calib_extra_options_keys if name in extra_options
}
if extra_options.get("SmoothQuant", False):
import importlib
try:
importlib.import_module("neural_compressor.adaptor.ox_utils.smooth_quant")
except Exception as e:
logging.error(f"{e}.")
raise RuntimeError("neural-compressor is not correctly installed. Please check your environment.") from e
import copy
from neural_compressor.adaptor.ox_utils.smooth_quant import ORTSmoothQuant
def inc_dataloader():
data_reader = copy.deepcopy(calibration_data_reader)
for data in data_reader:
yield data, None
orig_nodes = [i.name for i in model.graph.node]
dataloader = inc_dataloader()
sq = ORTSmoothQuant(model_input, dataloader, reduce_range)
del dataloader
model = sq.transform(extra_options.get("SmoothQuantAlpha", 0.5), extra_options.get("SmoothQuantFolding", True))
sq_path = tempfile.TemporaryDirectory(prefix="ort.quant.")
model_input = Path(sq_path.name).joinpath("sq_model.onnx").as_posix()
model.save(model_input)
nodes_to_exclude.extend([i.name for i in model.model.graph.node if i.name not in orig_nodes])
model = load_model_with_shape_infer(Path(model_input)) # use smooth quant model for calibration
with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
if isinstance(model_input, onnx.ModelProto):
output_path = str(Path(quant_tmp_dir) / "model_input.onnx")
onnx.save_model(
model_input,
output_path,
save_as_external_data=True,
)
model_input = output_path
calibrator = create_calibrator(
Path(model_input),
op_types_to_quantize,
augmented_model_path=Path(quant_tmp_dir).joinpath("augmented_model.onnx").as_posix(),
calibrate_method=calibrate_method,
use_external_data_format=use_external_data_format,
extra_options=calib_extra_options,
)
stride = extra_options.get("CalibStridedMinMax", None)
if stride:
total_data_size = len(calibration_data_reader)
if total_data_size % stride != 0:
raise ValueError(f"Total data size ({total_data_size}) is not divisible by stride size ({stride}).")
for start in range(0, total_data_size, stride):
end_index = start + stride
calibration_data_reader.set_range(start_index=start, end_index=end_index)
calibrator.collect_data(calibration_data_reader)
else:
calibrator.collect_data(calibration_data_reader)
tensors_range = calibrator.compute_data()
if not isinstance(tensors_range, TensorsData):
raise TypeError(
f"Unexpected type {type(tensors_range)} for tensors_range and calibrator={type(calibrator)}."
)
del calibrator
check_static_quant_arguments(quant_format, activation_type, weight_type)
if quant_format is QuantFormat.QOperator:
quantizer = ONNXQuantizer(
model,
per_channel,
reduce_range,
mode,
True, # static
weight_type,
activation_type,
tensors_range,
nodes_to_quantize,
nodes_to_exclude,
op_types_to_quantize,
extra_options,
)
else:
quantizer = QDQQuantizer(
model,
per_channel,
reduce_range,
weight_type,
activation_type,
tensors_range,
nodes_to_quantize,
nodes_to_exclude,
op_types_to_quantize,
extra_options,
)
quantizer.quantize_model()
quantizer.model.save_model_to_file(model_output, use_external_data_format)
if not pre_processed:
logging.warning(
"Please consider pre-processing before quantization. See "
"https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification"
"/cpu/ReadMe.md "
)
if extra_options.get("SmoothQuant", False):
sq_path.cleanup()
def quantize_dynamic(
model_input: Union[str, Path, onnx.ModelProto],
model_output: Union[str, Path],
op_types_to_quantize=None,
per_channel=False,
reduce_range=False,
weight_type=QuantType.QInt8,
nodes_to_quantize=None,
nodes_to_exclude=None,
use_external_data_format=False,
extra_options=None,
):
"""Given an onnx model, create a quantized onnx model and save it into a file
Args:
model_input: file path of model or ModelProto to quantize
model_output: file path of quantized model
op_types_to_quantize:
specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
It quantizes all supported operators by default.
per_channel: quantize weights per channel
reduce_range:
quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
especially for per-channel mode
weight_type:
quantization data type of weight. Please refer to
https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
nodes_to_quantize:
List of nodes names to quantize. When this list is not None only the nodes in this list
are quantized.
example:
[
'Conv__224',
'Conv__252'
]
nodes_to_exclude:
List of nodes names to exclude. The nodes in this list will be excluded from quantization
when it is not None.
use_external_data_format: option used for large size (>2GB) model. Set to False by default.
extra_options:
key value pair dictionary for various options in different case. Current used:
extra.Sigmoid.nnapi = True/False (Default is False)
ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
EnableSubgraph = True/False :
Default is False. If enabled, subgraph will be quantized. Dynamic mode currently is supported. Will
support more in the future.
ForceQuantizeNoInputCheck = True/False :
By default, some latent operators like maxpool, transpose, do not quantize if their input is not
quantized already. Setting to True to force such operator always quantize input and so generate
quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
MatMulConstBOnly = True/False:
Default is True for dynamic mode. If enabled, only MatMul with const B will be quantized.
"""
extra_options = extra_options or {}
nodes_to_exclude = nodes_to_exclude or []
nodes_to_quantize = nodes_to_quantize or []
op_types_to_quantize = op_types_to_quantize or []
mode = QuantizationMode.IntegerOps
if not op_types_to_quantize or len(op_types_to_quantize) == 0:
op_types_to_quantize = list(IntegerOpsRegistry.keys())
model = (
save_and_reload_model_with_shape_infer(model_input)
if isinstance(model_input, onnx.ModelProto)
else load_model_with_shape_infer(Path(model_input))
)
pre_processed: bool = model_has_pre_process_metadata(model)
if not pre_processed:
logging.warning(
"Please consider to run pre-processing before quantization. Refer to example: "
"https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification"
"/cpu/ReadMe.md "
)
if "MatMulConstBOnly" not in extra_options:
extra_options["MatMulConstBOnly"] = True
quantizer = ONNXQuantizer(
model,
per_channel,
reduce_range,
mode,
False, # static
weight_type,
QuantType.QUInt8, # dynamic activation only supports uint8
None,
nodes_to_quantize,
nodes_to_exclude,
op_types_to_quantize,
extra_options,
)
quantizer.quantize_model()
quantizer.model.save_model_to_file(model_output, use_external_data_format)
def quantize(
model_input: Union[str, Path, onnx.ModelProto],
model_output: Union[str, Path],
quant_config: QuantConfig,
):
"""Quantize a model with QuantConfig.
Args:
model_input (str | Path | ModelProto): Path to the model or ModelProto to quantize.
model_output (str | Path): Path to save the quantized model.
quant_config (QuantConfig): Quantization Configuration.
"""
if isinstance(quant_config, StaticQuantConfig):
quantize_static(
model_input,
model_output,
quant_config.calibration_data_reader,
calibrate_method=quant_config.calibrate_method,
quant_format=quant_config.quant_format,
activation_type=quant_config.activation_type,
weight_type=quant_config.weight_type,
op_types_to_quantize=quant_config.op_types_to_quantize,
nodes_to_quantize=quant_config.nodes_to_quantize,
nodes_to_exclude=quant_config.nodes_to_exclude,
per_channel=quant_config.per_channel,
reduce_range=quant_config.reduce_range,
use_external_data_format=quant_config.use_external_data_format,
extra_options=quant_config.extra_options,
)
elif isinstance(quant_config, DynamicQuantConfig):
quantize_dynamic(
model_input,
model_output,
weight_type=quant_config.weight_type,
op_types_to_quantize=quant_config.op_types_to_quantize,
nodes_to_quantize=quant_config.nodes_to_quantize,
nodes_to_exclude=quant_config.nodes_to_exclude,
per_channel=quant_config.per_channel,
reduce_range=quant_config.reduce_range,
use_external_data_format=quant_config.use_external_data_format,
extra_options=quant_config.extra_options,
)
else:
raise TypeError("Invalid quantization config type, it must be either StaticQuantConfig or DynamicQuantConfig.")

View File

@ -0,0 +1,105 @@
from .operators.activation import QDQRemovableActivation, QLinearActivation
from .operators.argmax import QArgMax
from .operators.attention import AttentionQuant
from .operators.base_operator import QuantOperatorBase
from .operators.binary_op import QLinearBinaryOp
from .operators.concat import QLinearConcat
from .operators.conv import ConvInteger, QDQConv, QLinearConv
from .operators.direct_q8 import Direct8BitOp, QDQDirect8BitOp
from .operators.embed_layernorm import EmbedLayerNormalizationQuant
from .operators.gather import GatherQuant, QDQGather
from .operators.gavgpool import QGlobalAveragePool
from .operators.gemm import QDQGemm, QLinearGemm
from .operators.lstm import LSTMQuant
from .operators.matmul import MatMulInteger, QDQMatMul, QLinearMatMul
from .operators.maxpool import QDQMaxPool, QMaxPool
from .operators.norm import QDQNormalization
from .operators.pad import QPad
from .operators.pooling import QLinearPool
from .operators.qdq_base_operator import QDQOperatorBase
from .operators.resize import QDQResize, QResize
from .operators.softmax import QLinearSoftmax
from .operators.split import QDQSplit, QSplit
from .operators.where import QDQWhere, QLinearWhere
from .quant_utils import QuantizationMode
CommonOpsRegistry = {
"Gather": GatherQuant,
"Transpose": Direct8BitOp,
"EmbedLayerNormalization": EmbedLayerNormalizationQuant,
}
IntegerOpsRegistry = {
"Conv": ConvInteger,
"MatMul": MatMulInteger,
"Attention": AttentionQuant,
"LSTM": LSTMQuant,
}
IntegerOpsRegistry.update(CommonOpsRegistry)
QLinearOpsRegistry = {
"ArgMax": QArgMax,
"Conv": QLinearConv,
"Gemm": QLinearGemm,
"MatMul": QLinearMatMul,
"Add": QLinearBinaryOp,
"Mul": QLinearBinaryOp,
"Relu": QLinearActivation,
"Clip": QLinearActivation,
"LeakyRelu": QLinearActivation,
"Sigmoid": QLinearActivation,
"MaxPool": QMaxPool,
"GlobalAveragePool": QGlobalAveragePool,
"Split": QSplit,
"Pad": QPad,
"Reshape": Direct8BitOp,
"Squeeze": Direct8BitOp,
"Unsqueeze": Direct8BitOp,
"Resize": QResize,
"AveragePool": QLinearPool,
"Concat": QLinearConcat,
"Softmax": QLinearSoftmax,
"Where": QLinearWhere,
}
QLinearOpsRegistry.update(CommonOpsRegistry)
QDQRegistry = {
"Conv": QDQConv,
"ConvTranspose": QDQConv,
"Gemm": QDQGemm,
"Clip": QDQRemovableActivation,
"Relu": QDQRemovableActivation,
"Reshape": QDQDirect8BitOp,
"Transpose": QDQDirect8BitOp,
"Squeeze": QDQDirect8BitOp,
"Unsqueeze": QDQDirect8BitOp,
"Resize": QDQResize,
"MaxPool": QDQMaxPool,
"AveragePool": QDQDirect8BitOp,
"MatMul": QDQMatMul,
"Split": QDQSplit,
"Gather": QDQGather,
"Where": QDQWhere,
"InstanceNormalization": QDQNormalization,
"LayerNormalization": QDQNormalization,
"BatchNormalization": QDQNormalization,
}
def CreateDefaultOpQuantizer(onnx_quantizer, node): # noqa: N802
return QuantOperatorBase(onnx_quantizer, node)
def CreateOpQuantizer(onnx_quantizer, node): # noqa: N802
registry = IntegerOpsRegistry if onnx_quantizer.mode == QuantizationMode.IntegerOps else QLinearOpsRegistry
if node.op_type in registry:
op_quantizer = registry[node.op_type](onnx_quantizer, node)
if op_quantizer.should_quantize():
return op_quantizer
return QuantOperatorBase(onnx_quantizer, node)
def CreateQDQQuantizer(onnx_quantizer, node): # noqa: N802
if node.op_type in QDQRegistry:
return QDQRegistry[node.op_type](onnx_quantizer, node)
return QDQOperatorBase(onnx_quantizer, node)

Some files were not shown because too many files have changed in this diff Show More