I am done

This commit is contained in:
2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions

View File

@ -0,0 +1,2 @@
from .preprocess import qnn_preprocess_model # noqa: F401
from .quant_config import get_qnn_qdq_config # noqa: F401

View File

@ -0,0 +1,132 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
from __future__ import annotations
import onnx
from ...fusions import Fusion
from ...onnx_model import ONNXModel
class FusionLpNormalization(Fusion):
def __init__(self, model: ONNXModel, epsilon: float = 1e-12):
super().__init__(model, "LpNormalization", "ReduceL2")
self.epsilon = epsilon
def fuse(
self,
reduce_node: onnx.NodeProto,
input_name_to_nodes: dict[str, list[onnx.NodeProto]],
output_name_to_node: dict[str, onnx.NodeProto],
):
"""
Interface function that tries to fuse a node sequence containing a ReduceL2 node into a single
LpNormalization node.
Pattern 1:
[root] --> ReduceL2 -----> Clip --> Expand ----> Div -->
| (axis=-1) (min=epsilon) (shape=root) ^
| (keepdims=True) |
| |
+-----------------------------------------------+
Notes:
- ReduceL2 must use the last axis, and keepdims == True
- Clip must only have a min attribute that is ~1e-12
- Expand must restore the shape to root.shape
- The output of Expand must be the second input to Div.
"""
if reduce_node.output[0] not in input_name_to_nodes:
return
# ReduceL2 must have one Clip child
children = input_name_to_nodes[reduce_node.output[0]]
if len(children) != 1 or children[0].op_type != "Clip":
return
# ReduceL2 must have keepdims == True
keepdims = self.get_node_attribute(reduce_node, "keepdims")
if not keepdims:
return
# ReduceL2 axes must refer only to the last dimension.
# Axes became an input in opset 18. Before then, axes was an attribute
reduce_input_ttype = self.model.get_tensor_type(reduce_node.input[0])
if not reduce_input_ttype:
return
reduce_input_shape = self.tensor_shape_to_list(reduce_input_ttype)
if not reduce_input_shape:
return
axes = self.get_node_attribute(reduce_node, "axes")
if not axes and len(reduce_node.input) > 1:
axes = self.model.get_constant_value(reduce_node.input[1])
if not axes or len(axes) != 1:
return
last_dim = len(reduce_input_shape) - 1
if axes[0] != -1 and axes[0] != last_dim:
return
# Clip node must have a min attribute approximately equal to 1e-12
clip_node = children[0]
clip_min = self.get_node_attribute(clip_node, "min")
if clip_min is None and len(clip_node.input) > 1:
clip_min = self.model.get_constant_value(clip_node.input[1])
clip_max = self.get_node_attribute(clip_node, "max") # TODO: clip_max could be FLOAT_MAX
if clip_max is None and len(clip_node.input) > 2:
clip_max = self.model.get_constant_value(clip_node.input[2])
if not (clip_max is None and clip_min is not None and clip_min > 0 and abs(clip_min - self.epsilon) < 1e-13):
return
if clip_node.output[0] not in input_name_to_nodes:
return
# Clip must have a single Expand child.
children = input_name_to_nodes[clip_node.output[0]]
if len(children) != 1 or children[0].op_type != "Expand":
return
expand_node = children[0]
if expand_node.output[0] not in input_name_to_nodes:
return
# Expand must have a single Div child
children = input_name_to_nodes[expand_node.output[0]]
if len(children) != 1 or children[0].op_type != "Div":
return
div_node = children[0]
# The first input to Div must be the root of the subgraph (i.e., reduce_node.input[0])
# The second input to Div must be the output of the Expand.
# As long as these two inputs go to the same Div node, then ONNX validation will ensure that
# their shapes match.
if div_node.input[0] != reduce_node.input[0]:
return
if div_node.input[1] != expand_node.output[0]:
return
subgraph_input = reduce_node.input[0]
subgraph_output = div_node.output[0]
subgraph_nodes = [reduce_node, clip_node, expand_node, div_node]
if not self.is_safe_to_fuse_nodes(subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node):
return
self.nodes_to_remove.extend(subgraph_nodes)
fused_node = onnx.helper.make_node(
self.fused_op_type,
name=self.create_unique_node_name(),
inputs=[subgraph_input],
outputs=[subgraph_output],
p=2,
axis=-1,
)
self.nodes_to_add.append(fused_node)

View File

@ -0,0 +1,413 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
from __future__ import annotations
import logging
from dataclasses import dataclass
import onnx
from ...quant_utils import QuantType
from ...tensor_quant_overrides import QuantTypeInfo, TensorQuantOverridesHelper
@dataclass
class TensorTypeRequest:
"""
Bundles desired quantization type requests for a tensor. A distinction is made between the
produced type and the consumed type.
"""
# The tensor's quant type at the producer end. If None, assumed to be the default activation quant type.
producer: QuantTypeInfo | None
# The tensor's quant type received by a set of consumer nodes.
# If None, assumed to be the default activation quant type for all consumers.
# consumers[1] is a set of consumer node names.
consumers: tuple[QuantTypeInfo, set[str]] | None
class MixedPrecisionTensorQuantOverridesFixer:
"""
Helper that generates tensor quantization overrides for mixed-precision QDQ models.
Specifically, this helper fixes an initial set of quantization overrides that assign a non-default
activation quantization type to one or more tensors by doing the following:
- Inferring which other tensors need to be overridden to the non-default activation quantization type.
- Inserting quantization data type conversions.
Example:
--------
Float model:
input_0 --> Op1 --> Op3 --> Op5 --> Op6 --> output_0
^
|
input_1 --> Op2 -+-> Op4 ----+
|
+-> Op7 --> output_1
|
+-> Op8 --> output_2
If we'd like to quantize this model to uint8 precision, but would like to make sure tensor "Op4_out"
is quantized to 16-bit, then we would specify the following initial tensor quantization overrides:
```
init_overrides = {"Op4_out": [{"quant_type": QuantType.QUInt16}]}
```
These initial overrides may not create a valid model because Op4 and Op5 may require both the input and output
to be the same type (e.g., uint16). This helper fixes the overrides so that input/output data types
are valid:
```
overrides = TensorQuantOverridesHelper(init_overrides)
fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(overrides, model, QuantType.QUInt8)
fixer.apply(
default_activation_qtype=QuantType.QUInt8,
default_activation_symmetric=False,
)
```
The above snippet generates the following "fixed" overrides (get via overrides.get_dict()):
{
"Op2_out": [{"quant_type": QUInt8, "convert": {"quant_type": QUInt16, "recv_nodes": {"Op4"}}}],
"Op3_out": [{"quant_type": QUInt8, "convert": {"quant_type": QUInt16, "recv_nodes": {"Op5"}}}],
"Op4_out": [{"quant_type": QUInt16}],
"Op5_out": [{"quant_type": QUInt16, "convert": {"quant_type": QUInt8, "recv_nodes": {"Op6"}}}]
}
How to interpret the fixed overrides:
- Op2's output is consumed by Op4, Op7, and Op8. Op4 consumes the converted u16 type,
but Op7 and Op8 consume the original u8 type.
- Op3's output is converted from u8 to u16. Op5 consumes the converted u16 type.
- Op4's output is just u16 (not converted). All consumers of Op4_out get the u16 type.
- Op5's output is converted from u16 to u8. Op6 consumes the u8 type.
"""
def __init__(
self,
overrides: TensorQuantOverridesHelper,
producers: dict[str, onnx.NodeProto],
consumers: dict[str, list[onnx.NodeProto]],
value_infos: dict[str, onnx.ValueInfoProto],
initializers: dict[str, onnx.TensorProto],
):
"""
Params:
overrides: The initial tensor quantization overrides to fix.
producers: Dictionary that maps a tensor name to the producer node that generates the tensor.
consumers: Dictionary that maps a tensor name to the consumer nodes that take the tensor as input.
value_infos: Dictionary that maps a tensor name to its onnx.ValueInfoProto.
initializers: Dictionary that maps an initializer name to its onnx.TensorProto.
"""
self.overrides = overrides
self.consumers = consumers
self.producers = producers
self.value_infos = value_infos
self.initializers = initializers
@staticmethod
def create_from_model(
overrides: TensorQuantOverridesHelper, model: onnx.ModelProto, default_activation_qtype: QuantType
) -> MixedPrecisionTensorQuantOverridesFixer:
"""
Helper function that creates an instance of this class from a loaded ONNX model.
Params:
overrides: The initial tensor quantization overrides to fix.
model: Loaded ONNX model
default_activation_qtype: The intended default activation quantization type.
Used to validate the initial overrides.
Returns:
Initialized MixedPrecisionTensorQuantOverridesFixer object
"""
model = onnx.shape_inference.infer_shapes(model) # Need to infer shapes to get value_infos
# Build dictionaries that enable convenient lookups of initializers and value_infos by name.
initializers = {initializer.name: initializer for initializer in model.graph.initializer}
value_infos = {vi.name: vi for vi in model.graph.value_info}
value_infos.update({ot.name: ot for ot in model.graph.output})
value_infos.update({it.name: it for it in model.graph.input})
# Ensure that the user-provided initial overrides are actually valid.
valid, err = overrides.is_valid(initializers, set(value_infos), default_activation_qtype)
if not valid:
pprint_overrides = overrides.pprint_str(indent=4)
logging.error(f"Provided invalid tensor quantization overrides:\n{pprint_overrides}")
raise ValueError(err)
consumers = {}
producers = {}
# Build dictionaries that map a tensor name to the consumer or producer nodes.
for node in model.graph.node:
for input_name in node.input:
if input_name:
if input_name not in consumers:
consumers[input_name] = []
consumers[input_name].append(node)
for output_name in node.output:
producers[output_name] = node
return MixedPrecisionTensorQuantOverridesFixer(overrides, producers, consumers, value_infos, initializers)
def apply(
self,
default_activation_qtype: QuantType,
default_activation_symmetric: bool,
):
"""
Fixes the initial tensor quantization overrides (in-place) for use in mixed-precision QDQ models.
Params:
default_activation_qtype: The intended default activation quantization type.
default_activation_symmetric: The intended default symmetry used to quantize activations.
"""
type_requests = self.get_desired_tensor_types(default_activation_qtype, default_activation_symmetric)
# Use type requests to "fix" tensor quantization overrides by adding
# quantization type conversions where necessary.
for tensor_name, type_req in type_requests.items():
all_consumers = set([node.name for node in self.consumers.get(tensor_name, [])])
has_producer_req = type_req.producer is not None
has_consumer_req = bool(type_req.consumers)
# Only producer type: Add conversion back to default activation type
if has_producer_req and not has_consumer_req:
self._update_converted_tensor(
tensor_name, type_req.producer, QuantTypeInfo(default_activation_qtype), all_consumers
)
# Only consumers
elif not has_producer_req and has_consumer_req:
prod_type_info = self.overrides.get_node_output_qtype_info(tensor_name, default_activation_qtype)
consumer_type_info = type_req.consumers[0]
if prod_type_info != consumer_type_info:
self._update_converted_tensor(
tensor_name, prod_type_info, consumer_type_info, type_req.consumers[1]
)
else:
if not self._check_nodes_are_not_convert_consumers(tensor_name, type_req.consumers[1]):
raise ValueError(
f"Tensor override for '{tensor_name}' converts the type for consumers that need the original type."
)
# Both producer and consumers
elif has_producer_req and has_consumer_req:
prod_type_info = type_req.producer
consumer_type_info = type_req.consumers[0]
if prod_type_info != consumer_type_info:
self._update_converted_tensor(
tensor_name, prod_type_info, consumer_type_info, type_req.consumers[1]
)
else:
consumers_for_original_type = all_consumers.difference(type_req.consumers[1])
if len(consumers_for_original_type) == 0:
# All consumers want the overridden type, so no need for convert nodes!
# Just add the override to the new new if not already present.
if tensor_name not in self.overrides:
self.overrides[tensor_name] = [{}]
prod_type_info.save_to_dict(self.overrides[tensor_name][0])
assert "convert" not in self.overrides[tensor_name][0]
else:
# Some consumers don't want the overridden type.
self._update_converted_tensor(
tensor_name,
prod_type_info,
QuantTypeInfo(default_activation_qtype),
consumers_for_original_type,
)
else:
raise ValueError(f"TypeRequest for tensor {tensor_name} has no producer or consumers.")
# Done. Check if the overrides are valid.
valid, err = self.overrides.is_valid(self.initializers, set(self.value_infos), default_activation_qtype)
if not valid:
pprint_overrides = self.overrides.pprint_str(indent=4)
logging.error(
f"Generated invalid tensor quantization overrides for mixed-precision QDQ model:\n{pprint_overrides}"
)
raise ValueError(err)
def get_desired_tensor_types(
self,
default_activation_qtype: QuantType,
default_activation_symmetric: bool,
) -> dict[str, TensorTypeRequest]:
"""
Iterates through the initial tensor quantization overrides and builds a set of TensorTypeRequests objects
that describe the quantization types required at each tensor. These TensorTypeRequests objects are ultimately
used to generated the "fixed" overrides.
Params:
default_activation_qtype: The intended default activation quantization type.
default_activation_symmetric: The intended default symmetry used to quantize activations.
Returns:
TensorTypeRequest objects as a dict that maps a tensor name to its requested types.
"""
type_requests = {}
default_activation_type_info = QuantTypeInfo(default_activation_qtype, default_activation_symmetric)
# Scan tensor overrides for type conversion requests.
for tensor_name, override_list in self.overrides.items():
if not self.__is_tensor_quantizable(tensor_name):
continue # Skip non-quantizable tensors (e.g., not a float)
if tensor_name in self.initializers:
continue # Skip initializers
if not override_list or len(override_list) > 1:
continue # Skip per-channel stuff
override_dict = override_list[0]
quant_type_info = QuantTypeInfo.load_from_dict(override_dict, default_activation_type_info.quant_type)
producer_node = self.producers.get(tensor_name) # None if this is a model input
if quant_type_info != default_activation_type_info and "convert" not in override_dict:
if producer_node is not None:
self._add_type_requests_for_node(type_requests, quant_type_info, producer_node)
# Find all consumer nodes of `tensor_name` and update their inputs/outputs to the new type.
for consumer_node in self.consumers.get(tensor_name, []):
self._add_type_requests_for_node(type_requests, quant_type_info, consumer_node)
return type_requests
def _add_type_requests_for_node(
self,
type_requests: dict[str, TensorTypeRequest],
quant_type_info: QuantTypeInfo,
node: onnx.NodeProto,
):
"""
Adds TensorTypeRequest objects for a given node, assuming that we want all its inputs and outputs
to have the same quantization type (as specified by the `quant_type_info` parameter).
Params:
type_requests: Dictionary of type requests to append to for this node.
quant_type_info: The quantization type to use for inputs and outputs.
node: The node for which the TensorTypeRequest objects are created and added to type_requests.
"""
# Add output side
for output_name in node.output:
if not self.__is_tensor_quantizable(output_name):
continue
if output_name not in type_requests:
type_requests[output_name] = TensorTypeRequest(quant_type_info, None)
else:
if (
type_requests[output_name].producer is not None
and type_requests[output_name].producer != quant_type_info
):
raise ValueError(f"Tensor {output_name} has multiple types.")
type_requests[output_name].producer = quant_type_info
# Add the consumer side
for input_name in node.input:
if input_name and input_name not in self.initializers and self.__is_tensor_quantizable(input_name):
if input_name not in type_requests:
type_requests[input_name] = TensorTypeRequest(None, None)
if type_requests[input_name].consumers is None:
type_requests[input_name].consumers = (quant_type_info, set())
if type_requests[input_name].consumers[0] != quant_type_info:
raise ValueError(f"Tensor {input_name} has consumers requesting different types.")
if not node.name:
raise ValueError(
f"Node of type {node.op_type} with output 0 {node.output[0]} does not have a name!"
)
type_requests[input_name].consumers[1].add(node.name)
def _update_converted_tensor(
self,
tensor_name: str,
producer_type_info: QuantTypeInfo,
consumer_type_info: QuantTypeInfo,
consumer_names: set[str],
):
"""
Updates the tensor quantization overrides for a tensor that is converted from one type to another.
Params:
tensor_name: The name of the tensor for which to update overrides.
producer_type_info: Info for the tensor's produced type.
consumer_type_info: Info for the tensor's consumed (i.e., converted) type.
consumer_names: Nodes names of consumers that consume the converted type.
"""
if tensor_name not in self.overrides or not self.overrides[tensor_name]:
self.overrides[tensor_name] = [{}]
producer_type_info.save_to_dict(self.overrides[tensor_name][0])
overrides = self.overrides[tensor_name][0]
if producer_type_info != QuantTypeInfo.load_from_dict(overrides):
raise ValueError(f"Desired producer quant_type for {tensor_name} doesn't match existing type.")
if consumer_names:
if "convert" not in overrides:
overrides["convert"] = {}
consumer_type_info.save_to_dict(overrides["convert"])
convert_dict = overrides["convert"]
if consumer_type_info != QuantTypeInfo.load_from_dict(convert_dict):
raise ValueError(f"Desired consumer quant_type for {tensor_name} doesn't match existing type.")
if "recv_nodes" not in convert_dict:
convert_dict["recv_nodes"] = set()
convert_dict["recv_nodes"].update(consumer_names)
def _check_nodes_are_not_convert_consumers(self, tensor_name: str, node_names: set[str]):
"""
Returns true if the given nodes do not consume/receive a converted quantization type.
Params:
tensor_name: The name of the tensor to check.
node_names: Set of node names that should not be consumers of the converted type.
"""
if tensor_name not in self.overrides or not self.overrides[tensor_name]:
return True
overrides = self.overrides[tensor_name][0]
if "convert" not in overrides:
return True
convert_dict = overrides["convert"]
if "recv_nodes" not in convert_dict:
return False
return not convert_dict["recv_nodes"].intersection(node_names)
def __is_tensor_quantizable(self, tensor_name):
weight = self.initializers.get(tensor_name)
if weight is not None:
if weight.data_type in (onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16):
return True
elif tensor_name in self.value_infos:
vi = self.value_infos[tensor_name]
if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type in (
onnx.TensorProto.FLOAT,
onnx.TensorProto.FLOAT16,
):
return True
return False

View File

@ -0,0 +1,307 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
from __future__ import annotations
import logging
from pathlib import Path
import onnx
from ...fusions import FusionGelu, FusionLayerNormalization
from ...onnx_model import ONNXModel
from .fusion_lpnorm import FusionLpNormalization
def qnn_preprocess_model(
model_input: str | Path | onnx.ModelProto,
model_output: str | Path,
fuse_layernorm: bool = False,
save_as_external_data: bool = False,
all_tensors_to_one_file: bool = False,
external_data_location: str | None = None,
external_data_size_threshold: int = 1024,
external_data_convert_attribute: bool = False,
inputs_to_make_channel_last: list[str] | None = None,
outputs_to_make_channel_last: list[str] | None = None,
) -> bool:
"""
If necessary, this method creates a new "pre-processed" model in preparation for
quantization of a model to be used in QNN EP. Returns true if a new model was created.
This method perfoms the following operations:
- Fuse Erf sequence into a single Gelu node.
- Fuse ReduceL2 sequence into a single LpNormalization node (p == 2).
- (Optional) Fuse ReduceMean sequence into a single LayerNormalization node.
Args:
model_input: Path to the input model file or ModelProto.
model_output: Path the output model file, which is only created if this method returns True.
fuse_layernorm: True if ReduceMean sequences should be fused into LayerNormalization nodes.
Defaults to False.
save_as_external_data: True if output model should be saved with external data. Defaults to false.
all_tensors_to_one_file: Effective only if save_as_external_data is true. Defaults to false.
If true, save all tensors to one external file specified by external_data_location.
If false, save each tensor to a file named with the tensor name.
external_data_location: Effective only if save_as_external_data is true. Defaults to None.
Specify the external file to which all tensors are saved. Path is relative
to the model path. If not specified, the model's name is used.
external_data_size_threshold: Effective only if save_as_external_data is true. Defaults to 1024.
Tensors with a data size >= external_data_size_threshold are converted to external data.
To convert every tensor with raw data to external data, set to 0.
external_data_convert_attribute: Effective only if save_as_external_data is true. Defaults to false.
If true, convert all tensors to external data.
If false, convert only non-attribute tensors to external data.
inputs_to_make_channel_last: List of graph input names to transpose to be "channel-last". For example,
if "input0" originally has the shape (N, C, D1, D2, ..., Dn), the resulting model will change input0's
shape to (N, D1, D2, ..., Dn, C) and add a transpose node after it.
Original:
input0 (N, C, D1, D2, ..., Dn) --> <Nodes>
Updated:
input0 (N, D1, D2, ..., Dn, C) --> Transpose --> input0_chanfirst (N, C, D1, D2, ..., Dn) --> <Nodes>
This can potentially improve inference latency for QDQ models running on QNN EP because the
additional transpose node may allow other transpose nodes inserted during ORT layout transformation
to cancel out.
outputs_to_make_channel_last: List of graph output names to transpose to be "channel-last". For example,
if "output0" originally has the shape (N, C, D1, D2, ..., Dn), the resulting model will change output0's
shape to (N, D1, D2, ..., Dn, C) and add a transpose node before it.
Original:
<Nodes> --> output0 (N, C, D1, D2, ..., Dn)
Updated:
<Nodes> --> output0_chanfirst (N, C, D1, D2, ..., Dn) --> Transpose --> output0 (N, D1, D2, ..., Dn, C)
This can potentially improve inference latency for QDQ models running on QNN EP because the
additional transpose node may allow other transpose nodes inserted during ORT layout transformation
to cancel out.
"""
modified = False
model = model_input if isinstance(model_input, onnx.ModelProto) else onnx.load_model(model_input)
onnx_model = ONNXModel(model)
# Fuse Erf sequence into a single Gelu
fusion_gelu = FusionGelu(onnx_model)
if fusion_gelu.apply():
modified = True
# Fuse ReduceL2 sequence into a single LpNormalization node with p == 2.
fusion_lpnorm = FusionLpNormalization(onnx_model)
if fusion_lpnorm.apply():
modified = True
# Optionally, fuse ReduceMean sequence into a single LayerNormalization node.
if fuse_layernorm:
onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
# Need opset >= 17 to use LayerNormalization.
if onnx_opset.version < 17:
logging.warning(
"Unable to fuse ReduceMean sequence into a LayerNormalization node. "
"ONNX model must use an opset >= 17 in order to use LayerNormalization, "
f"but found version {onnx_opset.version}. Please use onnx.version_converter to update your model."
)
else:
fusion_layernorm = FusionLayerNormalization(onnx_model)
if fusion_layernorm.apply():
modified = True
# Optionally, transpose inputs and/or outputs to make them "channel-last".
if inputs_to_make_channel_last or outputs_to_make_channel_last:
transpose_node_prefix = "Transpose_channel_"
transpose_node_suffix: int = onnx_model.get_largest_node_name_suffix(transpose_node_prefix) + 1
update_io_to_channel_last(
onnx_model.model,
inputs_to_make_channel_last,
outputs_to_make_channel_last,
transpose_node_name_prefix=transpose_node_prefix,
transpose_node_name_start_suffix=transpose_node_suffix,
)
modified = True
# Make sure all nodes have a name.
unnamed_node_prefix = "qnn_preproc_node_"
available_suffix = onnx_model.get_largest_node_name_suffix(unnamed_node_prefix) + 1
for node in onnx_model.model.graph.node:
if node.op_type != "Constant" and not node.name:
new_node_name = f"{unnamed_node_prefix}{available_suffix!s}"
available_suffix += 1
node.name = new_node_name
modified = True
logging.warning(f"Node of type {node.op_type} does not have a name. Renamed to {new_node_name}.")
if modified:
onnx_model.topological_sort()
onnx.save_model(
model,
model_output,
save_as_external_data=save_as_external_data,
all_tensors_to_one_file=all_tensors_to_one_file,
location=external_data_location,
size_threshold=external_data_size_threshold,
convert_attribute=external_data_convert_attribute,
)
return modified
class InputOutputNameMap:
def __init__(
self,
orig_tensor_names: set[str],
orig_graph_inputs: dict[str, onnx.ValueInfoProto],
orig_graph_outputs: dict[str, onnx.ValueInfoProto],
):
self.orig_tensor_names = orig_tensor_names
self.orig_graph_inputs = orig_graph_inputs
self.orig_graph_outputs = orig_graph_outputs
self.updated_io_names = {}
self.new_value_infos = []
def get_new_name(self, orig_name: str):
if orig_name in self.updated_io_names:
return self.updated_io_names[orig_name]
# Make a new tensor name that is unique among all tensors in the graph.
prefix: str = f"{orig_name}_channel_first_"
suffix: int = -1
for tensor_name in self.orig_tensor_names:
if tensor_name.startswith(prefix) and tensor_name[len(prefix) :].isdigit():
index = int(tensor_name[len(prefix) :])
suffix = max(suffix, index)
suffix += 1 # This is the first available suffix.
new_name = f"{prefix}{suffix!s}"
# Add new value_info objects for these new tensors.
orig_value_info = self.orig_graph_inputs.get(orig_name) or self.orig_graph_outputs[orig_name]
value_info_proto = onnx.ValueInfoProto()
value_info_proto.CopyFrom(orig_value_info)
value_info_proto.name = new_name
self.new_value_infos.append(value_info_proto)
self.updated_io_names[orig_name] = new_name
return self.updated_io_names[orig_name]
def update_io_to_channel_last(
model: onnx.ModelProto,
inputs_to_update: list[str] | None,
outputs_to_update: list[str] | None,
transpose_node_name_prefix: str = "Transpose_channel_",
transpose_node_name_start_suffix: int = 0,
):
inputs_to_update = set(inputs_to_update or [])
outputs_to_update = set(outputs_to_update or [])
if not inputs_to_update and not outputs_to_update:
return
graph = model.graph
orig_graph_inputs = {ginput.name: ginput for ginput in graph.input}
orig_graph_outputs = {goutput.name: goutput for goutput in graph.output}
# Check that the user passed in actual input and output names.
for input_name in inputs_to_update:
if input_name not in orig_graph_inputs:
raise ValueError(f"{input_name} is not a graph input")
for output_name in outputs_to_update:
if output_name not in orig_graph_outputs:
raise ValueError(f"{output_name} is not a graph output")
orig_tensor_names = set()
orig_tensor_names.update(set(orig_graph_inputs))
orig_tensor_names.update(set(orig_graph_outputs))
orig_tensor_names.update(input_name for node in graph.node for input_name in node.input if input_name)
# Maps original input (or output) name to its updated name used within the graph.
io_map = InputOutputNameMap(orig_tensor_names, orig_graph_inputs, orig_graph_outputs)
# Update each node's inputs/outputs to use the transposed versions.
for node in graph.node:
for i in range(len(node.input)):
if node.input[i] and node.input[i] in inputs_to_update:
node.input[i] = io_map.get_new_name(node.input[i])
elif node.input[i] and node.input[i] in outputs_to_update:
node.input[i] = io_map.get_new_name(node.input[i])
for i in range(len(node.output)):
if node.output[i] in outputs_to_update:
node.output[i] = io_map.get_new_name(node.output[i])
# Update graph inputs to channel-last and a Transpose (to channel-first) after each.
for g_input_name in inputs_to_update:
g_input = orig_graph_inputs[g_input_name]
if not g_input.type.HasField("tensor_type") or not g_input.type.tensor_type.HasField("shape"):
raise ValueError(f"Expected input {g_input.name} to have a tensor_type with a shape")
input_shape = g_input.type.tensor_type.shape
input_rank = len(input_shape.dim)
if input_rank < 3:
raise ValueError(f"Expected input {g_input.name} to be of rank >= 3")
channel_dim = onnx.TensorShapeProto.Dimension()
channel_dim.CopyFrom(input_shape.dim[1])
for i in range(1, input_rank - 1):
input_shape.dim[i].CopyFrom(input_shape.dim[i + 1])
input_shape.dim[input_rank - 1].CopyFrom(channel_dim)
transpose_perm = list(range(input_rank))
for i in range(input_rank):
transpose_perm[i] = i if i < 1 else i - 1
transpose_perm[1] = input_rank - 1
transpose_node = onnx.helper.make_node(
"Transpose",
name=f"{transpose_node_name_prefix}{transpose_node_name_start_suffix!s}",
inputs=[g_input.name],
outputs=[io_map.get_new_name(g_input.name)],
perm=transpose_perm,
)
transpose_node_name_start_suffix += 1
graph.node.extend([transpose_node])
# Update graph outputs to channel-last and a Transpose (from channel-first) before each.
for g_output_name in outputs_to_update:
g_output = orig_graph_outputs[g_output_name]
if not g_output.type.HasField("tensor_type") or not g_output.type.tensor_type.HasField("shape"):
raise ValueError(f"Expected output {g_output.name} to have a tensor_type with a shape")
output_shape = g_output.type.tensor_type.shape
output_rank = len(output_shape.dim)
if output_rank < 3:
raise ValueError(f"Expected output {g_output.name} to be of rank >= 3")
channel_dim = onnx.TensorShapeProto.Dimension()
channel_dim.CopyFrom(output_shape.dim[1])
for i in range(1, output_rank - 1):
output_shape.dim[i].CopyFrom(output_shape.dim[i + 1])
output_shape.dim[output_rank - 1].CopyFrom(channel_dim)
transpose_perm = list(range(output_rank))
for i in range(output_rank):
transpose_perm[i] = i if i == 0 else i + 1
transpose_perm[output_rank - 1] = 1
transpose_node = onnx.helper.make_node(
"Transpose",
name=f"{transpose_node_name_prefix}{transpose_node_name_start_suffix!s}",
inputs=[io_map.get_new_name(g_output.name)],
outputs=[g_output.name],
perm=transpose_perm,
)
transpose_node_name_start_suffix += 1
graph.node.extend([transpose_node])
graph.value_info.extend(io_map.new_value_infos)

View File

@ -0,0 +1,387 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
from __future__ import annotations
import copy
import logging
from pathlib import Path
from typing import Any
import numpy as np
import onnx
from ...calibrate import CalibrationDataReader, CalibrationMethod
from ...quant_utils import QuantType
from ...quantize import StaticQuantConfig
from ...tensor_quant_overrides import TensorQuantOverridesHelper
from .mixed_precision_overrides_utils import MixedPrecisionTensorQuantOverridesFixer
Q16_TYPES = {QuantType.QInt16, QuantType.QUInt16}
Q8_TYPES = {QuantType.QInt8, QuantType.QUInt8}
Q4_TYPES = {QuantType.QInt4, QuantType.QUInt4}
OP_TYPES_TO_EXCLUDE = {"Cast"}
MODEL_SIZE_THRESHOLD = 2147483648 # Quant model should use external data if >= 2GB
def warn_unable_to_override(
node: onnx.NodeProto,
what_str: str,
tensor_name: str,
io_kind: str,
):
logging.warning(
f"Unable to override {what_str} for {node.op_type} node's {io_kind} "
"because it has already been overridden! Check the initial quantization overrides provided "
"to get_qnn_qdq_config() if the generated QDQ model does not run on QNN EP. "
f"Node name: {node.name}, {io_kind} name: {tensor_name}"
)
def get_qnn_qdq_config(
model_input: str | Path | onnx.ModelProto,
calibration_data_reader: CalibrationDataReader,
calibrate_method: CalibrationMethod = CalibrationMethod.MinMax,
activation_type: QuantType = QuantType.QUInt8,
weight_type: QuantType = QuantType.QUInt8,
per_channel: bool = False,
init_overrides: dict[str, list[dict[str, Any]]] | None = None,
add_qtype_converts: bool = True,
activation_symmetric: bool = False,
weight_symmetric: bool | None = None,
keep_removable_activations: bool = False,
stride: int | None = None,
) -> StaticQuantConfig:
"""
Returns a static quantization configuration suitable for running QDQ models on QNN EP.
This is done primarily by setting tensor-level quantization overrides.
Params:
model_input: Path to the input model file or ModelProto.
calibration_data_reader: Calibration data reader.
calibrate_methode: The calibration method. Defaults to MinMax.
activation_type: The default activation quantization type. Defaults to QUInt8.
weight_type: The default weight quantization type. Defaults to QUInt8.
per_channel: Global option that determines if a fixed set of operator types should be quantized per-channel.
Defaults to false. Alternatively, use the tensor-level `init_overrides` to select individual operators
and their quantization axes.
If set, the quantization tool uses per-channel quantization for the following operator types and inputs:
- Conv:
- input[1] on axis 0
- input[2] (bias) on axis 0
- ConvTranspose:
- input[1] on axis 1
- input[2] (bias) on axis 0
init_overrides: Initial tensor-level quantization overrides. Defaults to None. This function updates of a copy
of these overrides with any necessary adjustments and includes them in the returned
configuration object (i.e., config.extra_options['TensorQuantOverrides']).
The key is a tensor name and the value is a list of dictionaries. For per-tensor quantization, the list
contains a single dictionary. For per-channel quantization, the list contains either a dictionary for
each channel in the tensor or a single dictionary that is assumed to apply to all channels. An 'axis'
key must be present in the first dictionary for per-channel quantization.
Each dictionary contains optional overrides with the following keys and values.
'quant_type' = QuantType : The tensor's quantization data type.
'axis' = Int : The per-channel axis. Must be present for per-channel weights.
'scale' = Float : The scale value to use. Must also specify `zero_point` if set.
'zero_point' = Int : The zero-point value to use. Must also specify `scale` is set.
'symmetric' = Bool : If the tensor should use symmetric quantization. Invalid if also
set `scale` or `zero_point`.
'reduce_range' = Bool : If the quantization range should be reduced. Invalid if also
set `scale` or `zero_point`. Only valid for initializers.
'rmax' = Float : Override the maximum real tensor value in calibration data.
Invalid if also set `scale` or `zero_point`.
'rmin' = Float : Override the minimum real tensor value in calibration data.
Invalid if also set `scale` or `zero_point`.
'convert' = Dict : A nested dictionary with the same keys for an activation
tensor that should be converted to another quantization type.
'convert["recv_nodes"] = Set : Set of node names that consume the converted activation,
other nodes get the original type. If not specified,
assume all consumer nodes get the converted type.
add_qtype_converts: True if this function should automatically add "convert" entries to the provided
`init_overrides` to ensure that operators use valid input/output types (activations only).
Ex: if you override the output of an Add to 16-bit, this option ensures that the activation inputs
of the Add are also up-converted to 16-bit and that data types for surrounding ops are converted
appropriately. Refer to the documentation in mixed_precision_overrides_utils.py for additional details.
activation_symmetric: True if activations should be quantized symmetrically (i.e, rmax == -rmin) by default.
Defaults to false. For int8 and int16, this results in zero-point values of 0. For uint8 and uin16,
the zero-point values are 128 and 32,768, respectively.
weight_symmetric: True if weights should be quantized symmetrically (i.e., rmax == -rmin) by default.
Defaults to None. If set to None, weight_symmetric is assumed true if the weight_type is a signed int.
keep_removable_activations: Defaults to false. If true, "removable" activations (e.g., Clip or Relu) will not
be removed, and will be explicitly represented in the QDQ model. If false, these activations
are automatically removed if activations are asymmetrically quantized. Keeping these activations
is necessary if optimizations or EP transformations will later remove
QuantizeLinear/DequantizeLinear operators from the model.
Returns:
A StaticQuantConfig object
"""
if weight_symmetric is None:
weight_symmetric = weight_type in {QuantType.QInt8, QuantType.QInt16}
model = (
model_input
if isinstance(model_input, onnx.ModelProto)
else onnx.load_model(model_input, load_external_data=False)
)
op_types = set()
model_has_external_data = False
name_to_initializer = {}
# Build map of initializers (name -> initializer) and
# check if the model has external data.
for initializer in model.graph.initializer:
name_to_initializer[initializer.name] = initializer
if onnx.external_data_helper.uses_external_data(initializer):
model_has_external_data = True
overrides_helper = TensorQuantOverridesHelper(copy.deepcopy(init_overrides) if init_overrides else {})
if not overrides_helper.empty() and add_qtype_converts:
# Fix mixed-precision overrides.
overrides_fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(
overrides_helper, model, activation_type
)
overrides_fixer.apply(activation_type, activation_symmetric)
# Setup quantization overrides for specific operator types to ensure compatibility with QNN EP.
qnn_compat = QnnCompatibilityOverrides(
activation_type,
weight_type,
activation_symmetric,
weight_symmetric,
per_channel,
overrides_helper,
name_to_initializer,
)
for node in model.graph.node:
op_types.add(node.op_type)
qnn_compat.process_node(node)
extra_options = {
"MinimumRealRange": 0.0001,
"DedicatedQDQPair": False, # Let ORT optimizer duplicate DQ nodes
"QDQKeepRemovableActivations": keep_removable_activations,
"TensorQuantOverrides": overrides_helper.get_dict(),
"ActivationSymmetric": activation_symmetric,
"WeightSymmetric": weight_symmetric,
"CalibStridedMinMax": stride,
}
# ONNX opset < 21 does not support 16-bit quantization, so must use 'com.microsoft' domain
# on Q/DQ operators if using 16-bit or 4-bit quantization.
onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
if onnx_opset.version < 21:
opset21_types = Q16_TYPES.union(Q4_TYPES)
overrides_have_opset21_types = any(t in opset21_types for t in overrides_helper.get_quant_types())
if activation_type in opset21_types or weight_type in opset21_types or overrides_have_opset21_types:
extra_options["UseQDQContribOps"] = True
return StaticQuantConfig(
calibration_data_reader,
calibrate_method=calibrate_method,
activation_type=activation_type,
weight_type=weight_type,
op_types_to_quantize=list(op_types.difference(OP_TYPES_TO_EXCLUDE)),
per_channel=per_channel,
use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD),
extra_options=extra_options,
)
class QnnCompatibilityOverrides:
"""
Helper that processes nodes to generate quantization overrides that make the resulting QDQ model
compatible with QNN EP.
"""
def __init__(
self,
default_activation_qtype: QuantType,
default_weight_qtype: QuantType,
activation_symmetric: bool,
weight_symmetric: bool,
per_channel: bool,
overrides: TensorQuantOverridesHelper,
initializers: dict[str, onnx.TensorProto],
):
self.default_activation_qtype = default_activation_qtype
self.default_weight_qtype = default_weight_qtype
self.activation_symmetric = activation_symmetric
self.weight_symmetric = weight_symmetric
self.per_channel = per_channel
self.overrides = overrides
self.initializers = initializers
self.process_fns = {
"MatMul": self._process_matmul,
"LayerNormalization": self._process_layernorm,
"Sigmoid": self._process_sigmoid,
"Tanh": self._process_tanh,
}
def process_node(self, node: onnx.NodeProto):
process_fn = self.process_fns.get(node.op_type)
if process_fn is not None:
process_fn(node)
def _make_static_inputs_use_default_weight_type(self, node: onnx.NodeProto):
"""
Overrides initializer input(s) to use the default weight type if:
- The default weight type is 8-bit
- One of the inputs is a 16-bit activation
- The other input is an initializer (per-tensor quantized)
This is necessary because the quantization tool does not assign MatMul or LayerNorm initializer
inputs the default weight type. Instead, it assigns the default activation type.
"""
if self.default_weight_qtype not in Q8_TYPES:
return
input_16bit_act_name = None
input_weight_name = None
# Loop through first 2 inputs to find a 16-bit activation and a (per-tensor) weight.
for i in range(2):
input_name = node.input[i]
if not input_name:
continue
is_weight = input_name in self.initializers
qtype_info = self.overrides.get_node_input_qtype_info(
input_name,
node.name,
default_qtype=None if is_weight else self.default_activation_qtype,
)
if qtype_info.axis is not None:
return # Don't process MatMul with a per-channel quantized input.
if (
is_weight
and qtype_info.quant_type == self.default_weight_qtype
and qtype_info.symmetric == self.weight_symmetric
):
return # Return. Weight is already overridden to use the desired weight type.
if is_weight:
input_weight_name = input_name
elif qtype_info.quant_type in Q16_TYPES:
input_16bit_act_name = input_name
# Override initializer input to use the default weight type.
if input_16bit_act_name and input_weight_name:
did_update = self.overrides.update_tensor_overrides(
input_weight_name,
{"quant_type": self.default_weight_qtype, "symmetric": self.weight_symmetric},
overwrite=False,
)
if not did_update:
warn_unable_to_override(node, "quant_type/symmetric", input_weight_name, "input weight")
def _process_matmul(self, node: onnx.NodeProto):
assert node.op_type == "MatMul", f"Expected MatMul, but got {node.op_type}"
if not self.per_channel:
self._make_static_inputs_use_default_weight_type(node)
return
# QNN does not support per-channel MatMul. However, the ORT quantization tool attempts to use per-channel
# quantization for MatMul by default *if* the global per_channel setting is enabled. So, we need to
# provide explicit per-tensor quantization overrides for MatMul if per_channel is enabled and
# the user did not provide any other overrides.
for input_name in node.input:
is_weight_no_overrides = input_name in self.initializers and input_name not in self.overrides
if is_weight_no_overrides:
self.overrides.update_tensor_overrides(
input_name,
{"quant_type": self.default_weight_qtype, "symmetric": self.weight_symmetric},
)
def _process_layernorm(self, node: onnx.NodeProto):
assert node.op_type == "LayerNormalization", f"Expected LayerNormalization, but got {node.op_type}"
if not self.per_channel:
self._make_static_inputs_use_default_weight_type(node)
return
has_weight_no_overrides = node.input[1] in self.initializers and node.input[1] not in self.overrides
has_bias_no_overrides = (
len(node.input) > 2
and node.input[2]
and node.input[2] in self.initializers
and node.input[2] not in self.overrides
)
if has_weight_no_overrides or has_bias_no_overrides:
# TODO: Make bias input not per-channel. QNN needs it to be per-tensor, but quantizer
# tries to makes it per-channel if the weight is also per-channel.
raise ValueError(
"get_qnn_qdq_config() does not currently support the global per_channel option with LayerNormalization."
" Please try using custom overrides that make bias per-tensor quantized."
)
def _process_sigmoid(self, node: onnx.NodeProto):
"""
Overrides 16-bit Sigmoid's output scale and zero-point as per QNN requirements.
"""
assert node.op_type == "Sigmoid", f"Expected Sigmoid, but got {node.op_type}"
output_type = self.overrides.get_node_output_qtype_info(
node.output[0], self.default_activation_qtype
).quant_type
if output_type == QuantType.QUInt16:
self.overrides.update_tensor_overrides(
node.output[0],
{
"quant_type": output_type,
"scale": np.array(1.0 / 65536.0, dtype=np.float32),
"zero_point": np.array(0, dtype=np.uint16),
},
)
elif output_type == QuantType.QInt16:
self.overrides.update_tensor_overrides(
node.output[0],
{
"quant_type": output_type,
"scale": np.array(1.0 / 32768.0, dtype=np.float32),
"zero_point": np.array(0, dtype=np.int16),
},
)
def _process_tanh(self, node: onnx.NodeProto):
"""
Overrides 16-bit Tanh's output scale and zero-point as per QNN requirements.
"""
assert node.op_type == "Tanh", f"Expected Tanh, but got {node.op_type}"
output_type = self.overrides.get_node_output_qtype_info(
node.output[0], self.default_activation_qtype
).quant_type
if output_type == QuantType.QUInt16:
self.overrides.update_tensor_overrides(
node.output[0],
{
"quant_type": output_type,
"scale": np.array(1.0 / 32768.0, dtype=np.float32),
"zero_point": np.array(32768, dtype=np.uint16),
},
)
elif output_type == QuantType.QInt16:
self.overrides.update_tensor_overrides(
node.output[0],
{
"quant_type": output_type,
"scale": np.array(1.0 / 32768.0, dtype=np.float32),
"zero_point": np.array(0, dtype=np.int16),
},
)