I am done

This commit is contained in:
2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions

View File

@ -0,0 +1,2 @@
# from .base_operator import QuantOperatorBase
# from .matmul import MatMulInteger

View File

@ -0,0 +1,119 @@
import onnx
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
from .qdq_base_operator import QDQOperatorBase
class QLinearActivation(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def QuantizeClipRelu(self): # noqa: N802
node = self.node
assert node.op_type == "Relu" or node.op_type == "Clip"
# When mode is QLinearOps, the output quantization params are calculated based on outputs from
# activation nodes, therefore these nodes can be removed from the graph if they follow a quantized op.
# If input to this node is not quantized then keep this node
# If activation is symmetric, not quantize the op and simply return
if node.input[0] not in self.quantizer.quantized_value_map or self.quantizer.is_activation_symmetric:
return super().quantize()
quantized_value = self.quantizer.quantized_value_map[node.input[0]]
self.quantizer.quantized_value_map[node.output[0]] = quantized_value
def quantize(self):
node = self.node
if node.op_type == "Relu" or node.op_type == "Clip":
self.QuantizeClipRelu()
return
nnapi_sigmoid_option = "extra.Sigmoid.nnapi"
sigmoid_nnapi_mode = (
node.op_type == "Sigmoid"
and nnapi_sigmoid_option in self.quantizer.extra_options
and self.quantizer.extra_options[nnapi_sigmoid_option]
)
use_scale = 1 / 256.0 if sigmoid_nnapi_mode else None
use_zeropoint = 0 if sigmoid_nnapi_mode else None
# No assert on op_type as it is controlled by registry
# only try to quantize when given quantization parameters for it
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0], use_scale, use_zeropoint)
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
if not data_found or quantized_input_names is None:
return super().quantize()
qlinear_activation_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
qlinear_activation_name = ""
if node.name:
qlinear_activation_name = node.name + "_quant"
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
qlinear_activation_inputs = [
quantized_input_names[0],
scale_names[0],
zero_point_names[0],
output_scale_name,
output_zp_name,
]
qlinear_activation_node = onnx.helper.make_node(
"QLinear" + node.op_type,
qlinear_activation_inputs,
[qlinear_activation_output],
qlinear_activation_name,
**kwargs,
)
# Create an entry for this quantized value
q_output = QuantizedValue(
node.output[0],
qlinear_activation_output,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = q_output
nodes.append(qlinear_activation_node)
self.quantizer.new_nodes += nodes
class QDQRemovableActivation(QDQOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
# If input to this node is not quantized then keep this node
if not self.quantizer.is_tensor_quantized(node.input[0]):
return
if (
not self.quantizer.is_activation_symmetric
and not self.quantizer.qdq_keep_removable_activations
and self.quantizer.try_replacing_upstream_output(node.input[0], node.output[0])
):
self.quantizer.remove_node(self.node)
else:
self.quantizer.quantize_activation_tensor(node.input[0])
if not self.disable_qdq_for_node_output:
self.quantizer.quantize_activation_tensor(node.output[0])

View File

@ -0,0 +1,18 @@
from .base_operator import QuantOperatorBase
# Use the quantized tensor as input without DQ.
class QArgMax(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
quantized_input_value = self.quantizer.find_quantized_value(node.input[0])
if quantized_input_value is None:
self.quantizer.new_nodes += [node]
return
node.input[0] = quantized_input_value.q_name
self.quantizer.new_nodes += [node]

View File

@ -0,0 +1,73 @@
import onnx
from onnx import onnx_pb as onnx_proto # noqa: F401
from ..quant_utils import attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
"""
Quantize Attention
"""
class AttentionQuant(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def should_quantize(self):
return self.quantizer.should_quantize_node(self.node)
def quantize(self):
"""
parameter node: Attention node.
parameter new_nodes_list: List of new nodes created before processing this node.
return: a list of nodes in topological order that represents quantized Attention node.
"""
node = self.node
assert node.op_type == "Attention"
# TODO This is a temporary fix to stop exporting QAttention with qkv_hidden_sizes
# attribute. This needs to be removed once the QAttention for varied q,k,v sizes
# is implemented
for attr in node.attribute:
if attr.name == "qkv_hidden_sizes":
return super().quantize()
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
(
quantized_input_names_weight,
zero_point_names_weight,
scale_names_weight,
nodes_weight,
) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
quantized_input_names.extend(quantized_input_names_weight)
zero_point_names.extend(zero_point_names_weight)
scale_names.extend(scale_names_weight)
nodes.extend(nodes_weight)
if quantized_input_names is None:
return super().quantize()
qattention_name = "" if not node.name else node.name + "_quant"
inputs = []
inputs.extend(quantized_input_names)
inputs.extend([node.input[2]])
inputs.extend(scale_names)
inputs.extend([node.input[3] if len(node.input) > 3 else ""])
inputs.extend(zero_point_names)
inputs.extend([node.input[4] if len(node.input) > 4 else ""])
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
qattention_node = onnx.helper.make_node("QAttention", inputs, node.output, qattention_name, **kwargs)
nodes.append(qattention_node)
self.quantizer.new_nodes += nodes

View File

@ -0,0 +1,26 @@
class QuantOperatorBase:
def __init__(self, onnx_quantizer, onnx_node):
self.quantizer = onnx_quantizer
self.node = onnx_node
def should_quantize(self):
if not self.quantizer.should_quantize_node(self.node):
return False
return self.quantizer.is_float_tensor(self.node.input[0])
def quantize(self):
"""
Given a node which does not support quantization, this method checks whether the input to
this node is quantized and adds a DequantizeLinear node to dequantize this input back to FP32
parameter node: Current node
parameter new_nodes_list: List of new nodes created before processing current node
return: List of new nodes created
"""
for _, node_input in enumerate(self.node.input):
dequantize_node = self.quantizer._dequantize_value(node_input)
if dequantize_node is not None:
self.quantizer.new_nodes.append(dequantize_node)
# Append the original node
self.quantizer.new_nodes.append(self.node)

View File

@ -0,0 +1,72 @@
import onnx
from onnx import onnx_pb as onnx_proto # noqa: F401
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
class QLinearBinaryOp(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0, 1])
if not data_found or quantized_input_names is None:
return super().quantize()
qlinear_binary_math_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
qlinear_binary_math_name = node.name + "_quant" if node.name else ""
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
qlinear_binary_math_inputs = []
# Input 0
qlinear_binary_math_inputs.append(quantized_input_names[0])
qlinear_binary_math_inputs.append(scale_names[0])
qlinear_binary_math_inputs.append(zero_point_names[0])
# Input 1
qlinear_binary_math_inputs.append(quantized_input_names[1])
qlinear_binary_math_inputs.append(scale_names[1])
qlinear_binary_math_inputs.append(zero_point_names[1])
# Output
qlinear_binary_math_inputs.append(output_scale_name)
qlinear_binary_math_inputs.append(output_zp_name)
qlinear_binary_math_node = onnx.helper.make_node(
"QLinear" + node.op_type,
qlinear_binary_math_inputs,
[qlinear_binary_math_output],
qlinear_binary_math_name,
**kwargs,
)
nodes.append(qlinear_binary_math_node)
# Create an entry for this quantized value
q_output = QuantizedValue(
node.output[0],
qlinear_binary_math_output,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = q_output
self.quantizer.new_nodes += nodes

View File

@ -0,0 +1,62 @@
import onnx
from ..quant_utils import ( # noqa: F401
TENSOR_NAME_QUANT_SUFFIX,
QuantizedValue,
QuantizedValueType,
attribute_to_kwarg,
ms_domain,
)
from .base_operator import QuantOperatorBase
from .qdq_base_operator import QDQOperatorBase # noqa: F401
class QLinearConcat(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
(
q_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [*range(len(node.input))])
if not data_found or q_input_names is None:
return super().quantize()
# Create an entry for output quantized value
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
quantized_output_value = QuantizedValue(
node.output[0],
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
output_scale_name,
output_zp_name,
quantized_input_value.value_type,
)
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
qnode_name = node.name + "_quant" if node.name else ""
qlconcat_inputs = [output_scale_name, output_zp_name]
for i in range(len(q_input_names)):
qlconcat_inputs.extend([q_input_names[i], scale_names[i], zero_point_names[i]])
qlconcat_node = onnx.helper.make_node(
"QLinearConcat", qlconcat_inputs, [quantized_output_value.q_name], qnode_name, **kwargs
)
self.quantizer.new_nodes += nodes
self.quantizer.new_nodes += [qlconcat_node]

View File

@ -0,0 +1,258 @@
import numpy as np
import onnx
from onnx import onnx_pb as onnx_proto
from ..quant_utils import (
TENSOR_NAME_QUANT_SUFFIX,
QuantizedValue,
QuantizedValueType,
attribute_to_kwarg,
find_by_name,
get_mul_node,
)
from .base_operator import QuantOperatorBase
from .qdq_base_operator import QDQOperatorBase
class ConvInteger(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def add_bias(self, nodes, scaled_output):
"""
Given a node, this function handles bias add by adding a "reshape" node on bias and an "add" node
parameter nodes: new nodes would be appended into nodes
parameter node: current node (Conv)
parameter scaled_output: output of quant conv without bias
parameter output: output of Conv
parameter bias_name: bias of Conv
return: the name of output
"""
node = self.node
model = self.quantizer.model
# Add tensors for the shape to be reshaped to
weight = find_by_name(node.input[1], model.initializer())
if weight is None:
raise ValueError(f"Expected {node.input[1]} to be an initializer")
# Add reshape for correct broadcase
output = node.output[0]
reshape_input_data = node.input[2] # bias of Conv
reshape_input_shape = output + "_bias_reshape_shape"
reshape_output = output + "_bias_reshape_output"
shape = np.ones((len(weight.dims)), dtype=np.int64)
shape[1] = -1
init_shape = onnx.helper.make_tensor(
reshape_input_shape, onnx_proto.TensorProto.INT64, [len(weight.dims)], shape
)
model.add_initializer(init_shape)
reshape_node = onnx.helper.make_node("Reshape", [reshape_input_data, reshape_input_shape], [reshape_output])
nodes.append(reshape_node)
# Add an Add operation for bias
add_node = onnx.helper.make_node("Add", [scaled_output, reshape_output], [output], output + "_bias_add")
nodes.append(add_node)
def quantize(self):
node = self.node
assert node.op_type == "Conv"
# Get Quantized from both activation(input[0]) and weight(input[1])
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
(
quantized_input_names_weight,
zero_point_names_weight,
scale_names_weight,
nodes_weight,
) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
quantized_input_names.extend(quantized_input_names_weight)
zero_point_names.extend(zero_point_names_weight)
scale_names.extend(scale_names_weight)
nodes.extend(nodes_weight)
conv_integer_output = node.output[0] + "_output_quantized"
conv_integer_name = node.name + "_quant" if node.name else ""
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
conv_integer_node = onnx.helper.make_node(
"ConvInteger", quantized_input_names + zero_point_names, [conv_integer_output], conv_integer_name, **kwargs
)
nodes.append(conv_integer_node)
# Add cast operation to cast convInteger output to float.
onnx_type = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
cast_op_output = conv_integer_output + "_cast_output"
cast_node = onnx.helper.make_node(
"Cast",
[conv_integer_output],
[cast_op_output],
conv_integer_output + "_cast",
to=onnx_type, # TODO: FLOAT ot FLOAT16
)
nodes.append(cast_node)
# Add mul operation to multiply scales of two inputs.
assert len(scale_names) == 2
if conv_integer_name:
scales_mul_op = conv_integer_name + "_scales_mul"
else:
scales_mul_op = scale_names[0] + "_" + scale_names[1] + "_mul"
scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
if scales_mul_node is None:
scales_mul_node = get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
nodes.append(scales_mul_node)
scales_mul_op_output = scales_mul_node.output[0]
has_bias = len(node.input) == 3
scaled_output_name = node.output[0] if not has_bias else node.output[0] + "quant_scaled_output"
# Add mul operation to multiply mul_scales_op result with output of ConvInteger
# and make the output of this node the same as output of original conv node.
output_scale_mul_op = conv_integer_name + "_output_scale_mul" if conv_integer_name else ""
nodes.append(
get_mul_node(
[cast_op_output, scales_mul_op_output],
scaled_output_name,
output_scale_mul_op,
)
)
if has_bias:
self.add_bias(nodes, scaled_output_name)
self.quantizer.new_nodes += nodes
class QLinearConv(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "Conv"
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
if self.quantizer.is_input_a_initializer(node.input[1]) and self.quantizer.is_per_channel():
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
node.input[1], onnx_proto.TensorProto.INT8, 0 # self.quantizer.weight_qType?
)
quantized_input_names.append(quant_weight_tuple[0])
zero_point_names.append(quant_weight_tuple[1])
scale_names.append(quant_weight_tuple[2])
else:
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
(
quantized_input_names_weight,
zero_point_names_weight,
scale_names_weight,
nodes_weight,
) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
quantized_input_names.extend(quantized_input_names_weight)
zero_point_names.extend(zero_point_names_weight)
scale_names.extend(scale_names_weight)
nodes.extend(nodes_weight)
if not data_found or quantized_input_names is None:
return super().quantize()
quantized_bias_name = ""
bias_present = False
if len(node.input) == 3:
if self.quantizer.weight_qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
raise RuntimeError("Quantization to FLOAT8E4M3FN for operator Conv is not supported.")
quantized_bias_name = self.quantizer.quantize_bias_static(node.input[2], node.input[0], node.input[1])
bias_present = True
qlinear_conv_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
qlinear_conv_name = node.name + "_quant" if node.name else ""
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
qlinear_conv_inputs = []
# Input 0
qlinear_conv_inputs.append(quantized_input_names[0])
qlinear_conv_inputs.append(scale_names[0])
qlinear_conv_inputs.append(zero_point_names[0])
# Input 1
qlinear_conv_inputs.append(quantized_input_names[1])
qlinear_conv_inputs.append(scale_names[1])
qlinear_conv_inputs.append(zero_point_names[1])
# Output
qlinear_conv_inputs.append(output_scale_name)
qlinear_conv_inputs.append(output_zp_name)
if bias_present:
qlinear_conv_inputs.append(quantized_bias_name)
qlinear_conv_node = onnx.helper.make_node(
"QLinearConv", qlinear_conv_inputs, [qlinear_conv_output], qlinear_conv_name, **kwargs
)
nodes.append(qlinear_conv_node)
# Create an entry for this quantized value
q_output = QuantizedValue(
node.output[0],
qlinear_conv_output,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = q_output
self.quantizer.new_nodes += nodes
class QDQConv(QDQOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "Conv" or node.op_type == "ConvTranspose"
self.quantizer.quantize_activation_tensor(node.input[0])
if not self.disable_qdq_for_node_output:
self.quantizer.quantize_activation_tensor(node.output[0])
is_weight_per_channel, weight_axis = self.quantizer.is_tensor_per_channel(
node.input[1], default_axis=0 if node.op_type == "Conv" else 1
)
if is_weight_per_channel:
self.quantizer.quantize_weight_tensor_per_channel(node.input[1], weight_axis)
else:
self.quantizer.quantize_weight_tensor(node.input[1])
if len(node.input) == 3:
self.quantizer.quantize_bias_tensor(node.name, node.input[2], node.input[0], node.input[1])

View File

@ -0,0 +1,78 @@
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType
from .base_operator import QuantOperatorBase
from .qdq_base_operator import QDQOperatorBase
# For operators that support 8bits operations directly, and output could
# reuse input[0]'s type, zeropoint, scale; For example,Transpose, Reshape, etc.
class Direct8BitOp(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
if not self.quantizer.force_quantize_no_input_check:
# Keep backward compatibility
# Quantize when input[0] is quantized already. Otherwise keep it.
quantized_input_value = self.quantizer.find_quantized_value(node.input[0])
if quantized_input_value is None:
self.quantizer.new_nodes += [node]
return
quantized_output_value = QuantizedValue(
node.output[0],
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
quantized_input_value.scale_name,
quantized_input_value.zp_name,
quantized_input_value.value_type,
)
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
node.input[0] = quantized_input_value.q_name
node.output[0] = quantized_output_value.q_name
self.quantizer.new_nodes += [node]
else:
# Force quantize those ops if possible, use exclude node list if this is not you want
if not self.quantizer.is_valid_quantize_weight(node.input[0]):
super().quantize()
return
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
if quantized_input_names is None:
return super().quantize()
# Create an entry for output quantized value
quantized_output_value = QuantizedValue(
node.output[0],
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
scale_names[0],
zero_point_names[0],
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
node.input[0] = quantized_input_names[0]
node.output[0] = quantized_output_value.q_name
nodes.append(node)
self.quantizer.new_nodes += nodes
class QDQDirect8BitOp(QDQOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
if self.quantizer.force_quantize_no_input_check:
self.quantizer.quantize_activation_tensor(self.node.input[0])
if not self.disable_qdq_for_node_output:
self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
elif self.quantizer.is_tensor_quantized(self.node.input[0]) and not self.disable_qdq_for_node_output:
self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)

View File

@ -0,0 +1,121 @@
import logging
import onnx
from onnx import onnx_pb as onnx_proto # noqa: F401
from ..quant_utils import attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
"""
Quantizes the EmbedLayerNorm fused ONNXRuntime Op.
This Quant operator keeps the input and segment IDs at int32 but will quantize all initializer and
weight inputs associated with the node to uint8.
"""
class EmbedLayerNormalizationQuant(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def should_quantize(self):
return self.quantizer.should_quantize_node(self.node)
def quantize(self):
node = self.node
assert node.op_type == "EmbedLayerNormalization"
if len(node.output) > 2:
logging.info(f"Quantization is not applied to {node.name} since it has 3 outputs")
return super().quantize()
"""
Pre-quantization EmbedLayerNorm inputs:
[0] input_ids (int32)
[1] segment_ids (int32)
[2] word_embedding (float32)
[3] position_embedding (float32)
[4] segment_embedding (float32)
[5] gamma (float32)
[6] beta (float32)
[7] mask (int32) (optional)
"""
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [2, 3, 4, 5, 6])
if quantized_input_names is None:
return super().quantize()
qembed_layer_norm_name = "" if not node.name else node.name + "_quant"
"""
Quantized Input Tensor List
[0] input_ids (int32)
[1] segment_ids (int32)
[2] word_embedding (uint8)
[3] position_embedding (uint8)
[4] segment_embedding (uint8)
[5] gamma (uint8)
[6] beta (uint8)
[7] mask (int32) (optional)
[8] word_embedding_scale (float)
[9] position_embedding_scale (float)
[10] segment_embedding_scale (float)
[11] gamma_scale (float)
[12] beta_scale (float)
[13] word_embedding_zero_point (uint8)
[14] position_embedding_zero_point (uint8)
[15] segment_embedding_zero_point (uint8)
[16] gamma_zero_point (uint8)
[17] beta_zero_point (uint8)
"""
inputs = []
# 'input_ids'
inputs.extend([node.input[0]])
# 'segment_ids'
inputs.extend([node.input[1]])
# 'word_embedding_quant'
inputs.extend([quantized_input_names[0]])
# 'position_embedding_quant'
inputs.extend([quantized_input_names[1]])
# 'segment_embedding_quant'
inputs.extend([quantized_input_names[2]])
# 'gamma_quant'
inputs.extend([quantized_input_names[3]])
# 'beta_quant'
inputs.extend([quantized_input_names[4]])
# 'mask' (optional)
inputs.extend([node.input[7] if len(node.input) > 7 else ""])
# Add all scales:
inputs.extend([scale_names[0]])
inputs.extend([scale_names[1]])
inputs.extend([scale_names[2]])
inputs.extend([scale_names[3]])
inputs.extend([scale_names[4]])
# Add all zero points:
inputs.extend([zero_point_names[0]])
inputs.extend([zero_point_names[1]])
inputs.extend([zero_point_names[2]])
inputs.extend([zero_point_names[3]])
inputs.extend([zero_point_names[4]])
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
qembed_layer_norm_node = onnx.helper.make_node(
"QEmbedLayerNormalization",
inputs,
node.output,
qembed_layer_norm_name,
**kwargs,
)
nodes.append(qembed_layer_norm_node)
self.quantizer.new_nodes += nodes

View File

@ -0,0 +1,64 @@
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType
from .base_operator import QuantOperatorBase
from .qdq_base_operator import QDQOperatorBase
"""
Quantize Gather
"""
class GatherQuant(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def should_quantize(self):
if not self.quantizer.should_quantize_node(self.node):
return False
return self.quantizer.is_valid_quantize_weight(self.node.input[0])
def quantize(self):
node = self.node
assert node.op_type == "Gather"
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
if quantized_input_names is None:
return super().quantize()
gather_new_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
# Create an entry for this quantized value
q_output = QuantizedValue(
node.output[0],
gather_new_output,
scale_names[0],
zero_point_names[0],
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = q_output
node.output[0] = gather_new_output
node.input[0] = quantized_input_names[0]
nodes.append(node)
self.quantizer.new_nodes += nodes
class QDQGather(QDQOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "Gather"
if self.quantizer.is_valid_quantize_weight(node.input[0]) or self.quantizer.force_quantize_no_input_check:
self.quantizer.quantize_activation_tensor(node.input[0])
self.quantizer.quantize_output_same_as_input(node.output[0], node.input[0], node.name)
elif self.quantizer.is_tensor_quantized(node.input[0]):
self.quantizer.quantize_output_same_as_input(node.output[0], node.input[0], node.name)

View File

@ -0,0 +1,62 @@
import onnx
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
class QGlobalAveragePool(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "GlobalAveragePool"
# If input to this node is not quantized then keep this node.
if node.input[0] not in self.quantizer.quantized_value_map:
return super().quantize()
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
# Create an entry for output quantized value.
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
(
data_found,
output_scale_name_from_parameter,
output_zp_name_from_parameter,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
# Just use input scale and zp if parameters for output is not specified.
output_scale_name = output_scale_name_from_parameter if data_found else quantized_input_value.scale_name
output_zp_name = output_zp_name_from_parameter if data_found else quantized_input_value.zp_name
quantized_output_value = QuantizedValue(
node.output[0],
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
kwargs["channels_last"] = 0
qnode_name = node.name + "_quant" if node.name else ""
qnode = onnx.helper.make_node(
"QLinear" + node.op_type,
[
quantized_input_value.q_name,
quantized_input_value.scale_name,
quantized_input_value.zp_name,
output_scale_name,
output_zp_name,
],
[quantized_output_value.q_name],
qnode_name,
**kwargs,
)
self.quantizer.new_nodes += [qnode]

View File

@ -0,0 +1,166 @@
import logging
import numpy as np # noqa: F401
import onnx
from ..quant_utils import find_by_name # noqa: F401
from ..quant_utils import get_mul_node # noqa: F401
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase # noqa: F401
from .matmul import QOpMatMul
from .qdq_base_operator import QDQOperatorBase
def is_B_transposed(gemm_node): # noqa: N802
transB_attribute = [attr for attr in gemm_node.attribute if attr.name == "transB"] # noqa: N806
if len(transB_attribute):
return onnx.helper.get_attribute_value(transB_attribute[0]) > 0
return False
def get_beta(gemm_node):
beta_attribute = [attr for attr in gemm_node.attribute if attr.name == "beta"]
if len(beta_attribute):
return onnx.helper.get_attribute_value(beta_attribute[0])
return 1.0
def set_default_beta(gemm_node):
beta_attribute = [attr for attr in gemm_node.attribute if attr.name == "beta"]
if len(beta_attribute):
beta_attribute[0].f = 1.0
return 1.0
class QLinearGemm(QOpMatMul):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "Gemm"
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
if self.quantizer.is_input_a_initializer(node.input[1]) and self.quantizer.is_per_channel():
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
node.input[1],
self.quantizer.weight_qType,
0 if is_B_transposed(node) else 1,
)
quantized_input_names.append(quant_weight_tuple[0])
zero_point_names.append(quant_weight_tuple[1])
scale_names.append(quant_weight_tuple[2])
else:
# Get Quantized from both activation(input[0]) and weight(input[1])
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
(
quantized_input_names_weight,
zero_point_names_weight,
scale_names_weight,
nodes_weight,
) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
quantized_input_names.extend(quantized_input_names_weight)
zero_point_names.extend(zero_point_names_weight)
scale_names.extend(scale_names_weight)
nodes.extend(nodes_weight)
if not data_found or quantized_input_names is None:
return super().quantize()
quantized_bias_name = ""
if len(node.input) == 3:
if not self.quantizer.is_input_a_initializer(node.input[2]):
return super().quantize()
# Note: if the quantized type is float 8, the bias is converted into float 16.
# cublasLtMatMul only supports (b)float16 or float32 bias.
quantized_bias_name = self.quantizer.quantize_bias_static(
node.input[2], node.input[0], node.input[1], get_beta(self.node)
)
qgemm_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
qgemm_name = node.name + "_quant" if node.name else ""
kwargs = {}
for attribute in node.attribute:
if attribute.name != "beta":
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
# generate input
qgemm_inputs = []
for i in range(2):
qgemm_inputs.extend([quantized_input_names[i], scale_names[i], zero_point_names[i]])
qgemm_inputs.extend([quantized_bias_name, output_scale_name, output_zp_name])
qgemm_node = onnx.helper.make_node("QGemm", qgemm_inputs, [qgemm_output], qgemm_name, **kwargs)
nodes.append(qgemm_node)
# Create an entry for this quantized value
q_output = QuantizedValue(
node.output[0],
qgemm_output,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
node_type=node.op_type,
node_qtype=self.quantizer.weight_qType,
)
self.quantizer.quantized_value_map[node.output[0]] = q_output
self.quantizer.new_nodes += nodes
class QDQGemm(QDQOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "Gemm"
self.quantizer.quantize_activation_tensor(node.input[0])
if not self.disable_qdq_for_node_output:
self.quantizer.quantize_activation_tensor(node.output[0])
is_weight_per_channel, weight_axis = self.quantizer.is_tensor_per_channel(
node.input[1], default_axis=0 if is_B_transposed(node) else 1
)
if is_weight_per_channel:
self.quantizer.quantize_weight_tensor_per_channel(node.input[1], weight_axis)
else:
self.quantizer.quantize_weight_tensor(node.input[1])
if len(node.input) == 3:
if self.quantizer.is_input_a_initializer(node.input[2]):
self.quantizer.quantize_bias_tensor(
node.name, node.input[2], node.input[0], node.input[1], get_beta(self.node)
)
set_default_beta(self.node)
else:
logging.warning(
f"Bias of Gemm node '{self.node.name}' is not constant. Please exclude this node for better performance."
)

View File

@ -0,0 +1,117 @@
import numpy
import onnx
from onnx import onnx_pb as onnx_proto
from ..quant_utils import QuantType, attribute_to_kwarg, ms_domain # noqa: F401
from .base_operator import QuantOperatorBase
"""
Quantize LSTM
"""
class LSTMQuant(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
"""
parameter node: LSTM node.
parameter new_nodes_list: List of new nodes created before processing this node.
return: a list of nodes in topological order that represents quantized Attention node.
"""
node = self.node
assert node.op_type == "LSTM"
if not self.quantizer.is_valid_quantize_weight(node.input[1]) or not self.quantizer.is_valid_quantize_weight(
node.input[2]
):
super().quantize()
return
model = self.quantizer.model
W = model.get_initializer(node.input[1]) # noqa: N806
R = model.get_initializer(node.input[2]) # noqa: N806
if len(W.dims) != 3 or len(R.dims) != 3:
super().quantize()
return
[W_num_dir, W_4_hidden_size, W_input_size] = W.dims # noqa: N806
[R_num_dir, R_4_hidden_size, R_hidden_size] = R.dims # noqa: N806
if self.quantizer.is_per_channel():
del W.dims[0]
del R.dims[0]
W.dims[0] = W_num_dir * W_4_hidden_size
R.dims[0] = R_num_dir * R_4_hidden_size
quant_input_weight_tuple = self.quantizer.quantize_weight_per_channel(
node.input[1], onnx_proto.TensorProto.INT8, 0 # self.quantizer.weight_qType?
)
quant_recurrent_weight_tuple = self.quantizer.quantize_weight_per_channel(
node.input[2], onnx_proto.TensorProto.INT8, 0 # self.quantizer.weight_qType?
)
W_quant_weight = model.get_initializer(quant_input_weight_tuple[0]) # noqa: N806
R_quant_weight = model.get_initializer(quant_recurrent_weight_tuple[0]) # noqa: N806
W_quant_array = onnx.numpy_helper.to_array(W_quant_weight) # noqa: N806
R_quant_array = onnx.numpy_helper.to_array(R_quant_weight) # noqa: N806
W_quant_array = numpy.reshape(W_quant_array, (W_num_dir, W_4_hidden_size, W_input_size)) # noqa: N806
R_quant_array = numpy.reshape(R_quant_array, (R_num_dir, R_4_hidden_size, R_hidden_size)) # noqa: N806
W_quant_array = numpy.transpose(W_quant_array, (0, 2, 1)) # noqa: N806
R_quant_array = numpy.transpose(R_quant_array, (0, 2, 1)) # noqa: N806
W_quant_tranposed = onnx.numpy_helper.from_array(W_quant_array, quant_input_weight_tuple[0]) # noqa: N806
R_quant_tranposed = onnx.numpy_helper.from_array(R_quant_array, quant_recurrent_weight_tuple[0]) # noqa: N806
model.remove_initializers([W_quant_weight, R_quant_weight])
model.add_initializer(W_quant_tranposed)
model.add_initializer(R_quant_tranposed)
W_quant_zp = model.get_initializer(quant_input_weight_tuple[1]) # noqa: N806
R_quant_zp = model.get_initializer(quant_recurrent_weight_tuple[1]) # noqa: N806
W_quant_scale = model.get_initializer(quant_input_weight_tuple[2]) # noqa: N806
R_quant_scale = model.get_initializer(quant_recurrent_weight_tuple[2]) # noqa: N806
if self.quantizer.is_per_channel():
W_quant_zp.dims[:] = [W_num_dir, W_4_hidden_size]
R_quant_zp.dims[:] = [R_num_dir, R_4_hidden_size]
W_quant_scale.dims[:] = [W_num_dir, W_4_hidden_size]
R_quant_scale.dims[:] = [R_num_dir, R_4_hidden_size]
inputs = []
input_len = len(node.input)
inputs.extend([node.input[0]])
inputs.extend([quant_input_weight_tuple[0], quant_recurrent_weight_tuple[0]])
inputs.extend([node.input[3] if input_len > 3 else ""])
inputs.extend([node.input[4] if input_len > 4 else ""])
inputs.extend([node.input[5] if input_len > 5 else ""])
inputs.extend([node.input[6] if input_len > 6 else ""])
inputs.extend([node.input[7] if input_len > 7 else ""])
inputs.extend(
[
quant_input_weight_tuple[2],
quant_input_weight_tuple[1],
quant_recurrent_weight_tuple[2],
quant_recurrent_weight_tuple[1],
]
)
kwargs = {}
for attribute in node.attribute:
if attribute.name == "layout":
continue
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
quant_lstm_name = "" if not node.name else node.name + "_quant"
quant_lstm_node = onnx.helper.make_node("DynamicQuantizeLSTM", inputs, node.output, quant_lstm_name, **kwargs)
self.quantizer.new_nodes.append(quant_lstm_node)
dequantize_node = self.quantizer._dequantize_value(node.input[0])
if dequantize_node is not None:
self.quantizer.new_nodes.append(dequantize_node)

View File

@ -0,0 +1,228 @@
import itertools
import logging
import onnx
from onnx import onnx_pb as onnx_proto
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, find_by_name, get_mul_node
from .base_operator import QuantOperatorBase
from .qdq_base_operator import QDQOperatorBase
class QOpMatMul(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def should_quantize(self):
if not self.quantizer.should_quantize_node(self.node):
logging.debug(f"Ignore MatMul {self.node.name}]")
return False
if (not self.quantizer.is_float_tensor(self.node.input[1])) and (
not self.quantizer.is_float_tensor(self.node.input[0])
):
logging.info(f"Ignore MatMul due to non float inputs {self.node.name}]")
return False
# do not quantize non-constant B matrices for matmul
if self.quantizer.q_matmul_const_b_only:
if not self.quantizer.find_initializer_in_path(self.node.input[1]):
logging.info(f"Ignore MatMul due to non constant B: {self.quantizer.graph_scope}[{self.node.name}]")
return False
return True
"""
Used when quantize mode is QuantizationMode.IntegerOps.
"""
class MatMulInteger(QOpMatMul):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "MatMul"
# Get Quantized from both activation(input[0]) and weight(input[1])
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
(
quantized_input_names_weight,
zero_point_names_weight,
scale_names_weight,
nodes_weight,
) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
quantized_input_names.extend(quantized_input_names_weight)
zero_point_names.extend(zero_point_names_weight)
scale_names.extend(scale_names_weight)
nodes.extend(nodes_weight)
matmul_integer_output = node.output[0] + "_output_quantized"
matmul_integer_name = node.name + "_quant" if node.name else ""
matmul_integer_node = onnx.helper.make_node(
"MatMulInteger",
quantized_input_names + zero_point_names,
[matmul_integer_output],
matmul_integer_name,
)
nodes.append(matmul_integer_node)
# Add cast operation to cast matmulInteger output to float.
cast_op_output = matmul_integer_output + "_cast_output"
otype = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
cast_node = onnx.helper.make_node(
"Cast",
[matmul_integer_output],
[cast_op_output],
matmul_integer_output + "_cast",
to=otype,
)
nodes.append(cast_node)
# Add mul operation to multiply scales of two inputs.
assert len(scale_names) == 2
scales_mul_op = (
matmul_integer_name + "_scales_mul"
if matmul_integer_name
else scale_names[0] + "_" + scale_names[1] + "_mul"
)
scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
if scales_mul_node is None:
scales_mul_node = get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
nodes.append(scales_mul_node)
scales_mul_op_output = scales_mul_node.output[0]
# Add mul operation to multiply mul_scales_op result with output of MatMulInteger
# and make the output of this node the same as output of original matmul node.
output_scale_mul_op = ""
if matmul_integer_name:
output_scale_mul_op = matmul_integer_name + "_output_scale_mul"
nodes.append(
get_mul_node(
[cast_op_output, scales_mul_op_output],
node.output[0],
output_scale_mul_op,
)
)
self.quantizer.new_nodes += nodes
"""
Used when quantize mode is QuantizationMode.QLinearOps
"""
class QLinearMatMul(QOpMatMul):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "MatMul"
# Get Quantized from both activation(input[0]) and weight(input[1])
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
(
quantized_input_names_weight,
zero_point_names_weight,
scale_names_weight,
nodes_weight,
) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
quantized_input_names.extend(quantized_input_names_weight)
zero_point_names.extend(zero_point_names_weight)
scale_names.extend(scale_names_weight)
nodes.extend(nodes_weight)
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
if not data_found or quantized_input_names is None:
return super().quantize()
qlinear_matmul_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
qlinear_matmul_name = node.name + "_quant" if node.name else ""
qlinear_matmul_inputs = []
# Input 0
qlinear_matmul_inputs.append(quantized_input_names[0])
qlinear_matmul_inputs.append(scale_names[0])
qlinear_matmul_inputs.append(zero_point_names[0])
# Input 1
qlinear_matmul_inputs.append(quantized_input_names[1])
qlinear_matmul_inputs.append(scale_names[1])
qlinear_matmul_inputs.append(zero_point_names[1])
# Output quantization parameter
qlinear_matmul_inputs.append(output_scale_name)
qlinear_matmul_inputs.append(output_zp_name)
domain = (
"com.microsoft"
if self.quantizer.weight_qType
in {
onnx_proto.TensorProto.FLOAT8E4M3FN,
onnx_proto.TensorProto.FLOAT8E4M3FNUZ,
onnx_proto.TensorProto.FLOAT8E5M2,
onnx_proto.TensorProto.FLOAT8E5M2FNUZ,
}
else ""
)
qlinear_matmul_node = onnx.helper.make_node(
"QLinearMatMul",
qlinear_matmul_inputs,
[qlinear_matmul_output],
qlinear_matmul_name,
domain=domain,
)
nodes.append(qlinear_matmul_node)
# Create an entry for this quantized value
q_output = QuantizedValue(
node.output[0],
qlinear_matmul_output,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = q_output
self.quantizer.new_nodes += nodes
class QDQMatMul(QDQOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "MatMul"
if self.disable_qdq_for_node_output:
nodes_to_iterate = node.input
else:
nodes_to_iterate = itertools.chain(node.input, node.output)
for tensor_name in nodes_to_iterate:
is_per_channel, channel_axis = self.quantizer.is_tensor_per_channel(
tensor_name, default_axis=1, op_type=node.op_type
)
if is_per_channel:
self.quantizer.quantize_weight_tensor_per_channel(tensor_name, channel_axis)
else:
self.quantizer.quantize_activation_tensor(tensor_name)

View File

@ -0,0 +1,34 @@
from .direct_q8 import Direct8BitOp, QDQDirect8BitOp
class QMaxPool(Direct8BitOp):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "MaxPool"
# if version is less than 12, go to normal quantize.
if self.quantizer.opset_version < 12:
super(Direct8BitOp, self).quantize()
return
# Direct 8bits op
return super().quantize()
class QDQMaxPool(QDQDirect8BitOp):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "MaxPool"
# if version is less than 12, just no change
if self.quantizer.opset_version < 12:
return
# Direct 8bits op
return super().quantize()

View File

@ -0,0 +1,40 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------
from .qdq_base_operator import QDQOperatorBase
class QDQNormalization(QDQOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type in {"InstanceNormalization", "LayerNormalization", "BatchNormalization"}
# Input
self.quantizer.quantize_activation_tensor(node.input[0])
# Scale
scale_is_initializer = self.quantizer.is_input_a_initializer(node.input[1])
scale_is_per_channel, scale_channel_axis = self.quantizer.is_tensor_per_channel(
node.input[1], default_axis=1, op_type=node.op_type
)
if scale_is_per_channel:
self.quantizer.quantize_weight_tensor_per_channel(node.input[1], axis=scale_channel_axis)
elif scale_is_initializer:
self.quantizer.quantize_weight_tensor(node.input[1])
else:
self.quantizer.quantize_activation_tensor(node.input[1])
# Bias
if len(node.input) > 2 and node.input[2]:
self.quantizer.quantize_bias_tensor(node.name, node.input[2], node.input[0], node.input[1])
# Output
if not self.disable_qdq_for_node_output:
for output_name in node.output:
self.quantizer.quantize_activation_tensor(output_name)

View File

@ -0,0 +1,100 @@
import onnx
from ..quant_utils import (
TENSOR_NAME_QUANT_SUFFIX,
QuantizedValue,
QuantizedValueType,
attribute_to_kwarg,
quantize_nparray,
)
from .base_operator import QuantOperatorBase
class QPad(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "Pad"
# Only after version 11, it has the optional constant_value
# If input[0] is not quantized, do not quanitize this node
if (self.quantizer.opset_version < 11) or (node.input[0] not in self.quantizer.quantized_value_map):
super().quantize()
return
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
kwargs = {}
for attribute in node.attribute:
kv = attribute_to_kwarg(attribute)
kwargs.update(kv)
if "mode" not in kwargs or kwargs["mode"] == b"constant":
if len(node.input) > 2 and node.input[2] != "": # There is 3rd input 'constant_value'
zp_tensor = self.quantizer.model.get_initializer(quantized_input_value.zp_name)
scale_tensor = self.quantizer.model.get_initializer(quantized_input_value.scale_name)
if zp_tensor is None or scale_tensor is None:
super().quantize()
return
padding_constant_initializer = self.quantizer.model.get_initializer(node.input[2])
if padding_constant_initializer is not None:
zp_array = onnx.numpy_helper.to_array(zp_tensor)
zp_value = zp_array.item() if zp_array.ndim == 0 else zp_array[0]
scale_array = onnx.numpy_helper.to_array(scale_tensor)
scale_value = scale_array.item() if scale_array.ndim == 0 else scale_array[0]
padding_constant_array = onnx.numpy_helper.to_array(padding_constant_initializer)
quantized_padding_constant_array = quantize_nparray(
self.quantizer.activation_qType,
padding_constant_array,
scale_value,
zp_value,
)
quantized_padding_constant_name = node.input[2] + TENSOR_NAME_QUANT_SUFFIX
quantized_padding_constant_initializer = onnx.numpy_helper.from_array(
quantized_padding_constant_array,
quantized_padding_constant_name,
)
# Suppose this padding constant initializer only used by the node
self.quantizer.model.remove_initializer(padding_constant_initializer)
self.quantizer.model.add_initializer(quantized_padding_constant_initializer)
node.input[2] = quantized_padding_constant_name
else:
# TODO: check quantize_inputs after sub graph is supported
pad_value_qnodes = self.quantizer._get_quantize_input_nodes(
node,
2,
self.quantizer.activation_qType,
quantized_input_value.scale_name,
quantized_input_value.zp_name,
initial_type=scale_tensor.data_type,
)
self.quantizer.new_nodes.extend(pad_value_qnodes)
node.input[2] = pad_value_qnodes[0].output[0]
else:
# In quantized format, the `zero` before quantization is mapped
# to quantized_input_value.zp_name. Thus, padding 0 to
# original tensor should become padding zero point to quantized
# tensor.
if len(node.input) == 2:
# Feed quantization's zero point to padding node.
node.input.append(quantized_input_value.zp_name)
else:
# Assign quantization's zero point to padding node.
assert node.input[2] == ""
node.input[2] = quantized_input_value.zp_name
# Create an entry for output quantized value
quantized_output_value = QuantizedValue(
node.output[0],
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
quantized_input_value.scale_name,
quantized_input_value.zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
node.input[0] = quantized_input_value.q_name
node.output[0] = quantized_output_value.q_name
self.quantizer.new_nodes += [node]

View File

@ -0,0 +1,67 @@
import onnx
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
class QLinearPool(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
# only try to quantize when given quantization parameters for it
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
# get quantized input tensor names, quantize input if needed
(
quantized_input_names,
input_zero_point_names,
input_scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
if not data_found or quantized_input_names is None:
return super().quantize()
# Create an entry for output quantized value.
qlinear_output_name = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
quantized_output_value = QuantizedValue(
node.output[0],
qlinear_output_name,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
# Create qlinear pool node for given type (AveragePool, etc)
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
qlinear_node_name = node.name + "_quant" if node.name else ""
qnode = onnx.helper.make_node(
"QLinear" + node.op_type,
[
quantized_input_names[0],
input_scale_names[0],
input_zero_point_names[0],
output_scale_name,
output_zp_name,
],
[qlinear_output_name],
qlinear_node_name,
**kwargs,
)
# add all newly created nodes
nodes.append(qnode)
self.quantizer.new_nodes += nodes

View File

@ -0,0 +1,22 @@
import itertools
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, quantize_nparray # noqa: F401
from .base_operator import QuantOperatorBase # noqa: F401
class QDQOperatorBase:
def __init__(self, onnx_quantizer, onnx_node):
self.quantizer = onnx_quantizer
self.node = onnx_node
self.disable_qdq_for_node_output = onnx_node.op_type in onnx_quantizer.op_types_to_exclude_output_quantization
def quantize(self):
node = self.node
if self.disable_qdq_for_node_output:
tensors_to_quantize = node.input
else:
tensors_to_quantize = itertools.chain(node.input, node.output)
for tensor_name in tensors_to_quantize:
self.quantizer.quantize_activation_tensor(tensor_name)

View File

@ -0,0 +1,34 @@
from .direct_q8 import Direct8BitOp, QDQDirect8BitOp
class QResize(Direct8BitOp):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "Resize"
# if version is less than 11, go to normal quantize.
if self.quantizer.opset_version < 11:
super(Direct8BitOp, self).quantize()
return
# Direct 8bits op
return super().quantize()
class QDQResize(QDQDirect8BitOp):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
assert node.op_type == "Resize"
# if version is less than 11, just keep this node
if self.quantizer.opset_version < 11:
return
# Direct 8bits op
return super().quantize()

View File

@ -0,0 +1,74 @@
import onnx
import onnx.helper
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
class QLinearSoftmax(QuantOperatorBase):
def quantize(self):
node = self.node
# set limitations for softmax output scale and zp, because the output of softmax is always 0-1
if self.quantizer.activation_qType == onnx.onnx_pb.TensorProto.UINT8:
out_scale = 1 / 256.0
out_zero_point = 0
else:
out_scale = 1 / 256.0
out_zero_point = -128
# only try to quantize when given quantization parameters for it
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0], out_scale, out_zero_point)
# get quantized input tensor names, quantize input if needed
(
quantized_input_names,
input_zero_point_names,
input_scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
if not data_found or quantized_input_names is None:
return super().quantize()
# Create an entry for output quantized value.
qlinear_output_name = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
quantized_output_value = QuantizedValue(
node.output[0],
qlinear_output_name,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
# Create qlinear softmax node for given type
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
# make qlinearsoft has the real opset_version, its default SinceVersion would be 1
kwargs["opset"] = self.quantizer.opset_version
qlinear_node_name = node.name + "_quant" if node.name else ""
qnode = onnx.helper.make_node(
"QLinear" + node.op_type,
[
quantized_input_names[0],
input_scale_names[0],
input_zero_point_names[0],
output_scale_name,
output_zp_name,
],
[qlinear_output_name],
qlinear_node_name,
**kwargs,
)
# add all newly created nodes
nodes.append(qnode)
self.quantizer.new_nodes += nodes
return None

View File

@ -0,0 +1,63 @@
import onnx
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg
from .base_operator import QuantOperatorBase
from .qdq_base_operator import QDQOperatorBase
class QSplit(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)
def quantize(self):
node = self.node
(
quantized_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [0])
if quantized_input_names is None:
return super().quantize()
quantized_node_name = ""
if node.name:
quantized_node_name = node.name + "_quant"
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
# Output just derive the scale/zero from input
quantized_output_names = []
for output_name in node.output:
quantized_output_name = output_name + "quantized"
quantized_output_names.append(quantized_output_name)
q_output = QuantizedValue(
output_name,
quantized_output_name,
scale_names[0],
zero_point_names[0],
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[output_name] = q_output
if len(node.input) > 1:
quantized_input_names.extend(node.input[1:])
quantized_node = onnx.helper.make_node(
node.op_type, quantized_input_names, quantized_output_names, quantized_node_name, **kwargs
)
nodes.append(quantized_node)
self.quantizer.new_nodes += nodes
class QDQSplit(QDQOperatorBase):
def quantize(self):
node = self.node
assert node.op_type == "Split"
if not self.quantizer.is_tensor_quantized(node.input[0]):
self.quantizer.quantize_activation_tensor(node.input[0])
if not self.disable_qdq_for_node_output:
for output in node.output:
self.quantizer.quantize_output_same_as_input(output, node.input[0], node.name)

View File

@ -0,0 +1,87 @@
import onnx
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
from .base_operator import QuantOperatorBase
from .qdq_base_operator import QDQOperatorBase
class QLinearWhere(QuantOperatorBase):
def should_quantize(self):
return True
def quantize(self):
node = self.node
assert node.op_type == "Where"
if not self.quantizer.force_quantize_no_input_check:
self.quantizer.new_nodes += [node]
return
(
data_found,
output_scale_name,
output_zp_name,
_,
_,
) = self.quantizer._get_quantization_params(node.output[0])
(
q_input_names,
zero_point_names,
scale_names,
nodes,
) = self.quantizer.quantize_activation(node, [1, 2])
if not data_found or q_input_names is None:
return super().quantize()
qlinear_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
qlinear_output_name = node.name + "_quant" if node.name else ""
q_output = QuantizedValue(
node.output[0],
qlinear_output,
output_scale_name,
output_zp_name,
QuantizedValueType.Input,
)
self.quantizer.quantized_value_map[node.output[0]] = q_output
kwargs = {}
for attribute in node.attribute:
kwargs.update(attribute_to_kwarg(attribute))
kwargs["domain"] = ms_domain
qlwhere_inputs = [
node.input[0],
q_input_names[0],
scale_names[0],
zero_point_names[0],
q_input_names[1],
scale_names[1],
zero_point_names[1],
output_scale_name,
output_zp_name,
]
qlwhere_node = onnx.helper.make_node(
"QLinearWhere", qlwhere_inputs, [qlinear_output], qlinear_output_name, **kwargs
)
self.quantizer.new_nodes += nodes
self.quantizer.new_nodes += [qlwhere_node]
class QDQWhere(QDQOperatorBase):
def quantize(self):
node = self.node
assert node.op_type == "Where"
if self.quantizer.force_quantize_no_input_check:
if not self.quantizer.is_tensor_quantized(node.input[1]):
self.quantizer.quantize_activation_tensor(node.input[1])
if not self.quantizer.is_tensor_quantized(node.input[2]):
self.quantizer.quantize_activation_tensor(node.input[2])
if not self.disable_qdq_for_node_output:
for output in node.output:
self.quantizer.quantize_activation_tensor(output)
elif (
self.quantizer.is_tensor_quantized(node.input[1])
and self.quantizer.is_tensor_quantized(node.input[2])
and not self.disable_qdq_for_node_output
):
for output in node.output:
self.quantizer.quantize_activation_tensor(output)