I am done

2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions
--- a/rl/Lib/site-packages/onnxruntime/quantization/quant_utils.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/quant_utils.py
@ -0,0 +1,866 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import logging
+import os
+import tempfile
+from enum import Enum
+from pathlib import Path
+
+import numpy
+import onnx
+from onnx import ModelProto, TensorProto, external_data_helper
+from onnx import onnx_pb as onnx_proto
+from onnx.helper import make_graph, make_model, make_node, make_tensor_value_info
+from onnx.reference import ReferenceEvaluator
+
+from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions
+
+try:
+    from onnx.reference.custom_element_types import float8e4m3fn
+except ImportError:
+    float8e4m3fn = None
+
+# INT4 np.dtypes added in ONNX 1.16. These map to np.int8/np.uint8 because numpy
+# does not support sub-byte types.
+try:
+    from onnx.reference.custom_element_types import int4, uint4
+except ImportError:
+    int4 = None
+    uint4 = None
+
+
+__producer__ = "onnx.quantize"
+__version__ = "0.1.0"
+onnx_domain = "ai.onnx"
+ms_domain = "com.microsoft"
+QUANT_OP_NAME = "QuantizeLinear"
+QUANT_INPUT_SUFFIX = "_QuantizeLinear_Input"
+DEQUANT_OP_NAME = "DequantizeLinear"
+DEQUANT_OUTPUT_SUFFIX = "_DequantizeLinear_Output"
+TENSOR_NAME_QUANT_SUFFIX = "_quantized"
+
+FLOAT8_DISTRIBUTIONS = {}
+
+type_to_name = {getattr(TensorProto, k): k for k in dir(TensorProto) if isinstance(getattr(TensorProto, k), int)}
+
+# Quantization mode
+# IntegerOps: Use IntegerOps in quantized model. Only ConvInteger and MatMulInteger ops are supported now.
+# QLinearOps: Use QLinearOps in quantized model. Only QLinearConv and QLinearMatMul ops are supported now.
+
+
+class QuantizationMode(Enum):
+    IntegerOps = 0
+    QLinearOps = 1
+
+    def __str__(self):
+        return self.name
+
+    @staticmethod
+    def from_string(mode):
+        try:
+            return QuantizationMode[mode]
+        except KeyError:
+            raise ValueError()  # noqa: B904
+
+
+class QuantizedValueType(Enum):
+    Input = 0
+    Initializer = 1
+
+    def __str__(self):
+        return self.name
+
+    @staticmethod
+    def from_string(v):
+        try:
+            return QuantizedValueType[v]
+        except KeyError:
+            raise ValueError()  # noqa: B904
+
+
+class QuantType(Enum):
+    QInt8 = 0
+    QUInt8 = 1
+    QFLOAT8E4M3FN = 2
+    QInt16 = 3
+    QUInt16 = 4
+    QInt4 = 5
+    QUInt4 = 6
+
+    def __str__(self):
+        return self.name
+
+    @staticmethod
+    def from_string(t):
+        try:
+            return QuantType[t]
+        except KeyError:
+            raise ValueError()  # noqa: B904
+
+    @property
+    def tensor_type(self):
+        if self == QuantType.QInt8:
+            return TensorProto.INT8
+        if self == QuantType.QUInt8:
+            return TensorProto.UINT8
+        if self == QuantType.QUInt16:
+            return TensorProto.UINT16
+        if self == QuantType.QInt16:
+            return TensorProto.INT16
+        if self == QuantType.QFLOAT8E4M3FN:
+            return TensorProto.FLOAT8E4M3FN
+        if self == QuantType.QUInt4:
+            return TensorProto.UINT4
+        if self == QuantType.QInt4:
+            return TensorProto.INT4
+        raise ValueError(f"Unexpected value qtype={self!r}.")
+
+
+class QuantFormat(Enum):
+    QOperator = 0
+    QDQ = 1
+
+    def __str__(self):
+        return self.name
+
+    @staticmethod
+    def from_string(format):
+        try:
+            return QuantFormat[format]
+        except KeyError:
+            raise ValueError()  # noqa: B904
+
+
+ONNX_TYPE_TO_NP_TYPE = {
+    onnx_proto.TensorProto.INT8: numpy.dtype("int8"),
+    onnx_proto.TensorProto.UINT8: numpy.dtype("uint8"),
+    onnx_proto.TensorProto.INT16: numpy.dtype("int16"),
+    onnx_proto.TensorProto.UINT16: numpy.dtype("uint16"),
+    onnx_proto.TensorProto.FLOAT8E4M3FN: float8e4m3fn,
+    onnx_proto.TensorProto.INT4: int4,  # base_dtype is np.int8
+    onnx_proto.TensorProto.UINT4: uint4,  # base_dtype is np.uint8
+}
+
+ONNX_INT_TYPE_RANGE = {
+    onnx_proto.TensorProto.UINT8: (numpy.array(0, dtype=numpy.uint8), numpy.array(255, dtype=numpy.uint8)),
+    onnx_proto.TensorProto.INT8: (numpy.array(-128, dtype=numpy.int8), numpy.array(127, dtype=numpy.int8)),
+    onnx_proto.TensorProto.UINT16: (numpy.array(0, dtype=numpy.uint16), numpy.array(65535, dtype=numpy.uint16)),
+    onnx_proto.TensorProto.INT16: (numpy.array(-32768, dtype=numpy.int16), numpy.array(32767, dtype=numpy.int16)),
+    onnx_proto.TensorProto.UINT4: (numpy.array(0, dtype=uint4), numpy.array(15, dtype=uint4)),
+    onnx_proto.TensorProto.INT4: (numpy.array(-8, dtype=int4), numpy.array(7, dtype=int4)),
+}
+
+ONNX_INT_TYPE_SYMMETRIC_RANGE = {
+    onnx_proto.TensorProto.INT8: (numpy.array(-127, dtype=numpy.int8), numpy.array(127, dtype=numpy.int8)),
+    onnx_proto.TensorProto.INT16: (numpy.array(-32767, dtype=numpy.int16), numpy.array(32767, dtype=numpy.int16)),
+}
+
+ONNX_INT_TYPE_REDUCED_RANGE = {
+    onnx_proto.TensorProto.UINT8: (numpy.array(0, dtype=numpy.uint8), numpy.array(127, dtype=numpy.uint8)),
+    onnx_proto.TensorProto.INT8: (numpy.array(-64, dtype=numpy.int8), numpy.array(64, dtype=numpy.int8)),
+    onnx_proto.TensorProto.UINT16: (numpy.array(0, dtype=numpy.uint16), numpy.array(32767, dtype=numpy.uint16)),
+    onnx_proto.TensorProto.INT16: (numpy.array(-16384, dtype=numpy.int16), numpy.array(16384, dtype=numpy.int16)),
+    onnx_proto.TensorProto.UINT4: (numpy.array(0, dtype=int4), numpy.array(7, dtype=int4)),
+    onnx_proto.TensorProto.INT4: (numpy.array(-4, dtype=int4), numpy.array(3, dtype=int4)),
+}
+
+
+def _check_type(*args, zero_point_index=-1):
+    new_args = []
+    for i, a in enumerate(args):
+        if numpy.issubdtype(type(a), numpy.number):
+            new_args.append(numpy.array(a))
+        elif isinstance(a, numpy.ndarray):
+            new_args.append(a)
+        else:
+            raise TypeError(f"arg {i} is not an array: {a}")
+        if i == zero_point_index:
+            v = new_args[-1]
+            if v.dtype == numpy.float32 or v.dtype == numpy.float16:
+                raise TypeError(f"zero_point cannot be {v.dtype}")
+    return tuple(new_args) if len(new_args) > 1 else new_args[0]
+
+
+def quantize_nparray(qType, arr, scale, zero_point, low=None, high=None):
+    assert (
+        qType in ONNX_TYPE_TO_NP_TYPE
+    ), f"Unexpected data type {qType} requested. Only INT8, UINT8, INT16, and UINT16 are supported."
+    if qType in (
+        onnx_proto.TensorProto.FLOAT8E4M3FN,
+        onnx_proto.TensorProto.FLOAT8E4M3FNUZ,
+        onnx_proto.TensorProto.FLOAT8E5M2,
+        onnx_proto.TensorProto.FLOAT8E5M2FNUZ,
+    ):
+        if zero_point != 0:
+            raise NotImplementedError(f"zero_point is expected to be null for float 8 not {zero_point!r}.")
+        if arr.dtype == numpy.float32:
+            onnx_type = TensorProto.FLOAT
+        elif arr.dtype == numpy.float16:
+            onnx_type = TensorProto.FLOAT16
+        else:
+            raise ValueError(f"Unexpected dtype {arr.dtype}.")
+        onnx_model = make_model(
+            make_graph(
+                [
+                    make_node(
+                        "Constant", [], ["zero_point"], value=onnx.helper.make_tensor("zero_point", qType, [], [0])
+                    ),
+                    make_node("QuantizeLinear", ["X", "scale", "zero_point"], ["Y"]),
+                ],
+                "qu",
+                [
+                    make_tensor_value_info("X", onnx_type, None),
+                    make_tensor_value_info("scale", onnx_type, None),
+                ],
+                [make_tensor_value_info("Y", qType, None)],
+            )
+        )
+        ref = ReferenceEvaluator(onnx_model)
+        return _check_type(ref.run(None, {"X": arr, "scale": scale})[0])
+    else:
+        # Quantizes data for all integer types.
+        #
+        # For int4 types, the quantized data is returned as either np.int8 or np.uint8,
+        # which matches the python reference ONNX implementation of QuantizeLinear.
+        # This data can be packed into 4-bit elements by using pack_bytes_to_4bit().
+        dtype = ONNX_TYPE_TO_NP_TYPE[qType]
+        (qmin, qmax) = get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=True)
+
+        cliplow = max(qmin, low) if low is not None else qmin
+        cliphigh = min(qmax, high) if high is not None else qmax
+        arr_fp32 = numpy.asarray((arr.astype(numpy.float32) / scale).round() + zero_point)
+        numpy.clip(arr_fp32, cliplow, cliphigh, out=arr_fp32)
+        return _check_type(arr_fp32.astype(dtype))
+
+
+def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False, min_real_range=None):
+    """Calculate the scale s and zero point z for the quantization relation
+    r = s(q-z), where r are the original values and q are the corresponding
+    quantized values.
+
+    r and z are calculated such that every value within [rmin,rmax] has an
+    approximate representation within [qmin,qmax]. In addition, qmin <= z <=
+    qmax is enforced. If the symmetric flag is set to True, the interval
+    [rmin,rmax] is symmetrized to [-absmax, +absmax], where
+    absmax = max(abs(rmin), abs(rmax)).
+
+    :parameter rmin: minimum value of r
+    :parameter rmax: maximum value of r
+    :parameter qmin: minimum value representable by the target quantization data type
+    :parameter qmax: maximum value representable by the target quantization data type
+    :parameter symmetric: True if the floating-point range should be made symmetric. Defaults to False.
+    :parameter min_real_range: Minimum floating-point range (i.e., rmax - rmin) to enforce. Defaults to None.
+    :return: zero and scale [z, s]
+
+    """
+    if qmin > 0 or qmax < 0:
+        raise ValueError(f"qmin and qmax must meet requirement: qmin <= 0 <= qmax while qmin:{qmin}, qmmax:{qmax}")
+
+    # Adjust rmin and rmax such that 0 is included in the range. This is
+    # required to make sure zero can be represented by the quantization data
+    # type (i.e. to make sure qmin <= zero_point <= qmax)
+    rmin = numpy.minimum(rmin, numpy.array(0, dtype=rmin.dtype))
+    rmax = numpy.maximum(rmax, numpy.array(0, dtype=rmax.dtype))
+
+    # Ensure a minimum float-point range if specified.
+    if min_real_range is not None:
+        rmax = max(rmax, rmin + min_real_range)
+
+    if symmetric:
+        absmax = numpy.maximum(numpy.abs(rmin), numpy.abs(rmax))
+        rmin = -absmax
+        rmax = +absmax
+
+    assert qmin <= qmax, f"qmin={rmin} > qmax={rmax}"
+    dr = numpy.array(rmax - rmin, dtype=numpy.float64)
+    dq = numpy.array(qmax, dtype=numpy.float64) - numpy.array(qmin, dtype=numpy.float64)
+    scale = numpy.array(dr / dq)
+    assert scale >= 0, "scale isse"
+    if scale < numpy.finfo(rmax.dtype).tiny:
+        scale = numpy.array(1.0, dtype=rmax.dtype)
+        zero_point = numpy.array(0, dtype=qmin.dtype)
+    else:
+        if symmetric:
+            # When symmetric (i.e., rmax == -rmin), the zero_point formula reduces to round((qmax + qmin) / 2.0).
+            # This simpler formula doesn't depend on scale and guarantees that the zero point values
+            # for int8, uint8, int16, and uint16 are always 0, 128, 0, and 32768, respectively.
+            # This is important for per-channel/symmetric QLinearConv on CPU EP, which requires all channels to have
+            # the exact same zero_point values.
+            zero_point = numpy.array(
+                numpy.round((qmin + qmax) / numpy.array(2.0, dtype=numpy.float64)), dtype=qmin.dtype
+            )
+        else:
+            zero_point = numpy.array(numpy.round(qmin - rmin / scale), dtype=qmin.dtype)
+        scale = scale.astype(rmax.dtype)
+
+    return [zero_point, scale]
+
+
+def compute_scale_zp_float8(element_type, std):
+    """Calculate the scale s for a float8 type (E4M3FN).
+    The function assumes the coefficient distribution and the float 8
+    distribution are similar to two gaussian laws.
+
+    :return: zero and scale [z, s]
+
+    More details in notebook `quantization_fp8.ipynb
+    <https://github.com/microsoft/onnxruntime/blob/main/docs/python/notebooks/quantization_fp8.ipynb>`_.
+    """
+    zp_dtype = None
+    if element_type not in FLOAT8_DISTRIBUTIONS:
+        if element_type == TensorProto.FLOAT8E4M3FN:
+            from onnx.numpy_helper import float8e4m3_to_float32
+            from onnx.reference.custom_element_types import float8e4m3fn
+
+            zp_dtype = float8e4m3fn
+            all_values = [float8e4m3_to_float32(i) for i in range(256)]
+            values = numpy.array(
+                [f for f in all_values if not numpy.isnan(f) and not numpy.isinf(f)], dtype=numpy.float32
+            )
+        else:
+            raise ValueError(f"Quantization to element_type={element_type} not implemented.")
+        FLOAT8_DISTRIBUTIONS[element_type] = values
+    elif element_type == TensorProto.FLOAT8E4M3FN:
+        from onnx.reference.custom_element_types import float8e4m3fn
+
+        zp_dtype = float8e4m3fn
+
+    if zp_dtype is None:
+        raise TypeError(f"Unexpected element_type {element_type}.")
+    std_f8 = numpy.std(FLOAT8_DISTRIBUTIONS[element_type])
+    zero = numpy.array(0, dtype=zp_dtype)
+    scale = numpy.array(std / std_f8, dtype=std.dtype)
+    return [zero, scale]
+
+
+def quantize_data(
+    data, qType, symmetric, reduce_range=False, min_real_range=None, rmin_override=None, rmax_override=None
+):
+    """
+    :param data: data to quantize
+    :param qType: data type to quantize to. Supported types UINT8 and INT8
+    :param symmetric: whether symmetric quantization is used or not. This is applied to INT8.
+    :parameter reduce_range: True if the quantization range should be reduced. Defaults to False.
+    :parameter min_real_range: Minimum floating-point range (i.e., rmax - rmin) to enforce. Defaults to None.
+    :parameter rmin_override: The value of rmin to use if not None. Otherwise, uses min(data).
+    :parameter rmax_override: The value of rmax to use if not None. Otherwise, uses max(data).
+    :return: minimum, maximum, zero point, scale, and quantized weights
+
+    To pack weights, we compute a linear transformation
+
+    - when data `type == uint8` mode, from `[rmin, rmax]` -> :math:`[0, 2^{b-1}]` and
+    - when data `type == int8`, from `[-m , m]` -> :math:`[-(2^{b-1}-1), 2^{b-1}-1]` where
+        `m = max(abs(rmin), abs(rmax))`
+
+    and add necessary intermediate nodes to transform quantized weight to full weight using the equation
+
+    :math:`r = S(q-z)`, where
+
+    - *r*: real original value
+    - *q*: quantized value
+    - *S*: scale
+    - *z*: zero point
+    """
+    if not isinstance(data, numpy.ndarray):
+        raise TypeError(f"Weight must be given as an array not {type(data)}.")
+    if rmin_override is not None:
+        rmin = rmin_override
+    else:
+        rmin = data.min() if len(data) else 0.0
+
+    if rmax_override is not None:
+        rmax = rmax_override
+    else:
+        rmax = data.max() if len(data) else 0.0
+
+    rmin = numpy.array(rmin, dtype=data.dtype)
+    rmax = numpy.array(rmax, dtype=data.dtype)
+    zero_point = 0
+    scale = numpy.array(1.0, dtype=data.dtype)
+
+    if qType == TensorProto.FLOAT8E4M3FN:
+        if reduce_range:
+            raise RuntimeError("Unsupported option reduce_range=True for float 8.")
+        std = numpy.std(data)
+        zero_point, scale = compute_scale_zp_float8(qType, std)
+        quantized_data = quantize_nparray(qType, data, scale, zero_point)
+        if any((quantized_data.astype(numpy.uint8).ravel() & 127) == 127):
+            np_data = numpy.asarray(data)
+            raise RuntimeError(
+                f"One of the quantized value is NaN data in [{np_data.min()}, {np_data.max()}], "
+                f"quantized_data in [{quantized_data.min()}, {quantized_data.max()}]."
+            )
+        return _check_type(rmin, rmax, zero_point, scale, quantized_data, zero_point_index=2)
+
+    if qType in (
+        TensorProto.INT8,
+        TensorProto.UINT8,
+        TensorProto.INT16,
+        TensorProto.UINT16,
+        TensorProto.INT4,
+        TensorProto.UINT4,
+    ):
+        if len(data):
+            qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, symmetric=symmetric)
+            zero_point, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, min_real_range)
+        quantized_data = quantize_nparray(qType, data, scale, zero_point)
+        return _check_type(rmin, rmax, zero_point, scale, quantized_data, zero_point_index=2)
+
+    raise ValueError(f"Unexpected value for qType={qType}.")
+
+
+def get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=False):  # noqa: N802
+    """
+    Return qmin and qmax, the minimum and maximum value representable by the given qType
+    :parameter qType: onnx.onnx_pb.TensorProto.UINT8 or onnx.onnx_pb.TensorProto.UINT8
+    :return: qmin, qmax
+    """
+    if qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
+        raise NotImplementedError("This function is not implemented for float 8 as not needed.")
+
+    qrange = None
+
+    if reduce_range:
+        qrange = ONNX_INT_TYPE_REDUCED_RANGE.get(qType)
+    elif symmetric and qType in ONNX_INT_TYPE_SYMMETRIC_RANGE:
+        qrange = ONNX_INT_TYPE_SYMMETRIC_RANGE[qType]
+    else:
+        qrange = ONNX_INT_TYPE_RANGE.get(qType)
+
+    if not qrange:
+        raise ValueError(f"Unexpected data type {qType} requested. Only INT8, UINT8, INT16, and UINT16 are supported.")
+
+    qmin, qmax = qrange
+    if qmin > 0 or qmax < 0:
+        raise ValueError(
+            f"qmin and qmax must meet requirement: qmin <= 0 <= qmax while "
+            f"qmin:{qmin}, qmmax:{qmax}, dtype={qmin.dtype}, reduce_range={reduce_range}, "
+            f"symmetric={symmetric}, qType={qType}"
+        )
+
+    return qrange
+
+
+def get_qrange_for_qType(qType, reduce_range=False, symmetric=False):  # noqa: N802
+    """
+    Helper function to get the quantization range for a type.
+        parameter qType: quantization type.
+        return: quantization range.
+    """
+    qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, symmetric=symmetric)
+    return qmax - qmin
+
+
+def normalize_axis(axis: int, rank: int) -> tuple[bool, int]:
+    """
+    Helper function that tries to return a normalized axis in the range [0, rank - 1].
+    :parameter axis: The axis to normalize.
+    :parameter rank: The tensor rank (number of dimensions).
+    :return (is_valid, axis_norm)
+    """
+    axis_norm = axis + rank if axis < 0 else axis
+    is_valid = axis_norm >= 0 and axis_norm < rank
+    return is_valid, axis_norm
+
+
+def pack_bytes_to_4bit(src_8bit: bytes) -> bytearray:
+    """
+    Copies a source array of 8-bit values into a destination bytearray of packed 4-bit values.
+    Assumes that the source values are already in the appropriate int4 range.
+    :parameter src_8bit: The 8-bit element values to pack.
+    :return A bytearray with every two 8-bit src elements packed into a single byte.
+    """
+    num_elems = len(src_8bit)
+    if num_elems == 0:
+        return bytearray()
+
+    dst_size = (num_elems + 1) // 2  # Ex: 5 8-bit elems packed into 3 bytes
+    dst = bytearray(dst_size)
+
+    src_i: int = 0
+    dst_i: int = 0
+
+    # Pack two 8-bit elements into a single byte in each iteration.
+    while src_i < num_elems - 1:
+        dst[dst_i] = ((src_8bit[src_i + 1] & 0xF) << 4) | (src_8bit[src_i] & 0xF)
+        dst_i += 1
+        src_i += 2
+
+    if src_i < num_elems:
+        # Odd number of elements.
+        dst[dst_i] = src_8bit[src_i] & 0xF
+
+    return dst
+
+
+class QuantizedInitializer:
+    """
+    Represents a linearly quantized weight input from ONNX operators
+    """
+
+    def __init__(
+        self,
+        name,
+        initializer,
+        rmins,
+        rmaxs,
+        zero_points,
+        scales,
+        data=[],  # noqa: B006
+        quantized_data=[],  # noqa: B006
+        axis=None,
+    ):
+        self.name = name
+        self.initializer = initializer  # TensorProto initializer in ONNX graph
+        self.rmins = rmins  # List of minimum range for each axis
+        self.rmaxs = rmaxs  # List of maximum range for each axis
+        # 1D tensor of zero points computed for each axis. scalar if axis is empty
+        self.zero_points = zero_points
+        self.scales = scales  # 1D tensor of scales computed for each axis. scalar if axis is empty
+        self.data = data  # original data from initializer TensorProto
+        self.quantized_data = quantized_data  # weight-packed data from data
+        # Scalar to specify which dimension in the initializer to weight pack.
+        self.axis = axis
+        # If empty, single zero point and scales computed from a single rmin and rmax
+
+
+class QuantizedValue:
+    """
+    Represents a linearly quantized value (input\\output\\intializer)
+    """
+
+    def __init__(
+        self,
+        name,
+        new_quantized_name,
+        scale_name,
+        zero_point_name,
+        quantized_value_type,
+        axis=None,
+        node_type=None,
+        node_qtype=None,
+        scale_type=None,
+    ):
+        self.original_name = name
+        self.q_name = new_quantized_name
+        self.scale_name = scale_name
+        self.zp_name = zero_point_name
+        self.value_type = quantized_value_type
+        self.axis = axis
+        self.node_type = node_type
+        self.node_qtype = node_qtype
+        self.scale_type = scale_type
+
+
+class BiasToQuantize:
+    """
+    Represents a bias to be quantized
+    """
+
+    def __init__(self, bias_name, input_name, weight_name):
+        self.bias_name = bias_name
+        self.input_name = input_name
+        self.weight_name = weight_name
+
+
+def attribute_to_kwarg(attribute):
+    """
+    Convert attribute to kwarg format for use with onnx.helper.make_node.
+        :parameter attribute: attribute in AttributeProto format.
+        :return: attribute in {key: value} format.
+    """
+    if attribute.type == 0:
+        raise ValueError(f"attribute {attribute.name} does not have type specified.")
+
+    # Based on attribute type definitions from AttributeProto
+    # definition in https://github.com/onnx/onnx/blob/main/onnx/onnx.proto
+    if attribute.type == 1:
+        value = attribute.f
+    elif attribute.type == 2:
+        value = attribute.i
+    elif attribute.type == 3:
+        value = attribute.s
+    elif attribute.type == 4:
+        value = attribute.t
+    elif attribute.type == 5:
+        value = attribute.g
+    elif attribute.type == 6:
+        value = attribute.floats
+    elif attribute.type == 7:
+        value = attribute.ints
+    elif attribute.type == 8:
+        value = attribute.strings
+    elif attribute.type == 9:
+        value = attribute.tensors
+    elif attribute.type == 10:
+        value = attribute.graphs
+    else:
+        raise ValueError(f"attribute {attribute.name} has unsupported type {attribute.type}.")
+
+    return {attribute.name: value}
+
+
+def find_by_name(item_name, item_list):
+    """
+    Helper function to find item by name in a list.
+        parameter item_name: name of the item.
+        parameter item_list: list of items.
+        return: item if found. None otherwise.
+    """
+    items = [item for item in item_list if item.name == item_name]
+    return items[0] if len(items) > 0 else None
+
+
+def get_elem_index(elem_name, elem_list):
+    """
+    Helper function to return index of an item in a node list
+    """
+    elem_idx = -1
+    for i in range(len(elem_list)):
+        if elem_list[i] == elem_name:
+            elem_idx = i
+    return elem_idx
+
+
+def get_mul_node(inputs, output, name):
+    """
+    Helper function to create a Mul node.
+        parameter inputs: list of input names.
+        parameter output: output name.
+        parameter name: name of the node.
+        return: Mul node in NodeProto format.
+    """
+    return onnx.helper.make_node("Mul", inputs, [output], name)
+
+
+def generate_identified_filename(filename: Path, identifier: str) -> Path:
+    """
+    Helper function to generate a identifiable filepath by concatenating the given identifier as a suffix.
+    """
+    return filename.parent.joinpath(filename.stem + identifier + filename.suffix)
+
+
+def apply_plot(hist, hist_edges):
+    import sys
+
+    import matplotlib.pyplot as plt
+    import numpy
+
+    numpy.set_printoptions(threshold=sys.maxsize)
+    print("Histogram:")
+    print(hist)
+    print("Histogram Edges:")
+    print(hist_edges)
+    plt.stairs(hist, hist_edges, fill=True)
+    plt.xlabel("Tensor value")
+    plt.ylabel("Counts")
+    plt.title("Tensor value V.S. Counts")
+    plt.show()
+
+
+def write_calibration_table(calibration_cache, dir="."):
+    """
+    Helper function to write calibration table to files.
+    """
+
+    import json
+
+    import flatbuffers
+
+    import onnxruntime.quantization.CalTableFlatBuffers.KeyValue as KeyValue
+    import onnxruntime.quantization.CalTableFlatBuffers.TrtTable as TrtTable
+
+    logging.info(f"calibration cache: {calibration_cache}")
+
+    with open(os.path.join(dir, "calibration.json"), "w") as file:
+        file.write(json.dumps(calibration_cache))  # use `json.loads` to do the reverse
+
+    # Serialize data using FlatBuffers
+    builder = flatbuffers.Builder(1024)
+    key_value_list = []
+    for key in sorted(calibration_cache.keys()):
+        values = calibration_cache[key]
+        value = str(max(abs(values[0]), abs(values[1])))
+
+        flat_key = builder.CreateString(key)
+        flat_value = builder.CreateString(value)
+
+        KeyValue.KeyValueStart(builder)
+        KeyValue.KeyValueAddKey(builder, flat_key)
+        KeyValue.KeyValueAddValue(builder, flat_value)
+        key_value = KeyValue.KeyValueEnd(builder)
+
+        key_value_list.append(key_value)
+
+    TrtTable.TrtTableStartDictVector(builder, len(key_value_list))
+    for key_value in key_value_list:
+        builder.PrependUOffsetTRelative(key_value)
+    main_dict = builder.EndVector()
+
+    TrtTable.TrtTableStart(builder)
+    TrtTable.TrtTableAddDict(builder, main_dict)
+    cal_table = TrtTable.TrtTableEnd(builder)
+
+    builder.Finish(cal_table)
+    buf = builder.Output()
+
+    with open(os.path.join(dir, "calibration.flatbuffers"), "wb") as file:
+        file.write(buf)
+
+    # Deserialize data (for validation)
+    if os.environ.get("QUANTIZATION_DEBUG", 0) in (1, "1"):
+        cal_table = TrtTable.TrtTable.GetRootAsTrtTable(buf, 0)
+        dict_len = cal_table.DictLength()
+        for i in range(dict_len):
+            key_value = cal_table.Dict(i)
+            logging.info(key_value.Key())
+            logging.info(key_value.Value())
+
+    # write plain text
+    with open(os.path.join(dir, "calibration.cache"), "w") as file:
+        for key in sorted(calibration_cache.keys()):
+            value = calibration_cache[key]
+            s = key + " " + str(max(abs(value[0]), abs(value[1])))
+            file.write(s)
+            file.write("\n")
+
+
+def smooth_distribution(p, eps=0.0001):
+    """Given a discrete distribution (may have not been normalized to 1),
+    smooth it by replacing zeros with eps multiplied by a scaling factor
+    and taking the corresponding amount off the non-zero values.
+    Ref: http://web.engr.illinois.edu/~hanj/cs412/bk3/KL-divergence.pdf
+         https://github.com//apache/incubator-mxnet/blob/master/python/mxnet/contrib/quantization.py
+    """
+    is_zeros = (p == 0).astype(numpy.float32)
+    is_nonzeros = (p != 0).astype(numpy.float32)
+    n_zeros = is_zeros.sum()
+    n_nonzeros = p.size - n_zeros
+
+    if not n_nonzeros:
+        # raise ValueError('The discrete probability distribution is malformed. All entries are 0.')
+        return None
+    eps1 = eps * float(n_zeros) / float(n_nonzeros)
+    assert eps1 < 1.0, "n_zeros=%d, n_nonzeros=%d, eps1=%f" % (
+        n_zeros,
+        n_nonzeros,
+        eps1,
+    )
+
+    hist = p.astype(numpy.float32)
+    hist += eps * is_zeros + (-eps1) * is_nonzeros
+    assert (hist <= 0).sum() == 0
+
+    return hist
+
+
+def model_has_external_data(model_path: Path):
+    model = onnx.load(model_path.as_posix(), load_external_data=False)
+    for intializer in model.graph.initializer:
+        if external_data_helper.uses_external_data(intializer):
+            return True
+    return False
+
+
+def optimize_model(model_path: Path, opt_model_path: Path):
+    """
+        Generate model that applies graph optimization (constant folding, etc.)
+        parameter model_path: path to the original onnx model
+        parameter opt_model_path: path to the optimized onnx model
+    :return: optimized onnx model
+    """
+    sess_option = SessionOptions()
+    sess_option.optimized_model_filepath = opt_model_path.as_posix()
+    sess_option.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC
+    kwargs = {}
+    # This will rename constant initializer names, disable it to make test pass.
+    kwargs["disabled_optimizers"] = ["ConstantSharing"]
+    _ = InferenceSession(model_path.as_posix(), sess_option, providers=["CPUExecutionProvider"], **kwargs)
+
+
+def add_pre_process_metadata(model: ModelProto):
+    """Tag the model that it went through quantization pre-processing"""
+    metadata_props = {"onnx.quant.pre_process": "onnxruntime.quant"}
+    if model.metadata_props:
+        for prop in model.metadata_props:
+            metadata_props.update({prop.key: prop.value})
+    onnx.helper.set_model_props(model, metadata_props)
+
+
+def model_has_pre_process_metadata(model: ModelProto) -> bool:
+    """Check the model whether it went through quantization pre-processing"""
+    if model.metadata_props:
+        for prop in model.metadata_props:
+            if prop.key == "onnx.quant.pre_process" and prop.value == "onnxruntime.quant":
+                return True
+    return False
+
+
+def add_infer_metadata(model: ModelProto):
+    metadata_props = {"onnx.infer": "onnxruntime.quant"}
+    if model.metadata_props:
+        for p in model.metadata_props:
+            metadata_props.update({p.key: p.value})
+    onnx.helper.set_model_props(model, metadata_props)
+
+
+def model_has_infer_metadata(model: ModelProto) -> bool:
+    if model.metadata_props:
+        for p in model.metadata_props:
+            if p.key == "onnx.infer" and p.value == "onnxruntime.quant":
+                return True
+    return False
+
+
+def load_model_with_shape_infer(model_path: Path) -> ModelProto:
+    inferred_model_path = generate_identified_filename(model_path, "-inferred")
+    onnx.shape_inference.infer_shapes_path(str(model_path), str(inferred_model_path))
+    model = onnx.load(inferred_model_path.as_posix())
+    add_infer_metadata(model)
+    inferred_model_path.unlink()
+    return model
+
+
+def save_and_reload_model_with_shape_infer(model: ModelProto) -> ModelProto:
+    with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
+        model_path = Path(quant_tmp_dir).joinpath("model.onnx")
+        onnx.save_model(model, model_path.as_posix(), save_as_external_data=True)
+        return load_model_with_shape_infer(model_path)
+
+
+def tensor_proto_to_array(initializer: TensorProto) -> numpy.ndarray:
+    if initializer.data_type in (onnx_proto.TensorProto.FLOAT, onnx_proto.TensorProto.FLOAT16):
+        return onnx.numpy_helper.to_array(initializer)
+
+    raise ValueError(
+        f"Only float type is supported. Weights {initializer.name} is {type_to_name[initializer.data_type]}"
+    )
+
+
+def add_quant_suffix(tensor_name: str) -> str:
+    return tensor_name + "_QuantizeLinear"
+
+
+def add_quant_input_suffix(tensor_name: str) -> str:
+    return tensor_name + QUANT_INPUT_SUFFIX
+
+
+def add_quant_output_suffix(tensor_name) -> str:
+    return tensor_name + "_QuantizeLinear_Output"
+
+
+def add_dequant_suffix(tensor_name) -> str:
+    return tensor_name + "_DequantizeLinear"
+
+
+def add_dequant_input_suffix(tensor_name) -> str:
+    return tensor_name + "_DequantizeLinear_Input"
+
+
+def add_dequant_output_suffix(tensor_name) -> str:
+    return tensor_name + DEQUANT_OUTPUT_SUFFIX