I am done

2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions
--- a/rl/Lib/site-packages/onnxruntime/quantization/CalTableFlatBuffers/KeyValue.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/CalTableFlatBuffers/KeyValue.py
@ -0,0 +1,78 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: CalTableFlatBuffers
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class KeyValue:
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):  # noqa: N802
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = KeyValue()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsKeyValue(cls, buf, offset=0):  # noqa: N802
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    # KeyValue
+    def Init(self, buf, pos):  # noqa: N802
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # KeyValue
+    def Key(self):  # noqa: N802
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+    # KeyValue
+    def Value(self):  # noqa: N802
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+
+def Start(builder):  # noqa: N802
+    builder.StartObject(2)
+
+
+def KeyValueStart(builder):  # noqa: N802
+    """This method is deprecated. Please switch to Start."""
+    return Start(builder)
+
+
+def AddKey(builder, key):  # noqa: N802
+    builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(key), 0)
+
+
+def KeyValueAddKey(builder, key):  # noqa: N802
+    """This method is deprecated. Please switch to AddKey."""
+    return AddKey(builder, key)
+
+
+def AddValue(builder, value):  # noqa: N802
+    builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(value), 0)
+
+
+def KeyValueAddValue(builder, value):  # noqa: N802
+    """This method is deprecated. Please switch to AddValue."""
+    return AddValue(builder, value)
+
+
+def End(builder):  # noqa: N802
+    return builder.EndObject()
+
+
+def KeyValueEnd(builder):  # noqa: N802
+    """This method is deprecated. Please switch to End."""
+    return End(builder)
--- a/rl/Lib/site-packages/onnxruntime/quantization/CalTableFlatBuffers/TrtTable.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/CalTableFlatBuffers/TrtTable.py
@ -0,0 +1,90 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: CalTableFlatBuffers
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class TrtTable:
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):  # noqa: N802
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = TrtTable()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsTrtTable(cls, buf, offset=0):  # noqa: N802
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    # TrtTable
+    def Init(self, buf, pos):  # noqa: N802
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # TrtTable
+    def Dict(self, j):  # noqa: N802
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from onnxruntime.quantization.CalTableFlatBuffers.KeyValue import KeyValue
+
+            obj = KeyValue()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # TrtTable
+    def DictLength(self):  # noqa: N802
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # TrtTable
+    def DictIsNone(self):  # noqa: N802
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+
+def Start(builder):  # noqa: N802
+    builder.StartObject(1)
+
+
+def TrtTableStart(builder):  # noqa: N802
+    """This method is deprecated. Please switch to Start."""
+    return Start(builder)
+
+
+def AddDict(builder, dict):  # noqa: N802
+    builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(dict), 0)
+
+
+def TrtTableAddDict(builder, dict):  # noqa: N802
+    """This method is deprecated. Please switch to AddDict."""
+    return AddDict(builder, dict)
+
+
+def StartDictVector(builder, numElems):  # noqa: N802
+    return builder.StartVector(4, numElems, 4)
+
+
+def TrtTableStartDictVector(builder, numElems):  # noqa: N802
+    """This method is deprecated. Please switch to Start."""
+    return StartDictVector(builder, numElems)
+
+
+def End(builder):  # noqa: N802
+    return builder.EndObject()
+
+
+def TrtTableEnd(builder):  # noqa: N802
+    """This method is deprecated. Please switch to End."""
+    return End(builder)
--- a/rl/Lib/site-packages/onnxruntime/quantization/CalTableFlatBuffers/init.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/CalTableFlatBuffers/init.py
--- a/rl/Lib/site-packages/onnxruntime/quantization/CalTableFlatBuffers/pycache/KeyValue.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/CalTableFlatBuffers/pycache/KeyValue.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/CalTableFlatBuffers/pycache/TrtTable.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/CalTableFlatBuffers/pycache/TrtTable.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/CalTableFlatBuffers/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/CalTableFlatBuffers/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/init.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/init.py
@ -0,0 +1,16 @@
+from .calibrate import (  # noqa: F401
+    CalibraterBase,
+    CalibrationDataReader,
+    CalibrationMethod,
+    MinMaxCalibrater,
+    create_calibrator,
+)
+from .qdq_quantizer import QDQQuantizer  # noqa: F401
+from .quant_utils import QuantFormat, QuantType, write_calibration_table  # noqa: F401
+from .quantize import DynamicQuantConfig  # noqa: F401
+from .quantize import QuantizationMode  # noqa: F401
+from .quantize import StaticQuantConfig  # noqa: F401
+from .quantize import quantize  # noqa: F401
+from .quantize import quantize_dynamic  # noqa: F401
+from .quantize import quantize_static  # noqa: F401
+from .shape_inference import quant_pre_process  # noqa: F401
--- a/rl/Lib/site-packages/onnxruntime/quantization/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/pycache/base_quantizer.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/pycache/base_quantizer.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/pycache/calibrate.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/pycache/calibrate.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/pycache/matmul_4bits_quantizer.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/pycache/matmul_4bits_quantizer.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/pycache/matmul_bnb4_quantizer.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/pycache/matmul_bnb4_quantizer.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/pycache/onnx_model.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/pycache/onnx_model.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/pycache/onnx_quantizer.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/pycache/onnx_quantizer.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/pycache/preprocess.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/pycache/preprocess.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/pycache/qdq_loss_debug.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/pycache/qdq_loss_debug.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/pycache/qdq_quantizer.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/pycache/qdq_quantizer.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/pycache/quant_utils.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/pycache/quant_utils.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/pycache/quantize.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/pycache/quantize.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/pycache/registry.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/pycache/registry.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/pycache/shape_inference.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/pycache/shape_inference.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/pycache/tensor_quant_overrides.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/pycache/tensor_quant_overrides.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/base_quantizer.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/base_quantizer.py
@ -0,0 +1,536 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+import logging
+from typing import Any, Dict
+
+import numpy as np
+import onnx
+import onnx.numpy_helper
+
+try:
+    from onnx.reference.op_run import to_array_extended
+except ImportError:
+    # old version of onnx.
+    to_array_extended = None
+
+from .calibrate import TensorData
+from .onnx_model import ONNXModel
+from .quant_utils import (
+    ONNX_TYPE_TO_NP_TYPE,
+    TENSOR_NAME_QUANT_SUFFIX,
+    QuantType,
+    find_by_name,
+    model_has_infer_metadata,
+    normalize_axis,
+    pack_bytes_to_4bit,
+    quantize_data,
+    quantize_nparray,
+    save_and_reload_model_with_shape_infer,
+    tensor_proto_to_array,
+)
+from .tensor_quant_overrides import TensorQuantOverridesHelper
+
+
+class QuantizationParams:
+    def __init__(self, **data: Dict[str, Any]):
+        self.data = {}
+        for k, v in data.items():
+            if not isinstance(k, str):
+                raise TypeError(f"Keys must be strings not {type(k)} for k={k!r}.")
+            if not isinstance(v, (int, str, np.ndarray)):
+                raise TypeError(f"Values must be numpy arrays, int, float, str not {type(v)} for k={k!r}.")
+            if k == "scale" and v.dtype not in (np.float32, np.float16):
+                raise ValueError(f"scale must a float32 or float16 numpy element but is {v.dtype} for k={k!r}")
+            self.data[k] = v
+
+    def __iter__(self):
+        yield from self.data
+
+    def __getitem__(self, key):
+        return self.data[key]
+
+    def __len__(self):
+        return len(self.data)
+
+
+class BaseQuantizer:
+    def __init__(
+        self,
+        model,
+        per_channel,
+        reduce_range,
+        weight_qType,
+        activation_qType,
+        tensors_range,
+        nodes_to_quantize,
+        nodes_to_exclude,
+        op_types_to_quantize,
+        extra_options=None,
+    ):
+        if not model_has_infer_metadata(model):
+            model = save_and_reload_model_with_shape_infer(model)
+        self.value_infos = {vi.name: vi for vi in model.graph.value_info}
+        self.value_infos.update({ot.name: ot for ot in model.graph.output})
+        self.value_infos.update({it.name: it for it in model.graph.input})
+
+        self.model = ONNXModel(model)
+        self.per_channel = per_channel  # weight-pack per channel
+        self.reduce_range = reduce_range
+
+        self.extra_options = extra_options if extra_options else {}
+        self.enable_subgraph_quantization = (
+            "EnableSubgraph" in self.extra_options and self.extra_options["EnableSubgraph"]
+        )
+        self.parent = None
+        self.force_quantize_no_input_check = (
+            "ForceQuantizeNoInputCheck" in self.extra_options and self.extra_options["ForceQuantizeNoInputCheck"]
+        )
+        self.is_weight_symmetric = self.extra_options.get(
+            "WeightSymmetric", weight_qType in (QuantType.QInt8, QuantType.QInt16, QuantType.QFLOAT8E4M3FN)
+        )
+        self.is_activation_symmetric = self.extra_options.get("ActivationSymmetric", False)
+        self.min_real_range = self.extra_options.get("MinimumRealRange")
+
+        self.activation_qType = getattr(activation_qType, "tensor_type", activation_qType)
+        self.weight_qType = getattr(weight_qType, "tensor_type", weight_qType)
+
+        """
+            Dictionary specifying the min and max values for tensors. It has following format:
+                {
+                    "param_name": [min, max]
+                }
+            example:
+                {
+                    'Conv_3:0': [np.float32(0), np.float32(0.5)],
+                    'Conv_4:0': [np.float32(1), np.float32(3.5)]
+                }
+        """
+        if tensors_range is not None and any(map(lambda t: not isinstance(t, TensorData), tensors_range.values())):
+            raise TypeError(
+                f"tensors_range contains unexpected types {set(type(v) for v in tensors_range.values())}, not TensorData."
+            )
+        self.tensors_range = tensors_range
+        self.nodes_to_quantize = nodes_to_quantize  # specific nodes to quantize
+        self.nodes_to_exclude = nodes_to_exclude  # specific nodes to exclude
+        self.op_types_to_quantize = op_types_to_quantize
+
+        self.opset_version = self.check_opset_version()
+
+        # Get tensor-level quantization overrides and ensure they are valid.
+        self.tensor_quant_overrides = TensorQuantOverridesHelper(self.extra_options.get("TensorQuantOverrides", {}))
+
+        self.initializers = {initzer.name: initzer for initzer in self.model.initializer()}
+        overrides_valid, overrides_err = self.tensor_quant_overrides.is_valid(
+            self.initializers, self.value_infos.keys(), activation_qType
+        )
+        if not overrides_valid:
+            raise ValueError(overrides_err)
+
+        self.tensor_quant_override_qtypes = self.tensor_quant_overrides.get_quant_types()
+
+    def quantize_model(self):
+        raise NotImplementedError
+
+    def is_input_a_initializer(self, input_name):
+        initializer = find_by_name(input_name, self.model.initializer())
+        return initializer is not None
+
+    def is_per_channel(self):
+        return self.per_channel
+
+    def is_valid_quantize_weight(self, weight_name):
+        weight = find_by_name(weight_name, self.model.initializer())
+        if weight is not None:
+            return weight.data_type in (onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16)
+        if (not self.enable_subgraph_quantization) or (self.parent is None):
+            return False
+        return self.parent.is_valid_quantize_weight(weight_name)
+
+    def should_quantize_node(self, node):
+        if (
+            self.nodes_to_quantize is not None
+            and len(self.nodes_to_quantize) != 0
+            and node.name not in self.nodes_to_quantize
+        ):
+            return False
+
+        if node.op_type not in self.op_types_to_quantize:
+            return False
+
+        if self.nodes_to_exclude is not None and node.name in self.nodes_to_exclude:
+            return False
+
+        return True
+
+    def check_opset_version(self):
+        ai_onnx_domain = [
+            opset for opset in self.model.model.opset_import if not opset.domain or opset.domain == "ai.onnx"
+        ]
+        if len(ai_onnx_domain) != 1:
+            raise ValueError("Failed to find proper ai.onnx domain")
+        opset_version = ai_onnx_domain[0].version
+
+        if opset_version == 10:
+            logging.warning(
+                f"The original model opset version is {opset_version}, which does not support node fusions. Please update the model to opset >= 11 for better performance."
+            )
+            return 10
+
+        if opset_version < 10:
+            logging.warning(
+                f"The original model opset version is {opset_version}, which does not support quantization. Please update the model to opset >= 11. Updating the model automatically to opset 11. Please verify the quantized model."
+            )
+            self.model.model.opset_import.remove(ai_onnx_domain[0])
+            self.model.model.opset_import.extend([onnx.helper.make_opsetid("", 11)])
+            opset_version = 11
+
+        if opset_version < 19 and self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+            logging.warning(
+                f"The original model opset version is {opset_version}, which does not support quantization to float 8. "
+                "Please update the model to opset >= 19. Updating the model automatically to opset 19. "
+                "Please verify the quantized model."
+            )
+            self.model.model.opset_import.remove(ai_onnx_domain[0])
+            self.model.model.opset_import.extend([onnx.helper.make_opsetid("", 19)])
+            self.model.model.ir_version = 9
+            opset_version = 19
+
+        return opset_version
+
+    def quantize_bias_static_impl(self, bias_name, input_scale, weight_scale, beta=1.0):
+        """
+        Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale
+        """
+
+        # get bias
+        bias_initializer = find_by_name(bias_name, self.model.initializer())
+        bias_data = tensor_proto_to_array(bias_initializer)
+        quantized_bias_name = bias_name + TENSOR_NAME_QUANT_SUFFIX
+
+        # quantize bias
+        if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+            data = np.asarray(bias_data)
+            if data.dtype == np.float16:
+                node_qtype = onnx.TensorProto.FLOAT16
+            elif data.dtype == np.float32:
+                node_qtype = onnx.TensorProto.FLOAT
+            else:
+                raise TypeError(f"Only float16 or float32 are supported with float 8 but bias dtype is {data.dtype}.")
+            quantized_data = data.astype(np.float32)
+            bias_scale = np.array([1], dtype=quantized_data.dtype)
+            bias_scale_data = bias_scale.reshape(-1)
+            packed_bias_initializer = onnx.numpy_helper.from_array(quantized_data, quantized_bias_name)
+            self.model.initializer_extend([packed_bias_initializer])
+            node_type = "Cast"
+        else:
+            # calculate scale for bias
+            # TODO: This formula should be explained including why the scale is not estimated for the bias as well.
+            bias_scale = input_scale * weight_scale * beta
+
+            quantized_data = (np.asarray(bias_data) / bias_scale).round().astype(np.int32)
+
+            # update bias initializer
+            bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(bias_initializer.dims)
+            packed_bias_initializer = onnx.numpy_helper.from_array(bias_np_data, quantized_bias_name)
+            self.model.initializer_extend([packed_bias_initializer])
+
+            # Bias's scale dtype should match the original bias data's unquantized type (float32 or float16).
+            bias_scale_data = np.asarray(bias_scale, dtype=bias_data.dtype).reshape(-1)
+            node_type = "DequantizeLinear"
+            node_qtype = self.weight_qType
+
+        # update scale initializer
+        quantized_bias_scale_name = quantized_bias_name + "_scale"
+        packed_bias_scale_initializer = onnx.numpy_helper.from_array(bias_scale_data, quantized_bias_scale_name)
+        self.model.initializer_extend([packed_bias_scale_initializer])
+
+        # update zero initializer
+        if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+            tensor_type = self.weight_qType
+        else:
+            tensor_type = onnx.TensorProto.INT32
+
+        quantized_bias_zp_name = quantized_bias_name + "_zero_point"
+        if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+            packed_bias_zp_initializer = onnx.helper.make_tensor(quantized_bias_zp_name, self.weight_qType, [1], [0.0])
+        elif bias_scale.size > 1:
+            bias_zp_data = np.zeros(bias_scale.shape, dtype=np.int32).reshape(-1)
+            packed_bias_zp_initializer = onnx.numpy_helper.from_array(bias_zp_data, quantized_bias_zp_name)
+        else:
+            packed_bias_zp_initializer = onnx.helper.make_tensor(quantized_bias_zp_name, tensor_type, [], [0])
+        self.model.initializer_extend([packed_bias_zp_initializer])
+
+        return (
+            quantized_bias_name,
+            quantized_bias_scale_name,
+            quantized_bias_zp_name,
+            bias_scale_data,
+            node_type,
+            node_qtype,
+        )
+
+    def quantize_initializer_impl(self, weight, qType, reduce_range=False, keep_float_weight=False):
+        """
+        :param weight: TensorProto initializer
+        :param qType: type to quantize to
+        :param keep_float_weight: Whether to quantize the weight. In some cases, we only want to qunatize scale and zero point.
+                                  If keep_float_weight is False, quantize the weight, or don't quantize the weight.
+        :return: quantized weight name, zero point name, scale name
+        """
+        q_weight_name = weight.name + TENSOR_NAME_QUANT_SUFFIX
+        zp_name = weight.name + "_zero_point"
+        scale_name = weight.name + "_scale"
+
+        # Quantize weight data. Use quantization overrides if provided by the user.
+        weight_data = tensor_proto_to_array(weight)
+        quant_overrides = self.tensor_quant_overrides.get_per_tensor_overrides(weight.name, default_val={})
+        if "quant_type" in quant_overrides:
+            qType = quant_overrides["quant_type"].tensor_type  # noqa: N806
+
+        if "scale" in quant_overrides and "zero_point" in quant_overrides:
+            zero_point = np.array(quant_overrides["zero_point"], dtype=ONNX_TYPE_TO_NP_TYPE[qType])
+            scale = np.array(quant_overrides["scale"])
+            q_weight_data = quantize_nparray(qType, weight_data.flatten(), scale, zero_point)
+            assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+            assert (
+                zero_point.dtype != np.float32 and zero_point.dtype != np.float16
+            ), f"Unexpected dtype {zero_point.dtype}"
+            assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+
+        else:
+            _, _, zero_point, scale, q_weight_data = quantize_data(
+                weight_data.flatten(),
+                qType,
+                quant_overrides.get("symmetric", self.is_weight_symmetric),
+                reduce_range=quant_overrides.get("reduce_range", self.reduce_range and reduce_range),
+                min_real_range=self.min_real_range,
+                rmin_override=quant_overrides.get("rmin"),
+                rmax_override=quant_overrides.get("rmax"),
+            )
+
+            assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+            assert (
+                zero_point.dtype != np.float32 and zero_point.dtype != np.float16
+            ), f"Unexpected dtype {zero_point.dtype}"
+            assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+
+        scale_dtype = weight.data_type
+        scale_initializer = onnx.helper.make_tensor(scale_name, scale_dtype, [], scale.reshape((-1,)).tolist())
+        zero_initializer = onnx.helper.make_tensor(zp_name, qType, [], zero_point.reshape((-1,)).tolist())
+        self.model.initializer_extend([scale_initializer, zero_initializer])
+
+        if not keep_float_weight:
+            if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+                q_weight_initializer = onnx.TensorProto()
+                q_weight_initializer.data_type = self.weight_qType
+                q_weight_initializer.dims.extend(weight.dims)
+                q_weight_initializer.name = q_weight_name
+                # Do not remove .flatten().copy() numpy is not clear about data persistence.
+                q_weight_initializer.raw_data = q_weight_data.flatten().copy().tobytes()
+                if to_array_extended is not None:
+                    # This test should not be needed but it helped catch some issues
+                    # with data persistence and tobytes.
+                    check = to_array_extended(q_weight_initializer)
+                    if check.shape != weight_data.shape or check.tobytes() != q_weight_data.tobytes():
+                        raise RuntimeError(
+                            f"The initializer of shape {weight_data.shape} could not be created, expecting "
+                            f"{q_weight_data.tobytes()[:10]}, got {check.tobytes()[:10]} and shape={weight.shape}"
+                            f"\nraw={str(q_weight_initializer)[:200]}."
+                        )
+            elif qType in (onnx.TensorProto.INT4, onnx.TensorProto.UINT4):
+                if q_weight_data.dtype not in (np.int8, np.uint8):
+                    raise RuntimeError(
+                        f"Quantized weights for {q_weight_name} must be 8-bit before packing as 4-bit values."
+                    )
+
+                # We do not use onnx.helper.pack_float32_to_4bit() due to performance.
+                # This can be the difference between a large model taking 30 minutes to quantize vs 5 minutes.
+                packed_data = bytes(pack_bytes_to_4bit(q_weight_data.tobytes()))
+
+                # We only use onnx.helper.make_tensor with raw data due to bug: https://github.com/onnx/onnx/pull/6161
+                q_weight_initializer = onnx.helper.make_tensor(q_weight_name, qType, weight.dims, packed_data, raw=True)
+            else:
+                q_weight_data = np.asarray(q_weight_data, dtype=onnx.helper.tensor_dtype_to_np_dtype(qType)).reshape(
+                    weight.dims
+                )
+                q_weight_initializer = onnx.numpy_helper.from_array(q_weight_data, q_weight_name)
+            self.model.initializer_extend([q_weight_initializer])
+
+        return q_weight_name, zp_name, scale_name
+
+    def quantize_weight_per_channel_impl(
+        self,
+        weight_name,
+        weight_qType,
+        channel_axis,
+        reduce_range=True,
+        keep_float_weight=False,
+    ):
+        initializer = find_by_name(weight_name, self.model.initializer())
+        if initializer is None:
+            raise ValueError("{} is not an initializer", weight_name)
+
+        weights = tensor_proto_to_array(initializer)
+        weights_rank = len(weights.shape)
+        is_axis_valid, axis_norm = normalize_axis(channel_axis, weights_rank)
+        if not is_axis_valid:
+            raise ValueError(
+                f"Weight {weight_name} has a per-channel axis with value {channel_axis} that is "
+                f"out-of-bounds for rank {weights_rank}"
+            )
+
+        channel_axis = axis_norm
+        channel_count = weights.shape[channel_axis]
+        quant_overrides_for_channels = self.tensor_quant_overrides.get_per_channel_overrides(
+            weight_name, default_val=[{"axis": channel_axis}]
+        )
+
+        num_channel_overrides = len(quant_overrides_for_channels)
+        if num_channel_overrides != 1 and num_channel_overrides != channel_count:
+            raise ValueError(
+                f"Per-channel tensor quantization overrides for {weight_name} must have "
+                f"either 1 or {channel_count} elements in the list of dictionaries."
+            )
+
+        is_axis_override_valid, axis_override = normalize_axis(quant_overrides_for_channels[0]["axis"], weights_rank)
+        if not is_axis_override_valid or axis_override != channel_axis:
+            raise ValueError(
+                f"Tensor quantization overrides for {weight_name} specify an unexpected axis. "
+                f"Expected {channel_axis}, but got {quant_overrides_for_channels[0]['axis']}."
+            )
+
+        # If user provides per-channel quantization overrides, all channels must use the same quant_type,
+        # axis, symmetric, and reduce_range values. So, just use the first channel's values.
+        if "quant_type" in quant_overrides_for_channels[0]:
+            weight_qType = quant_overrides_for_channels[0]["quant_type"].tensor_type  # noqa: N806
+
+        symmetric = quant_overrides_for_channels[0].get(
+            "symmetric",
+            (
+                self.is_weight_symmetric
+                or weight_qType in (onnx.TensorProto.INT8, onnx.TensorProto.FLOAT8E4M3FN, onnx.TensorProto.INT4)
+            ),
+        )
+        reduce_range = quant_overrides_for_channels[0].get("reduce_range", self.reduce_range and reduce_range)
+        zero_point_list = []
+        scale_list = []
+        quantized_per_channel_data_list = []
+        for i in range(channel_count):
+            per_channel_data = weights.take(i, channel_axis)
+            channel_override_index = i if i < num_channel_overrides else 0
+            channel_quant_overrides = quant_overrides_for_channels[channel_override_index]
+
+            if "scale" in channel_quant_overrides and "zero_point" in channel_quant_overrides:
+                zero_point = np.array(channel_quant_overrides["zero_point"], dtype=ONNX_TYPE_TO_NP_TYPE[weight_qType])
+                scale = np.array(channel_quant_overrides["scale"])
+                quantized_per_channel_data = quantize_nparray(
+                    weight_qType, per_channel_data.flatten(), scale, zero_point
+                )
+                assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+                assert (
+                    zero_point.dtype != np.float32 and zero_point.dtype != np.float16
+                ), f"Unexpected dtype {zero_point.dtype}"
+                assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+                assert isinstance(
+                    quantized_per_channel_data, np.ndarray
+                ), f"Unexpected type {type(quantized_per_channel_data)}"
+
+            else:
+                _, _, zero_point, scale, quantized_per_channel_data = quantize_data(
+                    per_channel_data.flatten(),
+                    weight_qType,
+                    symmetric,
+                    reduce_range=reduce_range,
+                    min_real_range=self.min_real_range,
+                    rmin_override=channel_quant_overrides.get("rmin"),
+                    rmax_override=channel_quant_overrides.get("rmax"),
+                )
+
+                assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+                assert (
+                    zero_point.dtype != np.float32 and zero_point.dtype != np.float16
+                ), f"Unexpected dtype {zero_point.dtype}"
+                assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+                assert isinstance(
+                    quantized_per_channel_data, np.ndarray
+                ), f"Unexpected type {type(quantized_per_channel_data)}"
+
+            zero_point_list.append(zero_point)
+            scale_list.append(scale)
+            quantized_per_channel_data_list.append(quantized_per_channel_data)
+
+        # combine per_channel_data into one
+        weights_shape = list(weights.shape)
+        reshape_dims = list(weights_shape)  # deep copy
+        reshape_dims[channel_axis] = 1  # only one per channel for reshape
+        quantized_weights = np.asarray(quantized_per_channel_data_list[0]).reshape(reshape_dims)
+        for i in range(1, len(quantized_per_channel_data_list)):
+            channel_weights = np.asarray(quantized_per_channel_data_list[i]).reshape(reshape_dims)
+            quantized_weights = np.concatenate((quantized_weights, channel_weights), channel_axis)
+
+        q_weight_name = weight_name + TENSOR_NAME_QUANT_SUFFIX
+        zp_name = weight_name + "_zero_point"
+        scale_name = weight_name + "_scale"
+
+        # Update packed weight, zero point, and scale initializers
+        zero_scale_shape = [initializer.dims[channel_axis]]
+        scale_initializer = onnx.helper.make_tensor(
+            scale_name, initializer.data_type, zero_scale_shape, np.hstack(scale_list).tolist()
+        )
+        zero_initializer = onnx.helper.make_tensor(
+            zp_name, weight_qType, zero_scale_shape, np.hstack(zero_point_list).tolist()
+        )
+
+        self.model.initializer_extend([scale_initializer, zero_initializer])
+
+        if not keep_float_weight:
+            if weight_qType in (onnx.TensorProto.INT4, onnx.TensorProto.UINT4):
+                if quantized_weights.dtype not in (np.int8, np.uint8):
+                    raise RuntimeError(
+                        f"Quantized weights for {q_weight_name} must be 8-bit before packing as 4-bit values."
+                    )
+
+                # We do not use onnx.helper.pack_float32_to_4bit() due to performance.
+                # This can be the difference between a large model taking 30 minutes to quantize vs 5 minutes.
+                packed_data = bytes(pack_bytes_to_4bit(quantized_weights.tobytes()))
+
+                # We only use onnx.helper.make_tensor with raw data due to bug: https://github.com/onnx/onnx/pull/6161
+                q_weight_initializer = onnx.helper.make_tensor(
+                    q_weight_name, weight_qType, weights_shape, packed_data, raw=True
+                )
+                self.model.initializer_extend([q_weight_initializer])
+            else:
+                quantized_weights = np.asarray(
+                    quantized_weights,
+                    dtype=onnx.helper.tensor_dtype_to_np_dtype(weight_qType),
+                ).reshape(initializer.dims)
+                q_weight_initializer = onnx.numpy_helper.from_array(quantized_weights, q_weight_name)
+                self.model.initializer_extend([q_weight_initializer])
+
+        return q_weight_name, zp_name, scale_name
+
+    def adjust_tensor_ranges(self):
+        if self.tensors_range is None:
+            return
+
+        for node in self.model.nodes():
+            # adjust tensor_ranges for input of Clip and Relu node
+            if node.op_type in ["Clip", "Relu"]:
+                if self.is_activation_symmetric:
+                    continue
+                if not self.should_quantize_node(node):
+                    continue
+                if len(self.model.input_name_to_nodes()[node.input[0]]) != 1:
+                    continue
+                if node.input[0] not in self.tensors_range or node.output[0] not in self.tensors_range:
+                    continue
+                td = self.tensors_range[node.output[0]]
+                if not isinstance(td, TensorData):
+                    raise TypeError(f"Unexpected type {type(td)} for {node.output[0]!r}.")
+                self.tensors_range[node.input[0]] = td
+            # Adjust Softmax to range from 0.0 to 1.0
+            elif node.op_type == "Softmax":
+                self.tensors_range[node.output[0]] = TensorData(lowest=np.float32(0.0), highest=np.float32(1.0))
--- a/rl/Lib/site-packages/onnxruntime/quantization/calibrate.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/calibrate.py
--- a/rl/Lib/site-packages/onnxruntime/quantization/execution_providers/qnn/init.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/execution_providers/qnn/init.py
@ -0,0 +1,2 @@
+from .preprocess import qnn_preprocess_model  # noqa: F401
+from .quant_config import get_qnn_qdq_config  # noqa: F401
--- a/rl/Lib/site-packages/onnxruntime/quantization/execution_providers/qnn/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/execution_providers/qnn/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/execution_providers/qnn/pycache/fusion_lpnorm.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/execution_providers/qnn/pycache/fusion_lpnorm.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/execution_providers/qnn/pycache/mixed_precision_overrides_utils.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/execution_providers/qnn/pycache/mixed_precision_overrides_utils.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/execution_providers/qnn/pycache/preprocess.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/execution_providers/qnn/pycache/preprocess.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/execution_providers/qnn/pycache/quant_config.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/execution_providers/qnn/pycache/quant_config.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/execution_providers/qnn/fusion_lpnorm.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/execution_providers/qnn/fusion_lpnorm.py
@ -0,0 +1,132 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import onnx
+
+from ...fusions import Fusion
+from ...onnx_model import ONNXModel
+
+
+class FusionLpNormalization(Fusion):
+    def __init__(self, model: ONNXModel, epsilon: float = 1e-12):
+        super().__init__(model, "LpNormalization", "ReduceL2")
+        self.epsilon = epsilon
+
+    def fuse(
+        self,
+        reduce_node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ):
+        """
+        Interface function that tries to fuse a node sequence containing a ReduceL2 node into a single
+        LpNormalization node.
+
+        Pattern 1:
+                    [root] --> ReduceL2 -----> Clip  --> Expand ----> Div -->
+                       |      (axis=-1)    (min=epsilon) (shape=root)  ^
+                       |   (keepdims=True)                             |
+                       |                                               |
+                       +-----------------------------------------------+
+        Notes:
+          - ReduceL2 must use the last axis, and keepdims == True
+          - Clip must only have a min attribute that is ~1e-12
+          - Expand must restore the shape to root.shape
+          - The output of Expand must be the second input to Div.
+        """
+        if reduce_node.output[0] not in input_name_to_nodes:
+            return
+
+        # ReduceL2 must have one Clip child
+        children = input_name_to_nodes[reduce_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Clip":
+            return
+
+        # ReduceL2 must have keepdims == True
+        keepdims = self.get_node_attribute(reduce_node, "keepdims")
+        if not keepdims:
+            return
+
+        # ReduceL2 axes must refer only to the last dimension.
+        # Axes became an input in opset 18. Before then, axes was an attribute
+        reduce_input_ttype = self.model.get_tensor_type(reduce_node.input[0])
+        if not reduce_input_ttype:
+            return
+
+        reduce_input_shape = self.tensor_shape_to_list(reduce_input_ttype)
+        if not reduce_input_shape:
+            return
+
+        axes = self.get_node_attribute(reduce_node, "axes")
+        if not axes and len(reduce_node.input) > 1:
+            axes = self.model.get_constant_value(reduce_node.input[1])
+
+        if not axes or len(axes) != 1:
+            return
+
+        last_dim = len(reduce_input_shape) - 1
+        if axes[0] != -1 and axes[0] != last_dim:
+            return
+
+        # Clip node must have a min attribute approximately equal to 1e-12
+        clip_node = children[0]
+        clip_min = self.get_node_attribute(clip_node, "min")
+        if clip_min is None and len(clip_node.input) > 1:
+            clip_min = self.model.get_constant_value(clip_node.input[1])
+
+        clip_max = self.get_node_attribute(clip_node, "max")  # TODO: clip_max could be FLOAT_MAX
+        if clip_max is None and len(clip_node.input) > 2:
+            clip_max = self.model.get_constant_value(clip_node.input[2])
+
+        if not (clip_max is None and clip_min is not None and clip_min > 0 and abs(clip_min - self.epsilon) < 1e-13):
+            return
+
+        if clip_node.output[0] not in input_name_to_nodes:
+            return
+
+        # Clip must have a single Expand child.
+        children = input_name_to_nodes[clip_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Expand":
+            return
+
+        expand_node = children[0]
+        if expand_node.output[0] not in input_name_to_nodes:
+            return
+
+        # Expand must have a single Div child
+        children = input_name_to_nodes[expand_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Div":
+            return
+
+        div_node = children[0]
+
+        # The first input to Div must be the root of the subgraph (i.e., reduce_node.input[0])
+        # The second input to Div must be the output of the Expand.
+        # As long as these two inputs go to the same Div node, then ONNX validation will ensure that
+        # their shapes match.
+        if div_node.input[0] != reduce_node.input[0]:
+            return
+        if div_node.input[1] != expand_node.output[0]:
+            return
+
+        subgraph_input = reduce_node.input[0]
+        subgraph_output = div_node.output[0]
+
+        subgraph_nodes = [reduce_node, clip_node, expand_node, div_node]
+        if not self.is_safe_to_fuse_nodes(subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = onnx.helper.make_node(
+            self.fused_op_type,
+            name=self.create_unique_node_name(),
+            inputs=[subgraph_input],
+            outputs=[subgraph_output],
+            p=2,
+            axis=-1,
+        )
+        self.nodes_to_add.append(fused_node)
--- a/rl/Lib/site-packages/onnxruntime/quantization/execution_providers/qnn/mixed_precision_overrides_utils.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/execution_providers/qnn/mixed_precision_overrides_utils.py
@ -0,0 +1,413 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+
+import onnx
+
+from ...quant_utils import QuantType
+from ...tensor_quant_overrides import QuantTypeInfo, TensorQuantOverridesHelper
+
+
+@dataclass
+class TensorTypeRequest:
+    """
+    Bundles desired quantization type requests for a tensor. A distinction is made between the
+    produced type and the consumed type.
+    """
+
+    # The tensor's quant type at the producer end. If None, assumed to be the default activation quant type.
+    producer: QuantTypeInfo | None
+
+    # The tensor's quant type received by a set of consumer nodes.
+    # If None, assumed to be the default activation quant type for all consumers.
+    # consumers[1] is a set of consumer node names.
+    consumers: tuple[QuantTypeInfo, set[str]] | None
+
+
+class MixedPrecisionTensorQuantOverridesFixer:
+    """
+    Helper that generates tensor quantization overrides for mixed-precision QDQ models.
+
+    Specifically, this helper fixes an initial set of quantization overrides that assign a non-default
+    activation quantization type to one or more tensors by doing the following:
+     - Inferring which other tensors need to be overridden to the non-default activation quantization type.
+     - Inserting quantization data type conversions.
+
+    Example:
+    --------
+
+    Float model:
+
+    input_0 --> Op1 --> Op3 --> Op5 --> Op6 --> output_0
+                                 ^
+                                 |
+    input_1 --> Op2 -+-> Op4 ----+
+                     |
+                     +-> Op7 --> output_1
+                     |
+                     +-> Op8 --> output_2
+
+    If we'd like to quantize this model to uint8 precision, but would like to make sure tensor "Op4_out"
+    is quantized to 16-bit, then we would specify the following initial tensor quantization overrides:
+
+    ```
+    init_overrides = {"Op4_out": [{"quant_type": QuantType.QUInt16}]}
+    ```
+
+    These initial overrides may not create a valid model because Op4 and Op5 may require both the input and output
+    to be the same type (e.g., uint16). This helper fixes the overrides so that input/output data types
+    are valid:
+
+    ```
+    overrides = TensorQuantOverridesHelper(init_overrides)
+
+    fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(overrides, model, QuantType.QUInt8)
+    fixer.apply(
+        default_activation_qtype=QuantType.QUInt8,
+        default_activation_symmetric=False,
+    )
+    ```
+
+    The above snippet generates the following "fixed" overrides (get via overrides.get_dict()):
+
+    {
+      "Op2_out": [{"quant_type": QUInt8, "convert": {"quant_type": QUInt16, "recv_nodes": {"Op4"}}}],
+      "Op3_out": [{"quant_type": QUInt8, "convert": {"quant_type": QUInt16, "recv_nodes": {"Op5"}}}],
+      "Op4_out": [{"quant_type": QUInt16}],
+      "Op5_out": [{"quant_type": QUInt16, "convert": {"quant_type": QUInt8, "recv_nodes": {"Op6"}}}]
+    }
+
+    How to interpret the fixed overrides:
+    - Op2's output is consumed by Op4, Op7, and Op8. Op4 consumes the converted u16 type,
+      but Op7 and Op8 consume the original u8 type.
+    - Op3's output is converted from u8 to u16. Op5 consumes the converted u16 type.
+    - Op4's output is just u16 (not converted). All consumers of Op4_out get the u16 type.
+    - Op5's output is converted from u16 to u8. Op6 consumes the u8 type.
+    """
+
+    def __init__(
+        self,
+        overrides: TensorQuantOverridesHelper,
+        producers: dict[str, onnx.NodeProto],
+        consumers: dict[str, list[onnx.NodeProto]],
+        value_infos: dict[str, onnx.ValueInfoProto],
+        initializers: dict[str, onnx.TensorProto],
+    ):
+        """
+        Params:
+            overrides: The initial tensor quantization overrides to fix.
+            producers: Dictionary that maps a tensor name to the producer node that generates the tensor.
+            consumers: Dictionary that maps a tensor name to the consumer nodes that take the tensor as input.
+            value_infos: Dictionary that maps a tensor name to its onnx.ValueInfoProto.
+            initializers: Dictionary that maps an initializer name to its onnx.TensorProto.
+        """
+        self.overrides = overrides
+        self.consumers = consumers
+        self.producers = producers
+        self.value_infos = value_infos
+        self.initializers = initializers
+
+    @staticmethod
+    def create_from_model(
+        overrides: TensorQuantOverridesHelper, model: onnx.ModelProto, default_activation_qtype: QuantType
+    ) -> MixedPrecisionTensorQuantOverridesFixer:
+        """
+        Helper function that creates an instance of this class from a loaded ONNX model.
+
+        Params:
+            overrides: The initial tensor quantization overrides to fix.
+            model: Loaded ONNX model
+            default_activation_qtype: The intended default activation quantization type.
+                                      Used to validate the initial overrides.
+
+        Returns:
+            Initialized MixedPrecisionTensorQuantOverridesFixer object
+        """
+        model = onnx.shape_inference.infer_shapes(model)  # Need to infer shapes to get value_infos
+
+        # Build dictionaries that enable convenient lookups of initializers and value_infos by name.
+        initializers = {initializer.name: initializer for initializer in model.graph.initializer}
+        value_infos = {vi.name: vi for vi in model.graph.value_info}
+        value_infos.update({ot.name: ot for ot in model.graph.output})
+        value_infos.update({it.name: it for it in model.graph.input})
+
+        # Ensure that the user-provided initial overrides are actually valid.
+        valid, err = overrides.is_valid(initializers, set(value_infos), default_activation_qtype)
+        if not valid:
+            pprint_overrides = overrides.pprint_str(indent=4)
+            logging.error(f"Provided invalid tensor quantization overrides:\n{pprint_overrides}")
+            raise ValueError(err)
+
+        consumers = {}
+        producers = {}
+
+        # Build dictionaries that map a tensor name to the consumer or producer nodes.
+        for node in model.graph.node:
+            for input_name in node.input:
+                if input_name:
+                    if input_name not in consumers:
+                        consumers[input_name] = []
+
+                    consumers[input_name].append(node)
+
+            for output_name in node.output:
+                producers[output_name] = node
+
+        return MixedPrecisionTensorQuantOverridesFixer(overrides, producers, consumers, value_infos, initializers)
+
+    def apply(
+        self,
+        default_activation_qtype: QuantType,
+        default_activation_symmetric: bool,
+    ):
+        """
+        Fixes the initial tensor quantization overrides (in-place) for use in mixed-precision QDQ models.
+
+        Params:
+            default_activation_qtype: The intended default activation quantization type.
+            default_activation_symmetric: The intended default symmetry used to quantize activations.
+        """
+        type_requests = self.get_desired_tensor_types(default_activation_qtype, default_activation_symmetric)
+
+        # Use type requests to "fix" tensor quantization overrides by adding
+        # quantization type conversions where necessary.
+        for tensor_name, type_req in type_requests.items():
+            all_consumers = set([node.name for node in self.consumers.get(tensor_name, [])])
+            has_producer_req = type_req.producer is not None
+            has_consumer_req = bool(type_req.consumers)
+
+            # Only producer type: Add conversion back to default activation type
+            if has_producer_req and not has_consumer_req:
+                self._update_converted_tensor(
+                    tensor_name, type_req.producer, QuantTypeInfo(default_activation_qtype), all_consumers
+                )
+            # Only consumers
+            elif not has_producer_req and has_consumer_req:
+                prod_type_info = self.overrides.get_node_output_qtype_info(tensor_name, default_activation_qtype)
+                consumer_type_info = type_req.consumers[0]
+
+                if prod_type_info != consumer_type_info:
+                    self._update_converted_tensor(
+                        tensor_name, prod_type_info, consumer_type_info, type_req.consumers[1]
+                    )
+                else:
+                    if not self._check_nodes_are_not_convert_consumers(tensor_name, type_req.consumers[1]):
+                        raise ValueError(
+                            f"Tensor override for '{tensor_name}' converts the type for consumers that need the original type."
+                        )
+            # Both producer and consumers
+            elif has_producer_req and has_consumer_req:
+                prod_type_info = type_req.producer
+                consumer_type_info = type_req.consumers[0]
+
+                if prod_type_info != consumer_type_info:
+                    self._update_converted_tensor(
+                        tensor_name, prod_type_info, consumer_type_info, type_req.consumers[1]
+                    )
+                else:
+                    consumers_for_original_type = all_consumers.difference(type_req.consumers[1])
+
+                    if len(consumers_for_original_type) == 0:
+                        # All consumers want the overridden type, so no need for convert nodes!
+                        # Just add the override to the new new if not already present.
+                        if tensor_name not in self.overrides:
+                            self.overrides[tensor_name] = [{}]
+                            prod_type_info.save_to_dict(self.overrides[tensor_name][0])
+
+                        assert "convert" not in self.overrides[tensor_name][0]
+                    else:
+                        # Some consumers don't want the overridden type.
+                        self._update_converted_tensor(
+                            tensor_name,
+                            prod_type_info,
+                            QuantTypeInfo(default_activation_qtype),
+                            consumers_for_original_type,
+                        )
+            else:
+                raise ValueError(f"TypeRequest for tensor {tensor_name} has no producer or consumers.")
+
+        # Done. Check if the overrides are valid.
+        valid, err = self.overrides.is_valid(self.initializers, set(self.value_infos), default_activation_qtype)
+        if not valid:
+            pprint_overrides = self.overrides.pprint_str(indent=4)
+            logging.error(
+                f"Generated invalid tensor quantization overrides for mixed-precision QDQ model:\n{pprint_overrides}"
+            )
+            raise ValueError(err)
+
+    def get_desired_tensor_types(
+        self,
+        default_activation_qtype: QuantType,
+        default_activation_symmetric: bool,
+    ) -> dict[str, TensorTypeRequest]:
+        """
+        Iterates through the initial tensor quantization overrides and builds a set of TensorTypeRequests objects
+        that describe the quantization types required at each tensor. These TensorTypeRequests objects are ultimately
+        used to generated the "fixed" overrides.
+
+        Params:
+            default_activation_qtype: The intended default activation quantization type.
+            default_activation_symmetric: The intended default symmetry used to quantize activations.
+
+        Returns:
+            TensorTypeRequest objects as a dict that maps a tensor name to its requested types.
+        """
+        type_requests = {}
+        default_activation_type_info = QuantTypeInfo(default_activation_qtype, default_activation_symmetric)
+
+        # Scan tensor overrides for type conversion requests.
+        for tensor_name, override_list in self.overrides.items():
+            if not self.__is_tensor_quantizable(tensor_name):
+                continue  # Skip non-quantizable tensors (e.g., not a float)
+
+            if tensor_name in self.initializers:
+                continue  # Skip initializers
+
+            if not override_list or len(override_list) > 1:
+                continue  # Skip per-channel stuff
+
+            override_dict = override_list[0]
+            quant_type_info = QuantTypeInfo.load_from_dict(override_dict, default_activation_type_info.quant_type)
+            producer_node = self.producers.get(tensor_name)  # None if this is a model input
+
+            if quant_type_info != default_activation_type_info and "convert" not in override_dict:
+                if producer_node is not None:
+                    self._add_type_requests_for_node(type_requests, quant_type_info, producer_node)
+
+                # Find all consumer nodes of `tensor_name` and update their inputs/outputs to the new type.
+                for consumer_node in self.consumers.get(tensor_name, []):
+                    self._add_type_requests_for_node(type_requests, quant_type_info, consumer_node)
+
+        return type_requests
+
+    def _add_type_requests_for_node(
+        self,
+        type_requests: dict[str, TensorTypeRequest],
+        quant_type_info: QuantTypeInfo,
+        node: onnx.NodeProto,
+    ):
+        """
+        Adds TensorTypeRequest objects for a given node, assuming that we want all its inputs and outputs
+        to have the same quantization type (as specified by the `quant_type_info` parameter).
+
+        Params:
+            type_requests: Dictionary of type requests to append to for this node.
+            quant_type_info: The quantization type to use for inputs and outputs.
+            node: The node for which the TensorTypeRequest objects are created and added to type_requests.
+        """
+        # Add output side
+        for output_name in node.output:
+            if not self.__is_tensor_quantizable(output_name):
+                continue
+
+            if output_name not in type_requests:
+                type_requests[output_name] = TensorTypeRequest(quant_type_info, None)
+            else:
+                if (
+                    type_requests[output_name].producer is not None
+                    and type_requests[output_name].producer != quant_type_info
+                ):
+                    raise ValueError(f"Tensor {output_name} has multiple types.")
+
+                type_requests[output_name].producer = quant_type_info
+
+        # Add the consumer side
+        for input_name in node.input:
+            if input_name and input_name not in self.initializers and self.__is_tensor_quantizable(input_name):
+                if input_name not in type_requests:
+                    type_requests[input_name] = TensorTypeRequest(None, None)
+
+                if type_requests[input_name].consumers is None:
+                    type_requests[input_name].consumers = (quant_type_info, set())
+
+                if type_requests[input_name].consumers[0] != quant_type_info:
+                    raise ValueError(f"Tensor {input_name} has consumers requesting different types.")
+
+                if not node.name:
+                    raise ValueError(
+                        f"Node of type {node.op_type} with output 0 {node.output[0]} does not have a name!"
+                    )
+
+                type_requests[input_name].consumers[1].add(node.name)
+
+    def _update_converted_tensor(
+        self,
+        tensor_name: str,
+        producer_type_info: QuantTypeInfo,
+        consumer_type_info: QuantTypeInfo,
+        consumer_names: set[str],
+    ):
+        """
+        Updates the tensor quantization overrides for a tensor that is converted from one type to another.
+
+        Params:
+            tensor_name: The name of the tensor for which to update overrides.
+            producer_type_info: Info for the tensor's produced type.
+            consumer_type_info: Info for the tensor's consumed (i.e., converted) type.
+            consumer_names: Nodes names of consumers that consume the converted type.
+        """
+        if tensor_name not in self.overrides or not self.overrides[tensor_name]:
+            self.overrides[tensor_name] = [{}]
+            producer_type_info.save_to_dict(self.overrides[tensor_name][0])
+
+        overrides = self.overrides[tensor_name][0]
+        if producer_type_info != QuantTypeInfo.load_from_dict(overrides):
+            raise ValueError(f"Desired producer quant_type for {tensor_name} doesn't match existing type.")
+
+        if consumer_names:
+            if "convert" not in overrides:
+                overrides["convert"] = {}
+                consumer_type_info.save_to_dict(overrides["convert"])
+
+            convert_dict = overrides["convert"]
+            if consumer_type_info != QuantTypeInfo.load_from_dict(convert_dict):
+                raise ValueError(f"Desired consumer quant_type for {tensor_name} doesn't match existing type.")
+
+            if "recv_nodes" not in convert_dict:
+                convert_dict["recv_nodes"] = set()
+
+            convert_dict["recv_nodes"].update(consumer_names)
+
+    def _check_nodes_are_not_convert_consumers(self, tensor_name: str, node_names: set[str]):
+        """
+        Returns true if the given nodes do not consume/receive a converted quantization type.
+
+        Params:
+            tensor_name: The name of the tensor to check.
+            node_names: Set of node names that should not be consumers of the converted type.
+        """
+        if tensor_name not in self.overrides or not self.overrides[tensor_name]:
+            return True
+
+        overrides = self.overrides[tensor_name][0]
+
+        if "convert" not in overrides:
+            return True
+
+        convert_dict = overrides["convert"]
+
+        if "recv_nodes" not in convert_dict:
+            return False
+
+        return not convert_dict["recv_nodes"].intersection(node_names)
+
+    def __is_tensor_quantizable(self, tensor_name):
+        weight = self.initializers.get(tensor_name)
+        if weight is not None:
+            if weight.data_type in (onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16):
+                return True
+        elif tensor_name in self.value_infos:
+            vi = self.value_infos[tensor_name]
+            if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type in (
+                onnx.TensorProto.FLOAT,
+                onnx.TensorProto.FLOAT16,
+            ):
+                return True
+
+        return False
--- a/rl/Lib/site-packages/onnxruntime/quantization/execution_providers/qnn/preprocess.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/execution_providers/qnn/preprocess.py
@ -0,0 +1,307 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+
+import onnx
+
+from ...fusions import FusionGelu, FusionLayerNormalization
+from ...onnx_model import ONNXModel
+from .fusion_lpnorm import FusionLpNormalization
+
+
+def qnn_preprocess_model(
+    model_input: str | Path | onnx.ModelProto,
+    model_output: str | Path,
+    fuse_layernorm: bool = False,
+    save_as_external_data: bool = False,
+    all_tensors_to_one_file: bool = False,
+    external_data_location: str | None = None,
+    external_data_size_threshold: int = 1024,
+    external_data_convert_attribute: bool = False,
+    inputs_to_make_channel_last: list[str] | None = None,
+    outputs_to_make_channel_last: list[str] | None = None,
+) -> bool:
+    """
+    If necessary, this method creates a new "pre-processed" model in preparation for
+    quantization of a model to be used in QNN EP. Returns true if a new model was created.
+
+    This method perfoms the following operations:
+    - Fuse Erf sequence into a single Gelu node.
+    - Fuse ReduceL2 sequence into a single LpNormalization node (p == 2).
+    - (Optional) Fuse ReduceMean sequence into a single LayerNormalization node.
+
+    Args:
+        model_input: Path to the input model file or ModelProto.
+        model_output: Path the output model file, which is only created if this method returns True.
+        fuse_layernorm: True if ReduceMean sequences should be fused into LayerNormalization nodes.
+            Defaults to False.
+        save_as_external_data: True if output model should be saved with external data. Defaults to false.
+        all_tensors_to_one_file: Effective only if save_as_external_data is true. Defaults to false.
+            If true, save all tensors to one external file specified by external_data_location.
+            If false, save each tensor to a file named with the tensor name.
+        external_data_location: Effective only if save_as_external_data is true. Defaults to None.
+            Specify the external file to which all tensors are saved. Path is relative
+            to the model path. If not specified, the model's name is used.
+        external_data_size_threshold: Effective only if save_as_external_data is true. Defaults to 1024.
+            Tensors with a data size >= external_data_size_threshold are converted to external data.
+            To convert every tensor with raw data to external data, set to 0.
+        external_data_convert_attribute: Effective only if save_as_external_data is true. Defaults to false.
+            If true, convert all tensors to external data.
+            If false, convert only non-attribute tensors to external data.
+        inputs_to_make_channel_last: List of graph input names to transpose to be "channel-last". For example,
+            if "input0" originally has the shape (N, C, D1, D2, ..., Dn), the resulting model will change input0's
+            shape to (N, D1, D2, ..., Dn, C) and add a transpose node after it.
+
+            Original:
+                input0 (N, C, D1, D2, ..., Dn) --> <Nodes>
+
+            Updated:
+                input0 (N, D1, D2, ..., Dn, C) --> Transpose --> input0_chanfirst (N, C, D1, D2, ..., Dn) --> <Nodes>
+
+            This can potentially improve inference latency for QDQ models running on QNN EP because the
+            additional transpose node may allow other transpose nodes inserted during ORT layout transformation
+            to cancel out.
+        outputs_to_make_channel_last: List of graph output names to transpose to be "channel-last". For example,
+            if "output0" originally has the shape (N, C, D1, D2, ..., Dn), the resulting model will change output0's
+            shape to (N, D1, D2, ..., Dn, C) and add a transpose node before it.
+
+            Original:
+                <Nodes> --> output0 (N, C, D1, D2, ..., Dn)
+
+            Updated:
+                <Nodes> --> output0_chanfirst (N, C, D1, D2, ..., Dn) --> Transpose --> output0 (N, D1, D2, ..., Dn, C)
+
+            This can potentially improve inference latency for QDQ models running on QNN EP because the
+            additional transpose node may allow other transpose nodes inserted during ORT layout transformation
+            to cancel out.
+    """
+    modified = False
+    model = model_input if isinstance(model_input, onnx.ModelProto) else onnx.load_model(model_input)
+    onnx_model = ONNXModel(model)
+
+    # Fuse Erf sequence into a single Gelu
+    fusion_gelu = FusionGelu(onnx_model)
+    if fusion_gelu.apply():
+        modified = True
+
+    # Fuse ReduceL2 sequence into a single LpNormalization node with p == 2.
+    fusion_lpnorm = FusionLpNormalization(onnx_model)
+    if fusion_lpnorm.apply():
+        modified = True
+
+    # Optionally, fuse ReduceMean sequence into a single LayerNormalization node.
+    if fuse_layernorm:
+        onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
+
+        # Need opset >= 17 to use LayerNormalization.
+        if onnx_opset.version < 17:
+            logging.warning(
+                "Unable to fuse ReduceMean sequence into a LayerNormalization node. "
+                "ONNX model must use an opset >= 17 in order to use LayerNormalization, "
+                f"but found version {onnx_opset.version}. Please use onnx.version_converter to update your model."
+            )
+        else:
+            fusion_layernorm = FusionLayerNormalization(onnx_model)
+            if fusion_layernorm.apply():
+                modified = True
+
+    # Optionally, transpose inputs and/or outputs to make them "channel-last".
+    if inputs_to_make_channel_last or outputs_to_make_channel_last:
+        transpose_node_prefix = "Transpose_channel_"
+        transpose_node_suffix: int = onnx_model.get_largest_node_name_suffix(transpose_node_prefix) + 1
+        update_io_to_channel_last(
+            onnx_model.model,
+            inputs_to_make_channel_last,
+            outputs_to_make_channel_last,
+            transpose_node_name_prefix=transpose_node_prefix,
+            transpose_node_name_start_suffix=transpose_node_suffix,
+        )
+        modified = True
+
+    # Make sure all nodes have a name.
+    unnamed_node_prefix = "qnn_preproc_node_"
+    available_suffix = onnx_model.get_largest_node_name_suffix(unnamed_node_prefix) + 1
+    for node in onnx_model.model.graph.node:
+        if node.op_type != "Constant" and not node.name:
+            new_node_name = f"{unnamed_node_prefix}{available_suffix!s}"
+            available_suffix += 1
+            node.name = new_node_name
+            modified = True
+            logging.warning(f"Node of type {node.op_type} does not have a name. Renamed to {new_node_name}.")
+
+    if modified:
+        onnx_model.topological_sort()
+        onnx.save_model(
+            model,
+            model_output,
+            save_as_external_data=save_as_external_data,
+            all_tensors_to_one_file=all_tensors_to_one_file,
+            location=external_data_location,
+            size_threshold=external_data_size_threshold,
+            convert_attribute=external_data_convert_attribute,
+        )
+
+    return modified
+
+
+class InputOutputNameMap:
+    def __init__(
+        self,
+        orig_tensor_names: set[str],
+        orig_graph_inputs: dict[str, onnx.ValueInfoProto],
+        orig_graph_outputs: dict[str, onnx.ValueInfoProto],
+    ):
+        self.orig_tensor_names = orig_tensor_names
+        self.orig_graph_inputs = orig_graph_inputs
+        self.orig_graph_outputs = orig_graph_outputs
+        self.updated_io_names = {}
+        self.new_value_infos = []
+
+    def get_new_name(self, orig_name: str):
+        if orig_name in self.updated_io_names:
+            return self.updated_io_names[orig_name]
+
+        # Make a new tensor name that is unique among all tensors in the graph.
+        prefix: str = f"{orig_name}_channel_first_"
+        suffix: int = -1
+        for tensor_name in self.orig_tensor_names:
+            if tensor_name.startswith(prefix) and tensor_name[len(prefix) :].isdigit():
+                index = int(tensor_name[len(prefix) :])
+                suffix = max(suffix, index)
+
+        suffix += 1  # This is the first available suffix.
+        new_name = f"{prefix}{suffix!s}"
+
+        # Add new value_info objects for these new tensors.
+        orig_value_info = self.orig_graph_inputs.get(orig_name) or self.orig_graph_outputs[orig_name]
+        value_info_proto = onnx.ValueInfoProto()
+        value_info_proto.CopyFrom(orig_value_info)
+        value_info_proto.name = new_name
+        self.new_value_infos.append(value_info_proto)
+
+        self.updated_io_names[orig_name] = new_name
+        return self.updated_io_names[orig_name]
+
+
+def update_io_to_channel_last(
+    model: onnx.ModelProto,
+    inputs_to_update: list[str] | None,
+    outputs_to_update: list[str] | None,
+    transpose_node_name_prefix: str = "Transpose_channel_",
+    transpose_node_name_start_suffix: int = 0,
+):
+    inputs_to_update = set(inputs_to_update or [])
+    outputs_to_update = set(outputs_to_update or [])
+
+    if not inputs_to_update and not outputs_to_update:
+        return
+
+    graph = model.graph
+    orig_graph_inputs = {ginput.name: ginput for ginput in graph.input}
+    orig_graph_outputs = {goutput.name: goutput for goutput in graph.output}
+
+    # Check that the user passed in actual input and output names.
+    for input_name in inputs_to_update:
+        if input_name not in orig_graph_inputs:
+            raise ValueError(f"{input_name} is not a graph input")
+
+    for output_name in outputs_to_update:
+        if output_name not in orig_graph_outputs:
+            raise ValueError(f"{output_name} is not a graph output")
+
+    orig_tensor_names = set()
+    orig_tensor_names.update(set(orig_graph_inputs))
+    orig_tensor_names.update(set(orig_graph_outputs))
+    orig_tensor_names.update(input_name for node in graph.node for input_name in node.input if input_name)
+
+    # Maps original input (or output) name to its updated name used within the graph.
+    io_map = InputOutputNameMap(orig_tensor_names, orig_graph_inputs, orig_graph_outputs)
+
+    # Update each node's inputs/outputs to use the transposed versions.
+    for node in graph.node:
+        for i in range(len(node.input)):
+            if node.input[i] and node.input[i] in inputs_to_update:
+                node.input[i] = io_map.get_new_name(node.input[i])
+            elif node.input[i] and node.input[i] in outputs_to_update:
+                node.input[i] = io_map.get_new_name(node.input[i])
+
+        for i in range(len(node.output)):
+            if node.output[i] in outputs_to_update:
+                node.output[i] = io_map.get_new_name(node.output[i])
+
+    # Update graph inputs to channel-last and a Transpose (to channel-first) after each.
+    for g_input_name in inputs_to_update:
+        g_input = orig_graph_inputs[g_input_name]
+
+        if not g_input.type.HasField("tensor_type") or not g_input.type.tensor_type.HasField("shape"):
+            raise ValueError(f"Expected input {g_input.name} to have a tensor_type with a shape")
+
+        input_shape = g_input.type.tensor_type.shape
+        input_rank = len(input_shape.dim)
+
+        if input_rank < 3:
+            raise ValueError(f"Expected input {g_input.name} to be of rank >= 3")
+
+        channel_dim = onnx.TensorShapeProto.Dimension()
+        channel_dim.CopyFrom(input_shape.dim[1])
+        for i in range(1, input_rank - 1):
+            input_shape.dim[i].CopyFrom(input_shape.dim[i + 1])
+        input_shape.dim[input_rank - 1].CopyFrom(channel_dim)
+
+        transpose_perm = list(range(input_rank))
+        for i in range(input_rank):
+            transpose_perm[i] = i if i < 1 else i - 1
+        transpose_perm[1] = input_rank - 1
+
+        transpose_node = onnx.helper.make_node(
+            "Transpose",
+            name=f"{transpose_node_name_prefix}{transpose_node_name_start_suffix!s}",
+            inputs=[g_input.name],
+            outputs=[io_map.get_new_name(g_input.name)],
+            perm=transpose_perm,
+        )
+        transpose_node_name_start_suffix += 1
+
+        graph.node.extend([transpose_node])
+
+    # Update graph outputs to channel-last and a Transpose (from channel-first) before each.
+    for g_output_name in outputs_to_update:
+        g_output = orig_graph_outputs[g_output_name]
+        if not g_output.type.HasField("tensor_type") or not g_output.type.tensor_type.HasField("shape"):
+            raise ValueError(f"Expected output {g_output.name} to have a tensor_type with a shape")
+
+        output_shape = g_output.type.tensor_type.shape
+        output_rank = len(output_shape.dim)
+
+        if output_rank < 3:
+            raise ValueError(f"Expected output {g_output.name} to be of rank >= 3")
+
+        channel_dim = onnx.TensorShapeProto.Dimension()
+        channel_dim.CopyFrom(output_shape.dim[1])
+        for i in range(1, output_rank - 1):
+            output_shape.dim[i].CopyFrom(output_shape.dim[i + 1])
+        output_shape.dim[output_rank - 1].CopyFrom(channel_dim)
+
+        transpose_perm = list(range(output_rank))
+        for i in range(output_rank):
+            transpose_perm[i] = i if i == 0 else i + 1
+        transpose_perm[output_rank - 1] = 1
+
+        transpose_node = onnx.helper.make_node(
+            "Transpose",
+            name=f"{transpose_node_name_prefix}{transpose_node_name_start_suffix!s}",
+            inputs=[io_map.get_new_name(g_output.name)],
+            outputs=[g_output.name],
+            perm=transpose_perm,
+        )
+        transpose_node_name_start_suffix += 1
+
+        graph.node.extend([transpose_node])
+
+    graph.value_info.extend(io_map.new_value_infos)
--- a/rl/Lib/site-packages/onnxruntime/quantization/execution_providers/qnn/quant_config.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/execution_providers/qnn/quant_config.py
@ -0,0 +1,387 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import copy
+import logging
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import onnx
+
+from ...calibrate import CalibrationDataReader, CalibrationMethod
+from ...quant_utils import QuantType
+from ...quantize import StaticQuantConfig
+from ...tensor_quant_overrides import TensorQuantOverridesHelper
+from .mixed_precision_overrides_utils import MixedPrecisionTensorQuantOverridesFixer
+
+Q16_TYPES = {QuantType.QInt16, QuantType.QUInt16}
+Q8_TYPES = {QuantType.QInt8, QuantType.QUInt8}
+Q4_TYPES = {QuantType.QInt4, QuantType.QUInt4}
+OP_TYPES_TO_EXCLUDE = {"Cast"}
+MODEL_SIZE_THRESHOLD = 2147483648  # Quant model should use external data if >= 2GB
+
+
+def warn_unable_to_override(
+    node: onnx.NodeProto,
+    what_str: str,
+    tensor_name: str,
+    io_kind: str,
+):
+    logging.warning(
+        f"Unable to override {what_str} for {node.op_type} node's {io_kind} "
+        "because it has already been overridden! Check the initial quantization overrides provided "
+        "to get_qnn_qdq_config() if the generated QDQ model does not run on QNN EP. "
+        f"Node name: {node.name}, {io_kind} name: {tensor_name}"
+    )
+
+
+def get_qnn_qdq_config(
+    model_input: str | Path | onnx.ModelProto,
+    calibration_data_reader: CalibrationDataReader,
+    calibrate_method: CalibrationMethod = CalibrationMethod.MinMax,
+    activation_type: QuantType = QuantType.QUInt8,
+    weight_type: QuantType = QuantType.QUInt8,
+    per_channel: bool = False,
+    init_overrides: dict[str, list[dict[str, Any]]] | None = None,
+    add_qtype_converts: bool = True,
+    activation_symmetric: bool = False,
+    weight_symmetric: bool | None = None,
+    keep_removable_activations: bool = False,
+    stride: int | None = None,
+) -> StaticQuantConfig:
+    """
+    Returns a static quantization configuration suitable for running QDQ models on QNN EP.
+    This is done primarily by setting tensor-level quantization overrides.
+
+    Params:
+        model_input: Path to the input model file or ModelProto.
+        calibration_data_reader: Calibration data reader.
+        calibrate_methode: The calibration method. Defaults to MinMax.
+        activation_type: The default activation quantization type. Defaults to QUInt8.
+        weight_type: The default weight quantization type. Defaults to QUInt8.
+        per_channel: Global option that determines if a fixed set of operator types should be quantized per-channel.
+            Defaults to false. Alternatively, use the tensor-level `init_overrides` to select individual operators
+            and their quantization axes.
+
+            If set, the quantization tool uses per-channel quantization for the following operator types and inputs:
+                - Conv:
+                    - input[1] on axis 0
+                    - input[2] (bias) on axis 0
+                - ConvTranspose:
+                    - input[1] on axis 1
+                    - input[2] (bias) on axis 0
+        init_overrides: Initial tensor-level quantization overrides. Defaults to None. This function updates of a copy
+            of these overrides with any necessary adjustments and includes them in the returned
+            configuration object (i.e., config.extra_options['TensorQuantOverrides']).
+
+            The key is a tensor name and the value is a list of dictionaries. For per-tensor quantization, the list
+            contains a single dictionary. For per-channel quantization, the list contains either a dictionary for
+            each channel in the tensor or a single dictionary that is assumed to apply to all channels. An 'axis'
+            key must be present in the first dictionary for per-channel quantization.
+
+            Each dictionary contains optional overrides with the following keys and values.
+                'quant_type' = QuantType : The tensor's quantization data type.
+                'axis' = Int             : The per-channel axis. Must be present for per-channel weights.
+                'scale' =  Float         : The scale value to use. Must also specify `zero_point` if set.
+                'zero_point' = Int       : The zero-point value to use. Must also specify `scale` is set.
+                'symmetric' = Bool       : If the tensor should use symmetric quantization. Invalid if also
+                                            set `scale` or `zero_point`.
+                'reduce_range' = Bool    : If the quantization range should be reduced. Invalid if also
+                                            set `scale` or `zero_point`. Only valid for initializers.
+                'rmax' = Float           : Override the maximum real tensor value in calibration data.
+                                            Invalid if also set `scale` or `zero_point`.
+                'rmin' = Float           : Override the minimum real tensor value in calibration data.
+                                            Invalid if also set `scale` or `zero_point`.
+                'convert' = Dict         : A nested dictionary with the same keys for an activation
+                                           tensor that should be converted to another quantization type.
+                'convert["recv_nodes"] = Set : Set of node names that consume the converted activation,
+                                               other nodes get the original type. If not specified,
+                                               assume all consumer nodes get the converted type.
+        add_qtype_converts: True if this function should automatically add "convert" entries to the provided
+            `init_overrides` to ensure that operators use valid input/output types (activations only).
+            Ex: if you override the output of an Add to 16-bit, this option ensures that the activation inputs
+            of the Add are also up-converted to 16-bit and that data types for surrounding ops are converted
+            appropriately. Refer to the documentation in mixed_precision_overrides_utils.py for additional details.
+        activation_symmetric: True if activations should be quantized symmetrically (i.e, rmax == -rmin) by default.
+            Defaults to false. For int8 and int16, this results in zero-point values of 0. For uint8 and uin16,
+            the zero-point values are 128 and 32,768, respectively.
+        weight_symmetric: True if weights should be quantized symmetrically (i.e., rmax == -rmin) by default.
+            Defaults to None. If set to None, weight_symmetric is assumed true if the weight_type is a signed int.
+        keep_removable_activations: Defaults to false. If true, "removable" activations (e.g., Clip or Relu) will not
+                        be removed, and will be explicitly represented in the QDQ model. If false, these activations
+                        are automatically removed if activations are asymmetrically quantized. Keeping these activations
+                        is necessary if optimizations or EP transformations will later remove
+                        QuantizeLinear/DequantizeLinear operators from the model.
+
+    Returns:
+        A StaticQuantConfig object
+    """
+    if weight_symmetric is None:
+        weight_symmetric = weight_type in {QuantType.QInt8, QuantType.QInt16}
+
+    model = (
+        model_input
+        if isinstance(model_input, onnx.ModelProto)
+        else onnx.load_model(model_input, load_external_data=False)
+    )
+
+    op_types = set()
+    model_has_external_data = False
+    name_to_initializer = {}
+
+    # Build map of initializers (name -> initializer) and
+    # check if the model has external data.
+    for initializer in model.graph.initializer:
+        name_to_initializer[initializer.name] = initializer
+        if onnx.external_data_helper.uses_external_data(initializer):
+            model_has_external_data = True
+
+    overrides_helper = TensorQuantOverridesHelper(copy.deepcopy(init_overrides) if init_overrides else {})
+
+    if not overrides_helper.empty() and add_qtype_converts:
+        # Fix mixed-precision overrides.
+        overrides_fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(
+            overrides_helper, model, activation_type
+        )
+        overrides_fixer.apply(activation_type, activation_symmetric)
+
+    # Setup quantization overrides for specific operator types to ensure compatibility with QNN EP.
+    qnn_compat = QnnCompatibilityOverrides(
+        activation_type,
+        weight_type,
+        activation_symmetric,
+        weight_symmetric,
+        per_channel,
+        overrides_helper,
+        name_to_initializer,
+    )
+
+    for node in model.graph.node:
+        op_types.add(node.op_type)
+        qnn_compat.process_node(node)
+
+    extra_options = {
+        "MinimumRealRange": 0.0001,
+        "DedicatedQDQPair": False,  # Let ORT optimizer duplicate DQ nodes
+        "QDQKeepRemovableActivations": keep_removable_activations,
+        "TensorQuantOverrides": overrides_helper.get_dict(),
+        "ActivationSymmetric": activation_symmetric,
+        "WeightSymmetric": weight_symmetric,
+        "CalibStridedMinMax": stride,
+    }
+
+    # ONNX opset < 21 does not support 16-bit quantization, so must use 'com.microsoft' domain
+    # on Q/DQ operators if using 16-bit or 4-bit quantization.
+    onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
+    if onnx_opset.version < 21:
+        opset21_types = Q16_TYPES.union(Q4_TYPES)
+        overrides_have_opset21_types = any(t in opset21_types for t in overrides_helper.get_quant_types())
+        if activation_type in opset21_types or weight_type in opset21_types or overrides_have_opset21_types:
+            extra_options["UseQDQContribOps"] = True
+
+    return StaticQuantConfig(
+        calibration_data_reader,
+        calibrate_method=calibrate_method,
+        activation_type=activation_type,
+        weight_type=weight_type,
+        op_types_to_quantize=list(op_types.difference(OP_TYPES_TO_EXCLUDE)),
+        per_channel=per_channel,
+        use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD),
+        extra_options=extra_options,
+    )
+
+
+class QnnCompatibilityOverrides:
+    """
+    Helper that processes nodes to generate quantization overrides that make the resulting QDQ model
+    compatible with QNN EP.
+    """
+
+    def __init__(
+        self,
+        default_activation_qtype: QuantType,
+        default_weight_qtype: QuantType,
+        activation_symmetric: bool,
+        weight_symmetric: bool,
+        per_channel: bool,
+        overrides: TensorQuantOverridesHelper,
+        initializers: dict[str, onnx.TensorProto],
+    ):
+        self.default_activation_qtype = default_activation_qtype
+        self.default_weight_qtype = default_weight_qtype
+        self.activation_symmetric = activation_symmetric
+        self.weight_symmetric = weight_symmetric
+        self.per_channel = per_channel
+        self.overrides = overrides
+        self.initializers = initializers
+
+        self.process_fns = {
+            "MatMul": self._process_matmul,
+            "LayerNormalization": self._process_layernorm,
+            "Sigmoid": self._process_sigmoid,
+            "Tanh": self._process_tanh,
+        }
+
+    def process_node(self, node: onnx.NodeProto):
+        process_fn = self.process_fns.get(node.op_type)
+
+        if process_fn is not None:
+            process_fn(node)
+
+    def _make_static_inputs_use_default_weight_type(self, node: onnx.NodeProto):
+        """
+        Overrides initializer input(s) to use the default weight type if:
+        - The default weight type is 8-bit
+        - One of the inputs is a 16-bit activation
+        - The other input is an initializer (per-tensor quantized)
+
+        This is necessary because the quantization tool does not assign MatMul or LayerNorm initializer
+        inputs the default weight type. Instead, it assigns the default activation type.
+        """
+        if self.default_weight_qtype not in Q8_TYPES:
+            return
+
+        input_16bit_act_name = None
+        input_weight_name = None
+
+        # Loop through first 2 inputs to find a 16-bit activation and a (per-tensor) weight.
+        for i in range(2):
+            input_name = node.input[i]
+            if not input_name:
+                continue
+
+            is_weight = input_name in self.initializers
+            qtype_info = self.overrides.get_node_input_qtype_info(
+                input_name,
+                node.name,
+                default_qtype=None if is_weight else self.default_activation_qtype,
+            )
+
+            if qtype_info.axis is not None:
+                return  # Don't process MatMul with a per-channel quantized input.
+
+            if (
+                is_weight
+                and qtype_info.quant_type == self.default_weight_qtype
+                and qtype_info.symmetric == self.weight_symmetric
+            ):
+                return  # Return. Weight is already overridden to use the desired weight type.
+
+            if is_weight:
+                input_weight_name = input_name
+            elif qtype_info.quant_type in Q16_TYPES:
+                input_16bit_act_name = input_name
+
+        # Override initializer input to use the default weight type.
+        if input_16bit_act_name and input_weight_name:
+            did_update = self.overrides.update_tensor_overrides(
+                input_weight_name,
+                {"quant_type": self.default_weight_qtype, "symmetric": self.weight_symmetric},
+                overwrite=False,
+            )
+
+            if not did_update:
+                warn_unable_to_override(node, "quant_type/symmetric", input_weight_name, "input weight")
+
+    def _process_matmul(self, node: onnx.NodeProto):
+        assert node.op_type == "MatMul", f"Expected MatMul, but got {node.op_type}"
+
+        if not self.per_channel:
+            self._make_static_inputs_use_default_weight_type(node)
+            return
+
+        # QNN does not support per-channel MatMul. However, the ORT quantization tool attempts to use per-channel
+        # quantization for MatMul by default *if* the global per_channel setting is enabled. So, we need to
+        # provide explicit per-tensor quantization overrides for MatMul if per_channel is enabled and
+        # the user did not provide any other overrides.
+        for input_name in node.input:
+            is_weight_no_overrides = input_name in self.initializers and input_name not in self.overrides
+            if is_weight_no_overrides:
+                self.overrides.update_tensor_overrides(
+                    input_name,
+                    {"quant_type": self.default_weight_qtype, "symmetric": self.weight_symmetric},
+                )
+
+    def _process_layernorm(self, node: onnx.NodeProto):
+        assert node.op_type == "LayerNormalization", f"Expected LayerNormalization, but got {node.op_type}"
+
+        if not self.per_channel:
+            self._make_static_inputs_use_default_weight_type(node)
+            return
+
+        has_weight_no_overrides = node.input[1] in self.initializers and node.input[1] not in self.overrides
+        has_bias_no_overrides = (
+            len(node.input) > 2
+            and node.input[2]
+            and node.input[2] in self.initializers
+            and node.input[2] not in self.overrides
+        )
+
+        if has_weight_no_overrides or has_bias_no_overrides:
+            # TODO: Make bias input not per-channel. QNN needs it to be per-tensor, but quantizer
+            # tries to makes it per-channel if the weight is also per-channel.
+            raise ValueError(
+                "get_qnn_qdq_config() does not currently support the global per_channel option with LayerNormalization."
+                " Please try using custom overrides that make bias per-tensor quantized."
+            )
+
+    def _process_sigmoid(self, node: onnx.NodeProto):
+        """
+        Overrides 16-bit Sigmoid's output scale and zero-point as per QNN requirements.
+        """
+        assert node.op_type == "Sigmoid", f"Expected Sigmoid, but got {node.op_type}"
+        output_type = self.overrides.get_node_output_qtype_info(
+            node.output[0], self.default_activation_qtype
+        ).quant_type
+
+        if output_type == QuantType.QUInt16:
+            self.overrides.update_tensor_overrides(
+                node.output[0],
+                {
+                    "quant_type": output_type,
+                    "scale": np.array(1.0 / 65536.0, dtype=np.float32),
+                    "zero_point": np.array(0, dtype=np.uint16),
+                },
+            )
+        elif output_type == QuantType.QInt16:
+            self.overrides.update_tensor_overrides(
+                node.output[0],
+                {
+                    "quant_type": output_type,
+                    "scale": np.array(1.0 / 32768.0, dtype=np.float32),
+                    "zero_point": np.array(0, dtype=np.int16),
+                },
+            )
+
+    def _process_tanh(self, node: onnx.NodeProto):
+        """
+        Overrides 16-bit Tanh's output scale and zero-point as per QNN requirements.
+        """
+        assert node.op_type == "Tanh", f"Expected Tanh, but got {node.op_type}"
+        output_type = self.overrides.get_node_output_qtype_info(
+            node.output[0], self.default_activation_qtype
+        ).quant_type
+
+        if output_type == QuantType.QUInt16:
+            self.overrides.update_tensor_overrides(
+                node.output[0],
+                {
+                    "quant_type": output_type,
+                    "scale": np.array(1.0 / 32768.0, dtype=np.float32),
+                    "zero_point": np.array(32768, dtype=np.uint16),
+                },
+            )
+        elif output_type == QuantType.QInt16:
+            self.overrides.update_tensor_overrides(
+                node.output[0],
+                {
+                    "quant_type": output_type,
+                    "scale": np.array(1.0 / 32768.0, dtype=np.float32),
+                    "zero_point": np.array(0, dtype=np.int16),
+                },
+            )
--- a/rl/Lib/site-packages/onnxruntime/quantization/fusions/init.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/fusions/init.py
@ -0,0 +1,3 @@
+from .fusion import Fusion  # noqa: F401
+from .fusion_gelu import FusionGelu  # noqa: F401
+from .fusion_layernorm import FusionLayerNormalization  # noqa: F401
--- a/rl/Lib/site-packages/onnxruntime/quantization/fusions/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/fusions/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/fusions/pycache/fusion.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/fusions/pycache/fusion.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/fusions/pycache/fusion_gelu.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/fusions/pycache/fusion_gelu.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/fusions/pycache/fusion_layernorm.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/fusions/pycache/fusion_layernorm.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/fusions/fusion.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/fusions/fusion.py
@ -0,0 +1,311 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+from collections import deque
+
+import onnx
+
+from ..onnx_model import ONNXModel
+
+
+class Fusion:
+    """
+    Base class for fusions.
+    """
+
+    def __init__(self, model: ONNXModel, fused_op_type: str, search_op_type: str):
+        self.search_op_type: str = search_op_type
+        self.fused_op_type: str = fused_op_type
+        self.model: ONNXModel = model
+        self.nodes_to_remove: list = []
+        self.nodes_to_add: list = []
+
+        self._new_node_name_prefix = self.fused_op_type + "_fused_" + self.search_op_type + "_"
+        self._new_node_name_suffix = None  # int|None used to create unique node names for the fused ops.
+
+    def fuse(
+        self,
+        node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ):
+        """
+        Interface function for derived fusion classes. Tries to fuse a node sequence containing
+        the specified node.
+        """
+        raise NotImplementedError
+
+    def apply(self) -> bool:
+        """
+        Apply graph fusion on the entire model graph.
+        """
+        input_name_to_nodes = self.model.input_name_to_nodes()
+        output_name_to_node = self.model.output_name_to_node()
+
+        for node in self.model.nodes():
+            if node.op_type == self.search_op_type:
+                self.fuse(node, input_name_to_nodes, output_name_to_node)
+
+        self.model.remove_nodes(self.nodes_to_remove)
+        self.model.add_nodes(self.nodes_to_add)
+
+        graph_updated = bool(self.nodes_to_remove or self.nodes_to_add)
+
+        if graph_updated:
+            self.model.remove_unused_constant()
+
+        return graph_updated
+
+    def create_unique_node_name(self):
+        prefix = self._new_node_name_prefix
+
+        if self._new_node_name_suffix is None:
+            largest_suffix: int = self.model.get_largest_node_name_suffix(prefix)
+            self._new_node_name_suffix = largest_suffix + 1
+
+        new_name = f"{prefix}{self._new_node_name_suffix!s}"
+        self._new_node_name_suffix += 1
+
+        return new_name
+
+    @staticmethod
+    def is_safe_to_fuse_nodes(
+        nodes_to_remove: list[onnx.NodeProto],
+        keep_outputs: list[str],
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ) -> bool:
+        for node_to_remove in nodes_to_remove:
+            for output_to_remove in node_to_remove.output:
+                if output_to_remove in keep_outputs:
+                    continue
+
+                if output_to_remove in input_name_to_nodes:
+                    for impacted_node in input_name_to_nodes[output_to_remove]:
+                        if impacted_node not in nodes_to_remove:
+                            # Not safe to remove nodes since output is used by impacted_node
+                            return False
+        return True
+
+    @staticmethod
+    def get_node_attribute(node: onnx.NodeProto, attribute_name: str):
+        for attr in node.attribute:
+            if attr.name == attribute_name:
+                value = onnx.helper.get_attribute_value(attr)
+                return value
+        return None
+
+    @staticmethod
+    def input_index(node_output: str, child_node: onnx.NodeProto) -> int:
+        for index, input_name in enumerate(child_node.input):
+            if input_name == node_output:
+                return index
+        return -1
+
+    @staticmethod
+    def tensor_shape_to_list(tensor_type) -> list[int]:
+        shape_list = []
+        for d in tensor_type.shape.dim:
+            if d.HasField("dim_value"):
+                shape_list.append(d.dim_value)  # known dimension
+            elif d.HasField("dim_param"):
+                shape_list.append(d.dim_param)  # unknown dimension with symbolic name
+            else:
+                shape_list.append("?")  # shall not happen
+        return shape_list
+
+    def get_constant_input(self, node: onnx.NodeProto):
+        for i, inp in enumerate(node.input):
+            value = self.model.get_constant_value(inp)
+            if value is not None:
+                return i, value
+
+        return None, None
+
+    def find_constant_input(self, node: onnx.NodeProto, expected_value: float, delta: float = 0.000001) -> int:
+        i, value = self.get_constant_input(node)
+        if value is not None and value.size == 1 and abs(value - expected_value) < delta:
+            return i
+
+        return -1
+
+    def has_constant_input(self, node: onnx.NodeProto, expected_value: float, delta: float = 0.000001) -> bool:
+        return self.find_constant_input(node, expected_value, delta) >= 0
+
+    def is_constant_with_specified_rank(self, output_name: str, rank: int) -> bool:
+        value = self.model.get_constant_value(output_name)
+        if value is None:
+            return False  # Not an initializer
+
+        if len(value.shape) != rank:
+            return False  # Wrong dimensions
+
+        return True
+
+    def match_first_parent(
+        self,
+        node: onnx.NodeProto,
+        parent_op_type: str,
+        output_name_to_node: dict[str, onnx.NodeProto] | None = None,
+        exclude: list[onnx.NodeProto] = [],  # noqa: B006
+    ) -> tuple[onnx.NodeProto | None, int | None]:
+        """
+        Find parent node based on constraints on op_type.
+
+        Args:
+            node: current node.
+            parent_op_type (str): constraint of parent node op_type.
+            output_name_to_node (dict): dictionary with output name as key, and node as value.
+            exclude (list): list of nodes that are excluded (not allowed to match as parent).
+
+        Returns:
+            parent: The matched parent node. None if not found.
+            index: The input index of matched parent node. None if not found.
+        """
+        if output_name_to_node is None:
+            output_name_to_node = self.model.output_name_to_node()
+
+        for i, inp in enumerate(node.input):
+            if inp in output_name_to_node:
+                parent = output_name_to_node[inp]
+                if parent.op_type == parent_op_type and parent not in exclude:
+                    return parent, i
+
+        return None, None
+
+    def match_parent(
+        self,
+        node: onnx.NodeProto,
+        parent_op_type: str,
+        input_index: int | None = None,
+        output_name_to_node: dict[str, onnx.NodeProto] | None = None,
+        exclude: list[onnx.NodeProto] = [],  # noqa: B006
+        return_indice: list[int] | None = None,
+    ) -> onnx.NodeProto | None:
+        """
+        Find parent node based on constraints on op_type and index.
+        When input_index is None, we will find the first parent node based on constraints,
+        and return_indice will be appended the corresponding input index.
+
+        Args:
+            node (str): current node name.
+            parent_op_type (str): constraint of parent node op_type.
+            input_index (int or None): only check the parent given input index of current node.
+            output_name_to_node (dict): dictionary with output name as key, and node as value.
+            exclude (list): list of nodes that are excluded (not allowed to match as parent).
+            return_indice (list): a list to append the input index when input_index is None.
+
+        Returns:
+            parent: The matched parent node.
+        """
+        assert node is not None
+        assert input_index is None or input_index >= 0
+
+        if output_name_to_node is None:
+            output_name_to_node = self.model.output_name_to_node()
+
+        if input_index is None:
+            parent, index = self.match_first_parent(node, parent_op_type, output_name_to_node, exclude)
+            if return_indice is not None:
+                return_indice.append(index)
+            return parent
+
+        if input_index >= len(node.input):
+            # Input index out of bounds.
+            return None
+
+        parent = self.model.get_parent(node, input_index, output_name_to_node)
+        if parent is not None and parent.op_type == parent_op_type and parent not in exclude:
+            return parent
+
+        return None
+
+    def match_parent_path(
+        self,
+        node: onnx.NodeProto,
+        parent_op_types: list[str],
+        parent_input_index: list[int] | None = None,
+        output_name_to_node: dict[str, onnx.NodeProto] | None = None,
+        return_indice: list[int] | None = None,
+    ) -> list[onnx.NodeProto] | None:
+        """
+        Find a sequence of input edges based on constraints on parent op_type and index.
+        When input_index is None, we will find the first parent node based on constraints,
+        and return_indice will be appended the corresponding input index.
+
+        Args:
+            node (str): current node name.
+            parent_op_types (str): constraint of parent node op_type of each input edge.
+            parent_input_index (list): constraint of input index of each input edge. None means no constraint.
+            output_name_to_node (dict): dictionary with output name as key, and node as value.
+            return_indice (list): a list to append the input index
+                                  When there is no constraint on input index of an edge.
+
+        Returns:
+            parents: a list of matched parent node.
+        """
+        if parent_input_index is not None:
+            assert len(parent_input_index) == len(parent_op_types)
+
+        if output_name_to_node is None:
+            output_name_to_node = self.model.output_name_to_node()
+
+        current_node = node
+        matched_parents = []
+        for i, op_type in enumerate(parent_op_types):
+            matched_parent = self.match_parent(
+                current_node,
+                op_type,
+                parent_input_index[i] if parent_input_index is not None else None,
+                output_name_to_node,
+                exclude=[],
+                return_indice=return_indice,
+            )
+            if matched_parent is None:
+                return None
+
+            matched_parents.append(matched_parent)
+            current_node = matched_parent
+
+        return matched_parents
+
+    def match_parent_paths(
+        self,
+        node: onnx.NodeProto,
+        paths: list[tuple[list[str], list[int]]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ) -> tuple[int, list[onnx.NodeProto] | None, list[int] | None]:
+        """
+        Find a matching parent path to the given node.
+        """
+        for i, path in enumerate(paths):
+            return_indice = []
+            matched = self.match_parent_path(node, path[0], path[1], output_name_to_node, return_indice)
+            if matched:
+                return i, matched, return_indice
+        return -1, None, None
+
+    def find_first_child_by_type(
+        self,
+        node: onnx.NodeProto,
+        child_type: str,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]] | None = None,
+        recursive: bool = True,
+    ) -> onnx.NodeProto | None:
+        children = self.model.get_children(node, input_name_to_nodes)
+        dq = deque(children)
+        while len(dq) > 0:
+            current_node = dq.pop()
+            if current_node.op_type == child_type:
+                return current_node
+
+            if recursive:
+                children = self.model.get_children(current_node, input_name_to_nodes)
+                for child in children:
+                    dq.appendleft(child)
+
+        return None
--- a/rl/Lib/site-packages/onnxruntime/quantization/fusions/fusion_gelu.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/fusions/fusion_gelu.py
@ -0,0 +1,272 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import onnx
+
+from ..onnx_model import ONNXModel
+from .fusion import Fusion
+
+
+class FusionGelu(Fusion):
+    def __init__(self, model: ONNXModel):
+        super().__init__(model, "Gelu", "Erf")
+
+    def fuse(
+        self,
+        erf_node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ):
+        """
+        Interface function that tries to fuse a node sequence containing an Erf node into a single
+        Gelu node.
+        """
+        if (
+            self.fuse_1(erf_node, input_name_to_nodes, output_name_to_node)
+            or self.fuse_2(erf_node, input_name_to_nodes, output_name_to_node)
+            or self.fuse_3(erf_node, input_name_to_nodes, output_name_to_node)
+        ):
+            self.model.set_opset_import("com.microsoft", 1)
+
+    def fuse_1(
+        self,
+        erf_node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ) -> bool:
+        """
+        This pattern is from PyTorch model
+        Fuse Gelu with Erf into one node:
+        Pattern 1:
+                       +-------Mul(0.5)---------------------+
+                       |                                    |
+                       |                                    v
+                    [root] --> Div -----> Erf  --> Add --> Mul -->
+                              (B=1.4142...)       (1)
+
+        Pattern 2:
+                       +------------------------------------+
+                       |                                    |
+                       |                                    v
+                    [root] --> Div -----> Erf  --> Add --> Mul -->Mul -->
+                              (B=1.4142...)       (1)            (0.5)
+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+        if erf_node.output[0] not in input_name_to_nodes:
+            return False
+        children = input_name_to_nodes[erf_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return False
+        add_after_erf = children[0]
+
+        if not self.has_constant_input(add_after_erf, 1):
+            return False
+
+        if add_after_erf.output[0] not in input_name_to_nodes:
+            return False
+
+        children = input_name_to_nodes[add_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return False
+
+        mul_after_erf = children[0]
+
+        div = self.match_parent(erf_node, "Div", 0, output_name_to_node)
+        if div is None:
+            return False
+
+        if self.find_constant_input(div, 1.4142, delta=0.001) != 1:
+            return False
+
+        subgraph_input = div.input[0]
+
+        another = 1 if mul_after_erf.input[0] == add_after_erf.output[0] else 0
+        if subgraph_input == mul_after_erf.input[another]:  # pattern 2
+            children = input_name_to_nodes[mul_after_erf.output[0]]
+            if len(children) != 1 or children[0].op_type != "Mul":
+                return False
+            mul_half = children[0]
+            if not self.has_constant_input(mul_half, 0.5):
+                return False
+            subgraph_output = mul_half.output[0]
+        else:  # pattern 1
+            mul_half = self.match_parent(mul_after_erf, "Mul", another, output_name_to_node)
+            if mul_half is None:
+                return False
+
+            if not self.has_constant_input(mul_half, 0.5):
+                return False
+
+            if subgraph_input not in mul_half.input:
+                return False
+
+            subgraph_output = mul_after_erf.output[0]
+
+        subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul_half]
+        if not self.is_safe_to_fuse_nodes(subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node):
+            return False
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = onnx.helper.make_node(
+            "Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[subgraph_output]
+        )
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        return True
+
+    def fuse_2(
+        self,
+        erf_node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ) -> bool:
+        """
+        This pattern is from Keras model
+        Fuse Gelu with Erf into one node:
+                       +------------------------------------------+
+                       |                                          |
+                       |                                          v
+                    [root] --> Div -----> Erf  --> Add --> Mul -->Mul
+                              (B=1.4142...)       (A=1)   (A=0.5)
+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+        if erf_node.output[0] not in input_name_to_nodes:
+            return False
+        children = input_name_to_nodes[erf_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return False
+        add_after_erf = children[0]
+
+        if not self.has_constant_input(add_after_erf, 1):
+            return False
+
+        if add_after_erf.output[0] not in input_name_to_nodes:
+            return False
+        children = input_name_to_nodes[add_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return False
+        mul_after_erf = children[0]
+
+        if not self.has_constant_input(mul_after_erf, 0.5):
+            return False
+
+        if mul_after_erf.output[0] not in input_name_to_nodes:
+            return False
+        children = input_name_to_nodes[mul_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return False
+        mul = children[0]
+
+        div = self.match_parent(erf_node, "Div", 0, output_name_to_node)
+        if div is None:
+            return False
+
+        sqrt_node = None
+        if self.find_constant_input(div, 1.4142, delta=0.001) != 1:
+            sqrt_node = self.match_parent(div, "Sqrt", 1, output_name_to_node)
+            if sqrt_node is None:
+                return False
+            if not self.has_constant_input(sqrt_node, 2.0):
+                return False
+
+        subgraph_input = div.input[0]
+
+        if subgraph_input not in mul.input:
+            return False
+
+        subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul]
+        if sqrt_node:
+            subgraph_nodes.append(sqrt_node)
+
+        if not self.is_safe_to_fuse_nodes(subgraph_nodes, [mul.output[0]], input_name_to_nodes, output_name_to_node):
+            return False
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = onnx.helper.make_node(
+            "Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[mul.output[0]]
+        )
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        return True
+
+    def fuse_3(
+        self,
+        erf_node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ) -> bool:
+        """
+        This pattern is from TensorFlow model
+        Fuse Gelu with Erf into one node:
+                       +----------------------------------------------+
+                       |                                              |
+                       |                                              v
+                    [root] --> Mul -----> Erf    -->   Add --> Mul -->Mul
+                               (A=0.7071067690849304)  (B=1)  (B=0.5)
+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+
+        if erf_node.output[0] not in input_name_to_nodes:
+            return False
+        children = input_name_to_nodes[erf_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return False
+        add_after_erf = children[0]
+
+        if not self.has_constant_input(add_after_erf, 1):
+            return False
+
+        if add_after_erf.output[0] not in input_name_to_nodes:
+            return False
+        children = input_name_to_nodes[add_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return False
+        mul_half = children[0]
+
+        if not self.has_constant_input(mul_half, 0.5):
+            return False
+
+        first_mul = self.match_parent(erf_node, "Mul", 0, output_name_to_node)
+        if first_mul is None:
+            return False
+
+        i = self.find_constant_input(first_mul, 0.7071067690849304, delta=0.001)
+        if i < 0:
+            return False
+
+        root_input_index = 1 - i
+        subgraph_input = first_mul.input[root_input_index]
+
+        if mul_half.output[0] not in input_name_to_nodes:
+            return False
+        children = input_name_to_nodes[mul_half.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return False
+        last_mul = children[0]
+
+        if not (last_mul.input[0] == subgraph_input or last_mul.input[1] == subgraph_input):
+            return False
+
+        subgraph_nodes = [first_mul, erf_node, add_after_erf, mul_half, last_mul]
+        if not self.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            [last_mul.output[0]],
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            return False
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = onnx.helper.make_node(
+            "Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[last_mul.output[0]]
+        )
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        return True
--- a/rl/Lib/site-packages/onnxruntime/quantization/fusions/fusion_layernorm.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/fusions/fusion_layernorm.py
@ -0,0 +1,135 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import onnx
+
+from ..onnx_model import ONNXModel
+from .fusion import Fusion
+
+
+class FusionLayerNormalization(Fusion):
+    def __init__(self, model: ONNXModel):
+        super().__init__(model, "LayerNormalization", "ReduceMean")
+
+    def fuse(
+        self,
+        reduce_mean_node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ):
+        """
+        Interface function that tries to fuse a node sequence containing a ReduceMean node into a single
+        LayerNormalization node.
+
+              +----------------------+
+              |                      |
+              |                      v
+          [Root] --> ReduceMean -->  Sub  --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
+                     (axis=2 or -1)  |      (Y=2)   (axis=2 or -1)  (E-6 or E-12 or 0) ^
+                                     |                                                 |
+                                     +-------------------------------------------------+
+
+         It also handles cases of duplicated sub nodes exported from older version of PyTorch:
+
+              +----------------------+
+              |                      v
+              |           +-------> Sub-----------------------------------------------+
+              |           |                                                           |
+              |           |                                                           v
+          [Root] --> ReduceMean -->  Sub  --> Pow --> ReduceMean --> Add --> Sqrt --> Div  --> Mul --> Add
+              |                      ^
+              |                      |
+              +----------------------+
+        """
+        children = self.model.get_children(reduce_mean_node, input_name_to_nodes)
+        if len(children) == 0 or len(children) > 2:
+            return
+
+        root_input = reduce_mean_node.input[0]
+
+        if children[0].op_type != "Sub" or children[0].input[0] != root_input:
+            return
+
+        if len(children) == 2:
+            if children[1].op_type != "Sub" or children[1].input[0] != root_input:
+                return
+
+        div_node = None
+        for child in children:
+            div_node = self.find_first_child_by_type(child, "Div", input_name_to_nodes, recursive=False)
+            if div_node is not None:
+                break
+        if div_node is None:
+            return
+
+        path_id, parent_nodes, _ = self.match_parent_paths(
+            div_node,
+            [
+                (["Sqrt", "Add", "ReduceMean", "Pow", "Sub"], [1, 0, 0, 0, 0]),
+                (
+                    ["Sqrt", "Add", "ReduceMean", "Pow", "Cast", "Sub"],
+                    [1, 0, 0, 0, 0, 0],
+                ),
+            ],
+            output_name_to_node,
+        )
+        if path_id < 0:
+            return
+
+        sub_node = parent_nodes[-1]
+        if sub_node not in children:
+            return
+
+        second_add_node = parent_nodes[1]
+        i, add_weight = self.get_constant_input(second_add_node)
+        if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
+            # Skip fusion since epsilon value is not expected.
+            return
+
+        pow_node = parent_nodes[3]
+        if self.find_constant_input(pow_node, 2.0) != 1:
+            return
+
+        mul_node = input_name_to_nodes[div_node.output[0]][0]
+        if mul_node.op_type != "Mul":
+            return
+
+        last_add_node = input_name_to_nodes[mul_node.output[0]][0]
+        if last_add_node.op_type != "Add":
+            return
+
+        subgraph_nodes = [reduce_mean_node]
+        subgraph_nodes.extend(children)
+        subgraph_nodes.extend(parent_nodes[:-1])
+
+        subgraph_nodes.extend([last_add_node, mul_node, div_node])
+        if not self.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            last_add_node.output,
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            return
+
+        weight_input = mul_node.input[1 - self.input_index(div_node.output[0], mul_node)]
+        if not self.is_constant_with_specified_rank(weight_input, 1):
+            return
+
+        bias_input = last_add_node.input[1 - self.input_index(mul_node.output[0], last_add_node)]
+        if not self.is_constant_with_specified_rank(bias_input, 1):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+
+        normalize_node = onnx.helper.make_node(
+            "LayerNormalization",
+            name=self.create_unique_node_name(),
+            inputs=[reduce_mean_node.input[0], weight_input, bias_input],
+            outputs=[last_add_node.output[0]],
+        )
+        normalize_node.attribute.extend([onnx.helper.make_attribute("epsilon", float(add_weight))])
+        self.nodes_to_add.append(normalize_node)
--- a/rl/Lib/site-packages/onnxruntime/quantization/matmul_4bits_quantizer.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/matmul_4bits_quantizer.py
@ -0,0 +1,857 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import argparse
+import copy
+import importlib
+import logging
+import os
+
+import numpy as np
+import numpy.typing as npt
+import onnx
+from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
+from packaging import version
+
+from onnxruntime.capi._pybind_state import quantize_matmul_4bits, quantize_qdq_matmul_4bits
+
+from .calibrate import CalibrationDataReader
+from .onnx_model import ONNXModel
+from .quant_utils import QuantFormat, attribute_to_kwarg
+
+logging.basicConfig(format="%(asctime)s %(name)s [%(levelname)s] - %(message)s", level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class WeightOnlyQuantConfig:
+    def __init__(self, algorithm, quant_format):
+        """This is the Base class for Weight Only Quant Configuration.
+
+        Args:
+            algorithm:
+                weight only quantize algorithm name.
+            quant_format: QuantFormat{QOperator, QDQ}.
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+        """
+        self.algorithm = algorithm
+        self.quant_format = quant_format
+
+
+class RTNWeightOnlyQuantConfig(WeightOnlyQuantConfig):
+    def __init__(
+        self,
+        ratios=None,
+        quant_format=QuantFormat.QOperator,
+    ):
+        """
+        This is a class for round-to-nearest (RTN) algorithm Weight Only Quant Configuration.
+        RTN is the most straightforward way to quantize weight using scale maps.
+
+        Args:
+            ratios:
+                percentile of clip. Defaults to {}.
+            quant_format (QuantFormat{QOperator, QDQ}, optional):
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+                Defaults to QuantFormat.QOperator.
+        """
+        assert quant_format == QuantFormat.QOperator, "RTN only supports QOperator format"
+
+        if ratios is None:
+            ratios = {}
+        super().__init__(
+            algorithm="RTN",
+            quant_format=quant_format,
+        )
+        self.ratios = ratios
+
+
+class GPTQWeightOnlyQuantConfig(WeightOnlyQuantConfig):
+    def __init__(
+        self,
+        calibration_data_reader: CalibrationDataReader,
+        percdamp=0.01,
+        block_size=128,
+        actorder=False,
+        mse=False,
+        perchannel=True,
+        quant_format=QuantFormat.QOperator,
+    ):
+        """
+        This is a class for GPTQ algorithm Weight Only Quant Configuration.
+        GPTQ algorithm provides more accurate quantization but requires more computational resources.
+
+        Args:
+            calibration_data_reader:
+                a calibration data reader. It enumerates calibration data and generates inputs for the original model.
+            percdamp:
+                percent of the average Hessian diagonal to use for dampening.
+            block_size (int, optional):
+                channel number in one block to execute a GPTQ quantization iteration.
+            actorder (bool, optional):
+                whether rearrange Hessian matrix considering the diag's value.
+            mse (bool, optional):
+                whether get scale and zero point with mse error.
+            perchannel (bool, optional):
+                whether quantize weight per-channel.
+            quant_format (QuantFormat{QOperator, QDQ}, optional):
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+                Defaults to QuantFormat.QOperator.
+        """
+        assert quant_format == QuantFormat.QOperator, "GPTQ only supports QOperator format"
+
+        super().__init__(
+            algorithm="GPTQ",
+            quant_format=quant_format,
+        )
+        self.calibration_data_reader = calibration_data_reader
+        self.percdamp = percdamp
+        self.block_size = block_size
+        self.actorder = actorder
+        self.mse = mse
+        self.perchannel = perchannel
+
+
+class HQQWeightOnlyQuantConfig(WeightOnlyQuantConfig):
+    def __init__(
+        self,
+        block_size=128,
+        bits=4,
+        axis=1,
+        quant_format=QuantFormat.QOperator,
+    ):
+        """
+        This is a class for HQQ algorithm Weight Only Quant Configuration.
+        HQQ algorithm quant weight without needing calibrate data.
+
+        Args:
+            block_size (int, optional):
+                channel number in one block to execute a HQQ quantization iteration.
+            bits (int, optional):
+                how many bits to represent weight.
+            axis (int, optional):
+                0 or 1. which axis to quantize. https://arxiv.org/pdf/2309.15531.pdf
+            quant_format (QuantFormat{QOperator, QDQ}, optional):
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+                Defaults to QuantFormat.QOperator.
+        """
+        assert quant_format == QuantFormat.QOperator, "HQQ only supports QOperator format"
+
+        super().__init__(
+            algorithm="HQQ",
+            quant_format=quant_format,
+        )
+        self.block_size = block_size
+        self.bits = bits
+        self.axis = axis
+
+
+class DefaultWeightOnlyQuantConfig(WeightOnlyQuantConfig):
+    def __init__(
+        self,
+        block_size: int = 128,
+        is_symmetric: bool = False,
+        accuracy_level: int | None = None,
+        quant_format=QuantFormat.QOperator,
+    ):
+        """
+        This is a class for weight only affine quantization configuration.
+
+        Args:
+            block_size (int, optional):
+                channel number in one block to execute an affine quantization iteration.
+            is_symmetric (bool, optional):
+                whether quantize weight symmetrically.
+            accuracy_level (int, optional):
+                Accuracy level of the 4-bit quantized MatMul computation.
+                Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details.
+                (https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits)
+            quant_format (QuantFormat{QOperator, QDQ}, optional):
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+                Defaults to QuantFormat.QOperator.
+        """
+        super().__init__(algorithm="DEFAULT", quant_format=quant_format)
+        self.block_size = block_size
+        self.is_symmetric = is_symmetric
+        self.bits = 4
+        self.accuracy_level = accuracy_level
+
+
+def is_divisible(val1, val2):
+    return int(val2 * np.ceil(val1 / val2)) == val1
+
+
+class HQQWeightOnlyQuantizer:
+    def __init__(
+        self,
+        config: HQQWeightOnlyQuantConfig,
+    ):
+        self.config = config
+
+    # Proximal solver || weight - dequantize(quantize(weight))||_p^p
+    @staticmethod
+    def optimize_weights(
+        tensor,
+        scale,
+        zero,
+        min_max: list[int],
+        axis: int = 0,
+        opt_params: dict = None,  # noqa: RUF013
+        verbose=False,
+    ):
+        import torch
+
+        opt_params = {"lp_norm": 0.7, "beta": 1e1, "kappa": 1.01, "iters": 20} if opt_params is None else opt_params
+        lp_norm, beta, kappa, iters = (
+            opt_params["lp_norm"],
+            opt_params["beta"],
+            opt_params["kappa"],
+            opt_params["iters"],
+        )
+
+        dtype = torch.float16 if tensor.is_cuda else torch.float32
+        w_f = tensor.to(dtype)
+        scale = scale.to(dtype)
+        zero = zero.to(dtype)
+
+        if lp_norm == 1:
+
+            def shrink_op(x, beta):
+                return torch.sign(x) * torch.nn.functional.relu(torch.abs(x) - 1.0 / beta)
+
+        else:
+
+            def shrink_op(x, beta, p=lp_norm):
+                return torch.sign(x) * torch.nn.functional.relu(
+                    torch.abs(x) - (1.0 / beta) * torch.pow(torch.abs(x) + 1e-8, p - 1)
+                )
+
+        best_error = 1e4
+        for i in range(iters):
+            w_q = torch.round(w_f * scale + zero).clamp(min_max[0], min_max[1])
+            w_r = (w_q - zero) / scale
+            w_e = shrink_op(w_f - w_r, beta)
+            zero = torch.mean(w_q - (w_f - w_e) * scale, axis=axis, keepdim=True)
+            beta *= kappa
+
+            current_error = float(torch.abs(w_f - w_r).mean())
+            if verbose:
+                print(i, np.round(current_error, 6))
+            if current_error < best_error:
+                best_error = current_error
+            else:
+                break
+
+        del w_f, w_q, w_r, w_e
+
+        return scale, zero
+
+    @staticmethod
+    def pack_on_row_fast_248bit(pack_tensor, ori_int_tensor, bits):
+        if pack_tensor.shape[0] == ori_int_tensor.shape[0]:
+            ori_int_tensor = ori_int_tensor.T
+            pack_tensor = pack_tensor.T
+        if bits in [2, 4, 8]:
+            compress_ratio = pack_tensor.element_size() * 8 // bits
+            for j in range(compress_ratio):
+                pack_tensor[0:] |= ori_int_tensor[j::compress_ratio] << (bits * (j))
+        else:
+            raise NotImplementedError("Only 2,4,8 bits are supported.")
+
+    # from Official implementation of Half-Quadratic Quantization (HQQ)
+    def quantize_internal(
+        self, tensor, bits=4, channel_wise=True, group_size=64, optimize=True, round_zero=True, axis=1
+    ):
+        import torch
+
+        weight = tensor.float()
+        ori_shape = weight.shape
+
+        pad_len = (group_size - ori_shape[axis] % group_size) % group_size
+        if axis == 1:
+            weight = torch.nn.functional.pad(weight, (0, pad_len), "constant", 0)
+        else:
+            weight = torch.nn.functional.pad(weight, (0, 0, 0, pad_len), "constant", 0)
+        shape = weight.shape
+
+        # Reshape for grouping
+        if (group_size is not None) and channel_wise:
+            weight = weight.reshape([-1, group_size]) if (axis == 1) else weight.reshape([group_size, -1])
+
+        # Get min/max values
+        if channel_wise is False:
+            _min, _max = weight.min(), weight.max()
+            optimize = False
+        else:
+            _min = weight.min(axis=axis, keepdim=True)[0]
+            _max = weight.max(axis=axis, keepdim=True)[0]
+
+        max_v = 2**bits - 1
+        min_v = 0
+        min_max = [min_v, max_v]
+
+        # Note: here we work with the inverse of the scale to avoid division and quantize instead via weight*scale + zero, the scale is inverted later on.
+        # clamp to avoid half-precision problems
+        scale = (max_v / (_max - _min)).clamp(max=2e4)
+        #!!!!!!!!!!!!!!!
+        min_max_axis = _max - _min
+        if (min_max_axis == 0).sum().item() > 0:
+            min_max_axis[min_max_axis == 0] = max_v
+            scale = (max_v / min_max_axis).clamp(max=2e4)
+        zero = -_min * scale
+
+        if round_zero:
+            zero = torch.round(zero)
+
+        # Fine-tune weights
+        if optimize:
+            scale, zero = self.optimize_weights(tensor=weight, scale=scale, zero=zero, min_max=min_max, axis=axis)
+
+        # Quantize
+        # Necessary for fake quantization backprop
+        w_q = torch.round(weight * scale + zero).clamp(min_max[0], min_max[1])
+        w_q = w_q.reshape(shape).int()
+
+        scale = 1.0 / scale
+        if axis == 1:
+            scale = scale.reshape(shape[0], -1)
+            zero = zero.reshape(shape[0], -1)
+        else:
+            scale = scale.reshape(-1, shape[-1])
+            zero = zero.reshape(-1, shape[-1])
+        # cleanup
+        del weight, _min, _max
+
+        return w_q, scale.to(tensor.dtype), zero.to(tensor.dtype)
+
+    def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> list[NodeProto]:
+        """
+        If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node.
+        If QOperator format, return MatMulNbits. If QDQ format, return DeQuantizeLinear + MatMul.
+        """
+        if node.op_type != "MatMul":
+            return [node]  # only care about MatMul for now
+        import torch
+
+        logger.info(f"start to quantize {node.name} ...")
+        input_b = node.input[1]
+        b_pb, bs_graph = get_initializer(input_b, graph_stack)
+        if b_pb is None:
+            logger.info("MatMul doesn't have const weight. Skip to quantize")
+            return [node]  # only care about constant weight
+
+        b_array = onnx.numpy_helper.to_array(b_pb)
+        if len(b_array.shape) != 2:
+            logger.info("MatMul weight is not 2D. Skip to quantize")
+            return [node]  # can only process 2-D matrix
+        b_array_torch = torch.from_numpy(b_array)
+        if torch.cuda.is_available():
+            b_array_torch = b_array_torch.cuda()
+        quant_weight_torch, scales_torch, zero_points_torch = self.quantize_internal(
+            b_array_torch.T, bits=self.config.bits, group_size=self.config.block_size
+        )
+        quant_weight_torch = quant_weight_torch.contiguous()
+        scales_torch = scales_torch.contiguous()
+        zero_points_torch = zero_points_torch.contiguous()
+
+        packed_torch = torch.zeros(
+            (quant_weight_torch.shape[0], quant_weight_torch.shape[1] // 2),
+            dtype=torch.uint8,
+            device=quant_weight_torch.device,
+        )
+        self.pack_on_row_fast_248bit(packed_torch, quant_weight_torch, self.config.bits)
+        scales = scales_torch.cpu().numpy()
+        zero_points = zero_points_torch.cpu().numpy()
+        # reshape to the predefined shape in MatmulNbits
+        scales = scales.reshape(-1)
+        zero_points = zero_points.reshape(-1)
+        rows, cols = b_array_torch.shape
+        block_size = self.config.block_size
+        blob_size = block_size // 2
+        k_blocks = (rows + block_size - 1) // block_size
+        packed_torch = packed_torch.reshape(cols, k_blocks, blob_size)
+
+        b_quant = onnx.numpy_helper.from_array(packed_torch.cpu().numpy())
+        b_quant.name = b_pb.name + "_Q4"
+        for input in bs_graph.input:
+            if input.name == input_b:
+                bs_graph.input.remove(input)
+                break
+
+        scales_tensor = onnx.numpy_helper.from_array(scales)
+        scales_tensor.name = b_pb.name + "_scales"
+        bs_graph.initializer.extend([b_quant, scales_tensor])
+
+        input_names = [node.input[0], b_quant.name, scales_tensor.name]
+        zp_tensor = onnx.numpy_helper.from_array(zero_points)
+        zp_tensor.name = b_pb.name + "_zero_points"
+        bs_graph.initializer.extend([zp_tensor])
+        input_names.append(zp_tensor.name)
+
+        kwargs = {}
+        rows, cols = b_array.shape
+        kwargs["K"] = rows
+        kwargs["N"] = cols
+        kwargs["bits"] = self.config.bits
+        kwargs["block_size"] = self.config.block_size
+
+        matmul_q4_node = onnx.helper.make_node(
+            "MatMulNBits",
+            inputs=input_names,
+            outputs=[node.output[0]],
+            name=node.name + "_Q4" if node.name else "",
+            domain="com.microsoft",
+            **kwargs,
+        )
+
+        logger.info(f"complete quantization of {node.name} ...")
+
+        return [matmul_q4_node]
+
+
+def get_initializer(name, graph_path: list[GraphProto]) -> tuple[TensorProto, GraphProto]:
+    for gid in range(len(graph_path) - 1, -1, -1):
+        graph = graph_path[gid]
+        for tensor in graph.initializer:
+            if tensor.name == name:
+                return tensor, graph
+    return None, None
+
+
+class DefaultWeightOnlyQuantizer:
+    def __init__(self, config: DefaultWeightOnlyQuantConfig):
+        self.config = config
+
+    def int4_block_quant(self, fp32weight: npt.ArrayLike) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """4b quantize fp32 weight to a blob"""
+
+        if len(fp32weight.shape) != 2:
+            raise ValueError("Current int4 block quantization only supports 2D tensors!")
+        rows, cols = fp32weight.shape
+
+        block_size = self.config.block_size
+        k_blocks = (rows + block_size - 1) // block_size
+
+        if self.config.quant_format == QuantFormat.QOperator:
+            blob_size = block_size // 2
+            padded_rows = k_blocks * block_size
+            pad_len = padded_rows - rows
+            if pad_len > 0:
+                fp32weight = np.pad(fp32weight, ((0, pad_len), (0, 0)), "constant")
+
+            # block wise quantization, each block comes from a single column
+            packed = np.zeros((cols, k_blocks, blob_size), dtype="uint8")
+            zero_point = np.zeros(cols * ((k_blocks + 1) // 2), dtype="uint8")
+            scales = np.zeros((cols * k_blocks), dtype=fp32weight.dtype)
+            quantize_matmul_4bits(
+                packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric
+            )
+        else:
+            packed = np.zeros((rows * cols + 1) // 2, dtype="uint8")
+            zero_point = np.zeros((cols * k_blocks + 1) // 2, dtype="uint8")
+            scales = np.zeros((k_blocks, cols), dtype=fp32weight.dtype)
+            quantize_qdq_matmul_4bits(
+                packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric
+            )
+
+        return (packed, scales, zero_point)
+
+    def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> list[NodeProto]:
+        """
+        If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node.
+        If QOperator format, return MatMulNbits. If QDQ format, return DeQuantizeLinear + MatMul.
+        """
+
+        if node.op_type != "MatMul":
+            return [node]  # only care about MatMul for now
+
+        logger.info(f"start to quantize {node.name} ...")
+        qtype = TensorProto.INT4 if self.config.is_symmetric else TensorProto.UINT4
+        input_b = node.input[1]
+        b_tensor, b_graph = get_initializer(input_b, graph_stack)
+        if b_tensor is None:
+            logger.info("MatMul doesn't have const weight. Skip to quantize")
+            return [node]  # only care about constant weight
+
+        b_ndarray = onnx.numpy_helper.to_array(b_tensor)
+        if len(b_ndarray.shape) != 2:
+            logger.info("MatMul weight is not 2D. Skip to quantize")
+            return [node]  # can only process 2-D matrix
+
+        packed, scales, zero_points = self.int4_block_quant(b_ndarray)
+
+        if self.config.quant_format == QuantFormat.QOperator:
+            b_quant = onnx.numpy_helper.from_array(packed, b_tensor.name + "_Q4")
+            scales_tensor = onnx.numpy_helper.from_array(scales, b_tensor.name + "_scales")
+        else:
+            b_quant = onnx.helper.make_tensor(b_tensor.name + "_DQ_Q4", qtype, b_ndarray.shape, packed.tobytes(), True)
+            scales_tensor = onnx.numpy_helper.from_array(scales, b_tensor.name + "_DQ_scales")
+
+        for input in b_graph.input:
+            if input.name == input_b:
+                b_graph.input.remove(input)
+                break
+
+        b_graph.initializer.extend([b_quant, scales_tensor])
+
+        output_nodes = []
+
+        if self.config.quant_format == QuantFormat.QOperator:
+            input_names = [node.input[0], b_quant.name, scales_tensor.name]
+            if not self.config.is_symmetric:
+                zp_tensor = onnx.numpy_helper.from_array(zero_points, b_tensor.name + "_zero_points")
+                input_names.append(zp_tensor.name)
+                b_graph.initializer.extend([zp_tensor])
+            kwargs = {}
+            rows, cols = b_ndarray.shape
+            kwargs["K"] = rows
+            kwargs["N"] = cols
+            kwargs["bits"] = 4
+            kwargs["block_size"] = self.config.block_size
+            if self.config.accuracy_level is not None:
+                kwargs["accuracy_level"] = self.config.accuracy_level
+
+            matmul_q4_node = onnx.helper.make_node(
+                "MatMulNBits",
+                inputs=input_names,
+                outputs=[node.output[0]],
+                name=node.name + "_Q4" if node.name else "",
+                domain="com.microsoft",
+                **kwargs,
+            )
+
+            output_nodes.append(matmul_q4_node)
+        else:
+            dq_input_names = [b_quant.name, scales_tensor.name]
+            dq_output_names = [b_quant.name + "_output"]
+            matmul_input_names = [node.input[0], dq_output_names[0]]
+            matmul_output_names = [node.output[0]]
+            if not self.config.is_symmetric:
+                zp_tensor = onnx.helper.make_tensor(
+                    b_tensor.name + "_DQ_zero_points", qtype, scales.shape, zero_points.tobytes(), True
+                )
+                dq_input_names.append(zp_tensor.name)
+                b_graph.initializer.extend([zp_tensor])
+            dq_kwargs = {"axis": 0, "block_size": self.config.block_size}
+            dq_node = onnx.helper.make_node(
+                "DequantizeLinear",
+                inputs=dq_input_names,
+                outputs=dq_output_names,
+                name=node.name + "_DQ_Q4" if node.name else "",
+                **dq_kwargs,
+            )
+            matmul_node = onnx.helper.make_node(
+                "MatMul",
+                inputs=matmul_input_names,
+                outputs=matmul_output_names,
+                name=node.name + "_matmul_Q4" if node.name else "",
+            )
+            output_nodes.extend([dq_node, matmul_node])
+
+        logger.info(f"complete quantization of {node.name} ...")
+        return output_nodes
+
+
+class MatMul4BitsQuantizer:
+    """
+    Perform 4b quantization of constant MatMul weights.
+    If algo_config.quant_format is QOperator, the quantized weight is stored in a MatMulNBits node, which relaces the
+    MatMul node.
+    If algo_config.quant_format is QDQ, the quantized weight is stored in a DeQuantizeLinear node. The MatMul node is
+    replaced by the DequantizeLinear + MatMul nodes.
+    """
+
+    def __init__(
+        self,
+        model: ModelProto | str,
+        block_size: int = 128,
+        is_symmetric: bool = False,
+        accuracy_level: int | None = None,
+        nodes_to_exclude=None,
+        quant_format=QuantFormat.QOperator,
+        algo_config: WeightOnlyQuantConfig | None = None,
+    ):
+        if nodes_to_exclude is None:
+            nodes_to_exclude = []
+        self.model = ONNXModel(onnx.load(model)) if isinstance(model, str) else ONNXModel(model)
+        self.model_path = model if isinstance(model, str) else None
+        self.block_size = block_size
+        self.is_symmetric = is_symmetric
+        self.accuracy_level = accuracy_level
+        self.nodes_to_exclude = set(nodes_to_exclude)
+        self.node_quantizer = None
+        if algo_config is None:
+            algo_config = DefaultWeightOnlyQuantConfig(
+                block_size=block_size,
+                is_symmetric=is_symmetric,
+                accuracy_level=accuracy_level,
+                quant_format=quant_format,
+            )
+        self.algo_config = algo_config
+        if algo_config.algorithm == "HQQ":
+            self.node_quantizer = HQQWeightOnlyQuantizer(self.algo_config)
+        elif algo_config.algorithm == "DEFAULT":
+            self.node_quantizer = DefaultWeightOnlyQuantizer(self.algo_config)
+
+    def _process_subgraph(self, graph_stack: list[GraphProto]):
+        new_nodes = []
+        graph = graph_stack[-1]
+
+        for node in graph.node:
+            graph_attrs = [
+                attr
+                for attr in node.attribute
+                if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS
+            ]
+            if len(graph_attrs):
+                kwargs = {}
+                for attr in node.attribute:
+                    if attr.type == onnx.AttributeProto.GRAPH:
+                        # recursive call to take care of sub-graph
+                        graph_stack.append(attr.g)
+                        kv = {attr.name: self._process_subgraph(graph_stack)}
+                    elif attr.type == onnx.AttributeProto.GRAPHS:
+                        value = []
+                        for subgraph in attr.graphs:
+                            # recursive call to take care of sub-graph
+                            graph_stack.append(subgraph)
+                            value.extend([self._process_subgraph(graph_stack)])
+                        kv = {attr.name: value}
+                    else:
+                        kv = attribute_to_kwarg(attr)
+                    kwargs.update(kv)
+                node = onnx.helper.make_node(  # noqa: PLW2901
+                    node.op_type, node.input, node.output, name=node.name, **kwargs
+                )
+            out_nodes = []
+            if node.name in self.nodes_to_exclude:
+                logger.info(f"exclude to quantize {node.name} as specified by nodes_to_exclude...")
+                out_nodes = [node]
+            elif self.algo_config is not None and self.algo_config.algorithm == "HQQ":
+                out_nodes = self.node_quantizer.quantize(node, graph_stack)
+            else:
+                out_nodes = self.node_quantizer.quantize(node, graph_stack)
+            new_nodes.extend(out_nodes)
+
+        graph.ClearField("node")
+        graph.node.extend(new_nodes)
+        graph_stack.pop()
+        return graph
+
+    def _generate_q4_node_config(self):
+        """Generate weight only quant configuration for nodes."""
+        q4_node_config = {}
+        template_config_q4 = {
+            "bits": 4,
+            "group_size": self.block_size,
+            "scheme": "sym" if self.is_symmetric else "asym",
+        }
+        for node in self.model.model.graph.node:
+            if node.op_type in ["MatMul"]:
+                if not all([self.model.get_initializer(i) is None for i in node.input]):
+                    q4_node_config[node.name] = template_config_q4
+        return q4_node_config
+
+    def int4_quant_algo(self):
+        """4b quantize a model with RTN or GPTQ algorithm. Please refer to
+        https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md
+        for more details on weight only quantization using Intel® Neural Compressor.
+        """
+
+        def inc_dataloader():
+            data_reader = copy.deepcopy(self.algo_config.calibration_data_reader)
+            for data in data_reader:
+                yield data, None
+
+        kwargs = {}
+        if self.accuracy_level is not None:
+            kwargs["accuracy_level"] = self.accuracy_level
+        weight_only_node_config = self._generate_q4_node_config()
+
+        algorithm = self.algo_config.algorithm
+        logger.info(f"start to quantize model with {algorithm} algorithm...")
+        if algorithm == "RTN":
+            from neural_compressor.adaptor.ox_utils.weight_only import rtn_quantize
+
+            kwargs["ratios"] = self.algo_config.ratios
+
+            self.model = rtn_quantize(
+                model=self.model_path if self.model_path is not None else self.model.model,
+                weight_config=weight_only_node_config,
+                **kwargs,
+            )
+        elif algorithm == "GPTQ":
+            from neural_compressor.adaptor.ox_utils.weight_only import gptq_quantize
+
+            kwargs["percdamp"] = self.algo_config.percdamp
+            kwargs["blocksize"] = self.algo_config.block_size
+            kwargs["actorder"] = self.algo_config.actorder
+            kwargs["mse"] = self.algo_config.mse
+            kwargs["perchannel"] = self.algo_config.perchannel
+            kwargs["n_samples"] = -1
+            dataloader = inc_dataloader()
+
+            self.model = gptq_quantize(
+                model=self.model_path if self.model_path is not None else self.model.model,
+                weight_config=weight_only_node_config,
+                dataloader=dataloader,
+                **kwargs,
+            )
+        logger.info(f"complete quantization of model with {algorithm} algorithm.")
+
+    def process(self):
+        if self.algo_config.algorithm in ["HQQ", "DEFAULT"]:
+            # use a stack to keep track of sub-graphs
+            graph_stack = [self.model.graph()]
+
+            # Update domain opset
+            if self.algo_config.quant_format == QuantFormat.QOperator:
+                self.model.set_opset_import("com.microsoft", 1)
+            else:
+                opset_import = self.model.opset_import()
+                for opset in opset_import:
+                    if opset.domain in [None, "ai.onnx", ""] and opset.version < 21:
+                        logger.warning(
+                            "The opset of the input model is under 21 and doesn't support int4 data type. "
+                            "Force to update it to opset 21, but the generated model may not be a valid model."
+                        )
+                        self.model.set_opset_import(opset.domain, 21)
+
+            self._process_subgraph(graph_stack)
+            self.model.clean_initializers()
+        else:
+            # use Intel® Neural Compressor for RTN or GPTQ weight-only quantize algorithm
+            try:
+                importlib.import_module("neural_compressor")
+            except Exception as e:
+                logging.error(f"{e}.")
+                raise RuntimeError(
+                    "neural-compressor is not correctly installed. Please check your environment."
+                ) from e
+
+            import neural_compressor
+
+            assert version.parse(neural_compressor.__version__) >= version.parse(
+                "2.3.2"
+            ), "Require neural-compressor >= 2.3.2 to support weight only quantization!"
+
+            self.int4_quant_algo()
+
+
+def ort_convert_str_to_bool(value):
+    return value.lower() in ("true", "1")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="""Blockwise int4 quantization for MatMul 2D weight matrices.
+
+A weight matrix is partitioned into into blocks, where each block is a
+continguous subset inside each column. Each block is quantized into a
+set of 4b integers with a scaling factor and an optional offset.
+"""
+    )
+
+    parser.add_argument("--input_model", required=True, help="Path to the input model file")
+    parser.add_argument("--output_model", required=True, help="Path to the output model file")
+    parser.add_argument("--block_size", required=False, default=32, type=int, help="Block size for quantization")
+    parser.add_argument(
+        "--quant_method",
+        default="default",
+        type=str,
+        choices=["default", "hqq", "rtn", "gptq"],
+        help="the algorithm used to quantize weight, \nrtn and gptq leverage Intel® Neural Compressor",
+    )
+    parser.add_argument("--bits", default=4, type=int, help="the target bits to represent weight")
+    parser.add_argument(
+        "--symmetric",
+        required=False,
+        default=True,
+        const=True,
+        nargs="?",
+        type=ort_convert_str_to_bool,
+        choices=[True, False],
+        help="Indicate whether to quantize the model symmetrically, symmetric is not supported by hqq",
+    )
+    parser.add_argument(
+        "--accuracy_level",
+        required=False,
+        type=int,
+        help="Accuracy level of the 4-bit quantized MatMul computation. "
+        "Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details "
+        "(https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits).",
+    )
+    parser.add_argument("-v", "--verbose", required=False, action="store_true")
+    parser.set_defaults(verbose=False)
+    parser.add_argument(
+        "--nodes_to_exclude",
+        nargs="+",
+        type=str,
+        required=False,
+        default=[],
+        help="Specify the nodes to be excluded from quantization with node names",
+    )
+    parser.add_argument(
+        "--quant_format",
+        default="QOperator",
+        type=str,
+        choices=["QOperator", "QDQ"],
+        help="QuantFormat {QOperator, QDQ}"
+        "QOperator format quantizes the model with quantized operators directly."
+        "QDQ format quantize the model by inserting DeQuantizeLinear before the MatMul.",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    if args.verbose:
+        logger.setLevel(logging.DEBUG)
+
+    input_model_path = args.input_model
+    output_model_path = args.output_model
+    quant_format = QuantFormat[args.quant_format]
+
+    if os.path.exists(output_model_path):
+        logger.error(f"file {output_model_path} already exists")
+        raise Exception(f"file {output_model_path} already exists")
+
+    if args.symmetric and args.quant_method == "hqq":
+        logger.warning("symmetric is not supportted by hqq, will force to symmetric=False")
+        args.symmetric = False
+
+    model = onnx.load(input_model_path)
+    if args.quant_method == "hqq":
+        quant_config = HQQWeightOnlyQuantConfig(block_size=args.block_size, bits=args.bits)
+    elif args.quant_method == "default":
+        quant_config = DefaultWeightOnlyQuantConfig(
+            block_size=args.block_size,
+            is_symmetric=args.symmetric,
+            accuracy_level=args.accuracy_level,
+            quant_format=quant_format,
+        )
+    elif args.quant_method == "rtn":
+        quant_config = RTNWeightOnlyQuantConfig()
+    elif args.quant_method == "gptq":
+        quant_config = GPTQWeightOnlyQuantConfig(block_size=args.block_size)
+    else:
+        raise ValueError(f"Unsupported quantization method: {args.quant_method}")
+
+    quant = MatMul4BitsQuantizer(
+        model=model,
+        accuracy_level=args.accuracy_level,
+        nodes_to_exclude=args.nodes_to_exclude,
+        algo_config=quant_config,
+    )
+    quant.process()
+    quant.model.save_model_to_file(output_model_path, True)
--- a/rl/Lib/site-packages/onnxruntime/quantization/matmul_bnb4_quantizer.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/matmul_bnb4_quantizer.py
@ -0,0 +1,240 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import argparse
+import logging
+import os
+from typing import List, Tuple
+
+import numpy as np
+import numpy.typing as npt
+import onnx
+from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
+
+from onnxruntime.capi._pybind_state import quantize_matmul_bnb4
+
+from .onnx_model import ONNXModel
+from .quant_utils import attribute_to_kwarg
+
+logger = logging.getLogger(__name__)
+
+
+class MatMulBnb4Quantizer:
+    """Perform 4b quantization of constant MatMul weights using FP4 or NF4 data type"""
+
+    ##################
+    # quantization types, must be consistent with native code type
+    # Bnb_DataType_t defined in blockwise_quant_block_bnb4.h
+
+    # 4b floating point with bias of 3
+    FP4 = 0
+
+    # 4b NormalFloat
+    NF4 = 1
+
+    def __init__(self, model: ModelProto, quant_type: int, block_size: int, nodes_to_exclude=None):
+        nodes_to_exclude = nodes_to_exclude or []
+        assert quant_type in [MatMulBnb4Quantizer.FP4, MatMulBnb4Quantizer.NF4]
+        self.model = ONNXModel(model)
+        self.quant_type = quant_type
+        self.block_size = block_size
+        self.nodes_to_exclude = set(nodes_to_exclude)
+
+    @staticmethod
+    def __get_initializer(name, graph_path: List[GraphProto]) -> Tuple[TensorProto, GraphProto]:
+        for gid in range(len(graph_path) - 1, -1, -1):
+            graph = graph_path[gid]
+            for tensor in graph.initializer:
+                if tensor.name == name:
+                    return tensor, graph
+        return None, None
+
+    def bnb4_block_quant(self, fpweight: npt.ArrayLike) -> np.ndarray:
+        """4b quantize fp32/fp16 weight"""
+
+        if len(fpweight.shape) != 2:
+            raise ValueError("Current bnb4 block quantization only supports 2D tensors!")
+        # need to copy since the transposed weight still has the original memory layout
+        # Linear4bit quantizes its weight data which is the transposed weight
+        fpweight_t = fpweight.transpose().copy()
+
+        rows, cols = fpweight.shape
+        numel = rows * cols
+        block_size = self.block_size
+        num_blocks = (numel + block_size - 1) // block_size
+        quantized_numel = (numel + 1) // 2
+
+        packed = np.zeros(quantized_numel, dtype="uint8")
+        absmax = np.zeros(num_blocks, dtype=fpweight.dtype)
+        # block wise quantization, fpweight_t is flattened and divided into blocks
+        quantize_matmul_bnb4(packed, fpweight_t, absmax, block_size, self.quant_type, cols, rows)
+
+        return (packed, absmax)
+
+    def _bnb4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto]) -> NodeProto:
+        """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node"""
+
+        if node.op_type != "MatMul":
+            return node  # only care about MatMul for now
+
+        logger.debug(f"start to quantize {node.name} ...")
+        if node.name in self.nodes_to_exclude:
+            logger.debug(f"exclude to quantize {node.name} as specified by nodes_to_exclude...")
+            return node
+
+        inputB = node.input[1]  # noqa: N806
+        B, Bs_graph = MatMulBnb4Quantizer.__get_initializer(inputB, graph_stack)  # noqa: N806
+        if B is None:
+            logger.debug("MatMul doesn't have const weight. Skip to quantize")
+            return node  # only care about constant weight
+
+        B_array = onnx.numpy_helper.to_array(B)  # noqa: N806
+        if len(B_array.shape) != 2:
+            logger.debug("MatMul weight is not 2D. Skip to quantize")
+            return node  # can only process 2-D matrix
+
+        packed, absmax = self.bnb4_block_quant(B_array)
+        B_quant = onnx.numpy_helper.from_array(packed)  # noqa: N806
+        B_quant.name = B.name + "_Bnb4"
+        for input in Bs_graph.input:
+            if input.name == inputB:
+                Bs_graph.input.remove(input)
+                break
+
+        absmax_tensor = onnx.numpy_helper.from_array(absmax)
+        absmax_tensor.name = B.name + "_absmax"
+
+        Bs_graph.initializer.extend([B_quant, absmax_tensor])
+
+        kwargs = {}
+        rows, cols = B_array.shape
+        kwargs["K"] = rows
+        kwargs["N"] = cols
+        kwargs["block_size"] = self.block_size
+        kwargs["quant_type"] = self.quant_type
+
+        matmul_bnb4_node = onnx.helper.make_node(
+            "MatMulBnb4",
+            inputs=[node.input[0], B_quant.name, absmax_tensor.name],
+            outputs=[node.output[0]],
+            name=node.name + "_Bnb4" if node.name else "",
+            domain="com.microsoft",
+            **kwargs,
+        )
+
+        logger.debug(f"complete quantization of {node.name} ...")
+
+        return matmul_bnb4_node
+
+    def _process_subgraph(self, graph_stack: List[GraphProto]):
+        new_nodes = []
+        graph = graph_stack[-1]
+
+        for node in graph.node:
+            graph_attrs = [
+                attr
+                for attr in node.attribute
+                if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS
+            ]
+            if len(graph_attrs):
+                kwargs = {}
+                for attr in node.attribute:
+                    if attr.type == onnx.AttributeProto.GRAPH:
+                        # recursive call to take care of sub-graph
+                        graph_stack.append(attr.g)
+                        kv = {attr.name: self._process_subgraph(graph_stack)}
+                    elif attr.type == onnx.AttributeProto.GRAPHS:
+                        value = []
+                        for subgraph in attr.graphs:
+                            # recursive call to take care of sub-graph
+                            graph_stack.append(subgraph)
+                            value.extend([self._process_subgraph(graph_stack)])
+                        kv = {attr.name: value}
+                    else:
+                        kv = attribute_to_kwarg(attr)
+                    kwargs.update(kv)
+                node = onnx.helper.make_node(  # noqa: PLW2901
+                    node.op_type, node.input, node.output, name=node.name, **kwargs
+                )
+
+            new_nodes.append(self._bnb4_matmul_node_weight(node, graph_stack))
+
+        graph.ClearField("node")
+        graph.node.extend(new_nodes)
+        graph_stack.pop()
+        return graph
+
+    def process(self):
+        # use a stack to keep track of sub-graphs
+        graph_stack = [self.model.graph()]
+        opset_import = self.model.opset_import()
+
+        has_ms_domain = False
+        for opset in opset_import:
+            if opset.domain == "com.microsoft":
+                has_ms_domain = True
+        if not has_ms_domain:
+            opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)])
+
+        self._process_subgraph(graph_stack)
+        self.model.clean_initializers()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="""Blockwise FP4/NF4 quantization for MatMul 2D weight matrices.
+
+A weight matrix is partitioned into blocks, where each block is a contiguous
+subset inside the flattened transposed weight matrix. Each block is quantized
+into a set of 4b integers with an absolute value scaling factor.
+"""
+    )
+
+    parser.add_argument("--input_model", required=True, help="Path to the input model file")
+    parser.add_argument("--output_model", required=True, help="Path to the output model file")
+    parser.add_argument(
+        "--quant_type",
+        required=False,
+        default=1,
+        choices=[MatMulBnb4Quantizer.FP4, MatMulBnb4Quantizer.NF4],
+        help="Quantization data type. 0: FP4, 1: NF4",
+    )
+    parser.add_argument(
+        "--block_size",
+        required=False,
+        default=64,
+        help="Block size for blockwise quantization. Note: bnb.nn.Linear4bit only uses block_size=64",
+    )
+    parser.add_argument("-v", "--verbose", required=False, action="store_true")
+    parser.set_defaults(verbose=False)
+    parser.add_argument(
+        "--nodes_to_exclude",
+        nargs="+",
+        type=str,
+        required=False,
+        default=[],
+        help="Specify the nodes to be excluded from quantization with node names",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    if args.verbose:
+        logger.setLevel(logging.DEBUG)
+
+    input_model_path = args.input_model
+    output_model_path = args.output_model
+
+    if os.path.exists(output_model_path):
+        logger.error(f"file {output_model_path} already exists")
+        raise Exception(f"file {output_model_path} already exists")
+
+    model = onnx.load(input_model_path)
+    quant = MatMulBnb4Quantizer(model, args.quant_type, args.block_size, nodes_to_exclude=args.nodes_to_exclude)
+    quant.process()
+    quant.model.save_model_to_file(output_model_path, True)
--- a/rl/Lib/site-packages/onnxruntime/quantization/onnx_model.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/onnx_model.py
@ -0,0 +1,580 @@
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from pathlib import Path
+
+import onnx
+import onnx.helper as onnx_helper
+import onnx.numpy_helper as onnx_numpy_helper
+from onnx.onnx_pb import ModelProto
+
+from .quant_utils import attribute_to_kwarg, find_by_name
+
+
+def _clean_initializers_helper(graph, model):
+    """Clean unused initializers from graph.
+
+    Returns:
+        A cleaned graph without unused initializers
+        A list of tensor names, which are not produced by this graph and its subgraphes
+    """
+    requesting_tensor_names = set()
+    requesting_tensor_names.update(input_name for node in graph.node for input_name in node.input if input_name)
+    requesting_tensor_names.update(g_out.name for g_out in graph.output if g_out.name)
+
+    new_nodes = []
+    for node in graph.node:
+        new_node = node
+        graph_attrs = [
+            attr
+            for attr in node.attribute
+            if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS
+        ]
+        if graph_attrs:
+            kwargs = {}
+            for attr in node.attribute:
+                new_attribute = {}
+                if attr.type == onnx.AttributeProto.GRAPH:
+                    (
+                        cleaned_sub_graph,
+                        sub_requesting_tensor_names,
+                    ) = _clean_initializers_helper(attr.g, model)
+                    new_attribute = {attr.name: cleaned_sub_graph}
+                    requesting_tensor_names.update(sub_requesting_tensor_names)
+                elif attr.type == onnx.AttributeProto.GRAPHS:
+                    cleaned_graphes = []
+                    for subgraph in attr.graphs:
+                        (
+                            cleaned_sub_graph,
+                            sub_requesting_tensor_names,
+                        ) = _clean_initializers_helper(subgraph, model)
+                        cleaned_graphes.append(cleaned_sub_graph)
+                        requesting_tensor_names.update(sub_requesting_tensor_names)
+                    new_attribute = {attr.name: cleaned_graphes}
+                else:
+                    new_attribute = attribute_to_kwarg(attr)
+                kwargs.update(new_attribute)
+            new_node = onnx_helper.make_node(node.op_type, node.input, node.output, name=node.name, **kwargs)
+        new_nodes.append(new_node)
+
+    graph.ClearField("node")
+    graph.node.extend(new_nodes)
+
+    requesting_tensor_names.difference_update(output for node in graph.node for output in node.output)
+
+    unused_initializer = []
+    for initializer in graph.initializer:
+        if initializer.name in requesting_tensor_names:
+            requesting_tensor_names.remove(initializer.name)
+        else:
+            # mark it to remove, remove here directly will cause mis-behavier
+            unused_initializer.append(initializer)
+
+    name_to_input = {input.name: input for input in graph.input}
+    for initializer in unused_initializer:
+        graph.initializer.remove(initializer)
+        if initializer.name in name_to_input:
+            try:
+                graph.input.remove(name_to_input[initializer.name])
+            except StopIteration:
+                if model.ir_version < 4:
+                    print(f"Warning: invalid weight name {initializer.name} found in the graph (not a graph input)")
+
+    requesting_tensor_names.difference_update(input.name for input in graph.input)
+
+    return graph, requesting_tensor_names
+
+
+class ONNXModel:
+    def __init__(self, model: ModelProto):
+        self.model = model
+
+    def nodes(self):
+        return self.model.graph.node
+
+    def initializer(self):
+        return self.model.graph.initializer
+
+    def initializer_extend(self, inits):
+        if len(inits) == 0:
+            raise ValueError("Can add an empty list.")
+        for init in self.initializer():
+            self._check_init(init, "gain")
+        for init in inits:
+            self._check_init(init)
+            self.model.graph.initializer.append(init)
+
+    def graph(self):
+        return self.model.graph
+
+    def ir_version(self):
+        return self.model.ir_version
+
+    def opset_import(self):
+        return self.model.opset_import
+
+    def set_opset_import(self, domain, version):
+        for opset in self.model.opset_import:
+            if opset.domain == domain:
+                opset.version = version
+                return
+
+        self.model.opset_import.extend([onnx_helper.make_opsetid(domain, version)])
+
+    def remove_node(self, node):
+        if node in self.model.graph.node:
+            self.model.graph.node.remove(node)
+
+    def remove_nodes(self, nodes_to_remove):
+        for node in nodes_to_remove:
+            self.remove_node(node)
+
+    def add_node(self, node):
+        self.model.graph.node.extend([self._check_node(node)])
+
+    def add_nodes(self, nodes_to_add):
+        for node in nodes_to_add:
+            self.add_node(node)
+
+    def add_initializer(self, tensor):
+        if find_by_name(tensor.name, self.model.graph.initializer) is None:
+            self._check_init(tensor)
+            self.model.graph.initializer.extend([tensor])
+
+    def get_initializer(self, name):
+        for tensor in self.model.graph.initializer:
+            if tensor.name == name:
+                return tensor
+        return None
+
+    def find_graph_input(self, input_name):
+        for input in self.model.graph.input:
+            if input.name == input_name:
+                return input
+        return None
+
+    def find_graph_output(self, output_name):
+        for output in self.model.graph.output:
+            if output.name == output_name:
+                return output
+        return None
+
+    def get_tensor_type(self, tensor_name: str):
+        tensor_type_map = {obj.name: obj.type for obj in self.model.graph.value_info}
+
+        if tensor_name in tensor_type_map:
+            return tensor_type_map[tensor_name].tensor_type
+
+        g_input = self.find_graph_input(tensor_name)
+        if g_input:
+            return g_input.type.tensor_type
+
+        g_output = self.find_graph_output(tensor_name)
+        if g_output:
+            return g_output.type.tensor_type
+
+        return None
+
+    def get_constant_value(self, output_name):
+        for node in self.model.graph.node:
+            if node.op_type == "Constant":
+                if node.output[0] == output_name:
+                    for attr in node.attribute:
+                        if attr.name == "value":
+                            return onnx_numpy_helper.to_array(attr.t)
+
+        # Fallback to initializer since constant folding may have been applied.
+        initializer = self.get_initializer(output_name)
+        if initializer is not None:
+            return onnx_numpy_helper.to_array(initializer)
+
+        return None
+
+    def get_initializer_name_set(self):
+        return {initializer.name for initializer in self.model.graph.initializer}
+
+    def remove_initializer(self, tensor):
+        if tensor in self.model.graph.initializer:
+            self.model.graph.initializer.remove(tensor)
+            for input in self.model.graph.input:
+                if input.name == tensor.name:
+                    self.model.graph.input.remove(input)
+                    break
+
+    def remove_initializers(self, init_to_remove):
+        for initializer in init_to_remove:
+            self.remove_initializer(initializer)
+
+    def get_non_initializer_inputs(self):
+        initializer_names = self.get_initializer_name_set()
+        non_initializer_inputs = set()
+        for input in self.model.graph.input:
+            if input.name not in initializer_names:
+                non_initializer_inputs.add(input.name)
+        return non_initializer_inputs
+
+    def input_name_to_nodes(self):
+        input_name_to_nodes = {}
+        for node in self.model.graph.node:
+            for input_name in node.input:
+                if input_name:  # Could be empty when it is optional
+                    if input_name not in input_name_to_nodes:
+                        input_name_to_nodes[input_name] = [node]
+                    else:
+                        input_name_to_nodes[input_name].append(node)
+        return input_name_to_nodes
+
+    def output_name_to_node(self):
+        output_name_to_node = {}
+        for node in self.model.graph.node:
+            for output_name in node.output:
+                if output_name:  # Could be empty when it is optional
+                    output_name_to_node[output_name] = node
+        return output_name_to_node
+
+    def get_children(self, node, input_name_to_nodes=None):
+        if input_name_to_nodes is None:
+            input_name_to_nodes = self.input_name_to_nodes()
+
+        children = []
+        for output in node.output:
+            if output in input_name_to_nodes:
+                for node in input_name_to_nodes[output]:
+                    children.append(node)  # noqa: PERF402
+        return children
+
+    def get_parents(self, node, output_name_to_node=None):
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        parents = []
+        for input in node.input:
+            if input in output_name_to_node:
+                parents.append(output_name_to_node[input])
+        return parents
+
+    def get_parent(self, node, idx, output_name_to_node=None):
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        if len(node.input) <= idx:
+            return None
+
+        input = node.input[idx]
+        if input not in output_name_to_node:
+            return None
+
+        return output_name_to_node[input]
+
+    def find_node_by_name(self, node_name, new_nodes_list, graph):
+        """Find out if a node exists in a graph or a node is in the
+        new set of nodes created during quantization.
+
+        Returns:
+            The node found or None.
+        """
+        graph_nodes_list = list(graph.node)  # deep copy
+        graph_nodes_list.extend(new_nodes_list)
+        node = find_by_name(node_name, graph_nodes_list)
+        return node
+
+    def get_largest_node_name_suffix(self, node_name_prefix):
+        """
+        Gets the largest node name (int) suffix for all node names that begin with `node_name_prefix`.
+        Example: for nodes my_prefix_0 and my_prefix_3, this method returns 3.
+        """
+        suffix = -1
+
+        for node in self.model.graph.node:
+            if node.name and node.name.startswith(node_name_prefix):
+                try:
+                    index = int(node.name[len(node_name_prefix) :])
+                    suffix = max(index, suffix)
+                except ValueError:
+                    continue
+
+        return suffix
+
+    def find_nodes_by_initializer(self, graph, initializer):
+        """
+        Find all nodes with given initializer as an input.
+        """
+        nodes = []
+        for node in graph.node:
+            for node_input in node.input:
+                if node_input == initializer.name:
+                    nodes.append(node)
+        return nodes
+
+    @staticmethod
+    def __get_initializer(name, graph_path):
+        for gid in range(len(graph_path) - 1, -1, -1):
+            graph = graph_path[gid]
+            for tensor in graph.initializer:
+                if tensor.name == name:
+                    return tensor, graph
+        return None, None
+
+    @staticmethod
+    def __replace_gemm_with_matmul(graph_path):
+        new_nodes = []
+        graph = graph_path[-1]
+        for node in graph.node:
+            graph_attrs = [attr for attr in node.attribute if attr.type == 5 or attr.type == 10]
+            if len(graph_attrs):
+                kwargs = {}
+                for attr in node.attribute:
+                    if attr.type == 5:
+                        graph_path.append(attr.g)
+                        kv = {attr.name: ONNXModel.__replace_gemm_with_matmul(graph_path)}
+                    elif attr.type == 10:
+                        value = []
+                        for subgraph in attr.graphs:
+                            graph_path.append(subgraph)
+                            value.extend([ONNXModel.__replace_gemm_with_matmul(graph_path)])
+                        kv = {attr.name: value}
+                    else:
+                        kv = attribute_to_kwarg(attr)
+                    kwargs.update(kv)
+                node = onnx_helper.make_node(  # noqa: PLW2901
+                    node.op_type, node.input, node.output, name=node.name, **kwargs
+                )
+
+            if node.op_type == "Gemm":
+                alpha = 1.0
+                beta = 1.0
+                transA = 0  # noqa: N806
+                transB = 0  # noqa: N806
+                for attr in node.attribute:
+                    if attr.name == "alpha":
+                        alpha = onnx_helper.get_attribute_value(attr)
+                    elif attr.name == "beta":
+                        beta = onnx_helper.get_attribute_value(attr)
+                    elif attr.name == "transA":
+                        transA = onnx_helper.get_attribute_value(attr)  # noqa: N806
+                    elif attr.name == "transB":
+                        transB = onnx_helper.get_attribute_value(attr)  # noqa: N806
+                if alpha == 1.0 and beta == 1.0 and transA == 0:
+                    inputB = node.input[1]  # noqa: N806
+                    if transB == 1:
+                        B, Bs_graph = ONNXModel.__get_initializer(node.input[1], graph_path)  # noqa: N806
+                        if B:
+                            # assume B is not used by any other node
+                            B_array = onnx_numpy_helper.to_array(B)  # noqa: N806
+                            B_trans = onnx_numpy_helper.from_array(B_array.T)  # noqa: N806
+                            B_trans.name = B.name
+                            Bs_graph.initializer.remove(B)
+                            for input in Bs_graph.input:
+                                if input.name == inputB:
+                                    Bs_graph.input.remove(input)
+                                    break
+                            Bs_graph.initializer.extend([B_trans])
+                        else:
+                            inputB += "_Transposed"  # noqa: N806
+                            transpose_node = onnx_helper.make_node(
+                                "Transpose",
+                                inputs=[node.input[1]],
+                                outputs=[inputB],
+                                name=node.name + "_Transpose" if node.name else "",
+                            )
+                            new_nodes.append(transpose_node)
+
+                    matmul_node = onnx_helper.make_node(
+                        "MatMul",
+                        inputs=[node.input[0], inputB],
+                        outputs=[node.output[0] + ("_MatMul" if len(node.input) > 2 else "")],
+                        name=node.name + "_MatMul" if node.name else "",
+                    )
+                    new_nodes.append(matmul_node)
+
+                    if len(node.input) > 2:
+                        add_node = onnx_helper.make_node(
+                            "Add",
+                            inputs=[node.output[0] + "_MatMul", node.input[2]],
+                            outputs=node.output,
+                            name=node.name + "_Add" if node.name else "",
+                        )
+                        new_nodes.append(add_node)
+
+                # unsupported
+                else:
+                    new_nodes.append(node)
+
+            # not GEMM
+            else:
+                new_nodes.append(node)
+
+        graph.ClearField("node")
+        graph.node.extend(new_nodes)
+        graph_path.pop()
+        return graph
+
+    def replace_gemm_with_matmul(self):
+        graph_path = [self.graph()]
+        ONNXModel.__replace_gemm_with_matmul(graph_path)
+
+    def save_model_to_file(self, output_path, use_external_data_format=False):
+        """
+        Save model to external data, which is needed for model size > 2GB
+        """
+        self.topological_sort()
+        if use_external_data_format:
+            onnx.external_data_helper.convert_model_to_external_data(
+                self.model,
+                all_tensors_to_one_file=True,
+                location=Path(output_path).name + ".data",
+                convert_attribute=True,
+            )
+        for init in self.model.graph.initializer:
+            self._check_init(init, "end")
+        onnx.save_model(self.model, output_path)
+
+    @staticmethod
+    def replace_node_input(node, old_input_name, new_input_name):
+        assert isinstance(old_input_name, str) and isinstance(new_input_name, str)
+        for j in range(len(node.input)):
+            if node.input[j] == old_input_name:
+                node.input[j] = new_input_name
+
+    def replace_input_of_all_nodes(self, old_input_name, new_input_name):
+        for node in self.model.graph.node:
+            ONNXModel.replace_node_input(node, old_input_name, new_input_name)
+
+    def replace_input_of_nodes(self, old_input_name, new_input_name, node_names_set):
+        for node in self.model.graph.node:
+            if node.name in node_names_set:
+                ONNXModel.replace_node_input(node, old_input_name, new_input_name)
+
+    @staticmethod
+    def replace_node_output(node, old_output_name, new_output_name):
+        assert isinstance(old_output_name, str) and isinstance(new_output_name, str)
+        for j in range(len(node.output)):
+            if node.output[j] == old_output_name:
+                node.output[j] = new_output_name
+
+    def replace_output_of_all_nodes(self, old_output_name, new_output_name):
+        for node in self.model.graph.node:
+            ONNXModel.replace_node_output(node, old_output_name, new_output_name)
+
+    def replace_output_of_nodes(self, old_output_name, new_output_name, node_names_set):
+        for node in self.model.graph.node:
+            if node.name in node_names_set:
+                ONNXModel.replace_node_output(node, old_output_name, new_output_name)
+
+    def remove_unused_constant(self):
+        input_name_to_nodes = self.input_name_to_nodes()
+
+        # remove unused constant
+        unused_nodes = []
+        nodes = self.nodes()
+        for node in nodes:
+            if (
+                node.op_type == "Constant"
+                and not self.is_graph_output(node.output[0])
+                and node.output[0] not in input_name_to_nodes
+            ):
+                unused_nodes.append(node)
+
+        self.remove_nodes(unused_nodes)
+
+        ununsed_weights = []
+        for w in self.initializer():
+            if w.name not in input_name_to_nodes and not self.is_graph_output(w.name):
+                ununsed_weights.append(w)
+                # Remove from graph.input
+                for graph_input in self.graph().input:
+                    if graph_input.name == w.name:
+                        self.graph().input.remove(graph_input)
+
+        self.remove_initializers(ununsed_weights)
+
+    def is_graph_output(self, output_name):
+        return any(output.name == output_name for output in self.model.graph.output)
+
+    def is_graph_input(self, tensor_name: str) -> bool:
+        return any(input.name == tensor_name for input in self.model.graph.input)
+
+    # TODO:use OnnxModel.graph_topological_sort(self.model.graph) from transformers.onnx_model
+    # Currently it breaks Openvino/Linux training gpu pipeline so hold off for 1.8 release
+    def topological_sort(self):
+        deps_count = [0] * len(self.nodes())  # dependency count of each node
+        deps_to_nodes = {}  # input to node indice
+        sorted_nodes = []  # initialize sorted_nodes
+        for node_idx, node in enumerate(self.nodes()):
+            # CANNOT use len(node.input) directly because input can be optional
+            deps_count[node_idx] = sum(1 for _ in node.input if _)
+            if deps_count[node_idx] == 0:  # Constant doesn't depend on any inputs
+                sorted_nodes.append(self.nodes()[node_idx])
+                continue
+
+            for input_name in node.input:
+                if not input_name:
+                    continue
+                if input_name not in deps_to_nodes:
+                    deps_to_nodes[input_name] = [node_idx]
+                else:
+                    deps_to_nodes[input_name].append(node_idx)
+
+        initializer_names = [init.name for init in self.initializer()]
+        graph_input_names = [input.name for input in self.model.graph.input]
+        input_names = initializer_names + graph_input_names
+        input_names.sort()
+        prev_input_name = None
+        for input_name in input_names:
+            if prev_input_name == input_name:
+                continue
+
+            prev_input_name = input_name
+            if input_name in deps_to_nodes:
+                for node_idx in deps_to_nodes[input_name]:
+                    deps_count[node_idx] = deps_count[node_idx] - 1
+                    if deps_count[node_idx] == 0:
+                        sorted_nodes.append(self.nodes()[node_idx])
+
+        start = 0
+        end = len(sorted_nodes)
+
+        while start < end:
+            for output in sorted_nodes[start].output:
+                if output in deps_to_nodes:
+                    for node_idx in deps_to_nodes[output]:
+                        deps_count[node_idx] = deps_count[node_idx] - 1
+                        if deps_count[node_idx] == 0:
+                            sorted_nodes.append(self.nodes()[node_idx])
+                            end = end + 1
+            start = start + 1
+
+        assert end == len(self.graph().node), "Graph is not a DAG"
+        self.graph().ClearField("node")
+        self.graph().node.extend(sorted_nodes)
+
+    def clean_initializers(self):
+        return _clean_initializers_helper(self.graph(), self.model)
+
+    def _check_init(self, init, test=None):
+        if init.data_type == onnx.TensorProto.FLOAT8E4M3FN:
+            if init.HasField("raw_data"):
+                b = list(init.raw_data)
+                if any(map(lambda i: (i & 127) == 127, b)):
+                    raise ValueError(f"Initializer {init.name!r} has nan.")
+        return init
+
+    def _check_node(self, node):
+        """
+        A quantization to float 8 does not use quantized bias but float 16 bias.
+        This function checks that DequantizeLinear is not used to
+        dequantize from float 16.
+        """
+        if node.op_type == "DequantizeLinear":
+            zero_point = node.input[2]
+            init = self.get_initializer(zero_point)
+            dtype = init.data_type
+            if dtype in {
+                onnx.TensorProto.FLOAT16,
+                onnx.TensorProto.FLOAT,
+                onnx.TensorProto.DOUBLE,
+                onnx.TensorProto.BFLOAT16,
+            }:
+                raise RuntimeError(f"Unsupported DequantizeLinear operator, dequantization from {dtype}.")
+        return node
--- a/rl/Lib/site-packages/onnxruntime/quantization/onnx_quantizer.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/onnx_quantizer.py
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/init.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/init.py
@ -0,0 +1,2 @@
+# from .base_operator import QuantOperatorBase
+# from .matmul import MatMulInteger
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/activation.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/activation.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/argmax.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/argmax.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/attention.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/attention.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/base_operator.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/base_operator.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/binary_op.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/binary_op.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/concat.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/concat.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/conv.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/conv.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/direct_q8.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/direct_q8.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/embed_layernorm.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/embed_layernorm.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/gather.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/gather.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/gavgpool.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/gavgpool.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/gemm.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/gemm.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/lstm.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/lstm.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/matmul.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/matmul.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/maxpool.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/maxpool.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/norm.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/norm.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/pad.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/pad.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/pooling.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/pooling.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/qdq_base_operator.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/qdq_base_operator.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/resize.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/resize.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/softmax.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/softmax.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/split.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/split.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/where.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pycache/where.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/activation.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/activation.py
@ -0,0 +1,119 @@
+import onnx
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QLinearActivation(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def QuantizeClipRelu(self):  # noqa: N802
+        node = self.node
+        assert node.op_type == "Relu" or node.op_type == "Clip"
+
+        # When mode is QLinearOps, the output quantization params are calculated based on outputs from
+        # activation nodes, therefore these nodes can be removed from the graph if they follow a quantized op.
+        # If input to this node is not quantized then keep this node
+        # If activation is symmetric, not quantize the op and simply return
+        if node.input[0] not in self.quantizer.quantized_value_map or self.quantizer.is_activation_symmetric:
+            return super().quantize()
+
+        quantized_value = self.quantizer.quantized_value_map[node.input[0]]
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_value
+
+    def quantize(self):
+        node = self.node
+        if node.op_type == "Relu" or node.op_type == "Clip":
+            self.QuantizeClipRelu()
+            return
+
+        nnapi_sigmoid_option = "extra.Sigmoid.nnapi"
+        sigmoid_nnapi_mode = (
+            node.op_type == "Sigmoid"
+            and nnapi_sigmoid_option in self.quantizer.extra_options
+            and self.quantizer.extra_options[nnapi_sigmoid_option]
+        )
+        use_scale = 1 / 256.0 if sigmoid_nnapi_mode else None
+        use_zeropoint = 0 if sigmoid_nnapi_mode else None
+
+        # No assert on op_type as it is controlled by registry
+        # only try to quantize when given quantization parameters for it
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0], use_scale, use_zeropoint)
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        qlinear_activation_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qlinear_activation_name = ""
+        if node.name:
+            qlinear_activation_name = node.name + "_quant"
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        qlinear_activation_inputs = [
+            quantized_input_names[0],
+            scale_names[0],
+            zero_point_names[0],
+            output_scale_name,
+            output_zp_name,
+        ]
+
+        qlinear_activation_node = onnx.helper.make_node(
+            "QLinear" + node.op_type,
+            qlinear_activation_inputs,
+            [qlinear_activation_output],
+            qlinear_activation_name,
+            **kwargs,
+        )
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            qlinear_activation_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        nodes.append(qlinear_activation_node)
+        self.quantizer.new_nodes += nodes
+
+
+class QDQRemovableActivation(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        # If input to this node is not quantized then keep this node
+        if not self.quantizer.is_tensor_quantized(node.input[0]):
+            return
+
+        if (
+            not self.quantizer.is_activation_symmetric
+            and not self.quantizer.qdq_keep_removable_activations
+            and self.quantizer.try_replacing_upstream_output(node.input[0], node.output[0])
+        ):
+            self.quantizer.remove_node(self.node)
+        else:
+            self.quantizer.quantize_activation_tensor(node.input[0])
+
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_activation_tensor(node.output[0])
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/argmax.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/argmax.py
@ -0,0 +1,18 @@
+from .base_operator import QuantOperatorBase
+
+
+# Use the quantized tensor as input without DQ.
+class QArgMax(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        quantized_input_value = self.quantizer.find_quantized_value(node.input[0])
+        if quantized_input_value is None:
+            self.quantizer.new_nodes += [node]
+            return
+
+        node.input[0] = quantized_input_value.q_name
+        self.quantizer.new_nodes += [node]
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/attention.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/attention.py
@ -0,0 +1,73 @@
+import onnx
+from onnx import onnx_pb as onnx_proto  # noqa: F401
+
+from ..quant_utils import attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+"""
+    Quantize Attention
+"""
+
+
+class AttentionQuant(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def should_quantize(self):
+        return self.quantizer.should_quantize_node(self.node)
+
+    def quantize(self):
+        """
+        parameter node: Attention node.
+        parameter new_nodes_list: List of new nodes created before processing this node.
+        return: a list of nodes in topological order that represents quantized Attention node.
+        """
+        node = self.node
+        assert node.op_type == "Attention"
+
+        # TODO This is a temporary fix to stop exporting QAttention with qkv_hidden_sizes
+        # attribute. This needs to be removed once the QAttention for varied q,k,v sizes
+        # is implemented
+        for attr in node.attribute:
+            if attr.name == "qkv_hidden_sizes":
+                return super().quantize()
+
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        (
+            quantized_input_names_weight,
+            zero_point_names_weight,
+            scale_names_weight,
+            nodes_weight,
+        ) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
+        quantized_input_names.extend(quantized_input_names_weight)
+        zero_point_names.extend(zero_point_names_weight)
+        scale_names.extend(scale_names_weight)
+        nodes.extend(nodes_weight)
+
+        if quantized_input_names is None:
+            return super().quantize()
+
+        qattention_name = "" if not node.name else node.name + "_quant"
+
+        inputs = []
+        inputs.extend(quantized_input_names)
+        inputs.extend([node.input[2]])
+        inputs.extend(scale_names)
+        inputs.extend([node.input[3] if len(node.input) > 3 else ""])
+        inputs.extend(zero_point_names)
+        inputs.extend([node.input[4] if len(node.input) > 4 else ""])
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+        qattention_node = onnx.helper.make_node("QAttention", inputs, node.output, qattention_name, **kwargs)
+        nodes.append(qattention_node)
+
+        self.quantizer.new_nodes += nodes
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/base_operator.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/base_operator.py
@ -0,0 +1,26 @@
+class QuantOperatorBase:
+    def __init__(self, onnx_quantizer, onnx_node):
+        self.quantizer = onnx_quantizer
+        self.node = onnx_node
+
+    def should_quantize(self):
+        if not self.quantizer.should_quantize_node(self.node):
+            return False
+
+        return self.quantizer.is_float_tensor(self.node.input[0])
+
+    def quantize(self):
+        """
+        Given a node which does not support quantization, this method checks whether the input to
+        this node is quantized and adds a DequantizeLinear node to dequantize this input back to FP32
+            parameter node: Current node
+            parameter new_nodes_list: List of new nodes created before processing current node
+            return: List of new nodes created
+        """
+        for _, node_input in enumerate(self.node.input):
+            dequantize_node = self.quantizer._dequantize_value(node_input)
+            if dequantize_node is not None:
+                self.quantizer.new_nodes.append(dequantize_node)
+
+        # Append the original node
+        self.quantizer.new_nodes.append(self.node)
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/binary_op.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/binary_op.py
@ -0,0 +1,72 @@
+import onnx
+from onnx import onnx_pb as onnx_proto  # noqa: F401
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+
+class QLinearBinaryOp(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0, 1])
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        qlinear_binary_math_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qlinear_binary_math_name = node.name + "_quant" if node.name else ""
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        qlinear_binary_math_inputs = []
+        # Input 0
+        qlinear_binary_math_inputs.append(quantized_input_names[0])
+        qlinear_binary_math_inputs.append(scale_names[0])
+        qlinear_binary_math_inputs.append(zero_point_names[0])
+        # Input 1
+        qlinear_binary_math_inputs.append(quantized_input_names[1])
+        qlinear_binary_math_inputs.append(scale_names[1])
+        qlinear_binary_math_inputs.append(zero_point_names[1])
+
+        # Output
+        qlinear_binary_math_inputs.append(output_scale_name)
+        qlinear_binary_math_inputs.append(output_zp_name)
+
+        qlinear_binary_math_node = onnx.helper.make_node(
+            "QLinear" + node.op_type,
+            qlinear_binary_math_inputs,
+            [qlinear_binary_math_output],
+            qlinear_binary_math_name,
+            **kwargs,
+        )
+        nodes.append(qlinear_binary_math_node)
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            qlinear_binary_math_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        self.quantizer.new_nodes += nodes
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/concat.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/concat.py
@ -0,0 +1,62 @@
+import onnx
+
+from ..quant_utils import (  # noqa: F401
+    TENSOR_NAME_QUANT_SUFFIX,
+    QuantizedValue,
+    QuantizedValueType,
+    attribute_to_kwarg,
+    ms_domain,
+)
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase  # noqa: F401
+
+
+class QLinearConcat(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+        (
+            q_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [*range(len(node.input))])
+        if not data_found or q_input_names is None:
+            return super().quantize()
+
+        # Create an entry for output quantized value
+        quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
+        quantized_output_value = QuantizedValue(
+            node.output[0],
+            node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
+            output_scale_name,
+            output_zp_name,
+            quantized_input_value.value_type,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+        qnode_name = node.name + "_quant" if node.name else ""
+
+        qlconcat_inputs = [output_scale_name, output_zp_name]
+        for i in range(len(q_input_names)):
+            qlconcat_inputs.extend([q_input_names[i], scale_names[i], zero_point_names[i]])
+        qlconcat_node = onnx.helper.make_node(
+            "QLinearConcat", qlconcat_inputs, [quantized_output_value.q_name], qnode_name, **kwargs
+        )
+
+        self.quantizer.new_nodes += nodes
+        self.quantizer.new_nodes += [qlconcat_node]
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/conv.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/conv.py
@ -0,0 +1,258 @@
+import numpy as np
+import onnx
+from onnx import onnx_pb as onnx_proto
+
+from ..quant_utils import (
+    TENSOR_NAME_QUANT_SUFFIX,
+    QuantizedValue,
+    QuantizedValueType,
+    attribute_to_kwarg,
+    find_by_name,
+    get_mul_node,
+)
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class ConvInteger(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def add_bias(self, nodes, scaled_output):
+        """
+        Given a node, this function handles bias add by adding a "reshape" node on bias and an "add" node
+            parameter nodes: new nodes would be appended into nodes
+            parameter node: current node (Conv)
+            parameter scaled_output: output of quant conv without bias
+            parameter output: output of Conv
+            parameter bias_name: bias of Conv
+            return: the name of output
+        """
+        node = self.node
+        model = self.quantizer.model
+        # Add tensors for the shape to be reshaped to
+        weight = find_by_name(node.input[1], model.initializer())
+        if weight is None:
+            raise ValueError(f"Expected {node.input[1]} to be an initializer")
+
+        # Add reshape for correct broadcase
+        output = node.output[0]
+        reshape_input_data = node.input[2]  # bias of Conv
+        reshape_input_shape = output + "_bias_reshape_shape"
+        reshape_output = output + "_bias_reshape_output"
+
+        shape = np.ones((len(weight.dims)), dtype=np.int64)
+        shape[1] = -1
+        init_shape = onnx.helper.make_tensor(
+            reshape_input_shape, onnx_proto.TensorProto.INT64, [len(weight.dims)], shape
+        )
+        model.add_initializer(init_shape)
+
+        reshape_node = onnx.helper.make_node("Reshape", [reshape_input_data, reshape_input_shape], [reshape_output])
+        nodes.append(reshape_node)
+
+        # Add an Add operation for bias
+        add_node = onnx.helper.make_node("Add", [scaled_output, reshape_output], [output], output + "_bias_add")
+        nodes.append(add_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Conv"
+        #  Get Quantized from both activation(input[0]) and weight(input[1])
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        (
+            quantized_input_names_weight,
+            zero_point_names_weight,
+            scale_names_weight,
+            nodes_weight,
+        ) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
+        quantized_input_names.extend(quantized_input_names_weight)
+        zero_point_names.extend(zero_point_names_weight)
+        scale_names.extend(scale_names_weight)
+        nodes.extend(nodes_weight)
+
+        conv_integer_output = node.output[0] + "_output_quantized"
+        conv_integer_name = node.name + "_quant" if node.name else ""
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        conv_integer_node = onnx.helper.make_node(
+            "ConvInteger", quantized_input_names + zero_point_names, [conv_integer_output], conv_integer_name, **kwargs
+        )
+        nodes.append(conv_integer_node)
+
+        # Add cast operation to cast convInteger output to float.
+        onnx_type = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
+        cast_op_output = conv_integer_output + "_cast_output"
+        cast_node = onnx.helper.make_node(
+            "Cast",
+            [conv_integer_output],
+            [cast_op_output],
+            conv_integer_output + "_cast",
+            to=onnx_type,  # TODO: FLOAT ot FLOAT16
+        )
+        nodes.append(cast_node)
+
+        # Add mul operation to multiply scales of two inputs.
+        assert len(scale_names) == 2
+        if conv_integer_name:
+            scales_mul_op = conv_integer_name + "_scales_mul"
+        else:
+            scales_mul_op = scale_names[0] + "_" + scale_names[1] + "_mul"
+
+        scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
+        if scales_mul_node is None:
+            scales_mul_node = get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
+            nodes.append(scales_mul_node)
+
+        scales_mul_op_output = scales_mul_node.output[0]
+
+        has_bias = len(node.input) == 3
+        scaled_output_name = node.output[0] if not has_bias else node.output[0] + "quant_scaled_output"
+
+        # Add mul operation to multiply mul_scales_op result with output of ConvInteger
+        # and make the output of this node the same as output of original conv node.
+        output_scale_mul_op = conv_integer_name + "_output_scale_mul" if conv_integer_name else ""
+        nodes.append(
+            get_mul_node(
+                [cast_op_output, scales_mul_op_output],
+                scaled_output_name,
+                output_scale_mul_op,
+            )
+        )
+
+        if has_bias:
+            self.add_bias(nodes, scaled_output_name)
+
+        self.quantizer.new_nodes += nodes
+
+
+class QLinearConv(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Conv"
+
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+
+        if self.quantizer.is_input_a_initializer(node.input[1]) and self.quantizer.is_per_channel():
+            (
+                quantized_input_names,
+                zero_point_names,
+                scale_names,
+                nodes,
+            ) = self.quantizer.quantize_activation(node, [0])
+            quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
+                node.input[1], onnx_proto.TensorProto.INT8, 0  # self.quantizer.weight_qType?
+            )
+            quantized_input_names.append(quant_weight_tuple[0])
+            zero_point_names.append(quant_weight_tuple[1])
+            scale_names.append(quant_weight_tuple[2])
+        else:
+            (
+                quantized_input_names,
+                zero_point_names,
+                scale_names,
+                nodes,
+            ) = self.quantizer.quantize_activation(node, [0])
+
+            (
+                quantized_input_names_weight,
+                zero_point_names_weight,
+                scale_names_weight,
+                nodes_weight,
+            ) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
+            quantized_input_names.extend(quantized_input_names_weight)
+            zero_point_names.extend(zero_point_names_weight)
+            scale_names.extend(scale_names_weight)
+            nodes.extend(nodes_weight)
+
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        quantized_bias_name = ""
+        bias_present = False
+        if len(node.input) == 3:
+            if self.quantizer.weight_qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
+                raise RuntimeError("Quantization to FLOAT8E4M3FN for operator Conv is not supported.")
+            quantized_bias_name = self.quantizer.quantize_bias_static(node.input[2], node.input[0], node.input[1])
+            bias_present = True
+
+        qlinear_conv_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qlinear_conv_name = node.name + "_quant" if node.name else ""
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        qlinear_conv_inputs = []
+        # Input 0
+        qlinear_conv_inputs.append(quantized_input_names[0])
+        qlinear_conv_inputs.append(scale_names[0])
+        qlinear_conv_inputs.append(zero_point_names[0])
+        # Input 1
+        qlinear_conv_inputs.append(quantized_input_names[1])
+        qlinear_conv_inputs.append(scale_names[1])
+        qlinear_conv_inputs.append(zero_point_names[1])
+
+        # Output
+        qlinear_conv_inputs.append(output_scale_name)
+        qlinear_conv_inputs.append(output_zp_name)
+
+        if bias_present:
+            qlinear_conv_inputs.append(quantized_bias_name)
+
+        qlinear_conv_node = onnx.helper.make_node(
+            "QLinearConv", qlinear_conv_inputs, [qlinear_conv_output], qlinear_conv_name, **kwargs
+        )
+        nodes.append(qlinear_conv_node)
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            qlinear_conv_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        self.quantizer.new_nodes += nodes
+
+
+class QDQConv(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Conv" or node.op_type == "ConvTranspose"
+
+        self.quantizer.quantize_activation_tensor(node.input[0])
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_activation_tensor(node.output[0])
+
+        is_weight_per_channel, weight_axis = self.quantizer.is_tensor_per_channel(
+            node.input[1], default_axis=0 if node.op_type == "Conv" else 1
+        )
+        if is_weight_per_channel:
+            self.quantizer.quantize_weight_tensor_per_channel(node.input[1], weight_axis)
+        else:
+            self.quantizer.quantize_weight_tensor(node.input[1])
+
+        if len(node.input) == 3:
+            self.quantizer.quantize_bias_tensor(node.name, node.input[2], node.input[0], node.input[1])
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/direct_q8.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/direct_q8.py
@ -0,0 +1,78 @@
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+# For operators that support 8bits operations directly, and output could
+# reuse input[0]'s type, zeropoint, scale; For example,Transpose, Reshape, etc.
+class Direct8BitOp(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        if not self.quantizer.force_quantize_no_input_check:
+            # Keep backward compatibility
+            # Quantize when input[0] is quantized already. Otherwise keep it.
+            quantized_input_value = self.quantizer.find_quantized_value(node.input[0])
+            if quantized_input_value is None:
+                self.quantizer.new_nodes += [node]
+                return
+
+            quantized_output_value = QuantizedValue(
+                node.output[0],
+                node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
+                quantized_input_value.scale_name,
+                quantized_input_value.zp_name,
+                quantized_input_value.value_type,
+            )
+            self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+            node.input[0] = quantized_input_value.q_name
+            node.output[0] = quantized_output_value.q_name
+            self.quantizer.new_nodes += [node]
+
+        else:
+            # Force quantize those ops if possible, use exclude node list if this is not you want
+            if not self.quantizer.is_valid_quantize_weight(node.input[0]):
+                super().quantize()
+                return
+
+            (
+                quantized_input_names,
+                zero_point_names,
+                scale_names,
+                nodes,
+            ) = self.quantizer.quantize_activation(node, [0])
+            if quantized_input_names is None:
+                return super().quantize()
+
+            # Create an entry for output quantized value
+            quantized_output_value = QuantizedValue(
+                node.output[0],
+                node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
+                scale_names[0],
+                zero_point_names[0],
+                QuantizedValueType.Input,
+            )
+            self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+            node.input[0] = quantized_input_names[0]
+            node.output[0] = quantized_output_value.q_name
+            nodes.append(node)
+
+            self.quantizer.new_nodes += nodes
+
+
+class QDQDirect8BitOp(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        if self.quantizer.force_quantize_no_input_check:
+            self.quantizer.quantize_activation_tensor(self.node.input[0])
+            if not self.disable_qdq_for_node_output:
+                self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
+        elif self.quantizer.is_tensor_quantized(self.node.input[0]) and not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/embed_layernorm.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/embed_layernorm.py
@ -0,0 +1,121 @@
+import logging
+
+import onnx
+from onnx import onnx_pb as onnx_proto  # noqa: F401
+
+from ..quant_utils import attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+"""
+Quantizes the EmbedLayerNorm fused ONNXRuntime Op.
+
+This Quant operator keeps the input and segment IDs at int32 but will quantize all initializer and
+weight inputs associated with the node to uint8.
+"""
+
+
+class EmbedLayerNormalizationQuant(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def should_quantize(self):
+        return self.quantizer.should_quantize_node(self.node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "EmbedLayerNormalization"
+
+        if len(node.output) > 2:
+            logging.info(f"Quantization is not applied to {node.name} since it has 3 outputs")
+            return super().quantize()
+
+        """
+        Pre-quantization EmbedLayerNorm inputs:
+        [0] input_ids (int32)
+        [1] segment_ids (int32)
+        [2] word_embedding (float32)
+        [3] position_embedding (float32)
+        [4] segment_embedding (float32)
+        [5] gamma (float32)
+        [6] beta (float32)
+        [7] mask (int32) (optional)
+        """
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [2, 3, 4, 5, 6])
+        if quantized_input_names is None:
+            return super().quantize()
+
+        qembed_layer_norm_name = "" if not node.name else node.name + "_quant"
+
+        """
+        Quantized Input Tensor List
+        [0] input_ids (int32)
+        [1] segment_ids (int32)
+        [2] word_embedding (uint8)
+        [3] position_embedding (uint8)
+        [4] segment_embedding (uint8)
+        [5] gamma (uint8)
+        [6] beta (uint8)
+        [7] mask (int32) (optional)
+        [8] word_embedding_scale (float)
+        [9] position_embedding_scale (float)
+        [10] segment_embedding_scale (float)
+        [11] gamma_scale (float)
+        [12] beta_scale (float)
+        [13] word_embedding_zero_point (uint8)
+        [14] position_embedding_zero_point (uint8)
+        [15] segment_embedding_zero_point (uint8)
+        [16] gamma_zero_point (uint8)
+        [17] beta_zero_point (uint8)
+        """
+        inputs = []
+        # 'input_ids'
+        inputs.extend([node.input[0]])
+        # 'segment_ids'
+        inputs.extend([node.input[1]])
+        # 'word_embedding_quant'
+        inputs.extend([quantized_input_names[0]])
+        # 'position_embedding_quant'
+        inputs.extend([quantized_input_names[1]])
+        # 'segment_embedding_quant'
+        inputs.extend([quantized_input_names[2]])
+        # 'gamma_quant'
+        inputs.extend([quantized_input_names[3]])
+        # 'beta_quant'
+        inputs.extend([quantized_input_names[4]])
+        # 'mask' (optional)
+        inputs.extend([node.input[7] if len(node.input) > 7 else ""])
+
+        # Add all scales:
+        inputs.extend([scale_names[0]])
+        inputs.extend([scale_names[1]])
+        inputs.extend([scale_names[2]])
+        inputs.extend([scale_names[3]])
+        inputs.extend([scale_names[4]])
+
+        # Add all zero points:
+        inputs.extend([zero_point_names[0]])
+        inputs.extend([zero_point_names[1]])
+        inputs.extend([zero_point_names[2]])
+        inputs.extend([zero_point_names[3]])
+        inputs.extend([zero_point_names[4]])
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        qembed_layer_norm_node = onnx.helper.make_node(
+            "QEmbedLayerNormalization",
+            inputs,
+            node.output,
+            qembed_layer_norm_name,
+            **kwargs,
+        )
+        nodes.append(qembed_layer_norm_node)
+
+        self.quantizer.new_nodes += nodes
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/gather.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/gather.py
@ -0,0 +1,64 @@
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+"""
+    Quantize Gather
+"""
+
+
+class GatherQuant(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def should_quantize(self):
+        if not self.quantizer.should_quantize_node(self.node):
+            return False
+
+        return self.quantizer.is_valid_quantize_weight(self.node.input[0])
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Gather"
+
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+        if quantized_input_names is None:
+            return super().quantize()
+
+        gather_new_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            gather_new_output,
+            scale_names[0],
+            zero_point_names[0],
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        node.output[0] = gather_new_output
+        node.input[0] = quantized_input_names[0]
+        nodes.append(node)
+
+        self.quantizer.new_nodes += nodes
+
+
+class QDQGather(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Gather"
+
+        if self.quantizer.is_valid_quantize_weight(node.input[0]) or self.quantizer.force_quantize_no_input_check:
+            self.quantizer.quantize_activation_tensor(node.input[0])
+            self.quantizer.quantize_output_same_as_input(node.output[0], node.input[0], node.name)
+        elif self.quantizer.is_tensor_quantized(node.input[0]):
+            self.quantizer.quantize_output_same_as_input(node.output[0], node.input[0], node.name)
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/gavgpool.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/gavgpool.py
@ -0,0 +1,62 @@
+import onnx
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+
+class QGlobalAveragePool(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "GlobalAveragePool"
+
+        # If input to this node is not quantized then keep this node.
+        if node.input[0] not in self.quantizer.quantized_value_map:
+            return super().quantize()
+
+        quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
+
+        # Create an entry for output quantized value.
+        quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
+        (
+            data_found,
+            output_scale_name_from_parameter,
+            output_zp_name_from_parameter,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+        # Just use input scale and zp if parameters for output is not specified.
+        output_scale_name = output_scale_name_from_parameter if data_found else quantized_input_value.scale_name
+        output_zp_name = output_zp_name_from_parameter if data_found else quantized_input_value.zp_name
+        quantized_output_value = QuantizedValue(
+            node.output[0],
+            node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+        kwargs["channels_last"] = 0
+        qnode_name = node.name + "_quant" if node.name else ""
+
+        qnode = onnx.helper.make_node(
+            "QLinear" + node.op_type,
+            [
+                quantized_input_value.q_name,
+                quantized_input_value.scale_name,
+                quantized_input_value.zp_name,
+                output_scale_name,
+                output_zp_name,
+            ],
+            [quantized_output_value.q_name],
+            qnode_name,
+            **kwargs,
+        )
+        self.quantizer.new_nodes += [qnode]
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/gemm.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/gemm.py
@ -0,0 +1,166 @@
+import logging
+
+import numpy as np  # noqa: F401
+import onnx
+
+from ..quant_utils import find_by_name  # noqa: F401
+from ..quant_utils import get_mul_node  # noqa: F401
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase  # noqa: F401
+from .matmul import QOpMatMul
+from .qdq_base_operator import QDQOperatorBase
+
+
+def is_B_transposed(gemm_node):  # noqa: N802
+    transB_attribute = [attr for attr in gemm_node.attribute if attr.name == "transB"]  # noqa: N806
+    if len(transB_attribute):
+        return onnx.helper.get_attribute_value(transB_attribute[0]) > 0
+
+    return False
+
+
+def get_beta(gemm_node):
+    beta_attribute = [attr for attr in gemm_node.attribute if attr.name == "beta"]
+    if len(beta_attribute):
+        return onnx.helper.get_attribute_value(beta_attribute[0])
+
+    return 1.0
+
+
+def set_default_beta(gemm_node):
+    beta_attribute = [attr for attr in gemm_node.attribute if attr.name == "beta"]
+    if len(beta_attribute):
+        beta_attribute[0].f = 1.0
+
+    return 1.0
+
+
+class QLinearGemm(QOpMatMul):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Gemm"
+
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+
+        if self.quantizer.is_input_a_initializer(node.input[1]) and self.quantizer.is_per_channel():
+            (
+                quantized_input_names,
+                zero_point_names,
+                scale_names,
+                nodes,
+            ) = self.quantizer.quantize_activation(node, [0])
+            quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
+                node.input[1],
+                self.quantizer.weight_qType,
+                0 if is_B_transposed(node) else 1,
+            )
+            quantized_input_names.append(quant_weight_tuple[0])
+            zero_point_names.append(quant_weight_tuple[1])
+            scale_names.append(quant_weight_tuple[2])
+        else:
+            #  Get Quantized from both activation(input[0]) and weight(input[1])
+            (
+                quantized_input_names,
+                zero_point_names,
+                scale_names,
+                nodes,
+            ) = self.quantizer.quantize_activation(node, [0])
+
+            (
+                quantized_input_names_weight,
+                zero_point_names_weight,
+                scale_names_weight,
+                nodes_weight,
+            ) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
+            quantized_input_names.extend(quantized_input_names_weight)
+            zero_point_names.extend(zero_point_names_weight)
+            scale_names.extend(scale_names_weight)
+            nodes.extend(nodes_weight)
+
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        quantized_bias_name = ""
+        if len(node.input) == 3:
+            if not self.quantizer.is_input_a_initializer(node.input[2]):
+                return super().quantize()
+
+            # Note: if the quantized type is float 8, the bias is converted into float 16.
+            # cublasLtMatMul only supports (b)float16 or float32 bias.
+            quantized_bias_name = self.quantizer.quantize_bias_static(
+                node.input[2], node.input[0], node.input[1], get_beta(self.node)
+            )
+
+        qgemm_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qgemm_name = node.name + "_quant" if node.name else ""
+
+        kwargs = {}
+        for attribute in node.attribute:
+            if attribute.name != "beta":
+                kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        # generate input
+        qgemm_inputs = []
+        for i in range(2):
+            qgemm_inputs.extend([quantized_input_names[i], scale_names[i], zero_point_names[i]])
+
+        qgemm_inputs.extend([quantized_bias_name, output_scale_name, output_zp_name])
+
+        qgemm_node = onnx.helper.make_node("QGemm", qgemm_inputs, [qgemm_output], qgemm_name, **kwargs)
+        nodes.append(qgemm_node)
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            qgemm_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+            node_type=node.op_type,
+            node_qtype=self.quantizer.weight_qType,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        self.quantizer.new_nodes += nodes
+
+
+class QDQGemm(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Gemm"
+
+        self.quantizer.quantize_activation_tensor(node.input[0])
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_activation_tensor(node.output[0])
+
+        is_weight_per_channel, weight_axis = self.quantizer.is_tensor_per_channel(
+            node.input[1], default_axis=0 if is_B_transposed(node) else 1
+        )
+        if is_weight_per_channel:
+            self.quantizer.quantize_weight_tensor_per_channel(node.input[1], weight_axis)
+        else:
+            self.quantizer.quantize_weight_tensor(node.input[1])
+
+        if len(node.input) == 3:
+            if self.quantizer.is_input_a_initializer(node.input[2]):
+                self.quantizer.quantize_bias_tensor(
+                    node.name, node.input[2], node.input[0], node.input[1], get_beta(self.node)
+                )
+                set_default_beta(self.node)
+            else:
+                logging.warning(
+                    f"Bias of Gemm node '{self.node.name}' is not constant. Please exclude this node for better performance."
+                )
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/lstm.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/lstm.py
@ -0,0 +1,117 @@
+import numpy
+import onnx
+from onnx import onnx_pb as onnx_proto
+
+from ..quant_utils import QuantType, attribute_to_kwarg, ms_domain  # noqa: F401
+from .base_operator import QuantOperatorBase
+
+"""
+    Quantize LSTM
+"""
+
+
+class LSTMQuant(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        """
+        parameter node: LSTM node.
+        parameter new_nodes_list: List of new nodes created before processing this node.
+        return: a list of nodes in topological order that represents quantized Attention node.
+        """
+        node = self.node
+        assert node.op_type == "LSTM"
+
+        if not self.quantizer.is_valid_quantize_weight(node.input[1]) or not self.quantizer.is_valid_quantize_weight(
+            node.input[2]
+        ):
+            super().quantize()
+            return
+
+        model = self.quantizer.model
+        W = model.get_initializer(node.input[1])  # noqa: N806
+        R = model.get_initializer(node.input[2])  # noqa: N806
+
+        if len(W.dims) != 3 or len(R.dims) != 3:
+            super().quantize()
+            return
+
+        [W_num_dir, W_4_hidden_size, W_input_size] = W.dims  # noqa: N806
+        [R_num_dir, R_4_hidden_size, R_hidden_size] = R.dims  # noqa: N806
+
+        if self.quantizer.is_per_channel():
+            del W.dims[0]
+            del R.dims[0]
+            W.dims[0] = W_num_dir * W_4_hidden_size
+            R.dims[0] = R_num_dir * R_4_hidden_size
+
+        quant_input_weight_tuple = self.quantizer.quantize_weight_per_channel(
+            node.input[1], onnx_proto.TensorProto.INT8, 0  # self.quantizer.weight_qType?
+        )
+        quant_recurrent_weight_tuple = self.quantizer.quantize_weight_per_channel(
+            node.input[2], onnx_proto.TensorProto.INT8, 0  # self.quantizer.weight_qType?
+        )
+
+        W_quant_weight = model.get_initializer(quant_input_weight_tuple[0])  # noqa: N806
+        R_quant_weight = model.get_initializer(quant_recurrent_weight_tuple[0])  # noqa: N806
+
+        W_quant_array = onnx.numpy_helper.to_array(W_quant_weight)  # noqa: N806
+        R_quant_array = onnx.numpy_helper.to_array(R_quant_weight)  # noqa: N806
+
+        W_quant_array = numpy.reshape(W_quant_array, (W_num_dir, W_4_hidden_size, W_input_size))  # noqa: N806
+        R_quant_array = numpy.reshape(R_quant_array, (R_num_dir, R_4_hidden_size, R_hidden_size))  # noqa: N806
+
+        W_quant_array = numpy.transpose(W_quant_array, (0, 2, 1))  # noqa: N806
+        R_quant_array = numpy.transpose(R_quant_array, (0, 2, 1))  # noqa: N806
+
+        W_quant_tranposed = onnx.numpy_helper.from_array(W_quant_array, quant_input_weight_tuple[0])  # noqa: N806
+        R_quant_tranposed = onnx.numpy_helper.from_array(R_quant_array, quant_recurrent_weight_tuple[0])  # noqa: N806
+
+        model.remove_initializers([W_quant_weight, R_quant_weight])
+        model.add_initializer(W_quant_tranposed)
+        model.add_initializer(R_quant_tranposed)
+
+        W_quant_zp = model.get_initializer(quant_input_weight_tuple[1])  # noqa: N806
+        R_quant_zp = model.get_initializer(quant_recurrent_weight_tuple[1])  # noqa: N806
+        W_quant_scale = model.get_initializer(quant_input_weight_tuple[2])  # noqa: N806
+        R_quant_scale = model.get_initializer(quant_recurrent_weight_tuple[2])  # noqa: N806
+
+        if self.quantizer.is_per_channel():
+            W_quant_zp.dims[:] = [W_num_dir, W_4_hidden_size]
+            R_quant_zp.dims[:] = [R_num_dir, R_4_hidden_size]
+            W_quant_scale.dims[:] = [W_num_dir, W_4_hidden_size]
+            R_quant_scale.dims[:] = [R_num_dir, R_4_hidden_size]
+
+        inputs = []
+        input_len = len(node.input)
+        inputs.extend([node.input[0]])
+        inputs.extend([quant_input_weight_tuple[0], quant_recurrent_weight_tuple[0]])
+        inputs.extend([node.input[3] if input_len > 3 else ""])
+        inputs.extend([node.input[4] if input_len > 4 else ""])
+        inputs.extend([node.input[5] if input_len > 5 else ""])
+        inputs.extend([node.input[6] if input_len > 6 else ""])
+        inputs.extend([node.input[7] if input_len > 7 else ""])
+        inputs.extend(
+            [
+                quant_input_weight_tuple[2],
+                quant_input_weight_tuple[1],
+                quant_recurrent_weight_tuple[2],
+                quant_recurrent_weight_tuple[1],
+            ]
+        )
+
+        kwargs = {}
+        for attribute in node.attribute:
+            if attribute.name == "layout":
+                continue
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        quant_lstm_name = "" if not node.name else node.name + "_quant"
+        quant_lstm_node = onnx.helper.make_node("DynamicQuantizeLSTM", inputs, node.output, quant_lstm_name, **kwargs)
+        self.quantizer.new_nodes.append(quant_lstm_node)
+
+        dequantize_node = self.quantizer._dequantize_value(node.input[0])
+        if dequantize_node is not None:
+            self.quantizer.new_nodes.append(dequantize_node)
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/matmul.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/matmul.py
@ -0,0 +1,228 @@
+import itertools
+import logging
+
+import onnx
+from onnx import onnx_pb as onnx_proto
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, find_by_name, get_mul_node
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QOpMatMul(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def should_quantize(self):
+        if not self.quantizer.should_quantize_node(self.node):
+            logging.debug(f"Ignore MatMul {self.node.name}]")
+            return False
+
+        if (not self.quantizer.is_float_tensor(self.node.input[1])) and (
+            not self.quantizer.is_float_tensor(self.node.input[0])
+        ):
+            logging.info(f"Ignore MatMul due to non float inputs {self.node.name}]")
+            return False
+
+        # do not quantize non-constant B matrices for matmul
+        if self.quantizer.q_matmul_const_b_only:
+            if not self.quantizer.find_initializer_in_path(self.node.input[1]):
+                logging.info(f"Ignore MatMul due to non constant B: {self.quantizer.graph_scope}[{self.node.name}]")
+                return False
+        return True
+
+
+"""
+    Used when quantize mode is QuantizationMode.IntegerOps.
+"""
+
+
+class MatMulInteger(QOpMatMul):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "MatMul"
+        #  Get Quantized from both activation(input[0]) and weight(input[1])
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        (
+            quantized_input_names_weight,
+            zero_point_names_weight,
+            scale_names_weight,
+            nodes_weight,
+        ) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
+        quantized_input_names.extend(quantized_input_names_weight)
+        zero_point_names.extend(zero_point_names_weight)
+        scale_names.extend(scale_names_weight)
+        nodes.extend(nodes_weight)
+
+        matmul_integer_output = node.output[0] + "_output_quantized"
+        matmul_integer_name = node.name + "_quant" if node.name else ""
+        matmul_integer_node = onnx.helper.make_node(
+            "MatMulInteger",
+            quantized_input_names + zero_point_names,
+            [matmul_integer_output],
+            matmul_integer_name,
+        )
+        nodes.append(matmul_integer_node)
+
+        # Add cast operation to cast matmulInteger output to float.
+        cast_op_output = matmul_integer_output + "_cast_output"
+        otype = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
+        cast_node = onnx.helper.make_node(
+            "Cast",
+            [matmul_integer_output],
+            [cast_op_output],
+            matmul_integer_output + "_cast",
+            to=otype,
+        )
+        nodes.append(cast_node)
+
+        # Add mul operation to multiply scales of two inputs.
+        assert len(scale_names) == 2
+        scales_mul_op = (
+            matmul_integer_name + "_scales_mul"
+            if matmul_integer_name
+            else scale_names[0] + "_" + scale_names[1] + "_mul"
+        )
+
+        scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
+        if scales_mul_node is None:
+            scales_mul_node = get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
+            nodes.append(scales_mul_node)
+
+        scales_mul_op_output = scales_mul_node.output[0]
+
+        # Add mul operation to multiply mul_scales_op result with output of MatMulInteger
+        # and make the output of this node the same as output of original matmul node.
+        output_scale_mul_op = ""
+        if matmul_integer_name:
+            output_scale_mul_op = matmul_integer_name + "_output_scale_mul"
+        nodes.append(
+            get_mul_node(
+                [cast_op_output, scales_mul_op_output],
+                node.output[0],
+                output_scale_mul_op,
+            )
+        )
+        self.quantizer.new_nodes += nodes
+
+
+"""
+    Used when quantize mode is QuantizationMode.QLinearOps
+"""
+
+
+class QLinearMatMul(QOpMatMul):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "MatMul"
+        #  Get Quantized from both activation(input[0]) and weight(input[1])
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        (
+            quantized_input_names_weight,
+            zero_point_names_weight,
+            scale_names_weight,
+            nodes_weight,
+        ) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
+        quantized_input_names.extend(quantized_input_names_weight)
+        zero_point_names.extend(zero_point_names_weight)
+        scale_names.extend(scale_names_weight)
+
+        nodes.extend(nodes_weight)
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        qlinear_matmul_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qlinear_matmul_name = node.name + "_quant" if node.name else ""
+
+        qlinear_matmul_inputs = []
+        # Input 0
+        qlinear_matmul_inputs.append(quantized_input_names[0])
+        qlinear_matmul_inputs.append(scale_names[0])
+        qlinear_matmul_inputs.append(zero_point_names[0])
+        # Input 1
+        qlinear_matmul_inputs.append(quantized_input_names[1])
+        qlinear_matmul_inputs.append(scale_names[1])
+        qlinear_matmul_inputs.append(zero_point_names[1])
+        # Output quantization parameter
+        qlinear_matmul_inputs.append(output_scale_name)
+        qlinear_matmul_inputs.append(output_zp_name)
+
+        domain = (
+            "com.microsoft"
+            if self.quantizer.weight_qType
+            in {
+                onnx_proto.TensorProto.FLOAT8E4M3FN,
+                onnx_proto.TensorProto.FLOAT8E4M3FNUZ,
+                onnx_proto.TensorProto.FLOAT8E5M2,
+                onnx_proto.TensorProto.FLOAT8E5M2FNUZ,
+            }
+            else ""
+        )
+        qlinear_matmul_node = onnx.helper.make_node(
+            "QLinearMatMul",
+            qlinear_matmul_inputs,
+            [qlinear_matmul_output],
+            qlinear_matmul_name,
+            domain=domain,
+        )
+        nodes.append(qlinear_matmul_node)
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            qlinear_matmul_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        self.quantizer.new_nodes += nodes
+
+
+class QDQMatMul(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "MatMul"
+
+        if self.disable_qdq_for_node_output:
+            nodes_to_iterate = node.input
+        else:
+            nodes_to_iterate = itertools.chain(node.input, node.output)
+
+        for tensor_name in nodes_to_iterate:
+            is_per_channel, channel_axis = self.quantizer.is_tensor_per_channel(
+                tensor_name, default_axis=1, op_type=node.op_type
+            )
+            if is_per_channel:
+                self.quantizer.quantize_weight_tensor_per_channel(tensor_name, channel_axis)
+            else:
+                self.quantizer.quantize_activation_tensor(tensor_name)
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/maxpool.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/maxpool.py
@ -0,0 +1,34 @@
+from .direct_q8 import Direct8BitOp, QDQDirect8BitOp
+
+
+class QMaxPool(Direct8BitOp):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "MaxPool"
+
+        # if version is less than 12, go to normal quantize.
+        if self.quantizer.opset_version < 12:
+            super(Direct8BitOp, self).quantize()
+            return
+
+        # Direct 8bits op
+        return super().quantize()
+
+
+class QDQMaxPool(QDQDirect8BitOp):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "MaxPool"
+
+        # if version is less than 12, just no change
+        if self.quantizer.opset_version < 12:
+            return
+
+        # Direct 8bits op
+        return super().quantize()
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/norm.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/norm.py
@ -0,0 +1,40 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QDQNormalization(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type in {"InstanceNormalization", "LayerNormalization", "BatchNormalization"}
+
+        # Input
+        self.quantizer.quantize_activation_tensor(node.input[0])
+
+        # Scale
+        scale_is_initializer = self.quantizer.is_input_a_initializer(node.input[1])
+        scale_is_per_channel, scale_channel_axis = self.quantizer.is_tensor_per_channel(
+            node.input[1], default_axis=1, op_type=node.op_type
+        )
+
+        if scale_is_per_channel:
+            self.quantizer.quantize_weight_tensor_per_channel(node.input[1], axis=scale_channel_axis)
+        elif scale_is_initializer:
+            self.quantizer.quantize_weight_tensor(node.input[1])
+        else:
+            self.quantizer.quantize_activation_tensor(node.input[1])
+
+        # Bias
+        if len(node.input) > 2 and node.input[2]:
+            self.quantizer.quantize_bias_tensor(node.name, node.input[2], node.input[0], node.input[1])
+
+        # Output
+        if not self.disable_qdq_for_node_output:
+            for output_name in node.output:
+                self.quantizer.quantize_activation_tensor(output_name)
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pad.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pad.py
@ -0,0 +1,100 @@
+import onnx
+
+from ..quant_utils import (
+    TENSOR_NAME_QUANT_SUFFIX,
+    QuantizedValue,
+    QuantizedValueType,
+    attribute_to_kwarg,
+    quantize_nparray,
+)
+from .base_operator import QuantOperatorBase
+
+
+class QPad(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Pad"
+
+        # Only after version 11, it has the optional constant_value
+        # If input[0] is not quantized, do not quanitize this node
+        if (self.quantizer.opset_version < 11) or (node.input[0] not in self.quantizer.quantized_value_map):
+            super().quantize()
+            return
+        quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kv = attribute_to_kwarg(attribute)
+            kwargs.update(kv)
+
+        if "mode" not in kwargs or kwargs["mode"] == b"constant":
+            if len(node.input) > 2 and node.input[2] != "":  # There is 3rd input 'constant_value'
+                zp_tensor = self.quantizer.model.get_initializer(quantized_input_value.zp_name)
+                scale_tensor = self.quantizer.model.get_initializer(quantized_input_value.scale_name)
+                if zp_tensor is None or scale_tensor is None:
+                    super().quantize()
+                    return
+
+                padding_constant_initializer = self.quantizer.model.get_initializer(node.input[2])
+                if padding_constant_initializer is not None:
+                    zp_array = onnx.numpy_helper.to_array(zp_tensor)
+                    zp_value = zp_array.item() if zp_array.ndim == 0 else zp_array[0]
+                    scale_array = onnx.numpy_helper.to_array(scale_tensor)
+                    scale_value = scale_array.item() if scale_array.ndim == 0 else scale_array[0]
+                    padding_constant_array = onnx.numpy_helper.to_array(padding_constant_initializer)
+                    quantized_padding_constant_array = quantize_nparray(
+                        self.quantizer.activation_qType,
+                        padding_constant_array,
+                        scale_value,
+                        zp_value,
+                    )
+                    quantized_padding_constant_name = node.input[2] + TENSOR_NAME_QUANT_SUFFIX
+                    quantized_padding_constant_initializer = onnx.numpy_helper.from_array(
+                        quantized_padding_constant_array,
+                        quantized_padding_constant_name,
+                    )
+                    # Suppose this padding constant initializer only used by the node
+                    self.quantizer.model.remove_initializer(padding_constant_initializer)
+                    self.quantizer.model.add_initializer(quantized_padding_constant_initializer)
+                    node.input[2] = quantized_padding_constant_name
+                else:
+                    # TODO: check quantize_inputs after sub graph is supported
+                    pad_value_qnodes = self.quantizer._get_quantize_input_nodes(
+                        node,
+                        2,
+                        self.quantizer.activation_qType,
+                        quantized_input_value.scale_name,
+                        quantized_input_value.zp_name,
+                        initial_type=scale_tensor.data_type,
+                    )
+                    self.quantizer.new_nodes.extend(pad_value_qnodes)
+                    node.input[2] = pad_value_qnodes[0].output[0]
+            else:
+                # In quantized format, the `zero` before quantization is mapped
+                # to quantized_input_value.zp_name. Thus, padding 0 to
+                # original tensor should become padding zero point to quantized
+                # tensor.
+                if len(node.input) == 2:
+                    # Feed quantization's zero point to padding node.
+                    node.input.append(quantized_input_value.zp_name)
+                else:
+                    # Assign quantization's zero point to padding node.
+                    assert node.input[2] == ""
+                    node.input[2] = quantized_input_value.zp_name
+
+        # Create an entry for output quantized value
+        quantized_output_value = QuantizedValue(
+            node.output[0],
+            node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
+            quantized_input_value.scale_name,
+            quantized_input_value.zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+        node.input[0] = quantized_input_value.q_name
+        node.output[0] = quantized_output_value.q_name
+        self.quantizer.new_nodes += [node]
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/pooling.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/pooling.py
@ -0,0 +1,67 @@
+import onnx
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+
+class QLinearPool(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        # only try to quantize when given quantization parameters for it
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+
+        # get quantized input tensor names, quantize input if needed
+        (
+            quantized_input_names,
+            input_zero_point_names,
+            input_scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        # Create an entry for output quantized value.
+        qlinear_output_name = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        quantized_output_value = QuantizedValue(
+            node.output[0],
+            qlinear_output_name,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+        # Create qlinear pool node for given type (AveragePool, etc)
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+        qlinear_node_name = node.name + "_quant" if node.name else ""
+        qnode = onnx.helper.make_node(
+            "QLinear" + node.op_type,
+            [
+                quantized_input_names[0],
+                input_scale_names[0],
+                input_zero_point_names[0],
+                output_scale_name,
+                output_zp_name,
+            ],
+            [qlinear_output_name],
+            qlinear_node_name,
+            **kwargs,
+        )
+
+        # add all newly created nodes
+        nodes.append(qnode)
+        self.quantizer.new_nodes += nodes
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/qdq_base_operator.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/qdq_base_operator.py
@ -0,0 +1,22 @@
+import itertools
+
+from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, quantize_nparray  # noqa: F401
+from .base_operator import QuantOperatorBase  # noqa: F401
+
+
+class QDQOperatorBase:
+    def __init__(self, onnx_quantizer, onnx_node):
+        self.quantizer = onnx_quantizer
+        self.node = onnx_node
+        self.disable_qdq_for_node_output = onnx_node.op_type in onnx_quantizer.op_types_to_exclude_output_quantization
+
+    def quantize(self):
+        node = self.node
+
+        if self.disable_qdq_for_node_output:
+            tensors_to_quantize = node.input
+        else:
+            tensors_to_quantize = itertools.chain(node.input, node.output)
+
+        for tensor_name in tensors_to_quantize:
+            self.quantizer.quantize_activation_tensor(tensor_name)
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/resize.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/resize.py
@ -0,0 +1,34 @@
+from .direct_q8 import Direct8BitOp, QDQDirect8BitOp
+
+
+class QResize(Direct8BitOp):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Resize"
+
+        # if version is less than 11, go to normal quantize.
+        if self.quantizer.opset_version < 11:
+            super(Direct8BitOp, self).quantize()
+            return
+
+        # Direct 8bits op
+        return super().quantize()
+
+
+class QDQResize(QDQDirect8BitOp):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Resize"
+
+        # if version is less than 11, just keep this node
+        if self.quantizer.opset_version < 11:
+            return
+
+        # Direct 8bits op
+        return super().quantize()
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/softmax.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/softmax.py
@ -0,0 +1,74 @@
+import onnx
+import onnx.helper
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+
+class QLinearSoftmax(QuantOperatorBase):
+    def quantize(self):
+        node = self.node
+        # set limitations for softmax output scale and zp, because the output of softmax is always 0-1
+        if self.quantizer.activation_qType == onnx.onnx_pb.TensorProto.UINT8:
+            out_scale = 1 / 256.0
+            out_zero_point = 0
+        else:
+            out_scale = 1 / 256.0
+            out_zero_point = -128
+        # only try to quantize when given quantization parameters for it
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0], out_scale, out_zero_point)
+
+        # get quantized input tensor names, quantize input if needed
+        (
+            quantized_input_names,
+            input_zero_point_names,
+            input_scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        # Create an entry for output quantized value.
+        qlinear_output_name = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        quantized_output_value = QuantizedValue(
+            node.output[0],
+            qlinear_output_name,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+        # Create qlinear softmax node for given type
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+        # make qlinearsoft has the real opset_version, its default SinceVersion would be 1
+        kwargs["opset"] = self.quantizer.opset_version
+        qlinear_node_name = node.name + "_quant" if node.name else ""
+        qnode = onnx.helper.make_node(
+            "QLinear" + node.op_type,
+            [
+                quantized_input_names[0],
+                input_scale_names[0],
+                input_zero_point_names[0],
+                output_scale_name,
+                output_zp_name,
+            ],
+            [qlinear_output_name],
+            qlinear_node_name,
+            **kwargs,
+        )
+
+        # add all newly created nodes
+        nodes.append(qnode)
+        self.quantizer.new_nodes += nodes
+        return None
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/split.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/split.py
@ -0,0 +1,63 @@
+import onnx
+
+from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QSplit(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+        if quantized_input_names is None:
+            return super().quantize()
+
+        quantized_node_name = ""
+        if node.name:
+            quantized_node_name = node.name + "_quant"
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+
+        # Output just derive the scale/zero from input
+        quantized_output_names = []
+        for output_name in node.output:
+            quantized_output_name = output_name + "quantized"
+            quantized_output_names.append(quantized_output_name)
+            q_output = QuantizedValue(
+                output_name,
+                quantized_output_name,
+                scale_names[0],
+                zero_point_names[0],
+                QuantizedValueType.Input,
+            )
+            self.quantizer.quantized_value_map[output_name] = q_output
+
+        if len(node.input) > 1:
+            quantized_input_names.extend(node.input[1:])
+        quantized_node = onnx.helper.make_node(
+            node.op_type, quantized_input_names, quantized_output_names, quantized_node_name, **kwargs
+        )
+
+        nodes.append(quantized_node)
+        self.quantizer.new_nodes += nodes
+
+
+class QDQSplit(QDQOperatorBase):
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Split"
+
+        if not self.quantizer.is_tensor_quantized(node.input[0]):
+            self.quantizer.quantize_activation_tensor(node.input[0])
+        if not self.disable_qdq_for_node_output:
+            for output in node.output:
+                self.quantizer.quantize_output_same_as_input(output, node.input[0], node.name)
--- a/rl/Lib/site-packages/onnxruntime/quantization/operators/where.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/operators/where.py
@ -0,0 +1,87 @@
+import onnx
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QLinearWhere(QuantOperatorBase):
+    def should_quantize(self):
+        return True
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Where"
+        if not self.quantizer.force_quantize_no_input_check:
+            self.quantizer.new_nodes += [node]
+            return
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+        (
+            q_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [1, 2])
+        if not data_found or q_input_names is None:
+            return super().quantize()
+        qlinear_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qlinear_output_name = node.name + "_quant" if node.name else ""
+
+        q_output = QuantizedValue(
+            node.output[0],
+            qlinear_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        qlwhere_inputs = [
+            node.input[0],
+            q_input_names[0],
+            scale_names[0],
+            zero_point_names[0],
+            q_input_names[1],
+            scale_names[1],
+            zero_point_names[1],
+            output_scale_name,
+            output_zp_name,
+        ]
+        qlwhere_node = onnx.helper.make_node(
+            "QLinearWhere", qlwhere_inputs, [qlinear_output], qlinear_output_name, **kwargs
+        )
+
+        self.quantizer.new_nodes += nodes
+        self.quantizer.new_nodes += [qlwhere_node]
+
+
+class QDQWhere(QDQOperatorBase):
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Where"
+        if self.quantizer.force_quantize_no_input_check:
+            if not self.quantizer.is_tensor_quantized(node.input[1]):
+                self.quantizer.quantize_activation_tensor(node.input[1])
+            if not self.quantizer.is_tensor_quantized(node.input[2]):
+                self.quantizer.quantize_activation_tensor(node.input[2])
+            if not self.disable_qdq_for_node_output:
+                for output in node.output:
+                    self.quantizer.quantize_activation_tensor(output)
+        elif (
+            self.quantizer.is_tensor_quantized(node.input[1])
+            and self.quantizer.is_tensor_quantized(node.input[2])
+            and not self.disable_qdq_for_node_output
+        ):
+            for output in node.output:
+                self.quantizer.quantize_activation_tensor(output)
--- a/rl/Lib/site-packages/onnxruntime/quantization/preprocess.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/preprocess.py
@ -0,0 +1,141 @@
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft, Intel Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import argparse
+import logging
+import sys
+
+from .shape_inference import quant_pre_process
+
+logger = logging.getLogger(__name__)
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="""Model optimizer and shape inferencer, in preparation for quantization,
+Consists of three optional steps:
+1. Symbolic shape inference (best for transformer models).
+2. Model optimization.
+3. ONNX shape inference.
+
+Model quantization with QDQ format, i.e. inserting QuantizeLinear/DeQuantizeLinear on
+the tensor, requires tensor shape information to perform its best. Currently, shape inferencing
+works best with optimized model. As a result, it is highly recommended to run quantization
+on optimized model with shape information. This is the tool for optimization and shape
+inferencing.
+
+Essentially this tool performs the following three (skippable) steps:
+
+1. Symbolic shape inference.
+2. Model optimization
+3. ONNX shape inference"""
+    )
+
+    parser.add_argument("--input", required=True, help="Path to the input model file")
+    parser.add_argument("--output", required=True, help="Path to the output model file")
+    parser.add_argument(
+        "--skip_optimization",
+        type=bool,
+        default=False,
+        help="Skip model optimization step if true. It's a known issue that ORT"
+        " optimization has difficulty with model size greater than 2GB, rerun with"
+        " this option to get around this issue.",
+    )
+    parser.add_argument(
+        "--skip_onnx_shape",
+        type=bool,
+        default=False,
+        help="Skip ONNX shape inference. Symbolic shape inference is most effective"
+        " with transformer based models. Skipping all shape inferences may"
+        " reduce the effectiveness of quantization, as a tensor with unknown"
+        " shape can not be quantized.",
+    )
+    parser.add_argument(
+        "--skip_symbolic_shape",
+        type=bool,
+        default=False,
+        help="Skip symbolic shape inference. Symbolic shape inference is most"
+        " effective with transformer based models. Skipping all shape"
+        " inferences may reduce the effectiveness of quantization, as a tensor"
+        " with unknown shape can not be quantized.",
+    )
+    parser.add_argument(
+        "--auto_merge",
+        help="Automatically merge symbolic dims when confliction happens",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--int_max",
+        help="maximum value for integer to be treated as boundless for ops like slice",
+        type=int,
+        default=2**31 - 1,
+    )
+    parser.add_argument(
+        "--guess_output_rank",
+        help="guess output rank to be the same as input 0 for unknown ops",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--verbose",
+        help="Prints detailed logs of inference, 0: turn off, 1: warnings, 3: detailed",
+        type=int,
+        default=0,
+    )
+    parser.add_argument(
+        "--save_as_external_data",
+        help="Saving an ONNX model to external data",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--all_tensors_to_one_file",
+        help="Saving all the external data to one file",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--external_data_location",
+        help="The file location to save the external file",
+        default=None,
+    )
+    parser.add_argument(
+        "--external_data_size_threshold",
+        help="The size threshold for external data",
+        type=int,
+        default=1024,
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    if args.skip_optimization and args.skip_onnx_shape and args.skip_symbolic_shape:
+        logger.error("Skipping all three steps, nothing to be done. Quitting...")
+        sys.exit()
+
+    if (not args.skip_optimization) and args.save_as_external_data:
+        logger.error("ORT model optimization does not support external data yet!")
+        sys.exit()
+
+    logger.info("input model: %s", args.input)
+    logger.info("output model: %s", args.output)
+    quant_pre_process(
+        args.input,
+        args.output,
+        args.skip_optimization,
+        args.skip_onnx_shape,
+        args.skip_symbolic_shape,
+        args.auto_merge,
+        args.int_max,
+        args.guess_output_rank,
+        args.verbose,
+        args.save_as_external_data,
+        args.all_tensors_to_one_file,
+        args.external_data_location,
+        args.external_data_size_threshold,
+    )
--- a/rl/Lib/site-packages/onnxruntime/quantization/qdq_loss_debug.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/qdq_loss_debug.py
@ -0,0 +1,389 @@
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft, Intel Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+"""Utilities to run a given ONNX model, while saving input/output tensors of
+eligible operator nodes.
+
+A use case is to debug quantization induced accuracy drop. An AI engineer can
+run the original float32 model and the quantized model with the same inputs,
+then compare the corresponding activations between the two models to find
+where the divergence is.
+
+Example Usage:
+
+```python
+    class ExampleDataReader(CalibrationDataReader):
+        def __init__(self):
+            ...
+        def get_next(self):
+            ...
+
+    input_data_reader = ExampleDataReader()
+
+    augmented_model_path = str(Path(self._tmp_model_dir.name).joinpath("augmented_model.onnx"))
+    modify_model_output_intermediate_tensors (path_to_onnx_model, augmented_model_path)
+
+    tensor_dict = collect_activations(augmented_model_path, input_data_reader)
+```
+
+`tensor_dict` points to a dictionary where the keys are tensor names and each value
+is a list of tensors, one from each model run
+
+"""
+
+import logging
+import math
+import time
+from pathlib import Path
+from typing import Callable, Dict, List, Optional, Sequence, Union
+
+import numpy
+import onnx
+from onnx import helper, numpy_helper
+
+import onnxruntime
+
+from .calibrate import CalibraterBase, CalibrationDataReader
+from .onnx_model import ONNXModel
+from .quant_utils import (
+    DEQUANT_OP_NAME,
+    DEQUANT_OUTPUT_SUFFIX,
+    QUANT_INPUT_SUFFIX,
+    TENSOR_NAME_QUANT_SUFFIX,
+    find_by_name,
+    load_model_with_shape_infer,
+)
+
+_TENSOR_SAVE_POSTFIX = "_ReshapedSavedOutput"
+_TENSOR_SAVE_POSTFIX_LEN = len(_TENSOR_SAVE_POSTFIX)
+
+
+def modify_model_output_intermediate_tensors(
+    input_model_path: Union[str, Path],
+    output_model_path: Union[str, Path],
+    op_types_for_saving: Optional[Sequence[str]] = None,
+    save_as_external_data: bool = False,
+) -> None:
+    """Augment a given ONNX model to save node input/output tensors.
+
+    Add all input/output tensors of operator nodes to model outputs
+    so that their values can be retrieved for debugging purposes.
+
+    Args:
+        input_model: the path to load the model.
+        op_types_for_saving: Operator types for which the
+                input/output should be saved. By default, saving all the
+                float32/float16 tensors.
+
+    Returns:
+        The augmented ONNX model
+    """
+
+    if op_types_for_saving is None:
+        op_types_for_saving = []
+    saver = CalibraterBase(input_model_path, op_types_to_calibrate=op_types_for_saving)
+    model_to_augment = saver.model
+    tensors, value_infos = saver.select_tensors_to_calibrate(model_to_augment)
+    reshape_shape_name = "LinearReshape_" + str(time.time())
+    reshape_shape = numpy_helper.from_array(numpy.array([-1], dtype=numpy.int64), reshape_shape_name)
+    model_to_augment.graph.initializer.append(reshape_shape)
+
+    for tensor_name in tensors:
+        reshape_output = tensor_name + _TENSOR_SAVE_POSTFIX
+        reshape_node = onnx.helper.make_node(
+            "Reshape",
+            inputs=[tensor_name, reshape_shape_name],
+            outputs=[reshape_output],
+            name=reshape_output,
+        )
+        model_to_augment.graph.node.append(reshape_node)
+        reshape_output_value_info = helper.make_tensor_value_info(
+            reshape_output, value_infos[tensor_name].type.tensor_type.elem_type, [-1]
+        )
+        model_to_augment.graph.output.append(reshape_output_value_info)
+
+    onnx.save(
+        model_to_augment,
+        output_model_path,
+        save_as_external_data=save_as_external_data,
+    )
+
+
+def collect_activations(
+    augmented_model: str,
+    input_reader: CalibrationDataReader,
+    session_options=None,
+    execution_providers: Optional[Sequence[str]] = None,
+) -> Dict[str, List[numpy.ndarray]]:
+    """Run augmented model and collect activations tensors.
+
+    Args:
+        augmented_model: Path to augmented model created by modify_model_output_intermediate_tensors ()
+        input_reader: Logic for reading input for the model, augmented model have the same
+            input with the original model.
+        session_options: Optional OnnxRuntime session options for controlling model run.
+            By default graph optimization is turned off
+        execution_providers: Collection of execution providers for running the model.
+            Only CPU EP is used by default.
+
+    Returns:
+        A dictionary where the key is tensor name and values are list of tensors from each batch
+    """
+
+    if session_options is None:
+        session_options = onnxruntime.SessionOptions()
+        session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
+    if execution_providers is None:
+        execution_providers = ["CPUExecutionProvider"]
+
+    inference_session = onnxruntime.InferenceSession(
+        augmented_model,
+        sess_options=session_options,
+        providers=execution_providers,
+    )
+
+    intermediate_outputs = []
+    for input_d in input_reader:
+        intermediate_outputs.append(inference_session.run(None, input_d))
+    if not intermediate_outputs:
+        raise RuntimeError("No data is collected while running augmented model!")
+
+    output_dict = {}
+    output_info = inference_session.get_outputs()
+    for batch in intermediate_outputs:
+        for output, output_data in zip(output_info, batch):
+            if output.name.endswith(_TENSOR_SAVE_POSTFIX):
+                output_name = output.name[:-_TENSOR_SAVE_POSTFIX_LEN]
+                output_dict.setdefault(output_name, []).append(output_data)
+
+    return output_dict
+
+
+_POST_QDQ_POSTFIX1 = DEQUANT_OUTPUT_SUFFIX + "_1"
+
+
+def _add_pre_post_qdq_pair(
+    qdq_cmp: Dict[str, Dict[str, Sequence[numpy.ndarray]]],
+    activation_name: str,
+    pre_qdq_tensors: Optional[Sequence[numpy.ndarray]],
+    post_qdq_tensors: Optional[Sequence[numpy.ndarray]],
+) -> None:
+    if post_qdq_tensors is not None and pre_qdq_tensors is not None:
+        qdq_cmp[activation_name] = {}
+        qdq_cmp[activation_name]["pre_qdq"] = pre_qdq_tensors
+        qdq_cmp[activation_name]["post_qdq"] = post_qdq_tensors
+
+
+def create_activation_matching(
+    qdq_activations: Dict[str, Sequence[numpy.ndarray]],
+    float_activations: Optional[Dict[str, Sequence[numpy.ndarray]]] = None,
+) -> Dict[str, Dict[str, Sequence[numpy.ndarray]]]:
+    """Comparing activation values to help debugging accuracy loss due to quantization.
+
+    This functions takes saved activations from the QDQ model and (optionally) the
+    float point model, and provides a data structure for comparing:
+        * from the qdq model, activation values before and after QDQ operation
+        * across both models, activations from the orignal model vs the corresponding
+          activations in the QDQ model
+
+    Arg:
+        qdq_activations: Output of `collect_activations`. This must be from a quantized
+            model with QDQ format.
+        float_activations: Output of `collect_activations`. This must be from the float
+            point model.
+
+    Returns:
+        Dict for comparing pre and post quantized activation tensors. E.g.
+        ```
+        qdq_cmp = cmp_qdq_input_output(qdq_activations)
+        print(qdq_cmp['activation1']['pre_qdq'][0])
+        print(qdq_cmp['activation1'][`post_qdq'][0])
+
+
+        qdq_cmp = cmp_qdq_input_output(qdq_activations, float_activations)
+        print(qdq_cmp['activation1']['float'][0])
+        print(qdq_cmp['activation1']['pre_qdq'][0])
+        print(qdq_cmp['activation1'][`post_qdq'][0])
+        ```
+    """
+
+    qdq_cmp: Dict[str, Dict[str, Sequence[numpy.ndarray]]] = {}
+    for tensor_name, tensors in qdq_activations.items():
+        if tensor_name.endswith(QUANT_INPUT_SUFFIX):
+            pre_name = tensor_name[: -len(QUANT_INPUT_SUFFIX)]
+            post_qdq_tensors = qdq_activations.get(pre_name)
+            pre_qdq_tensors = tensors
+            _add_pre_post_qdq_pair(qdq_cmp, pre_name, pre_qdq_tensors, post_qdq_tensors)
+        elif tensor_name.endswith(DEQUANT_OUTPUT_SUFFIX):
+            pre_name = tensor_name[: -len(DEQUANT_OUTPUT_SUFFIX)]
+            pre_qdq_tensors = qdq_activations.get(pre_name)
+            post_qdq_tensors = tensors
+            _add_pre_post_qdq_pair(qdq_cmp, pre_name, pre_qdq_tensors, post_qdq_tensors)
+        elif tensor_name.endswith(_POST_QDQ_POSTFIX1):
+            pre_name = tensor_name[: -len(_POST_QDQ_POSTFIX1)]
+            pre_qdq_tensors = qdq_activations.get(pre_name)
+            post_qdq_tensors = tensors
+            _add_pre_post_qdq_pair(qdq_cmp, pre_name, pre_qdq_tensors, post_qdq_tensors)
+
+    if not float_activations:
+        return qdq_cmp
+
+    for act_name, act_values in qdq_cmp.items():
+        float_acts = float_activations.get(act_name)
+        if float_acts is not None:
+            act_values["float"] = float_acts
+
+    return qdq_cmp
+
+
+def _run_dequantize_linear(
+    weight_tensor: numpy.ndarray, weight_scale: numpy.ndarray, weight_zp: numpy.ndarray, channel_axis: int
+) -> Optional[numpy.ndarray]:
+    assert weight_scale.shape == weight_zp.shape
+    if weight_zp.size == 1:
+        return (weight_tensor - weight_zp) * weight_scale
+
+    assert weight_zp.ndim == 1
+    reshape_dims = list(weight_tensor.shape)  # deep copy
+    reshape_dims[channel_axis] = 1  # only one per channel for reshape
+    channel_count = weight_tensor.shape[channel_axis]
+    dequantized_weights = None
+    for i in range(channel_count):
+        per_channel_data = weight_tensor.take(i, channel_axis)
+        dequantized_per_channel_data = (per_channel_data - weight_zp[i]) * weight_scale[i]
+        if i == 0:
+            dequantized_weights = numpy.asarray(dequantized_per_channel_data).reshape(reshape_dims)
+        else:
+            channel_weights = numpy.asarray(dequantized_per_channel_data).reshape(reshape_dims)
+            dequantized_weights = numpy.concatenate((dequantized_weights, channel_weights), channel_axis)
+
+    if dequantized_weights is None:
+        return None
+
+    dequantized_weights.reshape(weight_tensor.shape)
+    return dequantized_weights
+
+
+def create_weight_matching(float_model_path: str, qdq_model_path: str) -> Dict[str, Dict[str, numpy.ndarray]]:
+    """Comparing weight values to help debugging accuracy loss due to quantization.
+
+    This functions takes the float model and the qdq model, and provides a data structure for comparing
+    their corresponding weights to locate quantization errors
+
+    Arg:
+        float_model_path: Path points to the float point model.
+        qdq_model_path: Path points to the qdq model.
+
+    Returns:
+        Dict for comparing weight tensors. E.g.
+        ```
+        qdq_weight_cmp = create_weight_matching(float_model, qdq_model)
+        print(qdq_weight_cmp['activation1']['float'])
+        print(qdq_weight_cmp['activation1']['dequantized'])
+        ```
+    """
+    float_onnx_model = ONNXModel(load_model_with_shape_infer(Path(float_model_path)))
+    qdq_onnx_model = ONNXModel(load_model_with_shape_infer(Path(qdq_model_path)))
+
+    matched_weights: Dict[str, Dict[str, numpy.ndarray]] = {}
+    initializers = qdq_onnx_model.initializer()
+    for node in qdq_onnx_model.nodes():
+        if node.op_type != DEQUANT_OP_NAME:
+            continue  # Only care about DQ node
+        weight_name: str = node.input[0]
+        weight_values = find_by_name(weight_name, initializers)
+        if not weight_values:
+            continue  # Only care about DQ node with const inputs
+        if not weight_name.endswith(TENSOR_NAME_QUANT_SUFFIX):
+            logging.error(f"Model Error in '{qdq_model_path}': Dequantized tensor name '{weight_name}' not recognized!")
+            continue
+
+        axis = -1
+        for attr in node.attribute:
+            if attr.name == "axis":
+                axis = attr.i
+
+        weight_tensor = numpy_helper.to_array(weight_values)
+        weight_scale = numpy_helper.to_array(find_by_name(node.input[1], initializers))
+        if len(node.input) > 2:
+            weight_zp = numpy_helper.to_array(find_by_name(node.input[2], initializers))
+        else:
+            weight_zp = numpy.zeros(weight_scale.shape, dtype=numpy.int32)
+
+        # Perform dequantization:
+        if weight_scale.size == weight_zp.size == 1:
+            # Avoids the confusion between a scaler and a tensor of one element.
+            weight_scale = weight_scale.reshape(tuple())
+            weight_zp = weight_zp.reshape(tuple())
+        if weight_scale.shape != weight_zp.shape:
+            raise RuntimeError(
+                f"scale and zero_point must have the same shape but {weight_scale.shape} != {weight_zp.shape}"
+            )
+        weight_quant = _run_dequantize_linear(weight_tensor, weight_scale, weight_zp, channel_axis=axis)
+        weight_name = weight_name[: -len(TENSOR_NAME_QUANT_SUFFIX)]
+        if weight_quant is None:
+            logging.error(f"Model Error in '{qdq_model_path}': '{weight_name}' per-channel quantization on 0 channel")
+            continue
+
+        float_values = find_by_name(weight_name, float_onnx_model.initializer())
+        if not float_values:
+            logging.error(f"Model Error in '{float_model_path}': weight tensor '{weight_name}' not found!")
+            continue
+        weight_float = numpy_helper.to_array(float_values)
+        matched_weights[weight_name] = {"float": weight_float, "dequantized": weight_quant}
+
+    return matched_weights
+
+
+def compute_signal_to_quantization_noice_ratio(
+    x: Union[Sequence[numpy.ndarray], numpy.ndarray], y: Union[Sequence[numpy.ndarray], numpy.ndarray]
+) -> float:
+    if isinstance(x, numpy.ndarray):
+        xlist = [x]
+    else:
+        xlist = x
+    if isinstance(y, numpy.ndarray):
+        ylist = [y]
+    else:
+        ylist = y
+    if len(xlist) != len(ylist):
+        raise RuntimeError("Unequal number of tensors to compare!")
+
+    left = numpy.concatenate(xlist).flatten()
+    right = numpy.concatenate(ylist).flatten()
+
+    epsilon = numpy.finfo("float").eps
+    tensor_norm = max(numpy.linalg.norm(left), epsilon)
+    diff_norm = max(numpy.linalg.norm(left - right), epsilon)
+    res = tensor_norm / diff_norm
+    return 20 * math.log10(res)
+
+
+def compute_weight_error(
+    weights_match: Dict[str, Dict[str, numpy.ndarray]],
+    err_func: Callable[[numpy.ndarray, numpy.ndarray], float] = compute_signal_to_quantization_noice_ratio,
+) -> Dict[str, float]:
+    result: Dict[str, float] = {}
+    for weight_name, weight_match in weights_match.items():
+        result[weight_name] = err_func(weight_match["float"], weight_match["dequantized"])
+    return result
+
+
+def compute_activation_error(
+    activations_match: Dict[str, Dict[str, Sequence[numpy.ndarray]]],
+    err_func: Callable[
+        [Sequence[numpy.ndarray], Sequence[numpy.ndarray]], float
+    ] = compute_signal_to_quantization_noice_ratio,
+) -> Dict[str, Dict[str, float]]:
+    result: Dict[str, Dict[str, float]] = {}
+    for name, match in activations_match.items():
+        err_result: Dict[str, float] = {}
+        err_result["qdq_err"] = err_func(match["pre_qdq"], match["post_qdq"])
+        float_activation = match["float"]
+        if float_activation:
+            err_result["xmodel_err"] = err_func(float_activation, match["post_qdq"])
+        result[name] = err_result
+    return result
--- a/rl/Lib/site-packages/onnxruntime/quantization/qdq_quantizer.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/qdq_quantizer.py
--- a/rl/Lib/site-packages/onnxruntime/quantization/quant_utils.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/quant_utils.py
@ -0,0 +1,866 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import logging
+import os
+import tempfile
+from enum import Enum
+from pathlib import Path
+
+import numpy
+import onnx
+from onnx import ModelProto, TensorProto, external_data_helper
+from onnx import onnx_pb as onnx_proto
+from onnx.helper import make_graph, make_model, make_node, make_tensor_value_info
+from onnx.reference import ReferenceEvaluator
+
+from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions
+
+try:
+    from onnx.reference.custom_element_types import float8e4m3fn
+except ImportError:
+    float8e4m3fn = None
+
+# INT4 np.dtypes added in ONNX 1.16. These map to np.int8/np.uint8 because numpy
+# does not support sub-byte types.
+try:
+    from onnx.reference.custom_element_types import int4, uint4
+except ImportError:
+    int4 = None
+    uint4 = None
+
+
+__producer__ = "onnx.quantize"
+__version__ = "0.1.0"
+onnx_domain = "ai.onnx"
+ms_domain = "com.microsoft"
+QUANT_OP_NAME = "QuantizeLinear"
+QUANT_INPUT_SUFFIX = "_QuantizeLinear_Input"
+DEQUANT_OP_NAME = "DequantizeLinear"
+DEQUANT_OUTPUT_SUFFIX = "_DequantizeLinear_Output"
+TENSOR_NAME_QUANT_SUFFIX = "_quantized"
+
+FLOAT8_DISTRIBUTIONS = {}
+
+type_to_name = {getattr(TensorProto, k): k for k in dir(TensorProto) if isinstance(getattr(TensorProto, k), int)}
+
+# Quantization mode
+# IntegerOps: Use IntegerOps in quantized model. Only ConvInteger and MatMulInteger ops are supported now.
+# QLinearOps: Use QLinearOps in quantized model. Only QLinearConv and QLinearMatMul ops are supported now.
+
+
+class QuantizationMode(Enum):
+    IntegerOps = 0
+    QLinearOps = 1
+
+    def __str__(self):
+        return self.name
+
+    @staticmethod
+    def from_string(mode):
+        try:
+            return QuantizationMode[mode]
+        except KeyError:
+            raise ValueError()  # noqa: B904
+
+
+class QuantizedValueType(Enum):
+    Input = 0
+    Initializer = 1
+
+    def __str__(self):
+        return self.name
+
+    @staticmethod
+    def from_string(v):
+        try:
+            return QuantizedValueType[v]
+        except KeyError:
+            raise ValueError()  # noqa: B904
+
+
+class QuantType(Enum):
+    QInt8 = 0
+    QUInt8 = 1
+    QFLOAT8E4M3FN = 2
+    QInt16 = 3
+    QUInt16 = 4
+    QInt4 = 5
+    QUInt4 = 6
+
+    def __str__(self):
+        return self.name
+
+    @staticmethod
+    def from_string(t):
+        try:
+            return QuantType[t]
+        except KeyError:
+            raise ValueError()  # noqa: B904
+
+    @property
+    def tensor_type(self):
+        if self == QuantType.QInt8:
+            return TensorProto.INT8
+        if self == QuantType.QUInt8:
+            return TensorProto.UINT8
+        if self == QuantType.QUInt16:
+            return TensorProto.UINT16
+        if self == QuantType.QInt16:
+            return TensorProto.INT16
+        if self == QuantType.QFLOAT8E4M3FN:
+            return TensorProto.FLOAT8E4M3FN
+        if self == QuantType.QUInt4:
+            return TensorProto.UINT4
+        if self == QuantType.QInt4:
+            return TensorProto.INT4
+        raise ValueError(f"Unexpected value qtype={self!r}.")
+
+
+class QuantFormat(Enum):
+    QOperator = 0
+    QDQ = 1
+
+    def __str__(self):
+        return self.name
+
+    @staticmethod
+    def from_string(format):
+        try:
+            return QuantFormat[format]
+        except KeyError:
+            raise ValueError()  # noqa: B904
+
+
+ONNX_TYPE_TO_NP_TYPE = {
+    onnx_proto.TensorProto.INT8: numpy.dtype("int8"),
+    onnx_proto.TensorProto.UINT8: numpy.dtype("uint8"),
+    onnx_proto.TensorProto.INT16: numpy.dtype("int16"),
+    onnx_proto.TensorProto.UINT16: numpy.dtype("uint16"),
+    onnx_proto.TensorProto.FLOAT8E4M3FN: float8e4m3fn,
+    onnx_proto.TensorProto.INT4: int4,  # base_dtype is np.int8
+    onnx_proto.TensorProto.UINT4: uint4,  # base_dtype is np.uint8
+}
+
+ONNX_INT_TYPE_RANGE = {
+    onnx_proto.TensorProto.UINT8: (numpy.array(0, dtype=numpy.uint8), numpy.array(255, dtype=numpy.uint8)),
+    onnx_proto.TensorProto.INT8: (numpy.array(-128, dtype=numpy.int8), numpy.array(127, dtype=numpy.int8)),
+    onnx_proto.TensorProto.UINT16: (numpy.array(0, dtype=numpy.uint16), numpy.array(65535, dtype=numpy.uint16)),
+    onnx_proto.TensorProto.INT16: (numpy.array(-32768, dtype=numpy.int16), numpy.array(32767, dtype=numpy.int16)),
+    onnx_proto.TensorProto.UINT4: (numpy.array(0, dtype=uint4), numpy.array(15, dtype=uint4)),
+    onnx_proto.TensorProto.INT4: (numpy.array(-8, dtype=int4), numpy.array(7, dtype=int4)),
+}
+
+ONNX_INT_TYPE_SYMMETRIC_RANGE = {
+    onnx_proto.TensorProto.INT8: (numpy.array(-127, dtype=numpy.int8), numpy.array(127, dtype=numpy.int8)),
+    onnx_proto.TensorProto.INT16: (numpy.array(-32767, dtype=numpy.int16), numpy.array(32767, dtype=numpy.int16)),
+}
+
+ONNX_INT_TYPE_REDUCED_RANGE = {
+    onnx_proto.TensorProto.UINT8: (numpy.array(0, dtype=numpy.uint8), numpy.array(127, dtype=numpy.uint8)),
+    onnx_proto.TensorProto.INT8: (numpy.array(-64, dtype=numpy.int8), numpy.array(64, dtype=numpy.int8)),
+    onnx_proto.TensorProto.UINT16: (numpy.array(0, dtype=numpy.uint16), numpy.array(32767, dtype=numpy.uint16)),
+    onnx_proto.TensorProto.INT16: (numpy.array(-16384, dtype=numpy.int16), numpy.array(16384, dtype=numpy.int16)),
+    onnx_proto.TensorProto.UINT4: (numpy.array(0, dtype=int4), numpy.array(7, dtype=int4)),
+    onnx_proto.TensorProto.INT4: (numpy.array(-4, dtype=int4), numpy.array(3, dtype=int4)),
+}
+
+
+def _check_type(*args, zero_point_index=-1):
+    new_args = []
+    for i, a in enumerate(args):
+        if numpy.issubdtype(type(a), numpy.number):
+            new_args.append(numpy.array(a))
+        elif isinstance(a, numpy.ndarray):
+            new_args.append(a)
+        else:
+            raise TypeError(f"arg {i} is not an array: {a}")
+        if i == zero_point_index:
+            v = new_args[-1]
+            if v.dtype == numpy.float32 or v.dtype == numpy.float16:
+                raise TypeError(f"zero_point cannot be {v.dtype}")
+    return tuple(new_args) if len(new_args) > 1 else new_args[0]
+
+
+def quantize_nparray(qType, arr, scale, zero_point, low=None, high=None):
+    assert (
+        qType in ONNX_TYPE_TO_NP_TYPE
+    ), f"Unexpected data type {qType} requested. Only INT8, UINT8, INT16, and UINT16 are supported."
+    if qType in (
+        onnx_proto.TensorProto.FLOAT8E4M3FN,
+        onnx_proto.TensorProto.FLOAT8E4M3FNUZ,
+        onnx_proto.TensorProto.FLOAT8E5M2,
+        onnx_proto.TensorProto.FLOAT8E5M2FNUZ,
+    ):
+        if zero_point != 0:
+            raise NotImplementedError(f"zero_point is expected to be null for float 8 not {zero_point!r}.")
+        if arr.dtype == numpy.float32:
+            onnx_type = TensorProto.FLOAT
+        elif arr.dtype == numpy.float16:
+            onnx_type = TensorProto.FLOAT16
+        else:
+            raise ValueError(f"Unexpected dtype {arr.dtype}.")
+        onnx_model = make_model(
+            make_graph(
+                [
+                    make_node(
+                        "Constant", [], ["zero_point"], value=onnx.helper.make_tensor("zero_point", qType, [], [0])
+                    ),
+                    make_node("QuantizeLinear", ["X", "scale", "zero_point"], ["Y"]),
+                ],
+                "qu",
+                [
+                    make_tensor_value_info("X", onnx_type, None),
+                    make_tensor_value_info("scale", onnx_type, None),
+                ],
+                [make_tensor_value_info("Y", qType, None)],
+            )
+        )
+        ref = ReferenceEvaluator(onnx_model)
+        return _check_type(ref.run(None, {"X": arr, "scale": scale})[0])
+    else:
+        # Quantizes data for all integer types.
+        #
+        # For int4 types, the quantized data is returned as either np.int8 or np.uint8,
+        # which matches the python reference ONNX implementation of QuantizeLinear.
+        # This data can be packed into 4-bit elements by using pack_bytes_to_4bit().
+        dtype = ONNX_TYPE_TO_NP_TYPE[qType]
+        (qmin, qmax) = get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=True)
+
+        cliplow = max(qmin, low) if low is not None else qmin
+        cliphigh = min(qmax, high) if high is not None else qmax
+        arr_fp32 = numpy.asarray((arr.astype(numpy.float32) / scale).round() + zero_point)
+        numpy.clip(arr_fp32, cliplow, cliphigh, out=arr_fp32)
+        return _check_type(arr_fp32.astype(dtype))
+
+
+def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False, min_real_range=None):
+    """Calculate the scale s and zero point z for the quantization relation
+    r = s(q-z), where r are the original values and q are the corresponding
+    quantized values.
+
+    r and z are calculated such that every value within [rmin,rmax] has an
+    approximate representation within [qmin,qmax]. In addition, qmin <= z <=
+    qmax is enforced. If the symmetric flag is set to True, the interval
+    [rmin,rmax] is symmetrized to [-absmax, +absmax], where
+    absmax = max(abs(rmin), abs(rmax)).
+
+    :parameter rmin: minimum value of r
+    :parameter rmax: maximum value of r
+    :parameter qmin: minimum value representable by the target quantization data type
+    :parameter qmax: maximum value representable by the target quantization data type
+    :parameter symmetric: True if the floating-point range should be made symmetric. Defaults to False.
+    :parameter min_real_range: Minimum floating-point range (i.e., rmax - rmin) to enforce. Defaults to None.
+    :return: zero and scale [z, s]
+
+    """
+    if qmin > 0 or qmax < 0:
+        raise ValueError(f"qmin and qmax must meet requirement: qmin <= 0 <= qmax while qmin:{qmin}, qmmax:{qmax}")
+
+    # Adjust rmin and rmax such that 0 is included in the range. This is
+    # required to make sure zero can be represented by the quantization data
+    # type (i.e. to make sure qmin <= zero_point <= qmax)
+    rmin = numpy.minimum(rmin, numpy.array(0, dtype=rmin.dtype))
+    rmax = numpy.maximum(rmax, numpy.array(0, dtype=rmax.dtype))
+
+    # Ensure a minimum float-point range if specified.
+    if min_real_range is not None:
+        rmax = max(rmax, rmin + min_real_range)
+
+    if symmetric:
+        absmax = numpy.maximum(numpy.abs(rmin), numpy.abs(rmax))
+        rmin = -absmax
+        rmax = +absmax
+
+    assert qmin <= qmax, f"qmin={rmin} > qmax={rmax}"
+    dr = numpy.array(rmax - rmin, dtype=numpy.float64)
+    dq = numpy.array(qmax, dtype=numpy.float64) - numpy.array(qmin, dtype=numpy.float64)
+    scale = numpy.array(dr / dq)
+    assert scale >= 0, "scale isse"
+    if scale < numpy.finfo(rmax.dtype).tiny:
+        scale = numpy.array(1.0, dtype=rmax.dtype)
+        zero_point = numpy.array(0, dtype=qmin.dtype)
+    else:
+        if symmetric:
+            # When symmetric (i.e., rmax == -rmin), the zero_point formula reduces to round((qmax + qmin) / 2.0).
+            # This simpler formula doesn't depend on scale and guarantees that the zero point values
+            # for int8, uint8, int16, and uint16 are always 0, 128, 0, and 32768, respectively.
+            # This is important for per-channel/symmetric QLinearConv on CPU EP, which requires all channels to have
+            # the exact same zero_point values.
+            zero_point = numpy.array(
+                numpy.round((qmin + qmax) / numpy.array(2.0, dtype=numpy.float64)), dtype=qmin.dtype
+            )
+        else:
+            zero_point = numpy.array(numpy.round(qmin - rmin / scale), dtype=qmin.dtype)
+        scale = scale.astype(rmax.dtype)
+
+    return [zero_point, scale]
+
+
+def compute_scale_zp_float8(element_type, std):
+    """Calculate the scale s for a float8 type (E4M3FN).
+    The function assumes the coefficient distribution and the float 8
+    distribution are similar to two gaussian laws.
+
+    :return: zero and scale [z, s]
+
+    More details in notebook `quantization_fp8.ipynb
+    <https://github.com/microsoft/onnxruntime/blob/main/docs/python/notebooks/quantization_fp8.ipynb>`_.
+    """
+    zp_dtype = None
+    if element_type not in FLOAT8_DISTRIBUTIONS:
+        if element_type == TensorProto.FLOAT8E4M3FN:
+            from onnx.numpy_helper import float8e4m3_to_float32
+            from onnx.reference.custom_element_types import float8e4m3fn
+
+            zp_dtype = float8e4m3fn
+            all_values = [float8e4m3_to_float32(i) for i in range(256)]
+            values = numpy.array(
+                [f for f in all_values if not numpy.isnan(f) and not numpy.isinf(f)], dtype=numpy.float32
+            )
+        else:
+            raise ValueError(f"Quantization to element_type={element_type} not implemented.")
+        FLOAT8_DISTRIBUTIONS[element_type] = values
+    elif element_type == TensorProto.FLOAT8E4M3FN:
+        from onnx.reference.custom_element_types import float8e4m3fn
+
+        zp_dtype = float8e4m3fn
+
+    if zp_dtype is None:
+        raise TypeError(f"Unexpected element_type {element_type}.")
+    std_f8 = numpy.std(FLOAT8_DISTRIBUTIONS[element_type])
+    zero = numpy.array(0, dtype=zp_dtype)
+    scale = numpy.array(std / std_f8, dtype=std.dtype)
+    return [zero, scale]
+
+
+def quantize_data(
+    data, qType, symmetric, reduce_range=False, min_real_range=None, rmin_override=None, rmax_override=None
+):
+    """
+    :param data: data to quantize
+    :param qType: data type to quantize to. Supported types UINT8 and INT8
+    :param symmetric: whether symmetric quantization is used or not. This is applied to INT8.
+    :parameter reduce_range: True if the quantization range should be reduced. Defaults to False.
+    :parameter min_real_range: Minimum floating-point range (i.e., rmax - rmin) to enforce. Defaults to None.
+    :parameter rmin_override: The value of rmin to use if not None. Otherwise, uses min(data).
+    :parameter rmax_override: The value of rmax to use if not None. Otherwise, uses max(data).
+    :return: minimum, maximum, zero point, scale, and quantized weights
+
+    To pack weights, we compute a linear transformation
+
+    - when data `type == uint8` mode, from `[rmin, rmax]` -> :math:`[0, 2^{b-1}]` and
+    - when data `type == int8`, from `[-m , m]` -> :math:`[-(2^{b-1}-1), 2^{b-1}-1]` where
+        `m = max(abs(rmin), abs(rmax))`
+
+    and add necessary intermediate nodes to transform quantized weight to full weight using the equation
+
+    :math:`r = S(q-z)`, where
+
+    - *r*: real original value
+    - *q*: quantized value
+    - *S*: scale
+    - *z*: zero point
+    """
+    if not isinstance(data, numpy.ndarray):
+        raise TypeError(f"Weight must be given as an array not {type(data)}.")
+    if rmin_override is not None:
+        rmin = rmin_override
+    else:
+        rmin = data.min() if len(data) else 0.0
+
+    if rmax_override is not None:
+        rmax = rmax_override
+    else:
+        rmax = data.max() if len(data) else 0.0
+
+    rmin = numpy.array(rmin, dtype=data.dtype)
+    rmax = numpy.array(rmax, dtype=data.dtype)
+    zero_point = 0
+    scale = numpy.array(1.0, dtype=data.dtype)
+
+    if qType == TensorProto.FLOAT8E4M3FN:
+        if reduce_range:
+            raise RuntimeError("Unsupported option reduce_range=True for float 8.")
+        std = numpy.std(data)
+        zero_point, scale = compute_scale_zp_float8(qType, std)
+        quantized_data = quantize_nparray(qType, data, scale, zero_point)
+        if any((quantized_data.astype(numpy.uint8).ravel() & 127) == 127):
+            np_data = numpy.asarray(data)
+            raise RuntimeError(
+                f"One of the quantized value is NaN data in [{np_data.min()}, {np_data.max()}], "
+                f"quantized_data in [{quantized_data.min()}, {quantized_data.max()}]."
+            )
+        return _check_type(rmin, rmax, zero_point, scale, quantized_data, zero_point_index=2)
+
+    if qType in (
+        TensorProto.INT8,
+        TensorProto.UINT8,
+        TensorProto.INT16,
+        TensorProto.UINT16,
+        TensorProto.INT4,
+        TensorProto.UINT4,
+    ):
+        if len(data):
+            qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, symmetric=symmetric)
+            zero_point, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, min_real_range)
+        quantized_data = quantize_nparray(qType, data, scale, zero_point)
+        return _check_type(rmin, rmax, zero_point, scale, quantized_data, zero_point_index=2)
+
+    raise ValueError(f"Unexpected value for qType={qType}.")
+
+
+def get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=False):  # noqa: N802
+    """
+    Return qmin and qmax, the minimum and maximum value representable by the given qType
+    :parameter qType: onnx.onnx_pb.TensorProto.UINT8 or onnx.onnx_pb.TensorProto.UINT8
+    :return: qmin, qmax
+    """
+    if qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
+        raise NotImplementedError("This function is not implemented for float 8 as not needed.")
+
+    qrange = None
+
+    if reduce_range:
+        qrange = ONNX_INT_TYPE_REDUCED_RANGE.get(qType)
+    elif symmetric and qType in ONNX_INT_TYPE_SYMMETRIC_RANGE:
+        qrange = ONNX_INT_TYPE_SYMMETRIC_RANGE[qType]
+    else:
+        qrange = ONNX_INT_TYPE_RANGE.get(qType)
+
+    if not qrange:
+        raise ValueError(f"Unexpected data type {qType} requested. Only INT8, UINT8, INT16, and UINT16 are supported.")
+
+    qmin, qmax = qrange
+    if qmin > 0 or qmax < 0:
+        raise ValueError(
+            f"qmin and qmax must meet requirement: qmin <= 0 <= qmax while "
+            f"qmin:{qmin}, qmmax:{qmax}, dtype={qmin.dtype}, reduce_range={reduce_range}, "
+            f"symmetric={symmetric}, qType={qType}"
+        )
+
+    return qrange
+
+
+def get_qrange_for_qType(qType, reduce_range=False, symmetric=False):  # noqa: N802
+    """
+    Helper function to get the quantization range for a type.
+        parameter qType: quantization type.
+        return: quantization range.
+    """
+    qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, symmetric=symmetric)
+    return qmax - qmin
+
+
+def normalize_axis(axis: int, rank: int) -> tuple[bool, int]:
+    """
+    Helper function that tries to return a normalized axis in the range [0, rank - 1].
+    :parameter axis: The axis to normalize.
+    :parameter rank: The tensor rank (number of dimensions).
+    :return (is_valid, axis_norm)
+    """
+    axis_norm = axis + rank if axis < 0 else axis
+    is_valid = axis_norm >= 0 and axis_norm < rank
+    return is_valid, axis_norm
+
+
+def pack_bytes_to_4bit(src_8bit: bytes) -> bytearray:
+    """
+    Copies a source array of 8-bit values into a destination bytearray of packed 4-bit values.
+    Assumes that the source values are already in the appropriate int4 range.
+    :parameter src_8bit: The 8-bit element values to pack.
+    :return A bytearray with every two 8-bit src elements packed into a single byte.
+    """
+    num_elems = len(src_8bit)
+    if num_elems == 0:
+        return bytearray()
+
+    dst_size = (num_elems + 1) // 2  # Ex: 5 8-bit elems packed into 3 bytes
+    dst = bytearray(dst_size)
+
+    src_i: int = 0
+    dst_i: int = 0
+
+    # Pack two 8-bit elements into a single byte in each iteration.
+    while src_i < num_elems - 1:
+        dst[dst_i] = ((src_8bit[src_i + 1] & 0xF) << 4) | (src_8bit[src_i] & 0xF)
+        dst_i += 1
+        src_i += 2
+
+    if src_i < num_elems:
+        # Odd number of elements.
+        dst[dst_i] = src_8bit[src_i] & 0xF
+
+    return dst
+
+
+class QuantizedInitializer:
+    """
+    Represents a linearly quantized weight input from ONNX operators
+    """
+
+    def __init__(
+        self,
+        name,
+        initializer,
+        rmins,
+        rmaxs,
+        zero_points,
+        scales,
+        data=[],  # noqa: B006
+        quantized_data=[],  # noqa: B006
+        axis=None,
+    ):
+        self.name = name
+        self.initializer = initializer  # TensorProto initializer in ONNX graph
+        self.rmins = rmins  # List of minimum range for each axis
+        self.rmaxs = rmaxs  # List of maximum range for each axis
+        # 1D tensor of zero points computed for each axis. scalar if axis is empty
+        self.zero_points = zero_points
+        self.scales = scales  # 1D tensor of scales computed for each axis. scalar if axis is empty
+        self.data = data  # original data from initializer TensorProto
+        self.quantized_data = quantized_data  # weight-packed data from data
+        # Scalar to specify which dimension in the initializer to weight pack.
+        self.axis = axis
+        # If empty, single zero point and scales computed from a single rmin and rmax
+
+
+class QuantizedValue:
+    """
+    Represents a linearly quantized value (input\\output\\intializer)
+    """
+
+    def __init__(
+        self,
+        name,
+        new_quantized_name,
+        scale_name,
+        zero_point_name,
+        quantized_value_type,
+        axis=None,
+        node_type=None,
+        node_qtype=None,
+        scale_type=None,
+    ):
+        self.original_name = name
+        self.q_name = new_quantized_name
+        self.scale_name = scale_name
+        self.zp_name = zero_point_name
+        self.value_type = quantized_value_type
+        self.axis = axis
+        self.node_type = node_type
+        self.node_qtype = node_qtype
+        self.scale_type = scale_type
+
+
+class BiasToQuantize:
+    """
+    Represents a bias to be quantized
+    """
+
+    def __init__(self, bias_name, input_name, weight_name):
+        self.bias_name = bias_name
+        self.input_name = input_name
+        self.weight_name = weight_name
+
+
+def attribute_to_kwarg(attribute):
+    """
+    Convert attribute to kwarg format for use with onnx.helper.make_node.
+        :parameter attribute: attribute in AttributeProto format.
+        :return: attribute in {key: value} format.
+    """
+    if attribute.type == 0:
+        raise ValueError(f"attribute {attribute.name} does not have type specified.")
+
+    # Based on attribute type definitions from AttributeProto
+    # definition in https://github.com/onnx/onnx/blob/main/onnx/onnx.proto
+    if attribute.type == 1:
+        value = attribute.f
+    elif attribute.type == 2:
+        value = attribute.i
+    elif attribute.type == 3:
+        value = attribute.s
+    elif attribute.type == 4:
+        value = attribute.t
+    elif attribute.type == 5:
+        value = attribute.g
+    elif attribute.type == 6:
+        value = attribute.floats
+    elif attribute.type == 7:
+        value = attribute.ints
+    elif attribute.type == 8:
+        value = attribute.strings
+    elif attribute.type == 9:
+        value = attribute.tensors
+    elif attribute.type == 10:
+        value = attribute.graphs
+    else:
+        raise ValueError(f"attribute {attribute.name} has unsupported type {attribute.type}.")
+
+    return {attribute.name: value}
+
+
+def find_by_name(item_name, item_list):
+    """
+    Helper function to find item by name in a list.
+        parameter item_name: name of the item.
+        parameter item_list: list of items.
+        return: item if found. None otherwise.
+    """
+    items = [item for item in item_list if item.name == item_name]
+    return items[0] if len(items) > 0 else None
+
+
+def get_elem_index(elem_name, elem_list):
+    """
+    Helper function to return index of an item in a node list
+    """
+    elem_idx = -1
+    for i in range(len(elem_list)):
+        if elem_list[i] == elem_name:
+            elem_idx = i
+    return elem_idx
+
+
+def get_mul_node(inputs, output, name):
+    """
+    Helper function to create a Mul node.
+        parameter inputs: list of input names.
+        parameter output: output name.
+        parameter name: name of the node.
+        return: Mul node in NodeProto format.
+    """
+    return onnx.helper.make_node("Mul", inputs, [output], name)
+
+
+def generate_identified_filename(filename: Path, identifier: str) -> Path:
+    """
+    Helper function to generate a identifiable filepath by concatenating the given identifier as a suffix.
+    """
+    return filename.parent.joinpath(filename.stem + identifier + filename.suffix)
+
+
+def apply_plot(hist, hist_edges):
+    import sys
+
+    import matplotlib.pyplot as plt
+    import numpy
+
+    numpy.set_printoptions(threshold=sys.maxsize)
+    print("Histogram:")
+    print(hist)
+    print("Histogram Edges:")
+    print(hist_edges)
+    plt.stairs(hist, hist_edges, fill=True)
+    plt.xlabel("Tensor value")
+    plt.ylabel("Counts")
+    plt.title("Tensor value V.S. Counts")
+    plt.show()
+
+
+def write_calibration_table(calibration_cache, dir="."):
+    """
+    Helper function to write calibration table to files.
+    """
+
+    import json
+
+    import flatbuffers
+
+    import onnxruntime.quantization.CalTableFlatBuffers.KeyValue as KeyValue
+    import onnxruntime.quantization.CalTableFlatBuffers.TrtTable as TrtTable
+
+    logging.info(f"calibration cache: {calibration_cache}")
+
+    with open(os.path.join(dir, "calibration.json"), "w") as file:
+        file.write(json.dumps(calibration_cache))  # use `json.loads` to do the reverse
+
+    # Serialize data using FlatBuffers
+    builder = flatbuffers.Builder(1024)
+    key_value_list = []
+    for key in sorted(calibration_cache.keys()):
+        values = calibration_cache[key]
+        value = str(max(abs(values[0]), abs(values[1])))
+
+        flat_key = builder.CreateString(key)
+        flat_value = builder.CreateString(value)
+
+        KeyValue.KeyValueStart(builder)
+        KeyValue.KeyValueAddKey(builder, flat_key)
+        KeyValue.KeyValueAddValue(builder, flat_value)
+        key_value = KeyValue.KeyValueEnd(builder)
+
+        key_value_list.append(key_value)
+
+    TrtTable.TrtTableStartDictVector(builder, len(key_value_list))
+    for key_value in key_value_list:
+        builder.PrependUOffsetTRelative(key_value)
+    main_dict = builder.EndVector()
+
+    TrtTable.TrtTableStart(builder)
+    TrtTable.TrtTableAddDict(builder, main_dict)
+    cal_table = TrtTable.TrtTableEnd(builder)
+
+    builder.Finish(cal_table)
+    buf = builder.Output()
+
+    with open(os.path.join(dir, "calibration.flatbuffers"), "wb") as file:
+        file.write(buf)
+
+    # Deserialize data (for validation)
+    if os.environ.get("QUANTIZATION_DEBUG", 0) in (1, "1"):
+        cal_table = TrtTable.TrtTable.GetRootAsTrtTable(buf, 0)
+        dict_len = cal_table.DictLength()
+        for i in range(dict_len):
+            key_value = cal_table.Dict(i)
+            logging.info(key_value.Key())
+            logging.info(key_value.Value())
+
+    # write plain text
+    with open(os.path.join(dir, "calibration.cache"), "w") as file:
+        for key in sorted(calibration_cache.keys()):
+            value = calibration_cache[key]
+            s = key + " " + str(max(abs(value[0]), abs(value[1])))
+            file.write(s)
+            file.write("\n")
+
+
+def smooth_distribution(p, eps=0.0001):
+    """Given a discrete distribution (may have not been normalized to 1),
+    smooth it by replacing zeros with eps multiplied by a scaling factor
+    and taking the corresponding amount off the non-zero values.
+    Ref: http://web.engr.illinois.edu/~hanj/cs412/bk3/KL-divergence.pdf
+         https://github.com//apache/incubator-mxnet/blob/master/python/mxnet/contrib/quantization.py
+    """
+    is_zeros = (p == 0).astype(numpy.float32)
+    is_nonzeros = (p != 0).astype(numpy.float32)
+    n_zeros = is_zeros.sum()
+    n_nonzeros = p.size - n_zeros
+
+    if not n_nonzeros:
+        # raise ValueError('The discrete probability distribution is malformed. All entries are 0.')
+        return None
+    eps1 = eps * float(n_zeros) / float(n_nonzeros)
+    assert eps1 < 1.0, "n_zeros=%d, n_nonzeros=%d, eps1=%f" % (
+        n_zeros,
+        n_nonzeros,
+        eps1,
+    )
+
+    hist = p.astype(numpy.float32)
+    hist += eps * is_zeros + (-eps1) * is_nonzeros
+    assert (hist <= 0).sum() == 0
+
+    return hist
+
+
+def model_has_external_data(model_path: Path):
+    model = onnx.load(model_path.as_posix(), load_external_data=False)
+    for intializer in model.graph.initializer:
+        if external_data_helper.uses_external_data(intializer):
+            return True
+    return False
+
+
+def optimize_model(model_path: Path, opt_model_path: Path):
+    """
+        Generate model that applies graph optimization (constant folding, etc.)
+        parameter model_path: path to the original onnx model
+        parameter opt_model_path: path to the optimized onnx model
+    :return: optimized onnx model
+    """
+    sess_option = SessionOptions()
+    sess_option.optimized_model_filepath = opt_model_path.as_posix()
+    sess_option.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC
+    kwargs = {}
+    # This will rename constant initializer names, disable it to make test pass.
+    kwargs["disabled_optimizers"] = ["ConstantSharing"]
+    _ = InferenceSession(model_path.as_posix(), sess_option, providers=["CPUExecutionProvider"], **kwargs)
+
+
+def add_pre_process_metadata(model: ModelProto):
+    """Tag the model that it went through quantization pre-processing"""
+    metadata_props = {"onnx.quant.pre_process": "onnxruntime.quant"}
+    if model.metadata_props:
+        for prop in model.metadata_props:
+            metadata_props.update({prop.key: prop.value})
+    onnx.helper.set_model_props(model, metadata_props)
+
+
+def model_has_pre_process_metadata(model: ModelProto) -> bool:
+    """Check the model whether it went through quantization pre-processing"""
+    if model.metadata_props:
+        for prop in model.metadata_props:
+            if prop.key == "onnx.quant.pre_process" and prop.value == "onnxruntime.quant":
+                return True
+    return False
+
+
+def add_infer_metadata(model: ModelProto):
+    metadata_props = {"onnx.infer": "onnxruntime.quant"}
+    if model.metadata_props:
+        for p in model.metadata_props:
+            metadata_props.update({p.key: p.value})
+    onnx.helper.set_model_props(model, metadata_props)
+
+
+def model_has_infer_metadata(model: ModelProto) -> bool:
+    if model.metadata_props:
+        for p in model.metadata_props:
+            if p.key == "onnx.infer" and p.value == "onnxruntime.quant":
+                return True
+    return False
+
+
+def load_model_with_shape_infer(model_path: Path) -> ModelProto:
+    inferred_model_path = generate_identified_filename(model_path, "-inferred")
+    onnx.shape_inference.infer_shapes_path(str(model_path), str(inferred_model_path))
+    model = onnx.load(inferred_model_path.as_posix())
+    add_infer_metadata(model)
+    inferred_model_path.unlink()
+    return model
+
+
+def save_and_reload_model_with_shape_infer(model: ModelProto) -> ModelProto:
+    with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
+        model_path = Path(quant_tmp_dir).joinpath("model.onnx")
+        onnx.save_model(model, model_path.as_posix(), save_as_external_data=True)
+        return load_model_with_shape_infer(model_path)
+
+
+def tensor_proto_to_array(initializer: TensorProto) -> numpy.ndarray:
+    if initializer.data_type in (onnx_proto.TensorProto.FLOAT, onnx_proto.TensorProto.FLOAT16):
+        return onnx.numpy_helper.to_array(initializer)
+
+    raise ValueError(
+        f"Only float type is supported. Weights {initializer.name} is {type_to_name[initializer.data_type]}"
+    )
+
+
+def add_quant_suffix(tensor_name: str) -> str:
+    return tensor_name + "_QuantizeLinear"
+
+
+def add_quant_input_suffix(tensor_name: str) -> str:
+    return tensor_name + QUANT_INPUT_SUFFIX
+
+
+def add_quant_output_suffix(tensor_name) -> str:
+    return tensor_name + "_QuantizeLinear_Output"
+
+
+def add_dequant_suffix(tensor_name) -> str:
+    return tensor_name + "_DequantizeLinear"
+
+
+def add_dequant_input_suffix(tensor_name) -> str:
+    return tensor_name + "_DequantizeLinear_Input"
+
+
+def add_dequant_output_suffix(tensor_name) -> str:
+    return tensor_name + DEQUANT_OUTPUT_SUFFIX
--- a/rl/Lib/site-packages/onnxruntime/quantization/quantize.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/quantize.py
@ -0,0 +1,737 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+import logging
+import tempfile
+from pathlib import Path
+from typing import Union
+
+import onnx
+
+from .calibrate import CalibrationDataReader, CalibrationMethod, TensorsData, create_calibrator
+from .onnx_quantizer import ONNXQuantizer
+from .qdq_quantizer import QDQQuantizer
+from .quant_utils import (
+    QuantFormat,
+    QuantizationMode,
+    QuantType,
+    load_model_with_shape_infer,
+    model_has_pre_process_metadata,
+    save_and_reload_model_with_shape_infer,
+)
+from .registry import IntegerOpsRegistry, QDQRegistry, QLinearOpsRegistry
+
+
+class QuantConfig:
+    def __init__(
+        self,
+        activation_type=QuantType.QUInt8,
+        weight_type=QuantType.QInt8,
+        op_types_to_quantize=None,
+        nodes_to_quantize=None,
+        nodes_to_exclude=None,
+        per_channel=False,
+        reduce_range=False,
+        use_external_data_format=False,
+    ):
+        """
+        This is the Base class for both Static and Dynamic Quantize Configuration
+        Args:
+            activation_type:
+                quantization data type of activation. Please refer to
+                https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
+            weight_type:
+                quantization data type of weight. Please refer to
+                https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
+            op_types_to_quantize:
+                specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
+                It quantizes all supported operators by default.
+            nodes_to_quantize:
+                List of nodes names to quantize. When this list is not None only the nodes in this list
+                are quantized.
+                example:
+                [
+                    'Conv__224',
+                    'Conv__252'
+                ]
+            nodes_to_exclude:
+                List of nodes names to exclude. The nodes in this list will be excluded from quantization
+                when it is not None.
+            per_channel: quantize weights per channel
+            reduce_range:
+                quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
+                especially for per-channel mode
+            use_external_data_format: option used for large size (>2GB) model. Set to False by default.
+        """
+
+        nodes_to_exclude = nodes_to_exclude or []
+        nodes_to_quantize = nodes_to_quantize or []
+        op_types_to_quantize = op_types_to_quantize or []
+        self.op_types_to_quantize = op_types_to_quantize
+        self.per_channel = per_channel
+        self.reduce_range = reduce_range
+        self.weight_type = weight_type
+        self.activation_type = activation_type
+        self.nodes_to_quantize = nodes_to_quantize
+        self.nodes_to_exclude = nodes_to_exclude
+        self.use_external_data_format = use_external_data_format
+
+
+class StaticQuantConfig(QuantConfig):
+    def __init__(
+        self,
+        calibration_data_reader: CalibrationDataReader,
+        calibrate_method=CalibrationMethod.MinMax,
+        quant_format=QuantFormat.QDQ,
+        activation_type=QuantType.QInt8,
+        weight_type=QuantType.QInt8,
+        op_types_to_quantize=None,
+        nodes_to_quantize=None,
+        nodes_to_exclude=None,
+        per_channel=False,
+        reduce_range=False,
+        use_external_data_format=False,
+        extra_options=None,
+    ):
+        """
+        This is the derived class for static Quantize Configuration
+
+        Args:
+            calibration_data_reader:
+                a calibration data reader. It enumerates calibration data and generates inputs for the original model.
+            calibrate_method:
+                Current calibration methods supported are MinMax, Entropy and Percentile.
+            quant_format: QuantFormat{QOperator, QDQ}.
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+            extra_options:
+                key value pair dictionary for various options in different case. Current used:
+                    extra.Sigmoid.nnapi = True/False  (Default is False)
+                    ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
+                    WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
+                    EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized.
+                                                  Dyanmic mode currently is supported. Will support more in future.
+                    ForceQuantizeNoInputCheck = True/False :
+                        By default, some latent operators like maxpool, transpose, do not quantize if their input is not
+                        quantized already. Setting to True to force such operator always quantize input and so generate
+                        quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
+                    MatMulConstBOnly = True/False:
+                        Default is False for static mode. If enabled, only MatMul with const B will be quantized.
+                    AddQDQPairToWeight = True/False :
+                        Default is False which quantizes floating-point weight and feeds it to solely inserted
+                        DeQuantizeLinear node. If True, it remains floating-point weight and inserts both
+                        QuantizeLinear/DeQuantizeLinear nodes to weight.
+                    OpTypesToExcludeOutputQuantization = list of op type :
+                        Default is []. If any op type is specified, it won't quantize the output of ops with this
+                        specific op types.
+                    DedicatedQDQPair = True/False :
+                        Default is False. When inserting QDQ pair, multiple nodes can share a single QDQ pair as their
+                        inputs. If True, it will create identical and dedicated QDQ pair for each node.
+                    QDQOpTypePerChannelSupportToAxis = dictionary :
+                        Default is {}. Set channel axis for specific op type, for example: {'MatMul': 1}, and it's
+                        effective only when per channel quantization is supported and per_channel is True. If specific
+                        op type supports per channel quantization but not explicitly specified with channel axis,
+                        default channel axis will be used.
+                    CalibTensorRangeSymmetric = True/False :
+                        Default is False. If enabled, the final range of tensor during calibration will be explicitly
+                        set to symmetric to central point "0".
+                    CalibMovingAverage = True/False :
+                        Default is False. If enabled, the moving average of the minimum and maximum values will be
+                        computed when the calibration method selected is MinMax.
+                    CalibMovingAverageConstant = float :
+                        Default is 0.01. Constant smoothing factor to use when computing the moving average of the
+                        minimum and maximum values. Effective only when the calibration method selected is MinMax and
+                        when CalibMovingAverage is set to True.
+                    QuantizeBias = True/False :
+                        Default is True which quantizes floating-point biases and it solely inserts
+                        a DeQuantizeLinear node. If False, it remains floating-point bias and does not insert
+                        any quantization nodes associated with biases.
+                        This extra option is only effective when quant_format is QuantFormat.QDQ.
+                    SmoothQuant = True/False :
+                        Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do
+                        fake input channel quantization.
+                    SmoothQuantAlpha = float :
+                        Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight
+                        and activation quantization. A larger alpha value could be used on models with more significant
+                        activation outliers to migrate more quantization difficulty to weights.
+                    SmoothQuantFolding = True/False :
+                        Default is True. It only works if SmoothQuant is True. If enabled, inserted Mul ops during
+                        SmoothQuant will be folded into the previous op if the previous op is foldable.
+                    UseQDQContribOps = True/False :
+                        Default is False. If enabled, the inserted QuantizeLinear and DequantizeLinear ops will have the
+                        `com.microsoft` domain, which forces use of ONNX Runtime's QuantizeLinear and DequantizeLinear
+                        contrib op implementations. The contrib op implementations may support features not standardized
+                        into the ONNX specification (e.g., 16-bit quantization types).
+                    MinimumRealRange = float|None :
+                        Default is None. If set to a floating-point value, the calculation of the quantization parameters
+                        (i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax-rmin)
+                        is less than the specified minimum range, rmax will be set to rmin + MinimumRealRange. This is
+                        necessary for EPs like QNN that require a minimum floating-point range when determining
+                        quantization parameters.
+                    TensorQuantOverrides = dictionary :
+                        Default is {}. Set tensor quantization overrides. The key is a tensor name and the value is a
+                        list of dictionaries. For per-tensor quantization, the list contains a single dictionary. For
+                        per-channel quantization, the list contains a dictionary for each channel in the tensor.
+                        Each dictionary contains optional overrides with the following keys and values.
+                            'quant_type' = QuantType : The tensor's quantization data type.
+                            'scale' =  Float         : The scale value to use. Must also specify `zero_point` if set.
+                            'zero_point' = Int       : The zero-point value to use. Must also specify `scale` is set.
+                            'symmetric' = Bool       : If the tensor should use symmetric quantization. Invalid if also
+                                                       set `scale` or `zero_point`.
+                            'reduce_range' = Bool    : If the quantization range should be reduced. Invalid if also
+                                                       set `scale` or `zero_point`.
+                            'rmax' = Float           : Override the maximum real tensor value in calibration data.
+                                                       Invalid if also set `scale` or `zero_point`.
+                            'rmin' = Float           : Override the minimum real tensor value in calibration data.
+                                                       Invalid if also set `scale` or `zero_point`.
+                    QDQKeepRemovableActivations = True/False:
+                        Default is False. If true, "removable" activations (e.g., Clip or Relu) will not be removed, and
+                        will be explicitly represented in the QDQ model. If false, these activations are automatically
+                        removed if activations are asymmetrically quantized. Keeping these activations is necessary if
+                        optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear
+                        operators from the model.
+            execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc.
+        Raises:
+            ValueError: Raise ValueError if execution provider is unknown
+        """
+
+        super().__init__(
+            activation_type=activation_type,
+            weight_type=weight_type,
+            op_types_to_quantize=op_types_to_quantize,
+            nodes_to_quantize=nodes_to_quantize,
+            nodes_to_exclude=nodes_to_exclude,
+            per_channel=per_channel,
+            reduce_range=reduce_range,
+            use_external_data_format=use_external_data_format,
+        )
+        self.calibration_data_reader = calibration_data_reader
+        self.calibrate_method = calibrate_method
+        self.quant_format = quant_format
+        self.extra_options = extra_options or {}
+
+
+class DynamicQuantConfig(QuantConfig):
+    def __init__(
+        self,
+        weight_type=QuantType.QInt8,
+        op_types_to_quantize=None,
+        nodes_to_quantize=None,
+        nodes_to_exclude=None,
+        per_channel=False,
+        reduce_range=False,
+        use_external_data_format=False,
+        extra_options=None,
+    ):
+        """
+        This is a class for dynamic Quant Configuration
+
+        Args:
+            extra_options: key value pair dictionary for various options in different case. Current used:
+                extra.Sigmoid.nnapi = True/False  (Default is False)
+                ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
+                WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
+                EnableSubgraph = True/False :
+                    Default is False. If enabled, subgraph will be quantized. Dynamic mode currently is supported. Will
+                    support more in the future.
+                ForceQuantizeNoInputCheck = True/False :
+                    By default, some latent operators like maxpool, transpose, do not quantize if their input is not
+                    quantized already. Setting to True to force such operator always quantize input and so generate
+                    quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
+                MatMulConstBOnly = True/False:
+                    Default is True for dynamic mode. If enabled, only MatMul with const B will be quantized.
+            execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc.
+
+        Raises:
+            ValueError: Raise ValueError if execution provider is unknown
+        """
+        super().__init__(
+            op_types_to_quantize=op_types_to_quantize,
+            per_channel=per_channel,
+            reduce_range=reduce_range,
+            weight_type=weight_type,
+            nodes_to_quantize=nodes_to_quantize,
+            nodes_to_exclude=nodes_to_exclude,
+            use_external_data_format=use_external_data_format,
+        )
+        self.extra_options = extra_options or {}
+
+
+def check_static_quant_arguments(quant_format: QuantFormat, activation_type: QuantType, weight_type: QuantType):
+    if activation_type == QuantType.QInt8 and weight_type == QuantType.QUInt8:
+        raise ValueError(
+            "ONNXRuntime quantization doesn't support data format:"
+            "activation_type=QuantType.QInt8, weight_type=QuantType.QUInt8"
+        )
+    if activation_type != QuantType.QFLOAT8E4M3FN and weight_type == QuantType.QFLOAT8E4M3FN:
+        raise ValueError(
+            f"ONNXRuntime quantization doesn't support data format: activation_type={activation_type} "
+            f"!=QuantType.QFLOAT8E4M3FN, weight_type=QuantType.QFLOAT8E4M3FN."
+        )
+
+    if activation_type == QuantType.QFLOAT8E4M3FN and weight_type != QuantType.QFLOAT8E4M3FN:
+        raise ValueError(
+            "ONNXRuntime quantization doesn't support data format: activation_type=QuantType.QFLOAT8E4M3FN, "
+            f"weight_type={weight_type}!=QuantType.QFLOAT8E4M3FN"
+        )
+
+    q16_types = [QuantType.QInt16, QuantType.QUInt16]
+
+    if (activation_type in q16_types or weight_type in q16_types) and quant_format != QuantFormat.QDQ:
+        raise ValueError("Only QuantFormat.QDQ supports 16-bit quantization types.")
+
+    if activation_type == QuantType.QInt8 and weight_type == QuantType.QInt8 and quant_format != QuantFormat.QDQ:
+        logging.warning(
+            "Please use QuantFormat.QDQ for activation type QInt8 and weight type QInt8. "
+            "Or it will lead to bad performance on x64."
+        )
+
+
+def quantize_static(
+    model_input: Union[str, Path, onnx.ModelProto],
+    model_output: Union[str, Path],
+    calibration_data_reader: CalibrationDataReader,
+    quant_format=QuantFormat.QDQ,
+    op_types_to_quantize=None,
+    per_channel=False,
+    reduce_range=False,
+    activation_type=QuantType.QInt8,
+    weight_type=QuantType.QInt8,
+    nodes_to_quantize=None,
+    nodes_to_exclude=None,
+    use_external_data_format=False,
+    calibrate_method=CalibrationMethod.MinMax,
+    extra_options=None,
+):
+    """
+    Given an onnx model and calibration data reader, create a quantized onnx model and save it into a file
+    It is recommended to use QuantFormat.QDQ format from 1.11 with activation_type = QuantType.QInt8 and weight_type
+    = QuantType.QInt8. If model is targeted to GPU/TRT, symmetric activation and weight are required. If model is
+    targeted to CPU, asymmetric activation and symmetric weight are recommended for balance of performance and
+    accuracy.
+
+    Args:
+
+        model_input: file path of model or ModelProto to quantize
+        model_output: file path of quantized model
+        calibration_data_reader: a calibration data reader. It
+            enumerates calibration data and generates inputs for the
+            original model.
+        quant_format: QuantFormat{QOperator, QDQ}.
+            QOperator format quantizes the model with quantized operators directly.
+            QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+        activation_type:
+            quantization data type of activation. Please refer to
+            https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
+        calibrate_method:
+            Current calibration methods supported are MinMax and Entropy.
+                Please use CalibrationMethod.MinMax or CalibrationMethod.Entropy as options.
+        op_types_to_quantize:
+                specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
+                It quantizes all supported operators by default.
+        per_channel: quantize weights per channel
+        reduce_range:
+            quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
+            especially for per-channel mode
+        weight_type:
+            quantization data type of weight. Please refer to
+            https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
+        nodes_to_quantize:
+            List of nodes names to quantize. When this list is not None only the nodes in this list
+            are quantized.
+            example:
+            [
+                'Conv__224',
+                'Conv__252'
+            ]
+        nodes_to_exclude:
+            List of nodes names to exclude. The nodes in this list will be excluded from quantization
+            when it is not None.
+        use_external_data_format: option used for large size (>2GB) model. Set to False by default.
+        extra_options:
+            key value pair dictionary for various options in different case. Current used:
+                extra.Sigmoid.nnapi = True/False  (Default is False)
+                ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
+                WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
+                EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized.
+                                              Dyanmic mode currently is supported. Will support more in the future.
+                ForceQuantizeNoInputCheck = True/False :
+                    By default, some latent operators like maxpool, transpose, do not quantize if their input is not
+                    quantized already. Setting to True to force such operator always quantize input and so generate
+                    quantized output. Also, the True behavior could be disabled per node using the nodes_to_exclude.
+                MatMulConstBOnly = True/False:
+                    Default is False for static mode. If enabled, only MatMul with const B will be quantized.
+                AddQDQPairToWeight = True/False :
+                    Default is False which quantizes floating-point weight and feeds it to solely inserted
+                    DeQuantizeLinear node. If True, it remains floating-point weight and inserts both
+                    QuantizeLinear/DeQuantizeLinear nodes to weight.
+                OpTypesToExcludeOutputQuantization = list of op type :
+                    Default is []. If any op type is specified, it won't quantize the output of ops with this
+                    specific op types.
+                DedicatedQDQPair = True/False :
+                    Default is False. When inserting QDQ pair, multiple nodes can share a single QDQ pair as their
+                    inputs. If True, it will create identical and dedicated QDQ pair for each node.
+                QDQOpTypePerChannelSupportToAxis = dictionary :
+                    Default is {}. Set channel axis for specific op type, for example: {'MatMul': 1}, and it's
+                    effective only when per channel quantization is supported and per_channel is True. If specific
+                    op type supports per channel quantization but not explicitly specified with channel axis,
+                    default channel axis will be used.
+                CalibTensorRangeSymmetric = True/False :
+                    Default is False. If enabled, the final range of tensor during calibration will be explicitly
+                    set to symmetric to central point "0".
+                CalibStridedMinMax = Optional[int] :
+                    Default is None. If set to an integer, during calculation of the min-max, only stride amount of
+                    data will be used and then all results will be merged in the end.
+                CalibMovingAverage = True/False :
+                    Default is False. If enabled, the moving average of the minimum and maximum values will be
+                    computed when the calibration method selected is MinMax.
+                CalibMovingAverageConstant = float :
+                    Default is 0.01. Constant smoothing factor to use when computing the moving average of the
+                    minimum and maximum values. Effective only when the calibration method selected is MinMax and
+                    when CalibMovingAverage is set to True.
+                CalibMaxIntermediateOutputs = Optional[int] :
+                    Default is None. If set to an integer, during calculation of the min-max range of the tensors
+                    it will load at max value number of outputs before computing and merging the range. This will
+                    produce the same result as all computing with None, but is more memory efficient.
+                SmoothQuant = True/False :
+                    Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do
+                    fake input channel quantization.
+                SmoothQuantAlpha = float :
+                    Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight
+                    and activation quantization. A larger alpha value could be used on models with more significant
+                    activation outliers to migrate more quantization difficulty to weights.
+                SmoothQuantFolding = True/False :
+                    Default is True. It only works if SmoothQuant is True. If enabled, inserted Mul ops during
+                    SmoothQuant will be folded into the previous op if the previous op is foldable.
+                UseQDQContribOps = True/False :
+                    Default is False. If enabled, the inserted QuantizeLinear and DequantizeLinear ops will have the
+                    `com.microsoft` domain, which forces use of ONNX Runtime's QuantizeLinear and DequantizeLinear
+                    contrib op implementations. The contrib op implementations may support features not standardized
+                    into the ONNX specification (e.g., 16-bit quantization types).
+                MinimumRealRange = float|None :
+                    Default is None. If set to a floating-point value, the calculation of the quantization parameters
+                    (i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax - rmin)
+                    is less than the specified minimum range, rmax will be set to rmin + MinimumRealRange. This is
+                    necessary for EPs like QNN that require a minimum floating-point range when determining
+                    quantization parameters.
+                TensorQuantOverrides = dictionary :
+                    Default is {}. Set tensor quantization overrides. The key is a tensor name and the value is a
+                    list of dictionaries. For per-tensor quantization, the list contains a single dictionary. For
+                    per-channel quantization, the list contains a dictionary for each channel in the tensor.
+                    Each dictionary contains optional overrides with the following keys and values.
+                        'quant_type' = QuantType : The tensor's quantization data type.
+                        'scale' =  Float         : The scale value to use. Must also specify `zero_point` if set.
+                        'zero_point' = Int       : The zero-point value to use. Must also specify `scale` is set.
+                        'symmetric' = Bool       : If the tensor should use symmetric quantization. Invalid if also
+                                                   set `scale` or `zero_point`.
+                        'reduce_range' = Bool    : If the quantization range should be reduced. Invalid if also
+                                                   set `scale` or `zero_point`.
+                        'rmax' = Float           : Override the maximum real tensor value in calibration data.
+                                                   Invalid if also set `scale` or `zero_point`.
+                        'rmin' = Float           : Override the minimum real tensor value in calibration data.
+                                                   Invalid if also set `scale` or `zero_point`.
+                QDQKeepRemovableActivations = True/False:
+                    Default is False. If true, "removable" activations (e.g., Clip or Relu) will not be removed, and
+                    will be explicitly represented in the QDQ model. If false, these activations are automatically
+                    removed if activations are asymmetrically quantized. Keeping these activations is necessary if
+                    optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear
+                    operators from the model.
+    """
+    if activation_type == QuantType.QFLOAT8E4M3FN or weight_type == QuantType.QFLOAT8E4M3FN:
+        if calibrate_method != CalibrationMethod.Distribution:
+            raise ValueError("Only Distribution calibration method is supported for float quantization.")
+
+    extra_options = extra_options or {}
+    nodes_to_exclude = nodes_to_exclude or []
+    nodes_to_quantize = nodes_to_quantize or []
+    op_types_to_quantize = op_types_to_quantize or []
+    mode = QuantizationMode.QLinearOps
+
+    if not op_types_to_quantize or len(op_types_to_quantize) == 0:
+        q_linear_ops = list(QLinearOpsRegistry.keys())
+        qdq_ops = list(QDQRegistry.keys())
+        op_types_to_quantize = list(set(q_linear_ops + qdq_ops))
+
+    model = (
+        save_and_reload_model_with_shape_infer(model_input)
+        if isinstance(model_input, onnx.ModelProto)
+        else load_model_with_shape_infer(Path(model_input))
+    )
+
+    pre_processed: bool = model_has_pre_process_metadata(model)
+    if not pre_processed:
+        logging.warning(
+            "Please consider to run pre-processing before quantization. Refer to example: "
+            "https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification"
+            "/cpu/ReadMe.md "
+        )
+
+    calib_extra_options_keys = [
+        ("CalibTensorRangeSymmetric", "symmetric"),
+        ("CalibMovingAverage", "moving_average"),
+        ("CalibMovingAverageConstant", "averaging_constant"),
+        ("CalibMaxIntermediateOutputs", "max_intermediate_outputs"),
+    ]
+    calib_extra_options = {
+        key: extra_options.get(name) for (name, key) in calib_extra_options_keys if name in extra_options
+    }
+
+    if extra_options.get("SmoothQuant", False):
+        import importlib
+
+        try:
+            importlib.import_module("neural_compressor.adaptor.ox_utils.smooth_quant")
+        except Exception as e:
+            logging.error(f"{e}.")
+            raise RuntimeError("neural-compressor is not correctly installed. Please check your environment.") from e
+
+        import copy
+
+        from neural_compressor.adaptor.ox_utils.smooth_quant import ORTSmoothQuant
+
+        def inc_dataloader():
+            data_reader = copy.deepcopy(calibration_data_reader)
+            for data in data_reader:
+                yield data, None
+
+        orig_nodes = [i.name for i in model.graph.node]
+        dataloader = inc_dataloader()
+        sq = ORTSmoothQuant(model_input, dataloader, reduce_range)
+        del dataloader
+        model = sq.transform(extra_options.get("SmoothQuantAlpha", 0.5), extra_options.get("SmoothQuantFolding", True))
+        sq_path = tempfile.TemporaryDirectory(prefix="ort.quant.")
+        model_input = Path(sq_path.name).joinpath("sq_model.onnx").as_posix()
+        model.save(model_input)
+        nodes_to_exclude.extend([i.name for i in model.model.graph.node if i.name not in orig_nodes])
+        model = load_model_with_shape_infer(Path(model_input))  # use smooth quant model for calibration
+
+    with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
+        if isinstance(model_input, onnx.ModelProto):
+            output_path = str(Path(quant_tmp_dir) / "model_input.onnx")
+            onnx.save_model(
+                model_input,
+                output_path,
+                save_as_external_data=True,
+            )
+            model_input = output_path
+
+        calibrator = create_calibrator(
+            Path(model_input),
+            op_types_to_quantize,
+            augmented_model_path=Path(quant_tmp_dir).joinpath("augmented_model.onnx").as_posix(),
+            calibrate_method=calibrate_method,
+            use_external_data_format=use_external_data_format,
+            extra_options=calib_extra_options,
+        )
+
+        stride = extra_options.get("CalibStridedMinMax", None)
+        if stride:
+            total_data_size = len(calibration_data_reader)
+            if total_data_size % stride != 0:
+                raise ValueError(f"Total data size ({total_data_size}) is not divisible by stride size ({stride}).")
+
+            for start in range(0, total_data_size, stride):
+                end_index = start + stride
+                calibration_data_reader.set_range(start_index=start, end_index=end_index)
+                calibrator.collect_data(calibration_data_reader)
+        else:
+            calibrator.collect_data(calibration_data_reader)
+        tensors_range = calibrator.compute_data()
+        if not isinstance(tensors_range, TensorsData):
+            raise TypeError(
+                f"Unexpected type {type(tensors_range)} for tensors_range and calibrator={type(calibrator)}."
+            )
+        del calibrator
+
+    check_static_quant_arguments(quant_format, activation_type, weight_type)
+
+    if quant_format is QuantFormat.QOperator:
+        quantizer = ONNXQuantizer(
+            model,
+            per_channel,
+            reduce_range,
+            mode,
+            True,  # static
+            weight_type,
+            activation_type,
+            tensors_range,
+            nodes_to_quantize,
+            nodes_to_exclude,
+            op_types_to_quantize,
+            extra_options,
+        )
+    else:
+        quantizer = QDQQuantizer(
+            model,
+            per_channel,
+            reduce_range,
+            weight_type,
+            activation_type,
+            tensors_range,
+            nodes_to_quantize,
+            nodes_to_exclude,
+            op_types_to_quantize,
+            extra_options,
+        )
+
+    quantizer.quantize_model()
+    quantizer.model.save_model_to_file(model_output, use_external_data_format)
+    if not pre_processed:
+        logging.warning(
+            "Please consider pre-processing before quantization. See "
+            "https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification"
+            "/cpu/ReadMe.md "
+        )
+
+    if extra_options.get("SmoothQuant", False):
+        sq_path.cleanup()
+
+
+def quantize_dynamic(
+    model_input: Union[str, Path, onnx.ModelProto],
+    model_output: Union[str, Path],
+    op_types_to_quantize=None,
+    per_channel=False,
+    reduce_range=False,
+    weight_type=QuantType.QInt8,
+    nodes_to_quantize=None,
+    nodes_to_exclude=None,
+    use_external_data_format=False,
+    extra_options=None,
+):
+    """Given an onnx model, create a quantized onnx model and save it into a file
+
+    Args:
+        model_input: file path of model or ModelProto to quantize
+        model_output: file path of quantized model
+        op_types_to_quantize:
+            specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
+            It quantizes all supported operators by default.
+        per_channel: quantize weights per channel
+        reduce_range:
+            quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
+            especially for per-channel mode
+        weight_type:
+            quantization data type of weight. Please refer to
+            https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
+        nodes_to_quantize:
+            List of nodes names to quantize. When this list is not None only the nodes in this list
+            are quantized.
+            example:
+            [
+                'Conv__224',
+                'Conv__252'
+            ]
+        nodes_to_exclude:
+            List of nodes names to exclude. The nodes in this list will be excluded from quantization
+            when it is not None.
+        use_external_data_format: option used for large size (>2GB) model. Set to False by default.
+        extra_options:
+            key value pair dictionary for various options in different case. Current used:
+                extra.Sigmoid.nnapi = True/False  (Default is False)
+                ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
+                WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
+                EnableSubgraph = True/False :
+                    Default is False. If enabled, subgraph will be quantized. Dynamic mode currently is supported. Will
+                    support more in the future.
+                ForceQuantizeNoInputCheck = True/False :
+                    By default, some latent operators like maxpool, transpose, do not quantize if their input is not
+                    quantized already. Setting to True to force such operator always quantize input and so generate
+                    quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
+                MatMulConstBOnly = True/False:
+                    Default is True for dynamic mode. If enabled, only MatMul with const B will be quantized.
+    """
+    extra_options = extra_options or {}
+    nodes_to_exclude = nodes_to_exclude or []
+    nodes_to_quantize = nodes_to_quantize or []
+    op_types_to_quantize = op_types_to_quantize or []
+
+    mode = QuantizationMode.IntegerOps
+
+    if not op_types_to_quantize or len(op_types_to_quantize) == 0:
+        op_types_to_quantize = list(IntegerOpsRegistry.keys())
+
+    model = (
+        save_and_reload_model_with_shape_infer(model_input)
+        if isinstance(model_input, onnx.ModelProto)
+        else load_model_with_shape_infer(Path(model_input))
+    )
+
+    pre_processed: bool = model_has_pre_process_metadata(model)
+    if not pre_processed:
+        logging.warning(
+            "Please consider to run pre-processing before quantization. Refer to example: "
+            "https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification"
+            "/cpu/ReadMe.md "
+        )
+
+    if "MatMulConstBOnly" not in extra_options:
+        extra_options["MatMulConstBOnly"] = True
+
+    quantizer = ONNXQuantizer(
+        model,
+        per_channel,
+        reduce_range,
+        mode,
+        False,  # static
+        weight_type,
+        QuantType.QUInt8,  # dynamic activation only supports uint8
+        None,
+        nodes_to_quantize,
+        nodes_to_exclude,
+        op_types_to_quantize,
+        extra_options,
+    )
+
+    quantizer.quantize_model()
+    quantizer.model.save_model_to_file(model_output, use_external_data_format)
+
+
+def quantize(
+    model_input: Union[str, Path, onnx.ModelProto],
+    model_output: Union[str, Path],
+    quant_config: QuantConfig,
+):
+    """Quantize a model with QuantConfig.
+
+    Args:
+        model_input (str | Path | ModelProto): Path to the model or ModelProto to quantize.
+        model_output (str | Path): Path to save the quantized model.
+        quant_config (QuantConfig): Quantization Configuration.
+    """
+
+    if isinstance(quant_config, StaticQuantConfig):
+        quantize_static(
+            model_input,
+            model_output,
+            quant_config.calibration_data_reader,
+            calibrate_method=quant_config.calibrate_method,
+            quant_format=quant_config.quant_format,
+            activation_type=quant_config.activation_type,
+            weight_type=quant_config.weight_type,
+            op_types_to_quantize=quant_config.op_types_to_quantize,
+            nodes_to_quantize=quant_config.nodes_to_quantize,
+            nodes_to_exclude=quant_config.nodes_to_exclude,
+            per_channel=quant_config.per_channel,
+            reduce_range=quant_config.reduce_range,
+            use_external_data_format=quant_config.use_external_data_format,
+            extra_options=quant_config.extra_options,
+        )
+
+    elif isinstance(quant_config, DynamicQuantConfig):
+        quantize_dynamic(
+            model_input,
+            model_output,
+            weight_type=quant_config.weight_type,
+            op_types_to_quantize=quant_config.op_types_to_quantize,
+            nodes_to_quantize=quant_config.nodes_to_quantize,
+            nodes_to_exclude=quant_config.nodes_to_exclude,
+            per_channel=quant_config.per_channel,
+            reduce_range=quant_config.reduce_range,
+            use_external_data_format=quant_config.use_external_data_format,
+            extra_options=quant_config.extra_options,
+        )
+    else:
+        raise TypeError("Invalid quantization config type, it must be either StaticQuantConfig or DynamicQuantConfig.")
--- a/rl/Lib/site-packages/onnxruntime/quantization/registry.py
+++ b/rl/Lib/site-packages/onnxruntime/quantization/registry.py
@ -0,0 +1,105 @@
+from .operators.activation import QDQRemovableActivation, QLinearActivation
+from .operators.argmax import QArgMax
+from .operators.attention import AttentionQuant
+from .operators.base_operator import QuantOperatorBase
+from .operators.binary_op import QLinearBinaryOp
+from .operators.concat import QLinearConcat
+from .operators.conv import ConvInteger, QDQConv, QLinearConv
+from .operators.direct_q8 import Direct8BitOp, QDQDirect8BitOp
+from .operators.embed_layernorm import EmbedLayerNormalizationQuant
+from .operators.gather import GatherQuant, QDQGather
+from .operators.gavgpool import QGlobalAveragePool
+from .operators.gemm import QDQGemm, QLinearGemm
+from .operators.lstm import LSTMQuant
+from .operators.matmul import MatMulInteger, QDQMatMul, QLinearMatMul
+from .operators.maxpool import QDQMaxPool, QMaxPool
+from .operators.norm import QDQNormalization
+from .operators.pad import QPad
+from .operators.pooling import QLinearPool
+from .operators.qdq_base_operator import QDQOperatorBase
+from .operators.resize import QDQResize, QResize
+from .operators.softmax import QLinearSoftmax
+from .operators.split import QDQSplit, QSplit
+from .operators.where import QDQWhere, QLinearWhere
+from .quant_utils import QuantizationMode
+
+CommonOpsRegistry = {
+    "Gather": GatherQuant,
+    "Transpose": Direct8BitOp,
+    "EmbedLayerNormalization": EmbedLayerNormalizationQuant,
+}
+
+IntegerOpsRegistry = {
+    "Conv": ConvInteger,
+    "MatMul": MatMulInteger,
+    "Attention": AttentionQuant,
+    "LSTM": LSTMQuant,
+}
+IntegerOpsRegistry.update(CommonOpsRegistry)
+
+QLinearOpsRegistry = {
+    "ArgMax": QArgMax,
+    "Conv": QLinearConv,
+    "Gemm": QLinearGemm,
+    "MatMul": QLinearMatMul,
+    "Add": QLinearBinaryOp,
+    "Mul": QLinearBinaryOp,
+    "Relu": QLinearActivation,
+    "Clip": QLinearActivation,
+    "LeakyRelu": QLinearActivation,
+    "Sigmoid": QLinearActivation,
+    "MaxPool": QMaxPool,
+    "GlobalAveragePool": QGlobalAveragePool,
+    "Split": QSplit,
+    "Pad": QPad,
+    "Reshape": Direct8BitOp,
+    "Squeeze": Direct8BitOp,
+    "Unsqueeze": Direct8BitOp,
+    "Resize": QResize,
+    "AveragePool": QLinearPool,
+    "Concat": QLinearConcat,
+    "Softmax": QLinearSoftmax,
+    "Where": QLinearWhere,
+}
+QLinearOpsRegistry.update(CommonOpsRegistry)
+
+QDQRegistry = {
+    "Conv": QDQConv,
+    "ConvTranspose": QDQConv,
+    "Gemm": QDQGemm,
+    "Clip": QDQRemovableActivation,
+    "Relu": QDQRemovableActivation,
+    "Reshape": QDQDirect8BitOp,
+    "Transpose": QDQDirect8BitOp,
+    "Squeeze": QDQDirect8BitOp,
+    "Unsqueeze": QDQDirect8BitOp,
+    "Resize": QDQResize,
+    "MaxPool": QDQMaxPool,
+    "AveragePool": QDQDirect8BitOp,
+    "MatMul": QDQMatMul,
+    "Split": QDQSplit,
+    "Gather": QDQGather,
+    "Where": QDQWhere,
+    "InstanceNormalization": QDQNormalization,
+    "LayerNormalization": QDQNormalization,
+    "BatchNormalization": QDQNormalization,
+}
+
+
+def CreateDefaultOpQuantizer(onnx_quantizer, node):  # noqa: N802
+    return QuantOperatorBase(onnx_quantizer, node)
+
+
+def CreateOpQuantizer(onnx_quantizer, node):  # noqa: N802
+    registry = IntegerOpsRegistry if onnx_quantizer.mode == QuantizationMode.IntegerOps else QLinearOpsRegistry
+    if node.op_type in registry:
+        op_quantizer = registry[node.op_type](onnx_quantizer, node)
+        if op_quantizer.should_quantize():
+            return op_quantizer
+    return QuantOperatorBase(onnx_quantizer, node)
+
+
+def CreateQDQQuantizer(onnx_quantizer, node):  # noqa: N802
+    if node.op_type in QDQRegistry:
+        return QDQRegistry[node.op_type](onnx_quantizer, node)
+    return QDQOperatorBase(onnx_quantizer, node)
--- a/Show More
+++ b/Show More