I am done

2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions
--- a/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/init.py
+++ b/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/init.py
@ -0,0 +1,12 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import os.path
+import sys
+
+sys.path.append(os.path.dirname(__file__))
+
+transformers_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", ".."))
+if transformers_dir not in sys.path:
+    sys.path.append(transformers_dir)
--- a/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/pycache/benchmark_gpt2.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/pycache/benchmark_gpt2.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/pycache/convert_to_onnx.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/pycache/convert_to_onnx.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/pycache/gpt2_helper.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/pycache/gpt2_helper.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/pycache/gpt2_parity.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/pycache/gpt2_parity.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/pycache/gpt2_tester.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/pycache/gpt2_tester.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/pycache/parity_check_helper.cpython-312.pyc
+++ b/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/pycache/parity_check_helper.cpython-312.pyc
--- a/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/benchmark_gpt2.py
+++ b/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/benchmark_gpt2.py
@ -0,0 +1,413 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+# This script benchmarks gpt2 model with past state.
+# For gpt2 model without past state, use benchmark.py to measure performance.
+
+import argparse
+import csv
+import logging
+import os
+from datetime import datetime
+
+import psutil
+import torch
+from benchmark_helper import (
+    Precision,
+    create_onnxruntime_session,
+    get_ort_environment_variables,
+    prepare_environment,
+    setup_logger,
+)
+from gpt2_helper import DEFAULT_TOLERANCE, MODEL_CLASSES, PRETRAINED_GPT2_MODELS, Gpt2Helper
+from packaging import version
+from quantize_helper import QuantizeHelper
+from transformers import AutoConfig
+from transformers import __version__ as transformers_version
+
+logger = logging.getLogger("")
+
+
+def parse_arguments(argv=None):
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-m",
+        "--model_name_or_path",
+        required=True,
+        type=str,
+        help="Model path, or pretrained model name selected in the list: " + ", ".join(PRETRAINED_GPT2_MODELS),
+    )
+
+    parser.add_argument(
+        "--model_class",
+        required=False,
+        type=str,
+        default="GPT2LMHeadModel",
+        choices=list(MODEL_CLASSES.keys()),
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+
+    parser.add_argument(
+        "--cache_dir",
+        required=False,
+        type=str,
+        default=os.path.join(".", "cache_models"),
+        help="Directory to cache pre-trained models",
+    )
+
+    parser.add_argument(
+        "--onnx_dir",
+        required=False,
+        type=str,
+        default=os.path.join(".", "onnx_models"),
+        help="Directory to store onnx models",
+    )
+
+    parser.add_argument(
+        "--test_times",
+        required=False,
+        default=100,
+        type=int,
+        help="Number of repeat times to get average inference latency.",
+    )
+
+    parser.add_argument(
+        "-v",
+        "--validate_onnx",
+        required=False,
+        action="store_true",
+        help="Validate ONNX model",
+    )
+
+    parser.add_argument(
+        "-o",
+        "--optimize_onnx",
+        required=False,
+        action="store_true",
+        help="Use optimizer.py to optimize onnx model",
+    )
+    parser.set_defaults(optimize_onnx=False)
+
+    parser.add_argument(
+        "--stage",
+        type=int,
+        default=0,
+        required=False,
+        choices=[0, 1, 2],
+        help="Stage in generation: 1 (initial decoder), 2 (decoder), 0 (both). "
+        "1 - decode the first token when past_sequence_length is zero; "
+        "2 - decode the remaining tokens when past_sequence_length is not zero; "
+        "0 - one onnx model for both stages 1 and 2. "
+        "Note that we will optimize 1 and 2 differently for best performance.",
+    )
+
+    parser.add_argument("--use_gpu", required=False, action="store_true", help="use GPU for inference")
+    parser.set_defaults(use_gpu=False)
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        type=Precision,
+        default=Precision.FLOAT32,
+        choices=list(Precision),
+        help="Precision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization",
+    )
+
+    parser.add_argument("--torchscript", required=False, action="store_true", help="use Torchscript")
+    parser.set_defaults(torchscript=False)
+
+    parser.add_argument("-b", "--batch_sizes", nargs="+", type=int, default=[1], help="batch size")
+
+    parser.add_argument(
+        "--sequence_lengths",
+        nargs="+",
+        type=int,
+        default=[1],
+        help="sequence lengths (excluding past)",
+    )
+
+    parser.add_argument(
+        "-s",
+        "--past_sequence_lengths",
+        nargs="+",
+        type=int,
+        default=[8, 16, 32, 64, 128, 256],
+        help="past sequence lengths",
+    )
+
+    parser.add_argument(
+        "-r",
+        "--result_csv",
+        required=False,
+        default=None,
+        help="CSV file for saving summary results.",
+    )
+
+    parser.add_argument("--thread_num", required=False, type=int, default=-1, help="Threads to use")
+
+    parser.add_argument("--include_copy_output_latency", required=False, action="store_true")
+    parser.set_defaults(include_copy_output_latency=False)
+
+    parser.add_argument("--verbose", required=False, action="store_true")
+    parser.set_defaults(verbose=False)
+
+    parser.add_argument("--output_torch_latency", required=False, action="store_true")
+    parser.set_defaults(output_torch_latency=False)
+
+    parser.add_argument("--disable_io_binding", required=False, action="store_true")
+    parser.set_defaults(disable_io_binding=False)
+
+    args = parser.parse_args(argv)
+
+    return args
+
+
+def main(args):
+    if version.parse(transformers_version) < version.parse(
+        "3.1.0"
+    ):  # past_key_values name does not exist in 3.0.2 or older
+        raise RuntimeError("This tool requires transformers 3.1.0 or later.")
+
+    logger.info(f"Arguments:{args}")
+    if args.precision == Precision.FLOAT16:
+        assert args.optimize_onnx and args.use_gpu, "fp16 requires --optimize_onnx --use_gpu"
+
+    if args.precision == Precision.INT8:
+        assert not args.use_gpu, "quantization only supports CPU"
+
+    if args.stage == 1:
+        assert args.past_sequence_lengths == [0], "past_sequence_lengths shall be 0 for stage==1 (init decoder)"
+
+    torch.set_num_threads(psutil.cpu_count(logical=True) if args.thread_num <= 0 else args.thread_num)
+    print(torch.__config__.parallel_info())
+
+    cache_dir = args.cache_dir
+    output_dir = args.onnx_dir
+    prepare_environment(cache_dir, output_dir, args.use_gpu)
+
+    model_class = MODEL_CLASSES[args.model_class][0]
+    gpt2helper = Gpt2Helper
+    config = AutoConfig.from_pretrained(args.model_name_or_path, torchscript=args.torchscript, cache_dir=cache_dir)
+    model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir)
+
+    # This script does not support float16 for PyTorch.
+    # if args.float16:
+    #    model.half()
+
+    device = torch.device("cuda:0" if args.use_gpu else "cpu")
+    model.to(device)
+    use_external_data_format = config.n_layer > 24  # TODO: find a way to check model size > 2GB
+    onnx_model_paths = gpt2helper.get_onnx_paths(
+        output_dir,
+        args.model_name_or_path,
+        args.model_class,
+        has_past=True,
+        new_folder=use_external_data_format,
+    )
+
+    onnx_model_path = onnx_model_paths["raw"]
+    use_padding = MODEL_CLASSES[args.model_class][2]
+    gpt2helper.export_onnx(
+        model,
+        device,
+        onnx_model_path,
+        args.verbose,
+        use_external_data_format,
+        has_position_ids=use_padding,
+        has_attention_mask=use_padding,
+    )
+
+    if args.optimize_onnx or args.precision != Precision.FLOAT32:
+        onnx_model_path = onnx_model_paths[str(args.precision) if args.precision != Precision.INT8 else "fp32"]
+        gpt2helper.optimize_onnx(
+            onnx_model_paths["raw"],
+            onnx_model_path,
+            args.precision == Precision.FLOAT16,
+            model.config.num_attention_heads,
+            model.config.hidden_size,
+            use_external_data_format,
+            auto_mixed_precision=True,
+            stage=args.stage,
+        )
+
+        if args.precision == Precision.INT8:
+            logger.info("quantizing model...")
+            QuantizeHelper.quantize_onnx_model(onnx_model_path, onnx_model_paths["int8"], use_external_data_format)
+            model = QuantizeHelper.quantize_torch_model(model)
+            logger.info("finished quantizing model")
+            onnx_model_path = onnx_model_paths["int8"]
+
+    if args.torchscript:
+        model = gpt2helper.torchscript(
+            model,
+            config,
+            device,
+            has_position_ids=use_padding,
+            has_attention_mask=use_padding,
+        )
+
+    session = create_onnxruntime_session(
+        onnx_model_path,
+        args.use_gpu,
+        enable_all_optimization=False,
+        num_threads=args.thread_num,
+        verbose=args.verbose,
+    )
+    if session is None:
+        return
+
+    # Allocate output buffers for IO Binding
+    max_output_shapes = gpt2helper.get_output_shapes(
+        max(args.batch_sizes),
+        max(args.past_sequence_lengths),
+        max(args.sequence_lengths),
+        config,
+        args.model_class,
+    )
+    output_buffers = gpt2helper.get_output_buffers(max_output_shapes, device, args.precision == Precision.FLOAT16)
+
+    csv_filename = args.result_csv or "benchmark_result_{}.csv".format(datetime.now().strftime("%Y%m%d-%H%M%S"))
+    with open(csv_filename, mode="a", newline="") as csv_file:
+        column_names = [
+            "model_name",
+            "model_class",
+            "stage",
+            "environment_variables",
+            "gpu",
+            "precision",
+            "optimizer",
+            "torchscript",
+            "batch_size",
+            "sequence_length",
+            "past_sequence_length",
+            "disable_io_binding",
+            "torch_latency",
+            "onnxruntime_latency",
+        ]
+        csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
+        csv_writer.writeheader()
+
+        for batch_size in args.batch_sizes:
+            for sequence_length in args.sequence_lengths:
+                for past_sequence_length in args.past_sequence_lengths:
+                    assert batch_size > 0 and sequence_length > 0 and past_sequence_length >= 0
+                    logger.debug(
+                        "Running test for batch_size=%d sequence_length=%d past_sequence_length=%d ...",
+                        batch_size,
+                        sequence_length,
+                        past_sequence_length,
+                    )
+
+                    dummy_inputs = gpt2helper.get_dummy_inputs(
+                        batch_size,
+                        past_sequence_length,
+                        sequence_length,
+                        config.num_attention_heads,
+                        config.hidden_size,
+                        config.n_layer,
+                        config.vocab_size,
+                        device,
+                        float16=(args.precision == Precision.FLOAT16),
+                        has_position_ids=use_padding,
+                        has_attention_mask=use_padding,
+                    )
+                    output_shapes = gpt2helper.get_output_shapes(
+                        batch_size,
+                        past_sequence_length,
+                        sequence_length,
+                        config,
+                        args.model_class,
+                    )
+
+                    try:
+                        if args.validate_onnx or args.output_torch_latency:
+                            outputs, torch_latency = gpt2helper.pytorch_inference(model, dummy_inputs, args.test_times)
+
+                            # Dump Torch output shape
+                            for i, value in enumerate(outputs):
+                                if isinstance(value, tuple):
+                                    logger.debug(
+                                        f"torch output {i} is tuple of size {len(value)}, shape {value[0].shape}"
+                                    )
+                                else:
+                                    logger.debug(f"torch output {i} shape {value.shape}")
+                        else:
+                            outputs = None
+                            torch_latency = None
+
+                        if args.disable_io_binding:
+                            ort_outputs, ort_latency = gpt2helper.onnxruntime_inference(
+                                session, dummy_inputs, args.test_times
+                            )
+                        else:
+                            ort_outputs, ort_latency = gpt2helper.onnxruntime_inference_with_binded_io(
+                                session,
+                                dummy_inputs,
+                                output_buffers,
+                                output_shapes,
+                                args.test_times,
+                                return_numpy=False,
+                                include_copy_output_latency=args.include_copy_output_latency,
+                            )
+
+                        if args.validate_onnx:
+                            copy_outputs = ort_outputs
+                            if not args.disable_io_binding:
+                                # Results of IO binding might be in GPU. Copy outputs to CPU for comparison.
+                                copy_outputs = []
+                                for output in ort_outputs:
+                                    copy_outputs.append(output.cpu().numpy())
+
+                            if gpt2helper.compare_outputs(
+                                outputs,
+                                copy_outputs,
+                                model_class=args.model_class,
+                                rtol=DEFAULT_TOLERANCE[args.precision],
+                                atol=DEFAULT_TOLERANCE[args.precision],
+                            ):
+                                logger.info(
+                                    f"Pytorch and ONNX Runtime outputs are all close (tolerance={DEFAULT_TOLERANCE[args.precision]})."
+                                )
+
+                        logger.info(
+                            "batch_size=%d, sequence_length=%d, past_sequence_length=%d, onnxruntime_latency=%.2f %s %s",
+                            batch_size,
+                            sequence_length,
+                            past_sequence_length,
+                            ort_latency,
+                            "(disable_io_binding)" if args.disable_io_binding else "",
+                            ", torch_latency={torch_latency}" if torch_latency else "",
+                        )
+
+                        row = {
+                            "model_name": args.model_name_or_path,
+                            "model_class": args.model_class,
+                            "stage": args.stage,
+                            "environment_variables": get_ort_environment_variables(),
+                            "gpu": args.use_gpu,
+                            "precision": args.precision,
+                            "optimizer": args.optimize_onnx,
+                            "torchscript": args.torchscript,
+                            "batch_size": batch_size,
+                            "sequence_length": sequence_length,
+                            "past_sequence_length": past_sequence_length,
+                            "disable_io_binding": args.disable_io_binding,
+                            "torch_latency": f"{torch_latency:.2f}" if torch_latency else "None",
+                            "onnxruntime_latency": f"{ort_latency:.2f}",
+                        }
+                        csv_writer.writerow(row)
+                    except Exception:
+                        logger.error("Exception", exc_info=True)  # noqa: G201
+                        return None
+
+    logger.info(f"Results are saved to file {csv_filename}")
+    return csv_filename
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    setup_logger(args.verbose)
+    main(args)
--- a/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/convert_to_onnx.py
+++ b/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/convert_to_onnx.py
@ -0,0 +1,557 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+"""
+This converts GPT2 model to onnx. Examples:
+(1) Convert pretrained model 'gpt2' to ONNX
+   python convert_to_onnx.py -m gpt2 --output gpt2.onnx
+(2) Convert pretrained model 'distilgpt2' to ONNX, and use optimizer to get float16 model.
+   python convert_to_onnx.py -m distilgpt2 --output distilgpt2_fp16.onnx -o -p fp16
+(3) Convert a model check point to ONNX, and run optimization and int8 quantization
+   python convert_to_onnx.py -m ./my_model_checkpoint/ --output my_model_int8.onnx -o -p int8
+
+"""
+
+import argparse
+import csv
+import json
+import logging
+import os
+import shutil
+import sys
+from pathlib import Path
+
+import numpy
+import torch
+from benchmark_helper import (
+    Precision,
+    create_onnxruntime_session,
+    get_ort_environment_variables,
+    prepare_environment,
+    setup_logger,
+)
+from gpt2_helper import DEFAULT_TOLERANCE, MODEL_CLASSES, PRETRAINED_GPT2_MODELS, Gpt2Helper
+from gpt2_tester import Gpt2Tester
+from packaging import version
+from quantize_helper import QuantizeHelper
+from transformers import AutoConfig
+from transformers import __version__ as transformers_version
+
+from onnxruntime import __version__ as ort_version
+
+logger = logging.getLogger("")
+
+
+def parse_arguments(argv=None):
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-m",
+        "--model_name_or_path",
+        required=True,
+        type=str,
+        help="Model path, or pretrained model name in the list: " + ", ".join(PRETRAINED_GPT2_MODELS),
+    )
+
+    parser.add_argument(
+        "--model_class",
+        required=False,
+        type=str,
+        default="GPT2LMHeadModel",
+        choices=list(MODEL_CLASSES.keys()),
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+
+    parser.add_argument(
+        "--cache_dir",
+        required=False,
+        type=str,
+        default=os.path.join(".", "cache_models"),
+        help="Directory to cache pre-trained models",
+    )
+
+    parser.add_argument(
+        "--output",
+        required=False,
+        type=str,
+        default=os.path.join(".", "onnx_models"),
+        help="Output directory, or model path ends with .onnx",
+    )
+
+    parser.add_argument(
+        "-o",
+        "--optimize_onnx",
+        required=False,
+        action="store_true",
+        help="Use optimizer.py to optimize onnx model",
+    )
+    parser.set_defaults(optimize_onnx=False)
+
+    parser.add_argument("--use_gpu", required=False, action="store_true", help="use GPU for inference")
+    parser.set_defaults(use_gpu=False)
+
+    parser.add_argument(
+        "--provider",
+        required=False,
+        default=None,
+        choices=["dml", "rocm", "migraphx", "cuda", "tensorrt"],
+        help="use dml, rocm, cuda, tensorrt or migraphx for respective backend",
+    )
+
+    parser.add_argument(
+        "--tolerance",
+        required=False,
+        type=float,
+        default=0,
+        help="the absolute and relative tolerance for parity verification",
+    )
+
+    parser.add_argument(
+        "--input_test_file",
+        "-i",
+        required=False,
+        type=str,
+        default="",
+        help="Path to the file with inputs to test with",
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        required=False,
+        type=Precision,
+        default=Precision.FLOAT32,
+        choices=list(Precision),
+        help="Precision of model to run. fp32 for full precision, fp16 for half or mixed precision, and int8 for quantization",
+    )
+
+    parser.add_argument(
+        "-t",
+        "--test_cases",
+        required=False,
+        type=int,
+        default=1000,
+        help="Number of test cases per run for parity",
+    )
+    parser.add_argument(
+        "-r",
+        "--test_runs",
+        required=False,
+        type=int,
+        default=10,
+        help="Number of runs for parity. It is used for significance test.",
+    )
+
+    parser.add_argument("--verbose", required=False, action="store_true")
+    parser.set_defaults(verbose=False)
+
+    parser.add_argument("-e", "--use_external_data_format", required=False, action="store_true")
+    parser.set_defaults(use_external_data_format=False)
+
+    parser.add_argument("--overwrite", required=False, action="store_true")
+    parser.set_defaults(overwrite=False)
+
+    parser.add_argument(
+        "--use_int64_inputs",
+        required=False,
+        action="store_true",
+        help="Use int32 instead of int64 for input_ids, position_ids and attention_mask.",
+    )
+    parser.set_defaults(use_int64_inputs=False)
+
+    parser.add_argument(
+        "-s",
+        "--stage",
+        type=int,
+        default=0,
+        required=False,
+        choices=[0, 1, 2],
+        help="Stage in generation: 1 (initial decoder), 2 (decoder), 0 (both). "
+        "1 - decode the first token when past_sequence_length is zero; "
+        "2 - decode the remaining tokens when past_sequence_length is not zero; "
+        "0 - one onnx model for both stages 1 and 2. "
+        "Note that we will optimize 1 and 2 differently for best performance.",
+    )
+
+    fp16_option_group = parser.add_argument_group(
+        'float to float16 conversion parameters that works when "--precision fp16" is specified'
+    )
+
+    fp16_option_group.add_argument(
+        "-a",
+        "--auto_mixed_precision",
+        required=False,
+        action="store_true",
+        help="Convert to mixed precision automatically. Other float16 conversion parameters will be ignored.",
+    )
+    fp16_option_group.set_defaults(auto_mixed_precision=False)
+
+    fp16_option_group.add_argument(
+        "--keep_io_types",
+        required=False,
+        action="store_true",
+        help="Use float32 for past inputs, present and logits outputs.",
+    )
+    fp16_option_group.set_defaults(keep_io_types=False)
+
+    fp16_option_group.add_argument(
+        "--io_block_list",
+        nargs="+",
+        default=[],
+        help="List of inputs or outputs in float32 instead of float16",
+    )
+
+    fp16_option_group.add_argument(
+        "--op_block_list",
+        nargs="+",
+        default=[],
+        help="List of operators (like Add LayerNormalization SkipLayerNormalization EmbedLayerNormalization FastGelu) "
+        "to compute in float32 instead of float16.",
+    )
+
+    fp16_option_group.add_argument(
+        "--node_block_list",
+        nargs="+",
+        default=[],
+        help="List of node names to compute in float32 instead of float16.",
+    )
+
+    fp16_option_group.add_argument(
+        "--force_fp16_initializers",
+        required=False,
+        action="store_true",
+        help="Convert all float initializers to float16.",
+    )
+    fp16_option_group.set_defaults(force_fp16_initializers=False)
+
+    args = parser.parse_args(argv)
+
+    return args
+
+
+def get_onnx_model_size(onnx_path: str, use_external_data_format: bool):
+    if not use_external_data_format:
+        return os.path.getsize(onnx_path)
+    else:
+        return sum([f.stat().st_size for f in Path(onnx_path).parent.rglob("*")])
+
+
+def get_latency_name(batch_size, sequence_length, past_sequence_length):
+    return f"average_latency(batch_size={batch_size},sequence_length={sequence_length},past_sequence_length={past_sequence_length})"
+
+
+def main(argv=None, experiment_name: str = "", run_id: str = "0", csv_filename: str = "gpt2_parity_results.csv"):
+    result = {}
+    if version.parse(transformers_version) < version.parse(
+        "3.1.0"
+    ):  # past_key_values name does not exist in 3.0.2 or older
+        raise RuntimeError("This tool requires transformers 3.1.0 or later.")
+
+    args = parse_arguments(argv)
+    setup_logger(args.verbose)
+
+    if not experiment_name:
+        experiment_name = " ".join(argv if argv else sys.argv[1:])
+
+    if args.tolerance == 0:
+        args.tolerance = DEFAULT_TOLERANCE[args.precision]
+
+    logger.info(f"Arguments:{args}")
+
+    cache_dir = args.cache_dir
+    output_dir = args.output if not args.output.endswith(".onnx") else os.path.dirname(args.output)
+    prepare_environment(cache_dir, output_dir, args.use_gpu)
+
+    if args.precision != Precision.FLOAT32:
+        assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx"
+
+    if args.precision == Precision.FLOAT16:
+        assert args.use_gpu, "fp16 requires --use_gpu"
+
+    if args.precision == Precision.INT8:
+        assert not args.use_gpu, "quantization only supports CPU"
+
+    model_class = MODEL_CLASSES[args.model_class][0]
+    use_padding = MODEL_CLASSES[args.model_class][2]
+
+    gpt2helper = Gpt2Helper
+    config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=cache_dir)
+    model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir)
+
+    device = torch.device("cuda:0" if args.use_gpu else "cpu")
+    model.eval().to(device)
+
+    if (not args.use_external_data_format) and (config.n_layer > 24):
+        logger.info("Try --use_external_data_format when model size > 2GB")
+
+    onnx_model_paths = gpt2helper.get_onnx_paths(
+        output_dir,
+        args.model_name_or_path,
+        args.model_class,
+        new_folder=(args.precision == Precision.INT8),
+        remove_existing=["fp32", "fp16", "int8"],
+    )  # Do not remove raw model to save time in parity test
+
+    raw_onnx_model = onnx_model_paths["raw"]
+
+    int_data_type = torch.int64 if args.use_int64_inputs else torch.int32
+
+    if os.path.exists(raw_onnx_model) and not args.overwrite:
+        logger.warning(f"Skip exporting ONNX model since it existed: {raw_onnx_model}")
+    else:
+        logger.info(f"Exporting ONNX model to {raw_onnx_model}")
+        gpt2helper.export_onnx(
+            model,
+            device,
+            raw_onnx_model,
+            args.verbose,
+            args.use_external_data_format,
+            has_position_ids=use_padding,
+            has_attention_mask=use_padding,
+            input_ids_dtype=int_data_type,
+            position_ids_dtype=int_data_type,
+            attention_mask_dtype=int_data_type,
+        )
+
+    fp16_params = {"keep_io_types": args.keep_io_types}
+    if args.io_block_list:
+        fp16_params["keep_io_types"] = args.io_block_list
+    if args.node_block_list:
+        fp16_params["node_block_list"] = args.node_block_list
+    if args.op_block_list:
+        fp16_params["op_block_list"] = args.op_block_list
+    if args.force_fp16_initializers:
+        fp16_params["force_fp16_initializers"] = args.force_fp16_initializers
+
+    is_io_float16 = args.precision == Precision.FLOAT16 and not args.keep_io_types
+
+    optimized_ops = ""
+    all_ops = ""
+    if args.optimize_onnx or args.precision != Precision.FLOAT32:
+        output_path = onnx_model_paths[str(args.precision) if args.precision != Precision.INT8 else "fp32"]
+
+        logger.info(f"Optimizing model to {output_path}")
+        m = gpt2helper.optimize_onnx(
+            raw_onnx_model,
+            output_path,
+            args.precision == Precision.FLOAT16,
+            model.config.num_attention_heads,
+            model.config.hidden_size,
+            args.use_external_data_format,
+            auto_mixed_precision=args.auto_mixed_precision,
+            stage=args.stage,
+            **fp16_params,
+        )
+
+        nodes = m.nodes()
+        op_list = {node.op_type for node in nodes}
+        all_ops = ",".join(op_list)
+
+        # print optimized operators
+        optimized_op_counter = m.get_fused_operator_statistics()
+        if optimized_op_counter:
+            optimized_ops = ",".join([key for key in optimized_op_counter if optimized_op_counter[key] > 0])
+    else:
+        output_path = raw_onnx_model
+
+    if args.precision == Precision.INT8:
+        logger.info("quantizing model...")
+        QuantizeHelper.quantize_onnx_model(output_path, onnx_model_paths["int8"], args.use_external_data_format)
+        model = QuantizeHelper.quantize_torch_model(model)
+        logger.info("finished quantizing model")
+        output_path = onnx_model_paths["int8"]
+
+    if args.output.endswith(".onnx") and output_path != args.output and not args.use_external_data_format:
+        shutil.move(output_path, args.output)
+        output_path = args.output
+
+    logger.info(f"Output path: {output_path}")
+    model_size_in_MB = int(get_onnx_model_size(output_path, args.use_external_data_format) / 1024 / 1024)  # noqa: N806
+
+    session = create_onnxruntime_session(
+        output_path, args.use_gpu, args.provider, enable_all_optimization=True, verbose=args.verbose
+    )
+    if args.model_class == "GPT2LMHeadModel" and session is not None:
+        parity_result = gpt2helper.test_parity(
+            session,
+            model,
+            device,
+            is_io_float16,
+            rtol=args.tolerance,
+            atol=args.tolerance,
+            model_class=args.model_class,
+            has_position_ids=use_padding,
+            has_attention_mask=use_padding,
+            input_ids_dtype=int_data_type,
+            position_ids_dtype=int_data_type,
+            attention_mask_dtype=int_data_type,
+            test_cases_per_run=args.test_cases,
+            total_runs=args.test_runs,
+            stage=args.stage,
+            verbose=args.verbose,
+        )
+
+        # An example configuration for testing performance
+        batch_size = 8
+        sequence_length = 32 if args.stage == 1 else 1
+        past_sequence_length = 0 if args.stage == 1 else 32
+
+        latency = gpt2helper.test_performance(
+            session,
+            model,
+            device,
+            is_io_float16,
+            total_runs=100,
+            use_io_binding=True,
+            model_class=args.model_class,
+            has_position_ids=use_padding,
+            has_attention_mask=use_padding,
+            input_ids_dtype=int_data_type,
+            position_ids_dtype=int_data_type,
+            attention_mask_dtype=int_data_type,
+            batch_size=batch_size,
+            sequence_length=sequence_length,
+            past_sequence_length=past_sequence_length,
+        )
+
+        if args.precision == Precision.FLOAT16:
+            logger.info(f"fp16 conversion parameters:{fp16_params}")
+
+        # Write results to file
+        latency_name = get_latency_name(batch_size, sequence_length, past_sequence_length)
+        csv_file_existed = os.path.exists(csv_filename)
+        with open(csv_filename, mode="a", newline="") as csv_file:
+            column_names = [
+                "experiment",
+                "run_id",
+                "model_name",
+                "model_class",
+                "stage",
+                "gpu",
+                "precision",
+                "optimizer",
+                "test_cases",
+                "runs",
+                "keep_io_types",
+                "io_block_list",
+                "op_block_list",
+                "node_block_list",
+                "force_fp16_initializers",
+                "auto_mixed_precision",
+                "optimized_operators",
+                "operators",
+                "environment_variables",
+                "onnxruntime",
+                latency_name,
+                "top1_match_rate",
+                "onnx_size_in_MB",
+                "diff_50_percentile",
+                "diff_90_percentile",
+                "diff_95_percentile",
+                "diff_99_percentile",
+                "diff_pass_rate",
+                "nan_rate",
+                "top1_match_rate_per_run",
+            ]
+            csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
+            if not csv_file_existed:
+                csv_writer.writeheader()
+            row = {
+                "experiment": experiment_name,
+                "run_id": run_id,
+                "model_name": args.model_name_or_path,
+                "model_class": args.model_class,
+                "stage": args.stage,
+                "gpu": args.use_gpu,
+                "precision": args.precision,
+                "optimizer": args.optimize_onnx,
+                "test_cases": args.test_cases,
+                "runs": args.test_runs,
+                "keep_io_types": args.keep_io_types,
+                "io_block_list": args.io_block_list,
+                "op_block_list": args.op_block_list,
+                "node_block_list": args.node_block_list,
+                "force_fp16_initializers": args.force_fp16_initializers,
+                "auto_mixed_precision": args.auto_mixed_precision,
+                "optimized_operators": optimized_ops,
+                "operators": all_ops,
+                "environment_variables": get_ort_environment_variables(),
+                "onnxruntime": ort_version,
+                latency_name: f"{latency:.2f}",
+                "diff_50_percentile": parity_result["max_diff_percentile_50"],
+                "diff_90_percentile": parity_result["max_diff_percentile_90"],
+                "diff_95_percentile": parity_result["max_diff_percentile_95"],
+                "diff_99_percentile": parity_result["max_diff_percentile_99"],
+                "diff_pass_rate": parity_result["diff_pass_rate"],
+                "nan_rate": parity_result["nan_rate"],
+                "top1_match_rate": parity_result["top1_match_rate"],
+                "top1_match_rate_per_run": parity_result["top1_match_rate_per_run"],
+                "onnx_size_in_MB": f"{model_size_in_MB}",
+            }
+            logger.info(f"result: {row}")
+            result.update(row)
+            csv_writer.writerow(row)
+
+    if args.input_test_file:
+        test_inputs = []
+        # Each line of test file is a JSON string like:
+        # {"input_ids": [[14698, 257, 1310, 13688, 319, 326]]}
+        with open(args.input_test_file) as read_f:
+            for _, line in enumerate(read_f):
+                line = line.rstrip()  # noqa: PLW2901
+                data = json.loads(line)
+                input_ids = torch.from_numpy(numpy.asarray(data["input_ids"], dtype=numpy.int64)).to(device)
+
+                if use_padding:
+                    if "attention_mask" in data:
+                        numpy_float = numpy.float16 if is_io_float16 else numpy.float32
+                        attention_mask = torch.from_numpy(numpy.asarray(data["attention_mask"], dtype=numpy_float)).to(
+                            device
+                        )
+                    else:
+                        padding = -1
+                        attention_mask = (input_ids != padding).type(torch.float16 if is_io_float16 else torch.float32)
+                        input_ids.masked_fill_(input_ids == padding, 0)
+
+                    if "position_ids" in data:
+                        position_ids = torch.from_numpy(numpy.asarray(data["position_ids"], dtype=numpy.int64)).to(
+                            device
+                        )
+                    else:
+                        position_ids = attention_mask.long().cumsum(-1) - 1
+                        position_ids.masked_fill_(position_ids < 0, 0)
+
+                    inputs = {
+                        "input_ids": input_ids.to(int_data_type),
+                        "position_ids": position_ids.to(int_data_type),
+                        "attention_mask": attention_mask.to(int_data_type),
+                    }
+                else:
+                    inputs = {"input_ids": input_ids.to(int_data_type)}
+
+                test_inputs.append(inputs)
+
+        Gpt2Tester.test_generation(
+            session,
+            model,
+            device,
+            test_inputs,
+            precision=args.precision,
+            model_class=args.model_class,
+            top_k=20,
+            top_k_no_order=True,
+            max_steps=24,
+            max_inputs=0,
+            verbose=args.verbose,
+            save_test_data=3,
+            save_test_data_dir=Path(output_path).parent,
+        )
+
+    logger.info(f"Done. Output model: {output_path}")
+    return result
+
+
+if __name__ == "__main__":
+    main()
--- a/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/gpt2_helper.py
+++ b/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/gpt2_helper.py
--- a/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/gpt2_parity.py
+++ b/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/gpt2_parity.py
@ -0,0 +1,513 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+# This script uses different configurations in mixed precision conversion for GPT-2 model, and
+# measures the inference latency, top 1 match rate (compared to PyTorch FP32 model) and ONNX model size.
+# It outputs a csv file with Mann-Whitney U test and T-Test on each pair of experiments, where
+# pvalue < 0.05 means two experiments have significant difference on top 1 match rate.
+# User could use this script to select the best mixed precision model according to these metrics.
+
+import argparse
+import csv
+import datetime
+import json
+import logging
+import os
+
+import onnx
+import scipy.stats
+from benchmark_helper import get_ort_environment_variables, setup_logger
+from convert_to_onnx import main
+from gpt2_helper import PRETRAINED_GPT2_MODELS, Gpt2Helper
+from onnx_model import OnnxModel
+
+logger = logging.getLogger("")
+
+
+def parse_arguments(argv=None):
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-m",
+        "--model_name_or_path",
+        required=True,
+        type=str,
+        help="Model path, or pretrained model name in the list: " + ", ".join(PRETRAINED_GPT2_MODELS),
+    )
+
+    parser.add_argument(
+        "--csv",
+        required=False,
+        type=str,
+        default="gpt2_parity_results.csv",
+        help="path of csv file to save the result",
+    )
+
+    parser.add_argument(
+        "--test_cases",
+        required=False,
+        type=int,
+        default=500,
+        help="number of test cases per run",
+    )
+
+    parser.add_argument("--runs", required=False, type=int, default=40, help="number of repeated runs")
+
+    parser.add_argument("--use_gpu", required=False, action="store_true", help="use GPU for inference")
+    parser.set_defaults(use_gpu=False)
+
+    parser.add_argument(
+        "--all",
+        required=False,
+        action="store_true",
+        help="run all combinations of mixed precision",
+    )
+    parser.set_defaults(all=False)
+
+    parser.add_argument("-e", "--use_external_data_format", required=False, action="store_true")
+    parser.set_defaults(use_external_data_format=False)
+
+    parser.add_argument("--verbose", required=False, action="store_true")
+    parser.set_defaults(verbose=False)
+
+    parser.add_argument(
+        "--skip_test",
+        required=False,
+        action="store_true",
+        help="do not run test, and only rank experiments based on existing csv file",
+    )
+    parser.set_defaults(skip_test=False)
+
+    parser.add_argument(
+        "--overwrite",
+        required=False,
+        action="store_true",
+        help="Overwrite existing csv file",
+    )
+    parser.set_defaults(overwrite=False)
+
+    args = parser.parse_args(argv)
+
+    return args
+
+
+class ParityTask:
+    def __init__(self, test_cases, total_runs, csv_path):
+        self.total_runs = total_runs
+        self.test_cases = test_cases
+        self.csv_path = csv_path
+        self.results = []
+        self.run_id = 0
+
+    def run(self, argv, experiment_name):
+        start_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+        run_id = f"{start_time}_{self.run_id}"
+        self.run_id += 1
+
+        try:
+            result = main(
+                [*argv, "-t", f"{self.test_cases}", "-r", f"{self.total_runs}"],
+                experiment_name=experiment_name,
+                run_id=run_id,
+                csv_filename=self.csv_path,
+            )
+            if result:
+                self.results.append(result)
+        except Exception:
+            logger.exception(f"Failed to run experiment {experiment_name}")
+            result = None
+
+        return result
+
+
+def load_results_from_csv(csv_path):
+    rows = []
+    import csv
+
+    with open(csv_path, newline="") as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row in reader:
+            rows.append(row)  # noqa: PERF402
+    return rows
+
+
+def get_latency(row):
+    for name in row:
+        if name.startswith("average_latency(batch_size="):
+            return float(row[name])
+
+    raise RuntimeError("Failed to get average_latency from output")
+
+
+def score(row):
+    """Scoring function based on 3 metrics. The larger score is better."""
+    latency_in_ms = get_latency(row)
+    top1_match_rate = float(row["top1_match_rate"])
+    onnx_size_in_MB = float(row["onnx_size_in_MB"])  # noqa: N806
+    # A simple scoring function: cost of 0.1ms latency ~ 0.1% match rate ~ 100MB size
+    return top1_match_rate * 1000 - latency_in_ms * 10 - onnx_size_in_MB / 100
+
+
+def print_wins(wins, rows, test_name):
+    print()
+    print("*" * 10)
+
+    row_map = {}
+    for row in rows:
+        row_map[row["run_id"]] = row
+
+    sorted_wins = dict(
+        sorted(
+            wins.items(),
+            key=lambda item: (item[1], score(row_map[item[0]])),
+            reverse=True,
+        )
+    )
+    logger.debug(f"{test_name} Wins:{sorted_wins}")
+    logger.info(f"Based on {test_name} wins and a scoring function, the ranking:")
+
+    rank = 0
+    previous_value = -1
+    for count, (key, value) in enumerate(sorted_wins.items()):
+        if value != previous_value:
+            rank = count
+        previous_value = value
+
+        for row in rows:
+            if row["run_id"] == key:
+                logger.info(
+                    "{:02d}: WINs={:02d}, run_id={}, latency={:5.2f}, top1_match={:.4f}, size={}_MB, experiment={}, {}".format(  # noqa: G001
+                        rank,
+                        value,
+                        key,
+                        get_latency(row),
+                        float(row["top1_match_rate"]),
+                        row["onnx_size_in_MB"],
+                        row["experiment"],
+                        get_ort_environment_variables(),
+                    )
+                )
+                break
+
+
+def run_significance_test(rows, output_csv_path):
+    """Run U test and T test."""
+    utest_wins = {}
+    ttest_wins = {}
+    for row in rows:
+        run_id = row["run_id"]
+        utest_wins[run_id] = 0
+        ttest_wins[run_id] = 0
+
+    with open(output_csv_path, "w", newline="") as csvfile:
+        column_names = [
+            "model_name",
+            "run_id_1",
+            "experiment_1",
+            "top1_match_rate_1",
+            "run_id_2",
+            "experiment_2",
+            "top1_match_rate_2",
+            "U_statistic",
+            "U_pvalue",
+            "T_statistic",
+            "T_pvalue",
+        ]
+
+        writer = csv.DictWriter(csvfile, fieldnames=column_names)
+        writer.writeheader()
+
+        required_match_columns = ["model_name", "test_cases", "runs"]
+        num_results = len(rows)
+        for i in range(num_results - 1):
+            result1 = rows[i]
+
+            if isinstance(result1["top1_match_rate_per_run"], str):
+                a = json.loads(result1["top1_match_rate_per_run"])
+            else:
+                a = result1["top1_match_rate_per_run"]
+
+            for j in range(i + 1, num_results, 1):
+                result2 = rows[j]
+
+                all_matched = True
+                for column in required_match_columns:
+                    if result1[column] != result2[column]:
+                        all_matched = False
+                        break
+                if not all_matched:
+                    continue
+
+                if isinstance(result2["top1_match_rate_per_run"], str):
+                    b = json.loads(result2["top1_match_rate_per_run"])
+                else:
+                    b = result2["top1_match_rate_per_run"]
+
+                try:
+                    utest_statistic, utest_pvalue = scipy.stats.mannwhitneyu(
+                        a, b, use_continuity=True, alternative="two-sided"
+                    )  # TODO: shall we use one-sided: less or greater according to "top1_match_rate"
+                except ValueError:  # ValueError: All numbers are identical in mannwhitneyu
+                    utest_statistic = None
+                    utest_pvalue = None
+                ttest_statistic, ttest_pvalue = scipy.stats.ttest_ind(a, b, axis=None, equal_var=True)
+
+                if utest_pvalue is not None and utest_pvalue < 0.05:
+                    if float(result1["top1_match_rate"]) > float(result2["top1_match_rate"]):
+                        utest_wins[result1["run_id"]] += 1
+                    else:
+                        utest_wins[result2["run_id"]] += 1
+
+                if ttest_pvalue < 0.05:
+                    if float(result1["top1_match_rate"]) > float(result2["top1_match_rate"]):
+                        ttest_wins[result1["run_id"]] += 1
+                    else:
+                        ttest_wins[result2["run_id"]] += 1
+
+                row = {
+                    "model_name": result1["model_name"],
+                    "run_id_1": result1["run_id"],
+                    "experiment_1": result1["experiment"],
+                    "top1_match_rate_1": float(result1["top1_match_rate"]),
+                    "run_id_2": result2["run_id"],
+                    "experiment_2": result2["experiment"],
+                    "top1_match_rate_2": float(result2["top1_match_rate"]),
+                    "U_statistic": utest_statistic,
+                    "U_pvalue": utest_pvalue,
+                    "T_statistic": ttest_statistic,
+                    "T_pvalue": ttest_pvalue,
+                }
+
+                writer.writerow(row)
+    logger.info(f"U-Test and T-Test results are output to {output_csv_path}")
+    print_wins(utest_wins, rows, "U-Test")
+    print_wins(ttest_wins, rows, "T-Test")
+
+
+def get_last_matmul_node_name(raw_onnx_model: str):
+    model = onnx.load(raw_onnx_model)
+    onnx_model = OnnxModel(model)
+    output_name_to_node = onnx_model.output_name_to_node()
+
+    assert model.graph.output[0].name in output_name_to_node
+    node = output_name_to_node[model.graph.output[0].name]
+    if node.op_type == "MatMul":
+        logger.info(f"Found last MatMul node for logits: {node.name}")
+        return node.name
+
+    logger.warning(f"Failed to find MatMul node for logits. Found {node.op_type} of node {node.name}")
+    return None
+
+
+def get_mixed_precision_parameters(args, last_matmul_node_name, op_block_list):
+    model = args.model_name_or_path
+    parameters = f"-m {model} -o --use_gpu -p fp16".split()
+    if args.use_external_data_format:
+        parameters.append("--use_external_data_format")
+    parameters += [
+        "--io_block_list",
+        "logits",
+        "--node_block_list",
+        last_matmul_node_name,
+    ]
+
+    if op_block_list:
+        parameters.extend(["--op_block_list", *op_block_list])
+
+    return parameters
+
+
+def run_candidate(
+    task: ParityTask,
+    args,
+    last_matmul_node_name,
+    op_block_list=["FastGelu", "LayerNormalization"],  # noqa: B006
+):
+    parameters = get_mixed_precision_parameters(args, last_matmul_node_name, op_block_list)
+    op_block_list_str = ",".join(sorted(op_block_list))
+
+    if op_block_list:
+        name = f"Mixed precision baseline + {op_block_list_str} in FP32"
+    else:
+        name = f"Mixed precision baseline (logits output and last MatMul node {last_matmul_node_name} in FP32)"
+
+    env_vars = get_ort_environment_variables()
+    if env_vars:
+        name = name + f" ({env_vars})"
+
+    task.run(parameters, name)
+
+
+def get_baselines(args):
+    model = args.model_name_or_path
+    fp32_baseline = f"-m {model} -o -p fp32".split()
+    if args.use_gpu:
+        fp32_baseline.append("--use_gpu")
+    if args.use_external_data_format:
+        fp32_baseline.append("--use_external_data_format")
+
+    fp16_baseline = f"-m {model} -o --use_gpu -p fp16".split()
+    if args.use_external_data_format:
+        fp16_baseline.append("--use_external_data_format")
+
+    return fp32_baseline, fp16_baseline
+
+
+def run_tuning_step0(task, fp16_baseline, all_ops, optimized_ops):
+    """Step 0 is to check which operator in FP16 causes most loss"""
+    fp32_logits = ["--io_block_list", "logits"]
+    task.run(fp16_baseline + fp32_logits, "FP16 except logits")
+
+    fp32_io = ["--keep_io_types"]
+    task.run(fp16_baseline + fp32_io, "Graph I/O FP32, Other FP16")
+
+    # Only weights in FP16
+    task.run(
+        fp16_baseline + fp32_io + ["--op_block_list"] + [o for o in all_ops] + ["--force_fp16_initializers"],
+        "FP32 except weights in FP16",
+    )
+
+    optimized_ops_results = []
+    op_list = optimized_ops
+    for op in op_list:
+        op_block_list = ["--op_block_list"] + [o for o in op_list if o != op]
+        result = task.run(fp16_baseline + fp32_io + op_block_list, f"FP32 except {op} in FP16")
+        if result:
+            optimized_ops_results.append(result)
+
+    # Check which optimized operator causes the most loss in precision
+    min_result = min(optimized_ops_results, key=lambda y: y["top1_match_rate"])
+    print("step 0: optimized operator causes the most loss in precision", min_result)
+
+
+def run_tuning_step1(task, mixed_precision_baseline, optimized_ops):
+    """Step 1 is to figure out which optimized operator in FP32 could benefit most"""
+    for op in optimized_ops:
+        op_block_list = ["--op_block_list", op]
+        task.run(
+            mixed_precision_baseline + op_block_list,
+            f"Mixed precision baseline + {op} in FP32",
+        )
+
+
+def run_tuning_step2(task, mixed_precision_baseline, optimized_ops):
+    """Assumed that you have run step 0 and 1 to figure out that Logits FP32 and some operators shall be in FP32,
+    This step will try add one more operator.
+    """
+    candidate_fp32_ops = ["FastGelu", "LayerNormalization", "SkipLayerNormalization"]
+    fp32_ops = [x for x in candidate_fp32_ops if x in optimized_ops]
+    for op in optimized_ops:
+        if op not in fp32_ops:
+            op_block_list = [*fp32_ops, op]
+            task.run(
+                [*mixed_precision_baseline, "--op_block_list", *op_block_list],
+                "Mixed precision baseline + {},{} in FP32".format(",".join(fp32_ops), op),
+            )
+
+
+def run_parity(task: ParityTask, args):
+    onnx_model_paths = Gpt2Helper.get_onnx_paths(
+        "onnx_models",
+        args.model_name_or_path,
+        new_folder=args.use_external_data_format,
+        remove_existing=[],
+    )
+
+    fp32_baseline, fp16_baseline = get_baselines(args)
+
+    result = task.run(fp32_baseline, "FP32 baseline")
+
+    optimized_ops = []
+    if result and ("optimized_operators" in result) and result["optimized_operators"]:
+        optimized_ops = result["optimized_operators"].split(",")
+    else:
+        raise RuntimeError("Failed to get optimized operators")
+
+    all_ops = []
+    if result and ("operators" in result) and result["operators"]:
+        all_ops = result["operators"].split(",")
+    else:
+        raise RuntimeError("Failed to get operators")
+
+    # The following tests for fp16 requires GPU
+    if not args.use_gpu:
+        logger.info("skip mixed precision since --use_gpu is not specified")
+        return
+
+    task.run(fp16_baseline, "FP16 baseline")
+
+    last_matmul_node_name = get_last_matmul_node_name(onnx_model_paths["raw"])
+
+    # Mixed precision baseline
+    run_candidate(task, args, last_matmul_node_name, op_block_list=[])
+
+    def get_fp32_ops(x):
+        return [op for op in x if op in all_ops]
+
+    if args.all:
+        run_tuning_step0(task, fp16_baseline, all_ops, optimized_ops)
+        mixed_precision_baseline = get_mixed_precision_parameters(args, last_matmul_node_name, op_block_list=[])
+        run_tuning_step1(task, mixed_precision_baseline, optimized_ops)
+        run_tuning_step2(task, mixed_precision_baseline, optimized_ops)
+    else:
+        run_candidate(
+            task,
+            args,
+            last_matmul_node_name,
+            op_block_list=get_fp32_ops(["SkipLayerNormalization", "LayerNormalization", "Add"]),
+        )
+        run_candidate(task, args, last_matmul_node_name, op_block_list=["FastGelu"])
+
+    # Run a few good candidates
+    run_candidate(
+        task,
+        args,
+        last_matmul_node_name,
+        op_block_list=get_fp32_ops(["FastGelu", "SkipLayerNormalization", "LayerNormalization", "Add"]),
+    )
+    run_candidate(
+        task,
+        args,
+        last_matmul_node_name,
+        op_block_list=get_fp32_ops(
+            ["FastGelu", "EmbedLayerNormalization", "SkipLayerNormalization", "LayerNormalization", "Add"]
+        ),
+    )
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    setup_logger(args.verbose)
+
+    if args.test_cases < 100 or args.runs < 20 or args.test_cases * args.runs < 10000:
+        logger.warning(
+            "Not enough test cases or runs to get stable results or test significance. "
+            "Recommend test_cases >= 100, runs >= 20, test_cases * runs >= 10000."
+        )
+
+    if os.path.exists(args.csv) and not args.skip_test:
+        if not args.overwrite:
+            raise RuntimeError(
+                f"Output file {args.csv} existed. Please remove the file, or use either --skip_test or --overwrite."
+            )
+        else:
+            logger.info("Remove existing file %s since --overwrite is specified", args.csv)
+            os.remove(args.csv)
+
+    task = ParityTask(args.test_cases, args.runs, args.csv)
+
+    if not args.skip_test:
+        run_parity(task, args)
+
+    try:
+        rows = load_results_from_csv(task.csv_path)
+    except Exception:
+        logger.exception(f"Failed to load csv {task.csv_path}")
+        rows = task.results
+
+    logger.info("Start running significance tests...")
+    summary_csv = task.csv_path.replace(".csv", ".stats.csv")
+    run_significance_test(rows, summary_csv)
--- a/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/gpt2_tester.py
+++ b/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/gpt2_tester.py
@ -0,0 +1,501 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+# This script helps evaluation of GPT-2 model.
+import logging
+import math
+import os
+import statistics
+import timeit
+
+import numpy
+import torch
+from benchmark_helper import Precision
+from gpt2_helper import Gpt2Helper, Gpt2Inputs
+
+logger = logging.getLogger(__name__)
+
+
+class Gpt2Metric:
+    def __init__(self, treatment_name, baseline_name="Torch", top_k=20):
+        assert top_k > 1 and top_k <= 100
+        self.baseline = baseline_name
+        self.treatment = treatment_name
+        self.name: str = f"{treatment_name} vs {baseline_name}"
+        self.top_k = top_k
+        self.top_1_error: int = 0
+        self.top_k_error: int = 0
+        self.total_samples: int = 0
+        self.max_logits_diff: float = 0  # for non-empty past state
+        self.max_logits_diff_no_past: float = 0  # for empty past state
+        self.batch_top1_error: torch.FloatTensor = None  # top 1 error for current batch
+        self.batch_topk_error: torch.FloatTensor = None  # top k error for current batch
+        self.seq_len_latency = {}
+
+    def print(self):
+        if self.baseline != self.treatment:
+            print("---")
+            print(f"Metrics for {self.treatment} (baseline={self.baseline}):")
+            if self.total_samples > 0:
+                top_1_error_rate = 100.0 * self.top_1_error / self.total_samples
+                top_k_error_rate = 100.0 * self.top_k_error / self.total_samples
+                print(
+                    f"Total={self.total_samples} Top1Error={self.top_1_error} ({top_1_error_rate:.2f}%) Top{self.top_k}Error={self.top_k_error} ({top_k_error_rate:.2f}%)"
+                )
+            print("Max logits diffs:")
+            print(f"\twith past  = {self.max_logits_diff:.6f}")
+            print(f"\tempty past = {self.max_logits_diff_no_past:.6f}")
+        else:
+            print(f"Metrics for {self.treatment} (baseline):")
+
+        if self.seq_len_latency:
+            print("Past sequence length range and average latency:")
+            total = 0
+            count = 0
+            for key in sorted(self.seq_len_latency.keys()):
+                average = statistics.mean(self.seq_len_latency[key]) * 1000.0
+                if key == 0:
+                    print(f"\t{key}:         \t{average:.2f} ms")
+                else:
+                    print(f"\t[{2**key}, {2 ** (key + 1) - 1}]:\t{average:.2f} ms")
+                total += average * len(self.seq_len_latency[key])
+                count += len(self.seq_len_latency[key])
+            print(f"Average Latency: {total / count:.2f} ms")
+
+    def diff_logits(self, baseline_logits, treatment_logits, is_empty_past: bool):
+        diff = (baseline_logits - treatment_logits).abs().max()
+        if is_empty_past:
+            self.max_logits_diff_no_past = max(self.max_logits_diff_no_past, diff)
+        else:
+            self.max_logits_diff = max(self.max_logits_diff, diff)
+
+        return diff
+
+    def start_batch(self, batch_size: int):
+        self.total_samples += batch_size
+        self.batch_top1_error = torch.zeros((batch_size, 1), dtype=torch.bool)
+        self.batch_topk_error = torch.zeros((batch_size, 1), dtype=torch.bool)
+
+    def eval_batch(self, baseline, treatment, past_seq_len, verbose=True):
+        self._eval_topk(baseline.top_1_tokens, treatment.top_1_tokens, 1, verbose)
+        self._eval_topk(baseline.top_k_tokens, treatment.top_k_tokens, self.top_k, verbose)
+
+        max_diff = self.diff_logits(baseline.logits, treatment.logits, past_seq_len == 0)
+        if verbose:
+            print(f"Max logits diffs of {self.name}: {max_diff}")
+
+    def _eval_topk(self, baseline_topk, treatment_topk, top_k, verbose=True):
+        if not torch.all(torch.eq(baseline_topk, treatment_topk)):
+            if top_k == 1:
+                if verbose:
+                    print(f"Generated tokens not matched for {self.name}")
+                self.batch_top1_error |= torch.eq(baseline_topk, treatment_topk).logical_not()
+            else:
+                if verbose:
+                    print(
+                        f"Top {top_k} tokens not matched for {self.name}. This will lead to wrong beam search results"
+                    )
+                self.batch_topk_error |= (
+                    torch.eq(baseline_topk, treatment_topk).logical_not().sum(1).unsqueeze(dim=1) > 0
+                )
+
+    def end_batch(self):
+        self.top_1_error += self.batch_top1_error.sum()
+        self.top_k_error += self.batch_topk_error.sum()
+
+    def add_latency(self, past_seq_len, latency):
+        key = int(math.log2(past_seq_len)) + 1 if past_seq_len > 0 else 0
+        if key not in self.seq_len_latency:
+            self.seq_len_latency[key] = []
+        self.seq_len_latency[key].append(latency)
+
+
+class Gpt2Tester:
+    def __init__(
+        self,
+        input_ids,
+        position_ids,
+        attention_mask,
+        num_attention_heads,
+        hidden_size,
+        num_layer,
+        device,
+        is_fp16=False,
+        top_k=20,
+        top_k_required_order=False,
+    ):
+        self.batch_size = input_ids.shape[0]
+        self.input_length = input_ids.shape[1]
+        self.n_layer = num_layer
+
+        self.input_ids = input_ids
+        self.position_ids = position_ids
+        self.attention_mask = attention_mask
+
+        self.has_position_ids = position_ids is not None
+        self.has_attention_mask = attention_mask is not None
+
+        # Empty past state for first inference
+        self.past = []
+        past_shape = [
+            2,
+            self.batch_size,
+            num_attention_heads,
+            0,
+            hidden_size // num_attention_heads,
+        ]
+        for _i in range(num_layer):
+            empty_past = torch.empty(past_shape).type(torch.float16 if is_fp16 else torch.float32)
+            self.past.append(empty_past.to(device))
+
+        self.logits = None
+        self.top_1_tokens = None
+        self.top_k_tokens = None
+        self.top_k = top_k
+        self.top_k_required_order = top_k_required_order
+
+    def get_inputs(self) -> Gpt2Inputs:
+        return Gpt2Inputs(self.input_ids, self.position_ids, self.attention_mask, self.past)
+
+    def save_test_data(self, session, output, save_test_data_dir, test_case_id):
+        from onnx import numpy_helper
+
+        path = os.path.join(save_test_data_dir, "test_data_set_" + str(test_case_id))
+        if os.path.exists(path):
+            print(f"Directory {path} existed. Skip saving test data")
+            return
+
+        os.makedirs(path, exist_ok=True)
+
+        def add_tensor(input_tensors, torch_tensor, name):
+            input_tensors.append(numpy_helper.from_array(torch_tensor.clone().cpu().numpy(), name))
+
+        input_tensors = []
+        add_tensor(input_tensors, self.input_ids, "input_ids")
+
+        if self.has_position_ids:
+            add_tensor(input_tensors, self.position_ids, "position_ids")
+
+        if self.has_attention_mask:
+            add_tensor(input_tensors, self.attention_mask, "attention_mask")
+
+        for i in range(self.n_layer):
+            add_tensor(input_tensors, self.past[i], "past_" + str(i))
+
+        for i, tensor in enumerate(input_tensors):
+            with open(os.path.join(path, f"input_{i}.pb"), "wb") as f:
+                f.write(tensor.SerializeToString())
+
+        output_names = [output.name for output in session.get_outputs()]
+        for i, _name in enumerate(output_names):
+            tensor = numpy_helper.from_array(
+                output[i] if isinstance(output[i], numpy.ndarray) else output[i].clone().cpu().numpy()
+            )
+            with open(os.path.join(path, f"output_{i}.pb"), "wb") as f:
+                f.write(tensor.SerializeToString())
+
+        print(f"Test data saved to directory {path}")
+
+    def update(self, output, step, device):
+        """
+        Update the inputs for next inference.
+        """
+        self.logits = (
+            torch.from_numpy(output[0]) if isinstance(output[0], numpy.ndarray) else output[0].clone().detach().cpu()
+        )
+
+        self.top_1_tokens = Gpt2Tester.predict_next_token(self.logits)
+        self.top_k_tokens = Gpt2Tester.predict_next_token(self.logits, self.top_k, self.top_k_required_order)
+
+        self.input_ids = self.top_1_tokens.clone().detach().reshape([self.batch_size, 1]).to(device)
+
+        if self.has_position_ids:
+            self.position_ids = (
+                torch.tensor([self.input_length + step - 1]).unsqueeze(0).repeat(self.batch_size, 1).to(device)
+            )
+
+        if self.has_attention_mask:
+            self.attention_mask = torch.cat(
+                [
+                    self.attention_mask,
+                    torch.ones([self.batch_size, 1]).type_as(self.attention_mask),
+                ],
+                1,
+            ).to(device)
+
+        self.past = []
+
+        if isinstance(output[1], tuple):  # past in torch output is tuple
+            self.past = list(output[1])
+        else:
+            for i in range(self.n_layer):
+                past_i = (
+                    torch.from_numpy(output[i + 1])
+                    if isinstance(output[i + 1], numpy.ndarray)
+                    else output[i + 1].clone().detach()
+                )
+                self.past.append(past_i.to(device))
+
+    def diff(self, baseline):
+        """
+        Compare inputs and logits output.
+        """
+
+        print("start diff...")
+        if self.logits is not None:
+            max_io_diff = (self.logits - baseline.logits).abs().max()
+            if max_io_diff > 1e-4:
+                print(f"Max logits difference is too large: {max_io_diff}")
+
+        if not torch.all(self.input_ids == baseline.input_ids):
+            print("Input_ids is different", self.input_ids, baseline.input_ids)
+
+        if self.has_position_ids:
+            if not torch.all(self.position_ids == baseline.position_ids):
+                print(
+                    "position_ids is different",
+                    self.position_ids,
+                    baseline.position_ids,
+                )
+
+        if self.has_attention_mask:
+            if not torch.all(self.attention_mask == baseline.attention_mask):
+                print(
+                    "attention_mask is different",
+                    self.attention_mask,
+                    baseline.attention_mask,
+                )
+
+        assert len(self.past) == len(baseline.past)
+
+        for i, past_i in enumerate(self.past):
+            assert past_i.shape == baseline.past[i].shape
+            if past_i.nelement() > 0:
+                max_past_diff = (past_i - baseline.past[i]).abs().max()
+                if max_past_diff > 1e-4:
+                    print(f"max_past_diff[{i}]={max_past_diff}")
+
+    @staticmethod
+    def predict_next_token(logits, top_k=1, required_order=False):
+        """
+        Get top k topkens based on logits.
+        """
+
+        # logits has shape (batch_size, seq_len, vocab_size)
+        # last token logits has shape (batch_size, vocab_size)
+        lastTokenLogits = logits[:, -1]  # noqa: N806
+        if top_k == 1:
+            generatedTokens = torch.argmax(lastTokenLogits, 1, True)  # noqa: N806
+            return generatedTokens
+        else:
+            topk = torch.argsort(lastTokenLogits, -1, descending=True)[:, :top_k]
+            if not required_order:
+                sorted_topk, _ = topk.sort()
+                return sorted_topk
+            return topk
+
+    @staticmethod
+    def diff_present(onnx_output, onnx_io_output, n_layer):
+        """
+        Compare the present outputs of two outputs from ONNX Runtime.
+        """
+        present_diff_max = []
+        for i in range(n_layer):
+            onnx_present_i = (
+                torch.from_numpy(onnx_output[i + 1])
+                if isinstance(onnx_output[i + 1], numpy.ndarray)
+                else onnx_output[i + 1]
+            )
+            onnx_io_present_i = (
+                torch.from_numpy(onnx_io_output[i + 1])
+                if isinstance(onnx_io_output[i + 1], numpy.ndarray)
+                else onnx_io_output[i + 1]
+            )
+            max_diff = (onnx_present_i - onnx_io_present_i).abs().max()
+            present_diff_max.append(max_diff)
+        print(f"present_diff_max={present_diff_max}")
+
+    @staticmethod
+    def is_quantized_onnx_model(onnx_model_path):
+        """
+        Returns True if the ONNX model is quantized.
+        """
+        from onnx import load
+
+        model = load(onnx_model_path)
+        from onnxruntime.quantization.quantize import __producer__ as quantize_producer
+
+        return model.producer_name == quantize_producer
+
+    @staticmethod
+    def test_generation(
+        session,
+        model,
+        device,
+        test_inputs,
+        precision=Precision.FLOAT32,
+        model_class="Gpt2LMHeadModel",
+        top_k=20,
+        top_k_no_order=True,
+        max_steps=24,
+        max_inputs=0,
+        verbose=False,
+        save_test_data=0,
+        save_test_data_dir=".",
+    ):
+        """
+        Test Generation using greedy beam search (without sampling) to compare PyTorch and ONNX model.
+        It will print top 1 and top k errors on the given test inputs.
+        """
+        print(
+            f"start test generation: (top_k={top_k} top_k_no_order={top_k_no_order} max_steps={max_steps} test_inputs={len(test_inputs)} max_inputs={max_inputs})"
+        )
+        n_layer = model.config.n_layer
+        n_head = model.config.n_head
+        n_embd = model.config.n_embd
+        eos_token_id = model.config.eos_token_id
+        test_data_saved = 0
+
+        is_float16 = precision == Precision.FLOAT16
+        if is_float16:
+            assert "float16" in session.get_outputs()[0].type
+
+        # We will still use fp32 torch model as baseline when onnx model if fp16
+        model.eval().to(device)
+
+        # Allocate initial buffers for IO Binding of ONNX Runtimne. The buffer size will automatically increase later.
+        init_output_shapes = Gpt2Helper.get_output_shapes(
+            batch_size=4,
+            past_sequence_length=128,
+            sequence_length=32,
+            config=model.config,
+            model_class=model_class,
+        )
+        output_buffers = Gpt2Helper.get_output_buffers(init_output_shapes, device, is_float16=is_float16)
+
+        baseline_name = "Torch"
+        treatment_name = "Quantized Onnx" if precision == Precision.INT8 else "Onnx"
+        torch_metric = Gpt2Metric(baseline_name, baseline_name, top_k)
+        onnx_metric = Gpt2Metric(treatment_name, baseline_name, top_k)
+        onnx_io_metric = Gpt2Metric(treatment_name + " with IO Binding", baseline_name, top_k)
+
+        for i, inputs in enumerate(test_inputs):
+            if max_inputs > 0 and i == max_inputs:
+                break
+            if i % 10 == 0:
+                print(f"{i}")
+            input_ids = inputs["input_ids"]
+            position_ids = inputs.get("position_ids", None)
+            attention_mask = inputs.get("attention_mask", None)
+
+            onnx_runner = Gpt2Tester(
+                input_ids,
+                position_ids,
+                attention_mask,
+                n_head,
+                n_embd,
+                n_layer,
+                device,
+                is_float16,
+                top_k,
+                not top_k_no_order,
+            )
+            onnx_io_runner = Gpt2Tester(
+                input_ids,
+                position_ids,
+                attention_mask,
+                n_head,
+                n_embd,
+                n_layer,
+                device,
+                is_float16,
+                top_k,
+                not top_k_no_order,
+            )
+            torch_runner = Gpt2Tester(
+                input_ids,
+                position_ids,
+                attention_mask,
+                n_head,
+                n_embd,
+                n_layer,
+                device,
+                False,
+                top_k,
+                not top_k_no_order,
+            )  # Torch model baseline is fp32
+
+            batch_size = torch_runner.batch_size
+            onnx_metric.start_batch(batch_size)
+            onnx_io_metric.start_batch(batch_size)
+
+            with torch.no_grad():
+                done = torch.zeros(batch_size, dtype=torch.bool)
+                for step in range(max_steps):
+                    seq_len = list(onnx_runner.input_ids.size())[1]
+                    past_seq_len = list(onnx_runner.past[0].size())[3]
+
+                    start_time = timeit.default_timer()
+                    pytorch_output = Gpt2Helper.pytorch_inference(model, torch_runner.get_inputs())
+                    torch_metric.add_latency(past_seq_len, timeit.default_timer() - start_time)
+                    torch_runner.update(pytorch_output, step, device)
+
+                    onnx_output, avg_latency_ms = Gpt2Helper.onnxruntime_inference(
+                        session, onnx_runner.get_inputs(), total_runs=1
+                    )
+                    onnx_metric.add_latency(past_seq_len, avg_latency_ms / 1000.0)
+                    onnx_runner.update(onnx_output, step, device)
+
+                    output_shapes = Gpt2Helper.get_output_shapes(
+                        batch_size,
+                        past_seq_len,
+                        seq_len,
+                        model.config,
+                        model_class=model_class,
+                    )
+                    Gpt2Helper.auto_increase_buffer_size(output_buffers, output_shapes)
+
+                    (
+                        onnx_io_output,
+                        avg_latency_ms,
+                    ) = Gpt2Helper.onnxruntime_inference_with_binded_io(
+                        session,
+                        onnx_io_runner.get_inputs(),
+                        output_buffers,
+                        output_shapes,
+                        total_runs=1,
+                        return_numpy=False,
+                        include_copy_output_latency=True,
+                    )
+                    onnx_io_metric.add_latency(past_seq_len, avg_latency_ms / 1000.0)
+
+                    if test_data_saved < save_test_data:
+                        onnx_io_runner.save_test_data(session, onnx_io_output, save_test_data_dir, test_data_saved)
+                        test_data_saved += 1
+
+                    onnx_io_runner.update(onnx_io_output, step, device)
+
+                    if verbose:
+                        onnx_runner.diff(onnx_io_runner)
+                        Gpt2Tester.diff_present(onnx_output, onnx_io_output, n_layer)
+
+                        print("Top 1 tokens:")
+                        print("\tTorch", torch_runner.top_1_tokens)
+                        print("\tONNX", onnx_runner.top_1_tokens)
+                        print("\tONNX with IO binding", onnx_io_runner.top_1_tokens)
+
+                    onnx_metric.eval_batch(torch_runner, onnx_runner, past_seq_len, verbose=verbose)
+                    onnx_io_metric.eval_batch(torch_runner, onnx_io_runner, past_seq_len, verbose=verbose)
+
+                    done = done | (torch_runner.top_1_tokens == eos_token_id).any()
+                    if torch.all(done):
+                        break
+
+            onnx_metric.end_batch()
+            onnx_io_metric.end_batch()
+
+        torch_metric.print()
+        onnx_metric.print()
+        onnx_io_metric.print()
--- a/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/parity_check_helper.py
+++ b/rl/Lib/site-packages/onnxruntime/transformers/models/gpt2/parity_check_helper.py
@ -0,0 +1,146 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+# This script helps debugging parity issue for two same onnx models with fp16 and fp32 format
+# Please build ORT with --cmake_extra_defines onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS=ON
+
+import math
+import multiprocessing
+import os
+from pathlib import Path
+
+import numpy
+import torch
+from benchmark_helper import create_onnxruntime_session
+from gpt2_helper import Gpt2Helper
+from onnx import TensorProto, numpy_helper
+
+NON_ZERO_VALUE = str(1)
+ZERO_VALUE = str(0)
+
+
+def environ_setting_nodes(node_name_filter=None, node_type_filter=None):
+    # Set I/O data as default
+    os.environ["ORT_DEBUG_NODE_IO_DUMP_SHAPE_DATA"] = ZERO_VALUE
+    os.environ["ORT_DEBUG_NODE_IO_DUMP_INPUT_DATA"] = NON_ZERO_VALUE
+    os.environ["ORT_DEBUG_NODE_IO_DUMP_OUTPUT_DATA"] = NON_ZERO_VALUE
+    if node_name_filter is not None:
+        os.environ["ORT_DEBUG_NODE_IO_NAME_FILTER"] = node_name_filter
+    elif node_type_filter is not None:
+        os.environ["ORT_DEBUG_NODE_IO_OP_TYPE_FILTER"] = node_type_filter
+    else:
+        os.environ["ORT_DEBUG_NODE_IO_DUMPING_DATA_TO_FILES_FOR_ALL_NODES_IS_OK"] = NON_ZERO_VALUE
+
+
+def environ_setting_paths(output_path):
+    # Set dumping values to files as default
+    os.environ["ORT_DEBUG_NODE_IO_DUMP_DATA_DESTINATION"] = "files"
+    os.environ["ORT_DEBUG_NODE_IO_OUTPUT_DIR"] = output_path
+
+
+def environ_reset():
+    for flag in [
+        "ORT_DEBUG_NODE_IO_DUMP_SHAPE_DATA",
+        "ORT_DEBUG_NODE_IO_DUMP_INPUT_DATA",
+        "ORT_DEBUG_NODE_IO_DUMP_OUTPUT_DATA",
+        "ORT_DEBUG_NODE_IO_NAME_FILTER",
+        "ORT_DEBUG_NODE_IO_OP_TYPE_FILTER",
+        "ORT_DEBUG_NODE_IO_DUMP_DATA_TO_FILES",
+        "ORT_DEBUG_NODE_IO_OUTPUT_DIR",
+        "ORT_DEBUG_NODE_IO_DUMPING_DATA_TO_FILES_FOR_ALL_NODES_IS_OK",
+    ]:
+        if flag in os.environ:
+            del os.environ[flag]
+
+
+def inference(model_path, dummy_inputs, outputs_path, use_gpu):
+    environ_reset()
+    environ_setting_nodes()
+    environ_setting_paths(outputs_path)
+    session = create_onnxruntime_session(model_path, use_gpu, enable_all_optimization=False)
+    Gpt2Helper.onnxruntime_inference(session, dummy_inputs)
+
+
+def generate_outputs_files(model_path, dummy_inputs, outputs_path, use_gpu):
+    dir_path = Path(outputs_path)
+    if dir_path.exists() and dir_path.is_dir():
+        import shutil
+
+        shutil.rmtree(outputs_path)
+    dir_path.mkdir(parents=True, exist_ok=True)
+
+    process = multiprocessing.Process(target=inference, args=(model_path, dummy_inputs, outputs_path, use_gpu))
+    process.start()
+    process.join()
+
+
+def post_processing(outputs_path, outputs_path_other):
+    # Compare outputs with e.g. fp16 and fp32
+    record = {}
+    if_close = {}
+
+    import glob
+
+    for filename in glob.glob(os.path.join(outputs_path, "*.tensorproto")):
+        filename_other = os.path.join(outputs_path_other, Path(filename).name)
+        if not os.path.exists(filename_other):
+            continue
+        with open(filename, "rb") as f:
+            tensor = TensorProto()
+            tensor.ParseFromString(f.read())
+            array = numpy_helper.to_array(tensor)
+            with open(filename_other, "rb") as f:  # noqa: PLW2901
+                tensor_other = TensorProto()
+                tensor_other.ParseFromString(f.read())
+                array_other = numpy_helper.to_array(tensor_other)
+                if array_other.size == 0:
+                    continue
+                diff = numpy.average(numpy.abs(array_other - array) / (numpy.abs(array_other) + 1e-6))
+                if math.isnan(diff):
+                    continue
+                record[Path(filename).name.split(".")[0]] = diff
+                if_close[Path(filename).name.split(".")[0]] = numpy.allclose(array, array_other, rtol=1e-04, atol=1e-04)
+
+    results = ["Node\tDiff\tClose"]
+    for k, v in sorted(record.items(), key=lambda x: x[1], reverse=True):
+        results.append(f"{k}\t{v}\t{if_close[k]}")
+    for line in results:
+        print(line)
+
+
+if __name__ == "__main__":
+    # Below example shows how to use this helper to investigate parity issue of gpt-2 fp32 and fp16 onnx model
+    # Please build ORT with --cmake_extra_defines onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS=ON !!
+    multiprocessing.set_start_method("spawn")
+
+    # Generate Inputs
+    sequence_length = 8
+    past_sequence_length = 8
+    batch_size = 5
+    dummy_inputs_fp16 = Gpt2Helper.get_dummy_inputs(
+        batch_size,
+        past_sequence_length,
+        sequence_length,
+        12,
+        768,
+        12,
+        50257,
+        device=torch.device("cpu"),
+        float16=True,
+    )
+    dummy_inputs_fp32 = dummy_inputs_fp16.to_fp32()
+
+    # Get GPT-2 model from huggingface using convert_to_onnx.py
+    os.system("python convert_to_onnx.py -m gpt2 --output gpt2_fp32.onnx -o -p fp32 --use_gpu")
+    os.system("python convert_to_onnx.py -m gpt2 --output gpt2_fp16.onnx -o -p fp16 --use_gpu")
+
+    # Specify the directory to dump the node's I/O
+    outputs_path_fp32_gpu = "./fp32_gpu"
+    outputs_path_fp16_gpu = "./fp16_gpu"
+    generate_outputs_files("./gpt2_fp32.onnx", dummy_inputs_fp32, outputs_path_fp32_gpu, use_gpu=True)
+    generate_outputs_files("./gpt2_fp16.onnx", dummy_inputs_fp16, outputs_path_fp16_gpu, use_gpu=True)
+
+    # Compare each node's I/O value and sort based on average rtol
+    post_processing(outputs_path_fp16_gpu, outputs_path_fp32_gpu)