I am done

This commit is contained in:
2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions

View File

@ -0,0 +1,12 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------
import os.path
import sys
sys.path.append(os.path.dirname(__file__))
transformers_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", ".."))
if transformers_dir not in sys.path:
sys.path.append(transformers_dir)

View File

@ -0,0 +1,413 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
# This script benchmarks gpt2 model with past state.
# For gpt2 model without past state, use benchmark.py to measure performance.
import argparse
import csv
import logging
import os
from datetime import datetime
import psutil
import torch
from benchmark_helper import (
Precision,
create_onnxruntime_session,
get_ort_environment_variables,
prepare_environment,
setup_logger,
)
from gpt2_helper import DEFAULT_TOLERANCE, MODEL_CLASSES, PRETRAINED_GPT2_MODELS, Gpt2Helper
from packaging import version
from quantize_helper import QuantizeHelper
from transformers import AutoConfig
from transformers import __version__ as transformers_version
logger = logging.getLogger("")
def parse_arguments(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument(
"-m",
"--model_name_or_path",
required=True,
type=str,
help="Model path, or pretrained model name selected in the list: " + ", ".join(PRETRAINED_GPT2_MODELS),
)
parser.add_argument(
"--model_class",
required=False,
type=str,
default="GPT2LMHeadModel",
choices=list(MODEL_CLASSES.keys()),
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
)
parser.add_argument(
"--cache_dir",
required=False,
type=str,
default=os.path.join(".", "cache_models"),
help="Directory to cache pre-trained models",
)
parser.add_argument(
"--onnx_dir",
required=False,
type=str,
default=os.path.join(".", "onnx_models"),
help="Directory to store onnx models",
)
parser.add_argument(
"--test_times",
required=False,
default=100,
type=int,
help="Number of repeat times to get average inference latency.",
)
parser.add_argument(
"-v",
"--validate_onnx",
required=False,
action="store_true",
help="Validate ONNX model",
)
parser.add_argument(
"-o",
"--optimize_onnx",
required=False,
action="store_true",
help="Use optimizer.py to optimize onnx model",
)
parser.set_defaults(optimize_onnx=False)
parser.add_argument(
"--stage",
type=int,
default=0,
required=False,
choices=[0, 1, 2],
help="Stage in generation: 1 (initial decoder), 2 (decoder), 0 (both). "
"1 - decode the first token when past_sequence_length is zero; "
"2 - decode the remaining tokens when past_sequence_length is not zero; "
"0 - one onnx model for both stages 1 and 2. "
"Note that we will optimize 1 and 2 differently for best performance.",
)
parser.add_argument("--use_gpu", required=False, action="store_true", help="use GPU for inference")
parser.set_defaults(use_gpu=False)
parser.add_argument(
"-p",
"--precision",
type=Precision,
default=Precision.FLOAT32,
choices=list(Precision),
help="Precision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization",
)
parser.add_argument("--torchscript", required=False, action="store_true", help="use Torchscript")
parser.set_defaults(torchscript=False)
parser.add_argument("-b", "--batch_sizes", nargs="+", type=int, default=[1], help="batch size")
parser.add_argument(
"--sequence_lengths",
nargs="+",
type=int,
default=[1],
help="sequence lengths (excluding past)",
)
parser.add_argument(
"-s",
"--past_sequence_lengths",
nargs="+",
type=int,
default=[8, 16, 32, 64, 128, 256],
help="past sequence lengths",
)
parser.add_argument(
"-r",
"--result_csv",
required=False,
default=None,
help="CSV file for saving summary results.",
)
parser.add_argument("--thread_num", required=False, type=int, default=-1, help="Threads to use")
parser.add_argument("--include_copy_output_latency", required=False, action="store_true")
parser.set_defaults(include_copy_output_latency=False)
parser.add_argument("--verbose", required=False, action="store_true")
parser.set_defaults(verbose=False)
parser.add_argument("--output_torch_latency", required=False, action="store_true")
parser.set_defaults(output_torch_latency=False)
parser.add_argument("--disable_io_binding", required=False, action="store_true")
parser.set_defaults(disable_io_binding=False)
args = parser.parse_args(argv)
return args
def main(args):
if version.parse(transformers_version) < version.parse(
"3.1.0"
): # past_key_values name does not exist in 3.0.2 or older
raise RuntimeError("This tool requires transformers 3.1.0 or later.")
logger.info(f"Arguments:{args}")
if args.precision == Precision.FLOAT16:
assert args.optimize_onnx and args.use_gpu, "fp16 requires --optimize_onnx --use_gpu"
if args.precision == Precision.INT8:
assert not args.use_gpu, "quantization only supports CPU"
if args.stage == 1:
assert args.past_sequence_lengths == [0], "past_sequence_lengths shall be 0 for stage==1 (init decoder)"
torch.set_num_threads(psutil.cpu_count(logical=True) if args.thread_num <= 0 else args.thread_num)
print(torch.__config__.parallel_info())
cache_dir = args.cache_dir
output_dir = args.onnx_dir
prepare_environment(cache_dir, output_dir, args.use_gpu)
model_class = MODEL_CLASSES[args.model_class][0]
gpt2helper = Gpt2Helper
config = AutoConfig.from_pretrained(args.model_name_or_path, torchscript=args.torchscript, cache_dir=cache_dir)
model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir)
# This script does not support float16 for PyTorch.
# if args.float16:
# model.half()
device = torch.device("cuda:0" if args.use_gpu else "cpu")
model.to(device)
use_external_data_format = config.n_layer > 24 # TODO: find a way to check model size > 2GB
onnx_model_paths = gpt2helper.get_onnx_paths(
output_dir,
args.model_name_or_path,
args.model_class,
has_past=True,
new_folder=use_external_data_format,
)
onnx_model_path = onnx_model_paths["raw"]
use_padding = MODEL_CLASSES[args.model_class][2]
gpt2helper.export_onnx(
model,
device,
onnx_model_path,
args.verbose,
use_external_data_format,
has_position_ids=use_padding,
has_attention_mask=use_padding,
)
if args.optimize_onnx or args.precision != Precision.FLOAT32:
onnx_model_path = onnx_model_paths[str(args.precision) if args.precision != Precision.INT8 else "fp32"]
gpt2helper.optimize_onnx(
onnx_model_paths["raw"],
onnx_model_path,
args.precision == Precision.FLOAT16,
model.config.num_attention_heads,
model.config.hidden_size,
use_external_data_format,
auto_mixed_precision=True,
stage=args.stage,
)
if args.precision == Precision.INT8:
logger.info("quantizing model...")
QuantizeHelper.quantize_onnx_model(onnx_model_path, onnx_model_paths["int8"], use_external_data_format)
model = QuantizeHelper.quantize_torch_model(model)
logger.info("finished quantizing model")
onnx_model_path = onnx_model_paths["int8"]
if args.torchscript:
model = gpt2helper.torchscript(
model,
config,
device,
has_position_ids=use_padding,
has_attention_mask=use_padding,
)
session = create_onnxruntime_session(
onnx_model_path,
args.use_gpu,
enable_all_optimization=False,
num_threads=args.thread_num,
verbose=args.verbose,
)
if session is None:
return
# Allocate output buffers for IO Binding
max_output_shapes = gpt2helper.get_output_shapes(
max(args.batch_sizes),
max(args.past_sequence_lengths),
max(args.sequence_lengths),
config,
args.model_class,
)
output_buffers = gpt2helper.get_output_buffers(max_output_shapes, device, args.precision == Precision.FLOAT16)
csv_filename = args.result_csv or "benchmark_result_{}.csv".format(datetime.now().strftime("%Y%m%d-%H%M%S"))
with open(csv_filename, mode="a", newline="") as csv_file:
column_names = [
"model_name",
"model_class",
"stage",
"environment_variables",
"gpu",
"precision",
"optimizer",
"torchscript",
"batch_size",
"sequence_length",
"past_sequence_length",
"disable_io_binding",
"torch_latency",
"onnxruntime_latency",
]
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
csv_writer.writeheader()
for batch_size in args.batch_sizes:
for sequence_length in args.sequence_lengths:
for past_sequence_length in args.past_sequence_lengths:
assert batch_size > 0 and sequence_length > 0 and past_sequence_length >= 0
logger.debug(
"Running test for batch_size=%d sequence_length=%d past_sequence_length=%d ...",
batch_size,
sequence_length,
past_sequence_length,
)
dummy_inputs = gpt2helper.get_dummy_inputs(
batch_size,
past_sequence_length,
sequence_length,
config.num_attention_heads,
config.hidden_size,
config.n_layer,
config.vocab_size,
device,
float16=(args.precision == Precision.FLOAT16),
has_position_ids=use_padding,
has_attention_mask=use_padding,
)
output_shapes = gpt2helper.get_output_shapes(
batch_size,
past_sequence_length,
sequence_length,
config,
args.model_class,
)
try:
if args.validate_onnx or args.output_torch_latency:
outputs, torch_latency = gpt2helper.pytorch_inference(model, dummy_inputs, args.test_times)
# Dump Torch output shape
for i, value in enumerate(outputs):
if isinstance(value, tuple):
logger.debug(
f"torch output {i} is tuple of size {len(value)}, shape {value[0].shape}"
)
else:
logger.debug(f"torch output {i} shape {value.shape}")
else:
outputs = None
torch_latency = None
if args.disable_io_binding:
ort_outputs, ort_latency = gpt2helper.onnxruntime_inference(
session, dummy_inputs, args.test_times
)
else:
ort_outputs, ort_latency = gpt2helper.onnxruntime_inference_with_binded_io(
session,
dummy_inputs,
output_buffers,
output_shapes,
args.test_times,
return_numpy=False,
include_copy_output_latency=args.include_copy_output_latency,
)
if args.validate_onnx:
copy_outputs = ort_outputs
if not args.disable_io_binding:
# Results of IO binding might be in GPU. Copy outputs to CPU for comparison.
copy_outputs = []
for output in ort_outputs:
copy_outputs.append(output.cpu().numpy())
if gpt2helper.compare_outputs(
outputs,
copy_outputs,
model_class=args.model_class,
rtol=DEFAULT_TOLERANCE[args.precision],
atol=DEFAULT_TOLERANCE[args.precision],
):
logger.info(
f"Pytorch and ONNX Runtime outputs are all close (tolerance={DEFAULT_TOLERANCE[args.precision]})."
)
logger.info(
"batch_size=%d, sequence_length=%d, past_sequence_length=%d, onnxruntime_latency=%.2f %s %s",
batch_size,
sequence_length,
past_sequence_length,
ort_latency,
"(disable_io_binding)" if args.disable_io_binding else "",
", torch_latency={torch_latency}" if torch_latency else "",
)
row = {
"model_name": args.model_name_or_path,
"model_class": args.model_class,
"stage": args.stage,
"environment_variables": get_ort_environment_variables(),
"gpu": args.use_gpu,
"precision": args.precision,
"optimizer": args.optimize_onnx,
"torchscript": args.torchscript,
"batch_size": batch_size,
"sequence_length": sequence_length,
"past_sequence_length": past_sequence_length,
"disable_io_binding": args.disable_io_binding,
"torch_latency": f"{torch_latency:.2f}" if torch_latency else "None",
"onnxruntime_latency": f"{ort_latency:.2f}",
}
csv_writer.writerow(row)
except Exception:
logger.error("Exception", exc_info=True) # noqa: G201
return None
logger.info(f"Results are saved to file {csv_filename}")
return csv_filename
if __name__ == "__main__":
args = parse_arguments()
setup_logger(args.verbose)
main(args)

View File

@ -0,0 +1,557 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
"""
This converts GPT2 model to onnx. Examples:
(1) Convert pretrained model 'gpt2' to ONNX
python convert_to_onnx.py -m gpt2 --output gpt2.onnx
(2) Convert pretrained model 'distilgpt2' to ONNX, and use optimizer to get float16 model.
python convert_to_onnx.py -m distilgpt2 --output distilgpt2_fp16.onnx -o -p fp16
(3) Convert a model check point to ONNX, and run optimization and int8 quantization
python convert_to_onnx.py -m ./my_model_checkpoint/ --output my_model_int8.onnx -o -p int8
"""
import argparse
import csv
import json
import logging
import os
import shutil
import sys
from pathlib import Path
import numpy
import torch
from benchmark_helper import (
Precision,
create_onnxruntime_session,
get_ort_environment_variables,
prepare_environment,
setup_logger,
)
from gpt2_helper import DEFAULT_TOLERANCE, MODEL_CLASSES, PRETRAINED_GPT2_MODELS, Gpt2Helper
from gpt2_tester import Gpt2Tester
from packaging import version
from quantize_helper import QuantizeHelper
from transformers import AutoConfig
from transformers import __version__ as transformers_version
from onnxruntime import __version__ as ort_version
logger = logging.getLogger("")
def parse_arguments(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument(
"-m",
"--model_name_or_path",
required=True,
type=str,
help="Model path, or pretrained model name in the list: " + ", ".join(PRETRAINED_GPT2_MODELS),
)
parser.add_argument(
"--model_class",
required=False,
type=str,
default="GPT2LMHeadModel",
choices=list(MODEL_CLASSES.keys()),
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
)
parser.add_argument(
"--cache_dir",
required=False,
type=str,
default=os.path.join(".", "cache_models"),
help="Directory to cache pre-trained models",
)
parser.add_argument(
"--output",
required=False,
type=str,
default=os.path.join(".", "onnx_models"),
help="Output directory, or model path ends with .onnx",
)
parser.add_argument(
"-o",
"--optimize_onnx",
required=False,
action="store_true",
help="Use optimizer.py to optimize onnx model",
)
parser.set_defaults(optimize_onnx=False)
parser.add_argument("--use_gpu", required=False, action="store_true", help="use GPU for inference")
parser.set_defaults(use_gpu=False)
parser.add_argument(
"--provider",
required=False,
default=None,
choices=["dml", "rocm", "migraphx", "cuda", "tensorrt"],
help="use dml, rocm, cuda, tensorrt or migraphx for respective backend",
)
parser.add_argument(
"--tolerance",
required=False,
type=float,
default=0,
help="the absolute and relative tolerance for parity verification",
)
parser.add_argument(
"--input_test_file",
"-i",
required=False,
type=str,
default="",
help="Path to the file with inputs to test with",
)
parser.add_argument(
"-p",
"--precision",
required=False,
type=Precision,
default=Precision.FLOAT32,
choices=list(Precision),
help="Precision of model to run. fp32 for full precision, fp16 for half or mixed precision, and int8 for quantization",
)
parser.add_argument(
"-t",
"--test_cases",
required=False,
type=int,
default=1000,
help="Number of test cases per run for parity",
)
parser.add_argument(
"-r",
"--test_runs",
required=False,
type=int,
default=10,
help="Number of runs for parity. It is used for significance test.",
)
parser.add_argument("--verbose", required=False, action="store_true")
parser.set_defaults(verbose=False)
parser.add_argument("-e", "--use_external_data_format", required=False, action="store_true")
parser.set_defaults(use_external_data_format=False)
parser.add_argument("--overwrite", required=False, action="store_true")
parser.set_defaults(overwrite=False)
parser.add_argument(
"--use_int64_inputs",
required=False,
action="store_true",
help="Use int32 instead of int64 for input_ids, position_ids and attention_mask.",
)
parser.set_defaults(use_int64_inputs=False)
parser.add_argument(
"-s",
"--stage",
type=int,
default=0,
required=False,
choices=[0, 1, 2],
help="Stage in generation: 1 (initial decoder), 2 (decoder), 0 (both). "
"1 - decode the first token when past_sequence_length is zero; "
"2 - decode the remaining tokens when past_sequence_length is not zero; "
"0 - one onnx model for both stages 1 and 2. "
"Note that we will optimize 1 and 2 differently for best performance.",
)
fp16_option_group = parser.add_argument_group(
'float to float16 conversion parameters that works when "--precision fp16" is specified'
)
fp16_option_group.add_argument(
"-a",
"--auto_mixed_precision",
required=False,
action="store_true",
help="Convert to mixed precision automatically. Other float16 conversion parameters will be ignored.",
)
fp16_option_group.set_defaults(auto_mixed_precision=False)
fp16_option_group.add_argument(
"--keep_io_types",
required=False,
action="store_true",
help="Use float32 for past inputs, present and logits outputs.",
)
fp16_option_group.set_defaults(keep_io_types=False)
fp16_option_group.add_argument(
"--io_block_list",
nargs="+",
default=[],
help="List of inputs or outputs in float32 instead of float16",
)
fp16_option_group.add_argument(
"--op_block_list",
nargs="+",
default=[],
help="List of operators (like Add LayerNormalization SkipLayerNormalization EmbedLayerNormalization FastGelu) "
"to compute in float32 instead of float16.",
)
fp16_option_group.add_argument(
"--node_block_list",
nargs="+",
default=[],
help="List of node names to compute in float32 instead of float16.",
)
fp16_option_group.add_argument(
"--force_fp16_initializers",
required=False,
action="store_true",
help="Convert all float initializers to float16.",
)
fp16_option_group.set_defaults(force_fp16_initializers=False)
args = parser.parse_args(argv)
return args
def get_onnx_model_size(onnx_path: str, use_external_data_format: bool):
if not use_external_data_format:
return os.path.getsize(onnx_path)
else:
return sum([f.stat().st_size for f in Path(onnx_path).parent.rglob("*")])
def get_latency_name(batch_size, sequence_length, past_sequence_length):
return f"average_latency(batch_size={batch_size},sequence_length={sequence_length},past_sequence_length={past_sequence_length})"
def main(argv=None, experiment_name: str = "", run_id: str = "0", csv_filename: str = "gpt2_parity_results.csv"):
result = {}
if version.parse(transformers_version) < version.parse(
"3.1.0"
): # past_key_values name does not exist in 3.0.2 or older
raise RuntimeError("This tool requires transformers 3.1.0 or later.")
args = parse_arguments(argv)
setup_logger(args.verbose)
if not experiment_name:
experiment_name = " ".join(argv if argv else sys.argv[1:])
if args.tolerance == 0:
args.tolerance = DEFAULT_TOLERANCE[args.precision]
logger.info(f"Arguments:{args}")
cache_dir = args.cache_dir
output_dir = args.output if not args.output.endswith(".onnx") else os.path.dirname(args.output)
prepare_environment(cache_dir, output_dir, args.use_gpu)
if args.precision != Precision.FLOAT32:
assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx"
if args.precision == Precision.FLOAT16:
assert args.use_gpu, "fp16 requires --use_gpu"
if args.precision == Precision.INT8:
assert not args.use_gpu, "quantization only supports CPU"
model_class = MODEL_CLASSES[args.model_class][0]
use_padding = MODEL_CLASSES[args.model_class][2]
gpt2helper = Gpt2Helper
config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=cache_dir)
model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir)
device = torch.device("cuda:0" if args.use_gpu else "cpu")
model.eval().to(device)
if (not args.use_external_data_format) and (config.n_layer > 24):
logger.info("Try --use_external_data_format when model size > 2GB")
onnx_model_paths = gpt2helper.get_onnx_paths(
output_dir,
args.model_name_or_path,
args.model_class,
new_folder=(args.precision == Precision.INT8),
remove_existing=["fp32", "fp16", "int8"],
) # Do not remove raw model to save time in parity test
raw_onnx_model = onnx_model_paths["raw"]
int_data_type = torch.int64 if args.use_int64_inputs else torch.int32
if os.path.exists(raw_onnx_model) and not args.overwrite:
logger.warning(f"Skip exporting ONNX model since it existed: {raw_onnx_model}")
else:
logger.info(f"Exporting ONNX model to {raw_onnx_model}")
gpt2helper.export_onnx(
model,
device,
raw_onnx_model,
args.verbose,
args.use_external_data_format,
has_position_ids=use_padding,
has_attention_mask=use_padding,
input_ids_dtype=int_data_type,
position_ids_dtype=int_data_type,
attention_mask_dtype=int_data_type,
)
fp16_params = {"keep_io_types": args.keep_io_types}
if args.io_block_list:
fp16_params["keep_io_types"] = args.io_block_list
if args.node_block_list:
fp16_params["node_block_list"] = args.node_block_list
if args.op_block_list:
fp16_params["op_block_list"] = args.op_block_list
if args.force_fp16_initializers:
fp16_params["force_fp16_initializers"] = args.force_fp16_initializers
is_io_float16 = args.precision == Precision.FLOAT16 and not args.keep_io_types
optimized_ops = ""
all_ops = ""
if args.optimize_onnx or args.precision != Precision.FLOAT32:
output_path = onnx_model_paths[str(args.precision) if args.precision != Precision.INT8 else "fp32"]
logger.info(f"Optimizing model to {output_path}")
m = gpt2helper.optimize_onnx(
raw_onnx_model,
output_path,
args.precision == Precision.FLOAT16,
model.config.num_attention_heads,
model.config.hidden_size,
args.use_external_data_format,
auto_mixed_precision=args.auto_mixed_precision,
stage=args.stage,
**fp16_params,
)
nodes = m.nodes()
op_list = {node.op_type for node in nodes}
all_ops = ",".join(op_list)
# print optimized operators
optimized_op_counter = m.get_fused_operator_statistics()
if optimized_op_counter:
optimized_ops = ",".join([key for key in optimized_op_counter if optimized_op_counter[key] > 0])
else:
output_path = raw_onnx_model
if args.precision == Precision.INT8:
logger.info("quantizing model...")
QuantizeHelper.quantize_onnx_model(output_path, onnx_model_paths["int8"], args.use_external_data_format)
model = QuantizeHelper.quantize_torch_model(model)
logger.info("finished quantizing model")
output_path = onnx_model_paths["int8"]
if args.output.endswith(".onnx") and output_path != args.output and not args.use_external_data_format:
shutil.move(output_path, args.output)
output_path = args.output
logger.info(f"Output path: {output_path}")
model_size_in_MB = int(get_onnx_model_size(output_path, args.use_external_data_format) / 1024 / 1024) # noqa: N806
session = create_onnxruntime_session(
output_path, args.use_gpu, args.provider, enable_all_optimization=True, verbose=args.verbose
)
if args.model_class == "GPT2LMHeadModel" and session is not None:
parity_result = gpt2helper.test_parity(
session,
model,
device,
is_io_float16,
rtol=args.tolerance,
atol=args.tolerance,
model_class=args.model_class,
has_position_ids=use_padding,
has_attention_mask=use_padding,
input_ids_dtype=int_data_type,
position_ids_dtype=int_data_type,
attention_mask_dtype=int_data_type,
test_cases_per_run=args.test_cases,
total_runs=args.test_runs,
stage=args.stage,
verbose=args.verbose,
)
# An example configuration for testing performance
batch_size = 8
sequence_length = 32 if args.stage == 1 else 1
past_sequence_length = 0 if args.stage == 1 else 32
latency = gpt2helper.test_performance(
session,
model,
device,
is_io_float16,
total_runs=100,
use_io_binding=True,
model_class=args.model_class,
has_position_ids=use_padding,
has_attention_mask=use_padding,
input_ids_dtype=int_data_type,
position_ids_dtype=int_data_type,
attention_mask_dtype=int_data_type,
batch_size=batch_size,
sequence_length=sequence_length,
past_sequence_length=past_sequence_length,
)
if args.precision == Precision.FLOAT16:
logger.info(f"fp16 conversion parameters:{fp16_params}")
# Write results to file
latency_name = get_latency_name(batch_size, sequence_length, past_sequence_length)
csv_file_existed = os.path.exists(csv_filename)
with open(csv_filename, mode="a", newline="") as csv_file:
column_names = [
"experiment",
"run_id",
"model_name",
"model_class",
"stage",
"gpu",
"precision",
"optimizer",
"test_cases",
"runs",
"keep_io_types",
"io_block_list",
"op_block_list",
"node_block_list",
"force_fp16_initializers",
"auto_mixed_precision",
"optimized_operators",
"operators",
"environment_variables",
"onnxruntime",
latency_name,
"top1_match_rate",
"onnx_size_in_MB",
"diff_50_percentile",
"diff_90_percentile",
"diff_95_percentile",
"diff_99_percentile",
"diff_pass_rate",
"nan_rate",
"top1_match_rate_per_run",
]
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
if not csv_file_existed:
csv_writer.writeheader()
row = {
"experiment": experiment_name,
"run_id": run_id,
"model_name": args.model_name_or_path,
"model_class": args.model_class,
"stage": args.stage,
"gpu": args.use_gpu,
"precision": args.precision,
"optimizer": args.optimize_onnx,
"test_cases": args.test_cases,
"runs": args.test_runs,
"keep_io_types": args.keep_io_types,
"io_block_list": args.io_block_list,
"op_block_list": args.op_block_list,
"node_block_list": args.node_block_list,
"force_fp16_initializers": args.force_fp16_initializers,
"auto_mixed_precision": args.auto_mixed_precision,
"optimized_operators": optimized_ops,
"operators": all_ops,
"environment_variables": get_ort_environment_variables(),
"onnxruntime": ort_version,
latency_name: f"{latency:.2f}",
"diff_50_percentile": parity_result["max_diff_percentile_50"],
"diff_90_percentile": parity_result["max_diff_percentile_90"],
"diff_95_percentile": parity_result["max_diff_percentile_95"],
"diff_99_percentile": parity_result["max_diff_percentile_99"],
"diff_pass_rate": parity_result["diff_pass_rate"],
"nan_rate": parity_result["nan_rate"],
"top1_match_rate": parity_result["top1_match_rate"],
"top1_match_rate_per_run": parity_result["top1_match_rate_per_run"],
"onnx_size_in_MB": f"{model_size_in_MB}",
}
logger.info(f"result: {row}")
result.update(row)
csv_writer.writerow(row)
if args.input_test_file:
test_inputs = []
# Each line of test file is a JSON string like:
# {"input_ids": [[14698, 257, 1310, 13688, 319, 326]]}
with open(args.input_test_file) as read_f:
for _, line in enumerate(read_f):
line = line.rstrip() # noqa: PLW2901
data = json.loads(line)
input_ids = torch.from_numpy(numpy.asarray(data["input_ids"], dtype=numpy.int64)).to(device)
if use_padding:
if "attention_mask" in data:
numpy_float = numpy.float16 if is_io_float16 else numpy.float32
attention_mask = torch.from_numpy(numpy.asarray(data["attention_mask"], dtype=numpy_float)).to(
device
)
else:
padding = -1
attention_mask = (input_ids != padding).type(torch.float16 if is_io_float16 else torch.float32)
input_ids.masked_fill_(input_ids == padding, 0)
if "position_ids" in data:
position_ids = torch.from_numpy(numpy.asarray(data["position_ids"], dtype=numpy.int64)).to(
device
)
else:
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(position_ids < 0, 0)
inputs = {
"input_ids": input_ids.to(int_data_type),
"position_ids": position_ids.to(int_data_type),
"attention_mask": attention_mask.to(int_data_type),
}
else:
inputs = {"input_ids": input_ids.to(int_data_type)}
test_inputs.append(inputs)
Gpt2Tester.test_generation(
session,
model,
device,
test_inputs,
precision=args.precision,
model_class=args.model_class,
top_k=20,
top_k_no_order=True,
max_steps=24,
max_inputs=0,
verbose=args.verbose,
save_test_data=3,
save_test_data_dir=Path(output_path).parent,
)
logger.info(f"Done. Output model: {output_path}")
return result
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,513 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
# This script uses different configurations in mixed precision conversion for GPT-2 model, and
# measures the inference latency, top 1 match rate (compared to PyTorch FP32 model) and ONNX model size.
# It outputs a csv file with Mann-Whitney U test and T-Test on each pair of experiments, where
# pvalue < 0.05 means two experiments have significant difference on top 1 match rate.
# User could use this script to select the best mixed precision model according to these metrics.
import argparse
import csv
import datetime
import json
import logging
import os
import onnx
import scipy.stats
from benchmark_helper import get_ort_environment_variables, setup_logger
from convert_to_onnx import main
from gpt2_helper import PRETRAINED_GPT2_MODELS, Gpt2Helper
from onnx_model import OnnxModel
logger = logging.getLogger("")
def parse_arguments(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument(
"-m",
"--model_name_or_path",
required=True,
type=str,
help="Model path, or pretrained model name in the list: " + ", ".join(PRETRAINED_GPT2_MODELS),
)
parser.add_argument(
"--csv",
required=False,
type=str,
default="gpt2_parity_results.csv",
help="path of csv file to save the result",
)
parser.add_argument(
"--test_cases",
required=False,
type=int,
default=500,
help="number of test cases per run",
)
parser.add_argument("--runs", required=False, type=int, default=40, help="number of repeated runs")
parser.add_argument("--use_gpu", required=False, action="store_true", help="use GPU for inference")
parser.set_defaults(use_gpu=False)
parser.add_argument(
"--all",
required=False,
action="store_true",
help="run all combinations of mixed precision",
)
parser.set_defaults(all=False)
parser.add_argument("-e", "--use_external_data_format", required=False, action="store_true")
parser.set_defaults(use_external_data_format=False)
parser.add_argument("--verbose", required=False, action="store_true")
parser.set_defaults(verbose=False)
parser.add_argument(
"--skip_test",
required=False,
action="store_true",
help="do not run test, and only rank experiments based on existing csv file",
)
parser.set_defaults(skip_test=False)
parser.add_argument(
"--overwrite",
required=False,
action="store_true",
help="Overwrite existing csv file",
)
parser.set_defaults(overwrite=False)
args = parser.parse_args(argv)
return args
class ParityTask:
def __init__(self, test_cases, total_runs, csv_path):
self.total_runs = total_runs
self.test_cases = test_cases
self.csv_path = csv_path
self.results = []
self.run_id = 0
def run(self, argv, experiment_name):
start_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
run_id = f"{start_time}_{self.run_id}"
self.run_id += 1
try:
result = main(
[*argv, "-t", f"{self.test_cases}", "-r", f"{self.total_runs}"],
experiment_name=experiment_name,
run_id=run_id,
csv_filename=self.csv_path,
)
if result:
self.results.append(result)
except Exception:
logger.exception(f"Failed to run experiment {experiment_name}")
result = None
return result
def load_results_from_csv(csv_path):
rows = []
import csv
with open(csv_path, newline="") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
rows.append(row) # noqa: PERF402
return rows
def get_latency(row):
for name in row:
if name.startswith("average_latency(batch_size="):
return float(row[name])
raise RuntimeError("Failed to get average_latency from output")
def score(row):
"""Scoring function based on 3 metrics. The larger score is better."""
latency_in_ms = get_latency(row)
top1_match_rate = float(row["top1_match_rate"])
onnx_size_in_MB = float(row["onnx_size_in_MB"]) # noqa: N806
# A simple scoring function: cost of 0.1ms latency ~ 0.1% match rate ~ 100MB size
return top1_match_rate * 1000 - latency_in_ms * 10 - onnx_size_in_MB / 100
def print_wins(wins, rows, test_name):
print()
print("*" * 10)
row_map = {}
for row in rows:
row_map[row["run_id"]] = row
sorted_wins = dict(
sorted(
wins.items(),
key=lambda item: (item[1], score(row_map[item[0]])),
reverse=True,
)
)
logger.debug(f"{test_name} Wins:{sorted_wins}")
logger.info(f"Based on {test_name} wins and a scoring function, the ranking:")
rank = 0
previous_value = -1
for count, (key, value) in enumerate(sorted_wins.items()):
if value != previous_value:
rank = count
previous_value = value
for row in rows:
if row["run_id"] == key:
logger.info(
"{:02d}: WINs={:02d}, run_id={}, latency={:5.2f}, top1_match={:.4f}, size={}_MB, experiment={}, {}".format( # noqa: G001
rank,
value,
key,
get_latency(row),
float(row["top1_match_rate"]),
row["onnx_size_in_MB"],
row["experiment"],
get_ort_environment_variables(),
)
)
break
def run_significance_test(rows, output_csv_path):
"""Run U test and T test."""
utest_wins = {}
ttest_wins = {}
for row in rows:
run_id = row["run_id"]
utest_wins[run_id] = 0
ttest_wins[run_id] = 0
with open(output_csv_path, "w", newline="") as csvfile:
column_names = [
"model_name",
"run_id_1",
"experiment_1",
"top1_match_rate_1",
"run_id_2",
"experiment_2",
"top1_match_rate_2",
"U_statistic",
"U_pvalue",
"T_statistic",
"T_pvalue",
]
writer = csv.DictWriter(csvfile, fieldnames=column_names)
writer.writeheader()
required_match_columns = ["model_name", "test_cases", "runs"]
num_results = len(rows)
for i in range(num_results - 1):
result1 = rows[i]
if isinstance(result1["top1_match_rate_per_run"], str):
a = json.loads(result1["top1_match_rate_per_run"])
else:
a = result1["top1_match_rate_per_run"]
for j in range(i + 1, num_results, 1):
result2 = rows[j]
all_matched = True
for column in required_match_columns:
if result1[column] != result2[column]:
all_matched = False
break
if not all_matched:
continue
if isinstance(result2["top1_match_rate_per_run"], str):
b = json.loads(result2["top1_match_rate_per_run"])
else:
b = result2["top1_match_rate_per_run"]
try:
utest_statistic, utest_pvalue = scipy.stats.mannwhitneyu(
a, b, use_continuity=True, alternative="two-sided"
) # TODO: shall we use one-sided: less or greater according to "top1_match_rate"
except ValueError: # ValueError: All numbers are identical in mannwhitneyu
utest_statistic = None
utest_pvalue = None
ttest_statistic, ttest_pvalue = scipy.stats.ttest_ind(a, b, axis=None, equal_var=True)
if utest_pvalue is not None and utest_pvalue < 0.05:
if float(result1["top1_match_rate"]) > float(result2["top1_match_rate"]):
utest_wins[result1["run_id"]] += 1
else:
utest_wins[result2["run_id"]] += 1
if ttest_pvalue < 0.05:
if float(result1["top1_match_rate"]) > float(result2["top1_match_rate"]):
ttest_wins[result1["run_id"]] += 1
else:
ttest_wins[result2["run_id"]] += 1
row = {
"model_name": result1["model_name"],
"run_id_1": result1["run_id"],
"experiment_1": result1["experiment"],
"top1_match_rate_1": float(result1["top1_match_rate"]),
"run_id_2": result2["run_id"],
"experiment_2": result2["experiment"],
"top1_match_rate_2": float(result2["top1_match_rate"]),
"U_statistic": utest_statistic,
"U_pvalue": utest_pvalue,
"T_statistic": ttest_statistic,
"T_pvalue": ttest_pvalue,
}
writer.writerow(row)
logger.info(f"U-Test and T-Test results are output to {output_csv_path}")
print_wins(utest_wins, rows, "U-Test")
print_wins(ttest_wins, rows, "T-Test")
def get_last_matmul_node_name(raw_onnx_model: str):
model = onnx.load(raw_onnx_model)
onnx_model = OnnxModel(model)
output_name_to_node = onnx_model.output_name_to_node()
assert model.graph.output[0].name in output_name_to_node
node = output_name_to_node[model.graph.output[0].name]
if node.op_type == "MatMul":
logger.info(f"Found last MatMul node for logits: {node.name}")
return node.name
logger.warning(f"Failed to find MatMul node for logits. Found {node.op_type} of node {node.name}")
return None
def get_mixed_precision_parameters(args, last_matmul_node_name, op_block_list):
model = args.model_name_or_path
parameters = f"-m {model} -o --use_gpu -p fp16".split()
if args.use_external_data_format:
parameters.append("--use_external_data_format")
parameters += [
"--io_block_list",
"logits",
"--node_block_list",
last_matmul_node_name,
]
if op_block_list:
parameters.extend(["--op_block_list", *op_block_list])
return parameters
def run_candidate(
task: ParityTask,
args,
last_matmul_node_name,
op_block_list=["FastGelu", "LayerNormalization"], # noqa: B006
):
parameters = get_mixed_precision_parameters(args, last_matmul_node_name, op_block_list)
op_block_list_str = ",".join(sorted(op_block_list))
if op_block_list:
name = f"Mixed precision baseline + {op_block_list_str} in FP32"
else:
name = f"Mixed precision baseline (logits output and last MatMul node {last_matmul_node_name} in FP32)"
env_vars = get_ort_environment_variables()
if env_vars:
name = name + f" ({env_vars})"
task.run(parameters, name)
def get_baselines(args):
model = args.model_name_or_path
fp32_baseline = f"-m {model} -o -p fp32".split()
if args.use_gpu:
fp32_baseline.append("--use_gpu")
if args.use_external_data_format:
fp32_baseline.append("--use_external_data_format")
fp16_baseline = f"-m {model} -o --use_gpu -p fp16".split()
if args.use_external_data_format:
fp16_baseline.append("--use_external_data_format")
return fp32_baseline, fp16_baseline
def run_tuning_step0(task, fp16_baseline, all_ops, optimized_ops):
"""Step 0 is to check which operator in FP16 causes most loss"""
fp32_logits = ["--io_block_list", "logits"]
task.run(fp16_baseline + fp32_logits, "FP16 except logits")
fp32_io = ["--keep_io_types"]
task.run(fp16_baseline + fp32_io, "Graph I/O FP32, Other FP16")
# Only weights in FP16
task.run(
fp16_baseline + fp32_io + ["--op_block_list"] + [o for o in all_ops] + ["--force_fp16_initializers"],
"FP32 except weights in FP16",
)
optimized_ops_results = []
op_list = optimized_ops
for op in op_list:
op_block_list = ["--op_block_list"] + [o for o in op_list if o != op]
result = task.run(fp16_baseline + fp32_io + op_block_list, f"FP32 except {op} in FP16")
if result:
optimized_ops_results.append(result)
# Check which optimized operator causes the most loss in precision
min_result = min(optimized_ops_results, key=lambda y: y["top1_match_rate"])
print("step 0: optimized operator causes the most loss in precision", min_result)
def run_tuning_step1(task, mixed_precision_baseline, optimized_ops):
"""Step 1 is to figure out which optimized operator in FP32 could benefit most"""
for op in optimized_ops:
op_block_list = ["--op_block_list", op]
task.run(
mixed_precision_baseline + op_block_list,
f"Mixed precision baseline + {op} in FP32",
)
def run_tuning_step2(task, mixed_precision_baseline, optimized_ops):
"""Assumed that you have run step 0 and 1 to figure out that Logits FP32 and some operators shall be in FP32,
This step will try add one more operator.
"""
candidate_fp32_ops = ["FastGelu", "LayerNormalization", "SkipLayerNormalization"]
fp32_ops = [x for x in candidate_fp32_ops if x in optimized_ops]
for op in optimized_ops:
if op not in fp32_ops:
op_block_list = [*fp32_ops, op]
task.run(
[*mixed_precision_baseline, "--op_block_list", *op_block_list],
"Mixed precision baseline + {},{} in FP32".format(",".join(fp32_ops), op),
)
def run_parity(task: ParityTask, args):
onnx_model_paths = Gpt2Helper.get_onnx_paths(
"onnx_models",
args.model_name_or_path,
new_folder=args.use_external_data_format,
remove_existing=[],
)
fp32_baseline, fp16_baseline = get_baselines(args)
result = task.run(fp32_baseline, "FP32 baseline")
optimized_ops = []
if result and ("optimized_operators" in result) and result["optimized_operators"]:
optimized_ops = result["optimized_operators"].split(",")
else:
raise RuntimeError("Failed to get optimized operators")
all_ops = []
if result and ("operators" in result) and result["operators"]:
all_ops = result["operators"].split(",")
else:
raise RuntimeError("Failed to get operators")
# The following tests for fp16 requires GPU
if not args.use_gpu:
logger.info("skip mixed precision since --use_gpu is not specified")
return
task.run(fp16_baseline, "FP16 baseline")
last_matmul_node_name = get_last_matmul_node_name(onnx_model_paths["raw"])
# Mixed precision baseline
run_candidate(task, args, last_matmul_node_name, op_block_list=[])
def get_fp32_ops(x):
return [op for op in x if op in all_ops]
if args.all:
run_tuning_step0(task, fp16_baseline, all_ops, optimized_ops)
mixed_precision_baseline = get_mixed_precision_parameters(args, last_matmul_node_name, op_block_list=[])
run_tuning_step1(task, mixed_precision_baseline, optimized_ops)
run_tuning_step2(task, mixed_precision_baseline, optimized_ops)
else:
run_candidate(
task,
args,
last_matmul_node_name,
op_block_list=get_fp32_ops(["SkipLayerNormalization", "LayerNormalization", "Add"]),
)
run_candidate(task, args, last_matmul_node_name, op_block_list=["FastGelu"])
# Run a few good candidates
run_candidate(
task,
args,
last_matmul_node_name,
op_block_list=get_fp32_ops(["FastGelu", "SkipLayerNormalization", "LayerNormalization", "Add"]),
)
run_candidate(
task,
args,
last_matmul_node_name,
op_block_list=get_fp32_ops(
["FastGelu", "EmbedLayerNormalization", "SkipLayerNormalization", "LayerNormalization", "Add"]
),
)
if __name__ == "__main__":
args = parse_arguments()
setup_logger(args.verbose)
if args.test_cases < 100 or args.runs < 20 or args.test_cases * args.runs < 10000:
logger.warning(
"Not enough test cases or runs to get stable results or test significance. "
"Recommend test_cases >= 100, runs >= 20, test_cases * runs >= 10000."
)
if os.path.exists(args.csv) and not args.skip_test:
if not args.overwrite:
raise RuntimeError(
f"Output file {args.csv} existed. Please remove the file, or use either --skip_test or --overwrite."
)
else:
logger.info("Remove existing file %s since --overwrite is specified", args.csv)
os.remove(args.csv)
task = ParityTask(args.test_cases, args.runs, args.csv)
if not args.skip_test:
run_parity(task, args)
try:
rows = load_results_from_csv(task.csv_path)
except Exception:
logger.exception(f"Failed to load csv {task.csv_path}")
rows = task.results
logger.info("Start running significance tests...")
summary_csv = task.csv_path.replace(".csv", ".stats.csv")
run_significance_test(rows, summary_csv)

View File

@ -0,0 +1,501 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
# This script helps evaluation of GPT-2 model.
import logging
import math
import os
import statistics
import timeit
import numpy
import torch
from benchmark_helper import Precision
from gpt2_helper import Gpt2Helper, Gpt2Inputs
logger = logging.getLogger(__name__)
class Gpt2Metric:
def __init__(self, treatment_name, baseline_name="Torch", top_k=20):
assert top_k > 1 and top_k <= 100
self.baseline = baseline_name
self.treatment = treatment_name
self.name: str = f"{treatment_name} vs {baseline_name}"
self.top_k = top_k
self.top_1_error: int = 0
self.top_k_error: int = 0
self.total_samples: int = 0
self.max_logits_diff: float = 0 # for non-empty past state
self.max_logits_diff_no_past: float = 0 # for empty past state
self.batch_top1_error: torch.FloatTensor = None # top 1 error for current batch
self.batch_topk_error: torch.FloatTensor = None # top k error for current batch
self.seq_len_latency = {}
def print(self):
if self.baseline != self.treatment:
print("---")
print(f"Metrics for {self.treatment} (baseline={self.baseline}):")
if self.total_samples > 0:
top_1_error_rate = 100.0 * self.top_1_error / self.total_samples
top_k_error_rate = 100.0 * self.top_k_error / self.total_samples
print(
f"Total={self.total_samples} Top1Error={self.top_1_error} ({top_1_error_rate:.2f}%) Top{self.top_k}Error={self.top_k_error} ({top_k_error_rate:.2f}%)"
)
print("Max logits diffs:")
print(f"\twith past = {self.max_logits_diff:.6f}")
print(f"\tempty past = {self.max_logits_diff_no_past:.6f}")
else:
print(f"Metrics for {self.treatment} (baseline):")
if self.seq_len_latency:
print("Past sequence length range and average latency:")
total = 0
count = 0
for key in sorted(self.seq_len_latency.keys()):
average = statistics.mean(self.seq_len_latency[key]) * 1000.0
if key == 0:
print(f"\t{key}: \t{average:.2f} ms")
else:
print(f"\t[{2**key}, {2 ** (key + 1) - 1}]:\t{average:.2f} ms")
total += average * len(self.seq_len_latency[key])
count += len(self.seq_len_latency[key])
print(f"Average Latency: {total / count:.2f} ms")
def diff_logits(self, baseline_logits, treatment_logits, is_empty_past: bool):
diff = (baseline_logits - treatment_logits).abs().max()
if is_empty_past:
self.max_logits_diff_no_past = max(self.max_logits_diff_no_past, diff)
else:
self.max_logits_diff = max(self.max_logits_diff, diff)
return diff
def start_batch(self, batch_size: int):
self.total_samples += batch_size
self.batch_top1_error = torch.zeros((batch_size, 1), dtype=torch.bool)
self.batch_topk_error = torch.zeros((batch_size, 1), dtype=torch.bool)
def eval_batch(self, baseline, treatment, past_seq_len, verbose=True):
self._eval_topk(baseline.top_1_tokens, treatment.top_1_tokens, 1, verbose)
self._eval_topk(baseline.top_k_tokens, treatment.top_k_tokens, self.top_k, verbose)
max_diff = self.diff_logits(baseline.logits, treatment.logits, past_seq_len == 0)
if verbose:
print(f"Max logits diffs of {self.name}: {max_diff}")
def _eval_topk(self, baseline_topk, treatment_topk, top_k, verbose=True):
if not torch.all(torch.eq(baseline_topk, treatment_topk)):
if top_k == 1:
if verbose:
print(f"Generated tokens not matched for {self.name}")
self.batch_top1_error |= torch.eq(baseline_topk, treatment_topk).logical_not()
else:
if verbose:
print(
f"Top {top_k} tokens not matched for {self.name}. This will lead to wrong beam search results"
)
self.batch_topk_error |= (
torch.eq(baseline_topk, treatment_topk).logical_not().sum(1).unsqueeze(dim=1) > 0
)
def end_batch(self):
self.top_1_error += self.batch_top1_error.sum()
self.top_k_error += self.batch_topk_error.sum()
def add_latency(self, past_seq_len, latency):
key = int(math.log2(past_seq_len)) + 1 if past_seq_len > 0 else 0
if key not in self.seq_len_latency:
self.seq_len_latency[key] = []
self.seq_len_latency[key].append(latency)
class Gpt2Tester:
def __init__(
self,
input_ids,
position_ids,
attention_mask,
num_attention_heads,
hidden_size,
num_layer,
device,
is_fp16=False,
top_k=20,
top_k_required_order=False,
):
self.batch_size = input_ids.shape[0]
self.input_length = input_ids.shape[1]
self.n_layer = num_layer
self.input_ids = input_ids
self.position_ids = position_ids
self.attention_mask = attention_mask
self.has_position_ids = position_ids is not None
self.has_attention_mask = attention_mask is not None
# Empty past state for first inference
self.past = []
past_shape = [
2,
self.batch_size,
num_attention_heads,
0,
hidden_size // num_attention_heads,
]
for _i in range(num_layer):
empty_past = torch.empty(past_shape).type(torch.float16 if is_fp16 else torch.float32)
self.past.append(empty_past.to(device))
self.logits = None
self.top_1_tokens = None
self.top_k_tokens = None
self.top_k = top_k
self.top_k_required_order = top_k_required_order
def get_inputs(self) -> Gpt2Inputs:
return Gpt2Inputs(self.input_ids, self.position_ids, self.attention_mask, self.past)
def save_test_data(self, session, output, save_test_data_dir, test_case_id):
from onnx import numpy_helper
path = os.path.join(save_test_data_dir, "test_data_set_" + str(test_case_id))
if os.path.exists(path):
print(f"Directory {path} existed. Skip saving test data")
return
os.makedirs(path, exist_ok=True)
def add_tensor(input_tensors, torch_tensor, name):
input_tensors.append(numpy_helper.from_array(torch_tensor.clone().cpu().numpy(), name))
input_tensors = []
add_tensor(input_tensors, self.input_ids, "input_ids")
if self.has_position_ids:
add_tensor(input_tensors, self.position_ids, "position_ids")
if self.has_attention_mask:
add_tensor(input_tensors, self.attention_mask, "attention_mask")
for i in range(self.n_layer):
add_tensor(input_tensors, self.past[i], "past_" + str(i))
for i, tensor in enumerate(input_tensors):
with open(os.path.join(path, f"input_{i}.pb"), "wb") as f:
f.write(tensor.SerializeToString())
output_names = [output.name for output in session.get_outputs()]
for i, _name in enumerate(output_names):
tensor = numpy_helper.from_array(
output[i] if isinstance(output[i], numpy.ndarray) else output[i].clone().cpu().numpy()
)
with open(os.path.join(path, f"output_{i}.pb"), "wb") as f:
f.write(tensor.SerializeToString())
print(f"Test data saved to directory {path}")
def update(self, output, step, device):
"""
Update the inputs for next inference.
"""
self.logits = (
torch.from_numpy(output[0]) if isinstance(output[0], numpy.ndarray) else output[0].clone().detach().cpu()
)
self.top_1_tokens = Gpt2Tester.predict_next_token(self.logits)
self.top_k_tokens = Gpt2Tester.predict_next_token(self.logits, self.top_k, self.top_k_required_order)
self.input_ids = self.top_1_tokens.clone().detach().reshape([self.batch_size, 1]).to(device)
if self.has_position_ids:
self.position_ids = (
torch.tensor([self.input_length + step - 1]).unsqueeze(0).repeat(self.batch_size, 1).to(device)
)
if self.has_attention_mask:
self.attention_mask = torch.cat(
[
self.attention_mask,
torch.ones([self.batch_size, 1]).type_as(self.attention_mask),
],
1,
).to(device)
self.past = []
if isinstance(output[1], tuple): # past in torch output is tuple
self.past = list(output[1])
else:
for i in range(self.n_layer):
past_i = (
torch.from_numpy(output[i + 1])
if isinstance(output[i + 1], numpy.ndarray)
else output[i + 1].clone().detach()
)
self.past.append(past_i.to(device))
def diff(self, baseline):
"""
Compare inputs and logits output.
"""
print("start diff...")
if self.logits is not None:
max_io_diff = (self.logits - baseline.logits).abs().max()
if max_io_diff > 1e-4:
print(f"Max logits difference is too large: {max_io_diff}")
if not torch.all(self.input_ids == baseline.input_ids):
print("Input_ids is different", self.input_ids, baseline.input_ids)
if self.has_position_ids:
if not torch.all(self.position_ids == baseline.position_ids):
print(
"position_ids is different",
self.position_ids,
baseline.position_ids,
)
if self.has_attention_mask:
if not torch.all(self.attention_mask == baseline.attention_mask):
print(
"attention_mask is different",
self.attention_mask,
baseline.attention_mask,
)
assert len(self.past) == len(baseline.past)
for i, past_i in enumerate(self.past):
assert past_i.shape == baseline.past[i].shape
if past_i.nelement() > 0:
max_past_diff = (past_i - baseline.past[i]).abs().max()
if max_past_diff > 1e-4:
print(f"max_past_diff[{i}]={max_past_diff}")
@staticmethod
def predict_next_token(logits, top_k=1, required_order=False):
"""
Get top k topkens based on logits.
"""
# logits has shape (batch_size, seq_len, vocab_size)
# last token logits has shape (batch_size, vocab_size)
lastTokenLogits = logits[:, -1] # noqa: N806
if top_k == 1:
generatedTokens = torch.argmax(lastTokenLogits, 1, True) # noqa: N806
return generatedTokens
else:
topk = torch.argsort(lastTokenLogits, -1, descending=True)[:, :top_k]
if not required_order:
sorted_topk, _ = topk.sort()
return sorted_topk
return topk
@staticmethod
def diff_present(onnx_output, onnx_io_output, n_layer):
"""
Compare the present outputs of two outputs from ONNX Runtime.
"""
present_diff_max = []
for i in range(n_layer):
onnx_present_i = (
torch.from_numpy(onnx_output[i + 1])
if isinstance(onnx_output[i + 1], numpy.ndarray)
else onnx_output[i + 1]
)
onnx_io_present_i = (
torch.from_numpy(onnx_io_output[i + 1])
if isinstance(onnx_io_output[i + 1], numpy.ndarray)
else onnx_io_output[i + 1]
)
max_diff = (onnx_present_i - onnx_io_present_i).abs().max()
present_diff_max.append(max_diff)
print(f"present_diff_max={present_diff_max}")
@staticmethod
def is_quantized_onnx_model(onnx_model_path):
"""
Returns True if the ONNX model is quantized.
"""
from onnx import load
model = load(onnx_model_path)
from onnxruntime.quantization.quantize import __producer__ as quantize_producer
return model.producer_name == quantize_producer
@staticmethod
def test_generation(
session,
model,
device,
test_inputs,
precision=Precision.FLOAT32,
model_class="Gpt2LMHeadModel",
top_k=20,
top_k_no_order=True,
max_steps=24,
max_inputs=0,
verbose=False,
save_test_data=0,
save_test_data_dir=".",
):
"""
Test Generation using greedy beam search (without sampling) to compare PyTorch and ONNX model.
It will print top 1 and top k errors on the given test inputs.
"""
print(
f"start test generation: (top_k={top_k} top_k_no_order={top_k_no_order} max_steps={max_steps} test_inputs={len(test_inputs)} max_inputs={max_inputs})"
)
n_layer = model.config.n_layer
n_head = model.config.n_head
n_embd = model.config.n_embd
eos_token_id = model.config.eos_token_id
test_data_saved = 0
is_float16 = precision == Precision.FLOAT16
if is_float16:
assert "float16" in session.get_outputs()[0].type
# We will still use fp32 torch model as baseline when onnx model if fp16
model.eval().to(device)
# Allocate initial buffers for IO Binding of ONNX Runtimne. The buffer size will automatically increase later.
init_output_shapes = Gpt2Helper.get_output_shapes(
batch_size=4,
past_sequence_length=128,
sequence_length=32,
config=model.config,
model_class=model_class,
)
output_buffers = Gpt2Helper.get_output_buffers(init_output_shapes, device, is_float16=is_float16)
baseline_name = "Torch"
treatment_name = "Quantized Onnx" if precision == Precision.INT8 else "Onnx"
torch_metric = Gpt2Metric(baseline_name, baseline_name, top_k)
onnx_metric = Gpt2Metric(treatment_name, baseline_name, top_k)
onnx_io_metric = Gpt2Metric(treatment_name + " with IO Binding", baseline_name, top_k)
for i, inputs in enumerate(test_inputs):
if max_inputs > 0 and i == max_inputs:
break
if i % 10 == 0:
print(f"{i}")
input_ids = inputs["input_ids"]
position_ids = inputs.get("position_ids", None)
attention_mask = inputs.get("attention_mask", None)
onnx_runner = Gpt2Tester(
input_ids,
position_ids,
attention_mask,
n_head,
n_embd,
n_layer,
device,
is_float16,
top_k,
not top_k_no_order,
)
onnx_io_runner = Gpt2Tester(
input_ids,
position_ids,
attention_mask,
n_head,
n_embd,
n_layer,
device,
is_float16,
top_k,
not top_k_no_order,
)
torch_runner = Gpt2Tester(
input_ids,
position_ids,
attention_mask,
n_head,
n_embd,
n_layer,
device,
False,
top_k,
not top_k_no_order,
) # Torch model baseline is fp32
batch_size = torch_runner.batch_size
onnx_metric.start_batch(batch_size)
onnx_io_metric.start_batch(batch_size)
with torch.no_grad():
done = torch.zeros(batch_size, dtype=torch.bool)
for step in range(max_steps):
seq_len = list(onnx_runner.input_ids.size())[1]
past_seq_len = list(onnx_runner.past[0].size())[3]
start_time = timeit.default_timer()
pytorch_output = Gpt2Helper.pytorch_inference(model, torch_runner.get_inputs())
torch_metric.add_latency(past_seq_len, timeit.default_timer() - start_time)
torch_runner.update(pytorch_output, step, device)
onnx_output, avg_latency_ms = Gpt2Helper.onnxruntime_inference(
session, onnx_runner.get_inputs(), total_runs=1
)
onnx_metric.add_latency(past_seq_len, avg_latency_ms / 1000.0)
onnx_runner.update(onnx_output, step, device)
output_shapes = Gpt2Helper.get_output_shapes(
batch_size,
past_seq_len,
seq_len,
model.config,
model_class=model_class,
)
Gpt2Helper.auto_increase_buffer_size(output_buffers, output_shapes)
(
onnx_io_output,
avg_latency_ms,
) = Gpt2Helper.onnxruntime_inference_with_binded_io(
session,
onnx_io_runner.get_inputs(),
output_buffers,
output_shapes,
total_runs=1,
return_numpy=False,
include_copy_output_latency=True,
)
onnx_io_metric.add_latency(past_seq_len, avg_latency_ms / 1000.0)
if test_data_saved < save_test_data:
onnx_io_runner.save_test_data(session, onnx_io_output, save_test_data_dir, test_data_saved)
test_data_saved += 1
onnx_io_runner.update(onnx_io_output, step, device)
if verbose:
onnx_runner.diff(onnx_io_runner)
Gpt2Tester.diff_present(onnx_output, onnx_io_output, n_layer)
print("Top 1 tokens:")
print("\tTorch", torch_runner.top_1_tokens)
print("\tONNX", onnx_runner.top_1_tokens)
print("\tONNX with IO binding", onnx_io_runner.top_1_tokens)
onnx_metric.eval_batch(torch_runner, onnx_runner, past_seq_len, verbose=verbose)
onnx_io_metric.eval_batch(torch_runner, onnx_io_runner, past_seq_len, verbose=verbose)
done = done | (torch_runner.top_1_tokens == eos_token_id).any()
if torch.all(done):
break
onnx_metric.end_batch()
onnx_io_metric.end_batch()
torch_metric.print()
onnx_metric.print()
onnx_io_metric.print()

View File

@ -0,0 +1,146 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
# This script helps debugging parity issue for two same onnx models with fp16 and fp32 format
# Please build ORT with --cmake_extra_defines onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS=ON
import math
import multiprocessing
import os
from pathlib import Path
import numpy
import torch
from benchmark_helper import create_onnxruntime_session
from gpt2_helper import Gpt2Helper
from onnx import TensorProto, numpy_helper
NON_ZERO_VALUE = str(1)
ZERO_VALUE = str(0)
def environ_setting_nodes(node_name_filter=None, node_type_filter=None):
# Set I/O data as default
os.environ["ORT_DEBUG_NODE_IO_DUMP_SHAPE_DATA"] = ZERO_VALUE
os.environ["ORT_DEBUG_NODE_IO_DUMP_INPUT_DATA"] = NON_ZERO_VALUE
os.environ["ORT_DEBUG_NODE_IO_DUMP_OUTPUT_DATA"] = NON_ZERO_VALUE
if node_name_filter is not None:
os.environ["ORT_DEBUG_NODE_IO_NAME_FILTER"] = node_name_filter
elif node_type_filter is not None:
os.environ["ORT_DEBUG_NODE_IO_OP_TYPE_FILTER"] = node_type_filter
else:
os.environ["ORT_DEBUG_NODE_IO_DUMPING_DATA_TO_FILES_FOR_ALL_NODES_IS_OK"] = NON_ZERO_VALUE
def environ_setting_paths(output_path):
# Set dumping values to files as default
os.environ["ORT_DEBUG_NODE_IO_DUMP_DATA_DESTINATION"] = "files"
os.environ["ORT_DEBUG_NODE_IO_OUTPUT_DIR"] = output_path
def environ_reset():
for flag in [
"ORT_DEBUG_NODE_IO_DUMP_SHAPE_DATA",
"ORT_DEBUG_NODE_IO_DUMP_INPUT_DATA",
"ORT_DEBUG_NODE_IO_DUMP_OUTPUT_DATA",
"ORT_DEBUG_NODE_IO_NAME_FILTER",
"ORT_DEBUG_NODE_IO_OP_TYPE_FILTER",
"ORT_DEBUG_NODE_IO_DUMP_DATA_TO_FILES",
"ORT_DEBUG_NODE_IO_OUTPUT_DIR",
"ORT_DEBUG_NODE_IO_DUMPING_DATA_TO_FILES_FOR_ALL_NODES_IS_OK",
]:
if flag in os.environ:
del os.environ[flag]
def inference(model_path, dummy_inputs, outputs_path, use_gpu):
environ_reset()
environ_setting_nodes()
environ_setting_paths(outputs_path)
session = create_onnxruntime_session(model_path, use_gpu, enable_all_optimization=False)
Gpt2Helper.onnxruntime_inference(session, dummy_inputs)
def generate_outputs_files(model_path, dummy_inputs, outputs_path, use_gpu):
dir_path = Path(outputs_path)
if dir_path.exists() and dir_path.is_dir():
import shutil
shutil.rmtree(outputs_path)
dir_path.mkdir(parents=True, exist_ok=True)
process = multiprocessing.Process(target=inference, args=(model_path, dummy_inputs, outputs_path, use_gpu))
process.start()
process.join()
def post_processing(outputs_path, outputs_path_other):
# Compare outputs with e.g. fp16 and fp32
record = {}
if_close = {}
import glob
for filename in glob.glob(os.path.join(outputs_path, "*.tensorproto")):
filename_other = os.path.join(outputs_path_other, Path(filename).name)
if not os.path.exists(filename_other):
continue
with open(filename, "rb") as f:
tensor = TensorProto()
tensor.ParseFromString(f.read())
array = numpy_helper.to_array(tensor)
with open(filename_other, "rb") as f: # noqa: PLW2901
tensor_other = TensorProto()
tensor_other.ParseFromString(f.read())
array_other = numpy_helper.to_array(tensor_other)
if array_other.size == 0:
continue
diff = numpy.average(numpy.abs(array_other - array) / (numpy.abs(array_other) + 1e-6))
if math.isnan(diff):
continue
record[Path(filename).name.split(".")[0]] = diff
if_close[Path(filename).name.split(".")[0]] = numpy.allclose(array, array_other, rtol=1e-04, atol=1e-04)
results = ["Node\tDiff\tClose"]
for k, v in sorted(record.items(), key=lambda x: x[1], reverse=True):
results.append(f"{k}\t{v}\t{if_close[k]}")
for line in results:
print(line)
if __name__ == "__main__":
# Below example shows how to use this helper to investigate parity issue of gpt-2 fp32 and fp16 onnx model
# Please build ORT with --cmake_extra_defines onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS=ON !!
multiprocessing.set_start_method("spawn")
# Generate Inputs
sequence_length = 8
past_sequence_length = 8
batch_size = 5
dummy_inputs_fp16 = Gpt2Helper.get_dummy_inputs(
batch_size,
past_sequence_length,
sequence_length,
12,
768,
12,
50257,
device=torch.device("cpu"),
float16=True,
)
dummy_inputs_fp32 = dummy_inputs_fp16.to_fp32()
# Get GPT-2 model from huggingface using convert_to_onnx.py
os.system("python convert_to_onnx.py -m gpt2 --output gpt2_fp32.onnx -o -p fp32 --use_gpu")
os.system("python convert_to_onnx.py -m gpt2 --output gpt2_fp16.onnx -o -p fp16 --use_gpu")
# Specify the directory to dump the node's I/O
outputs_path_fp32_gpu = "./fp32_gpu"
outputs_path_fp16_gpu = "./fp16_gpu"
generate_outputs_files("./gpt2_fp32.onnx", dummy_inputs_fp32, outputs_path_fp32_gpu, use_gpu=True)
generate_outputs_files("./gpt2_fp16.onnx", dummy_inputs_fp16, outputs_path_fp16_gpu, use_gpu=True)
# Compare each node's I/O value and sort based on average rtol
post_processing(outputs_path_fp16_gpu, outputs_path_fp32_gpu)