AI Newsletter Digest improvements: fixed QP soft line break decoding, URL extraction, and content cleaning
This commit is contained in:
@@ -0,0 +1,8 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.append(os.path.dirname(__file__))
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,40 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
# Get/Set cpu affinity. Currently only support part of Unix system
|
||||
import logging
|
||||
import os
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AffinitySetting:
|
||||
def __init__(self):
|
||||
self.pid = os.getpid()
|
||||
self.affinity = None
|
||||
self.is_os_supported = hasattr(os, "sched_getaffinity") and hasattr(os, "sched_setaffinity")
|
||||
if not self.is_os_supported:
|
||||
logger.warning("Current OS does not support os.get_affinity() and os.set_affinity()")
|
||||
|
||||
def get_affinity(self):
|
||||
if self.is_os_supported:
|
||||
self.affinity = os.sched_getaffinity(self.pid)
|
||||
|
||||
def set_affinity(self):
|
||||
if self.is_os_supported:
|
||||
current_affinity = os.sched_getaffinity(self.pid)
|
||||
if self.affinity != current_affinity:
|
||||
logger.warning(
|
||||
"Replacing affinity setting %s with %s",
|
||||
str(current_affinity),
|
||||
str(self.affinity),
|
||||
)
|
||||
os.sched_setaffinity(self.pid, self.affinity)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
affi_helper = AffinitySetting()
|
||||
affi_helper.get_affinity()
|
||||
affi_helper.set_affinity()
|
||||
@@ -0,0 +1,942 @@
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Copyright 2018 The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Benchmarking the inference of pretrained transformer models.
|
||||
PyTorch/TorchScript benchmark is based on https://github.com/huggingface/transformers/blob/master/examples/benchmarks.py.
|
||||
One difference is that random input_ids is generated in this benchmark.
|
||||
|
||||
For onnxruntime, this script will convert a pretrained model to ONNX, and optimize it when -o parameter is used.
|
||||
|
||||
Example commands:
|
||||
Export all models to ONNX, optimize and validate them:
|
||||
python benchmark.py -b 0 -o -v -i 1 2 3
|
||||
Run OnnxRuntime on GPU for all models:
|
||||
python benchmark.py -g
|
||||
Run OnnxRuntime on GPU for all models with fp32 optimization:
|
||||
python benchmark.py -g -o
|
||||
Run OnnxRuntime on GPU with fp16 optimization:
|
||||
python benchmark.py -g -o -p "fp16"
|
||||
Run TorchScript on GPU for all models:
|
||||
python benchmark.py -e torchscript -g
|
||||
Run TorchScript on GPU for all models with fp16:
|
||||
python benchmark.py -e torchscript -g -p "fp16"
|
||||
Run ONNXRuntime and TorchScript on CPU for all models with quantization:
|
||||
python benchmark.py -e torchscript onnxruntime -p "int8" -o
|
||||
Run OnnxRuntime with bfloat16 fastmath mode kernels on aarch64 platforms with bfloat16 support:
|
||||
python benchmark.py --enable_arm64_bfloat16_fastmath_mlas_gemm
|
||||
|
||||
It is recommended to use run_benchmark.sh to launch benchmark.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import timeit
|
||||
from datetime import datetime
|
||||
|
||||
import numpy
|
||||
import psutil
|
||||
from benchmark_helper import (
|
||||
ConfigModifier,
|
||||
OptimizerInfo,
|
||||
Precision,
|
||||
create_onnxruntime_session,
|
||||
get_latency_result,
|
||||
inference_ort,
|
||||
inference_ort_with_io_binding,
|
||||
output_details,
|
||||
output_fusion_statistics,
|
||||
output_summary,
|
||||
setup_logger,
|
||||
)
|
||||
from fusion_options import FusionOptions
|
||||
from huggingface_models import MODEL_CLASSES, MODELS
|
||||
from onnx_exporter import (
|
||||
create_onnxruntime_input,
|
||||
export_onnx_model_from_pt,
|
||||
export_onnx_model_from_tf,
|
||||
load_pretrained_model,
|
||||
)
|
||||
from packaging import version
|
||||
from quantize_helper import QuantizeHelper
|
||||
|
||||
logger = logging.getLogger("")
|
||||
|
||||
cpu_count = psutil.cpu_count(logical=False)
|
||||
|
||||
# Set OMP environment variable before importing onnxruntime or torch.
|
||||
if "OMP_NUM_THREADS" not in os.environ:
|
||||
os.environ["OMP_NUM_THREADS"] = str(cpu_count)
|
||||
|
||||
import torch # noqa: E402
|
||||
from transformers import AutoConfig, AutoTokenizer, LxmertConfig # noqa: E402
|
||||
|
||||
|
||||
def run_onnxruntime(
|
||||
use_gpu,
|
||||
provider,
|
||||
model_names,
|
||||
model_class,
|
||||
config_modifier,
|
||||
precision,
|
||||
num_threads,
|
||||
batch_sizes,
|
||||
sequence_lengths,
|
||||
repeat_times,
|
||||
input_counts,
|
||||
optimizer_info,
|
||||
validate_onnx,
|
||||
cache_dir,
|
||||
onnx_dir,
|
||||
verbose,
|
||||
overwrite,
|
||||
disable_ort_io_binding,
|
||||
use_raw_attention_mask,
|
||||
model_fusion_statistics,
|
||||
model_source,
|
||||
enable_arm64_bfloat16_fastmath_mlas_gemm,
|
||||
args,
|
||||
):
|
||||
import onnxruntime # noqa: PLC0415
|
||||
|
||||
results = []
|
||||
if (
|
||||
use_gpu
|
||||
and ("CUDAExecutionProvider" not in onnxruntime.get_available_providers())
|
||||
and ("MIGraphXExecutionProvider" not in onnxruntime.get_available_providers())
|
||||
and ("DmlExecutionProvider" not in onnxruntime.get_available_providers())
|
||||
):
|
||||
logger.error(
|
||||
"Please install onnxruntime-gpu or onnxruntime-directml package instead of onnxruntime, and use a machine with GPU for testing gpu performance."
|
||||
)
|
||||
return results
|
||||
|
||||
warm_up_repeat = 0
|
||||
if provider == "tensorrt":
|
||||
optimizer_info = OptimizerInfo.NOOPT
|
||||
warm_up_repeat = 5
|
||||
if "TensorrtExecutionProvider" not in onnxruntime.get_available_providers():
|
||||
logger.error(
|
||||
"Please install onnxruntime-gpu-tensorrt package, and use a machine with GPU for testing gpu performance."
|
||||
)
|
||||
return results
|
||||
|
||||
if optimizer_info == OptimizerInfo.NOOPT:
|
||||
logger.warning(
|
||||
f"OptimizerInfo is set to {optimizer_info}, graph optimizations specified in FusionOptions are not applied."
|
||||
)
|
||||
|
||||
for model_name in model_names:
|
||||
all_input_names = MODELS[model_name][0]
|
||||
for num_inputs in input_counts:
|
||||
if num_inputs > len(all_input_names):
|
||||
break
|
||||
|
||||
input_names = all_input_names[:num_inputs]
|
||||
args.model_type = MODELS[model_name][3]
|
||||
fusion_options = FusionOptions.parse(args)
|
||||
|
||||
if "pt" in model_source:
|
||||
with torch.no_grad():
|
||||
(
|
||||
onnx_model_file,
|
||||
is_valid_onnx_model,
|
||||
vocab_size,
|
||||
max_sequence_length,
|
||||
) = export_onnx_model_from_pt(
|
||||
model_name,
|
||||
MODELS[model_name][1],
|
||||
MODELS[model_name][2],
|
||||
MODELS[model_name][3],
|
||||
model_class,
|
||||
config_modifier,
|
||||
cache_dir,
|
||||
onnx_dir,
|
||||
input_names,
|
||||
use_gpu,
|
||||
precision,
|
||||
optimizer_info,
|
||||
validate_onnx,
|
||||
use_raw_attention_mask,
|
||||
overwrite,
|
||||
model_fusion_statistics,
|
||||
fusion_options,
|
||||
)
|
||||
if "tf" in model_source:
|
||||
(
|
||||
onnx_model_file,
|
||||
is_valid_onnx_model,
|
||||
vocab_size,
|
||||
max_sequence_length,
|
||||
) = export_onnx_model_from_tf(
|
||||
model_name,
|
||||
MODELS[model_name][1],
|
||||
MODELS[model_name][2],
|
||||
MODELS[model_name][3],
|
||||
model_class,
|
||||
config_modifier,
|
||||
cache_dir,
|
||||
onnx_dir,
|
||||
input_names,
|
||||
use_gpu,
|
||||
precision,
|
||||
optimizer_info,
|
||||
validate_onnx,
|
||||
use_raw_attention_mask,
|
||||
overwrite,
|
||||
model_fusion_statistics,
|
||||
fusion_options,
|
||||
)
|
||||
|
||||
if not is_valid_onnx_model:
|
||||
continue
|
||||
|
||||
ort_session = create_onnxruntime_session(
|
||||
onnx_model_file,
|
||||
use_gpu,
|
||||
provider,
|
||||
enable_all_optimization=True,
|
||||
num_threads=num_threads,
|
||||
verbose=verbose,
|
||||
enable_mlas_gemm_fastmath_arm64_bfloat16=enable_arm64_bfloat16_fastmath_mlas_gemm,
|
||||
)
|
||||
if ort_session is None:
|
||||
continue
|
||||
|
||||
ort_output_names = [node_arg.name for node_arg in ort_session.get_outputs()]
|
||||
output_buffers = []
|
||||
device = "cuda" if use_gpu else "cpu"
|
||||
config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir)
|
||||
max_last_state_size = numpy.prod(
|
||||
[
|
||||
max(batch_sizes),
|
||||
max(sequence_lengths),
|
||||
max(vocab_size, config.hidden_size),
|
||||
]
|
||||
)
|
||||
max_pooler_size = numpy.prod([max(batch_sizes), config.hidden_size])
|
||||
for batch_size in batch_sizes:
|
||||
if batch_size <= 0:
|
||||
continue
|
||||
for sequence_length in sequence_lengths:
|
||||
if max_sequence_length is not None and sequence_length > max_sequence_length:
|
||||
continue
|
||||
|
||||
input_value_type = numpy.int64 if "pt" in model_source else numpy.int32
|
||||
ort_inputs = create_onnxruntime_input(
|
||||
vocab_size,
|
||||
batch_size,
|
||||
sequence_length,
|
||||
input_names,
|
||||
config,
|
||||
input_value_type,
|
||||
)
|
||||
result_template = {
|
||||
"engine": "onnxruntime",
|
||||
"version": onnxruntime.__version__,
|
||||
"providers": provider,
|
||||
"device": device,
|
||||
"optimizer": optimizer_info,
|
||||
"precision": precision,
|
||||
"io_binding": not disable_ort_io_binding,
|
||||
"model_name": model_name,
|
||||
"inputs": num_inputs,
|
||||
"threads": num_threads,
|
||||
"batch_size": batch_size,
|
||||
"sequence_length": sequence_length,
|
||||
"custom_layer_num": config_modifier.get_layer_num(),
|
||||
"datetime": str(datetime.now()),
|
||||
}
|
||||
|
||||
if config.model_type in ["vit", "swin"]:
|
||||
logger.info(
|
||||
f"Run onnxruntime on {model_name} with input shape {[batch_size, 3, config.image_size, config.image_size]}"
|
||||
)
|
||||
else:
|
||||
logger.info(f"Run onnxruntime on {model_name} with input shape {[batch_size, sequence_length]}")
|
||||
|
||||
if disable_ort_io_binding:
|
||||
result = inference_ort(
|
||||
ort_session,
|
||||
ort_inputs,
|
||||
result_template,
|
||||
repeat_times,
|
||||
batch_size,
|
||||
warm_up_repeat,
|
||||
)
|
||||
else:
|
||||
# Get output sizes from a dummy ort run
|
||||
ort_outputs = ort_session.run(ort_output_names, ort_inputs)
|
||||
output_buffer_max_sizes = [max_last_state_size]
|
||||
for i in range(len(ort_outputs)):
|
||||
if i == 2 and MODELS[model_name][3] == "gpt":
|
||||
# past state output max size
|
||||
output_buffer_max_sizes.append(max_pooler_size)
|
||||
else:
|
||||
output_buffer_max_sizes.append(max_last_state_size)
|
||||
|
||||
data_type = numpy.longlong if "pt" in model_source else numpy.intc
|
||||
result = inference_ort_with_io_binding(
|
||||
ort_session,
|
||||
ort_inputs,
|
||||
result_template,
|
||||
repeat_times,
|
||||
ort_output_names,
|
||||
ort_outputs,
|
||||
output_buffers,
|
||||
output_buffer_max_sizes,
|
||||
batch_size,
|
||||
device,
|
||||
data_type,
|
||||
warm_up_repeat,
|
||||
)
|
||||
logger.info(result)
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def run_pytorch(
|
||||
use_gpu,
|
||||
model_names,
|
||||
model_class,
|
||||
config_modifier,
|
||||
precision,
|
||||
num_threads,
|
||||
batch_sizes,
|
||||
sequence_lengths,
|
||||
repeat_times,
|
||||
torchscript,
|
||||
torch2,
|
||||
cache_dir,
|
||||
verbose,
|
||||
):
|
||||
results = []
|
||||
if use_gpu and not torch.cuda.is_available():
|
||||
logger.error("Please install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.")
|
||||
return results
|
||||
|
||||
torch.set_grad_enabled(False)
|
||||
|
||||
for model_name in model_names:
|
||||
config = AutoConfig.from_pretrained(model_name, torchscript=torchscript, cache_dir=cache_dir)
|
||||
config_modifier.modify(config)
|
||||
model = load_pretrained_model(
|
||||
model_name,
|
||||
config=config,
|
||||
cache_dir=cache_dir,
|
||||
custom_model_class=model_class,
|
||||
)
|
||||
|
||||
if config.model_type in ["vit", "swin"]:
|
||||
# These models don't use sequence lengths, so just pick the first sequence length so that the summary still works
|
||||
sequence_lengths = [sequence_lengths[0]]
|
||||
else:
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
|
||||
|
||||
max_input_size = tokenizer.model_max_length
|
||||
|
||||
logger.debug(f"Model {model}")
|
||||
logger.debug(f"Number of parameters {model.num_parameters()}")
|
||||
|
||||
if precision == Precision.FLOAT16:
|
||||
model.half()
|
||||
|
||||
device = torch.device("cuda:0" if use_gpu else "cpu")
|
||||
model.to(device)
|
||||
|
||||
if precision == Precision.INT8:
|
||||
model = QuantizeHelper.quantize_torch_model(model)
|
||||
|
||||
for batch_size in batch_sizes:
|
||||
if batch_size <= 0:
|
||||
continue
|
||||
|
||||
for sequence_length in sequence_lengths:
|
||||
if config.model_type in ["vit", "swin"]:
|
||||
logger.info(
|
||||
f"Run PyTorch on {model_name} with input shape {[batch_size, 3, config.image_size, config.image_size]}"
|
||||
)
|
||||
input_ids = torch.randn(
|
||||
size=(batch_size, 3, config.image_size, config.image_size),
|
||||
dtype=torch.float16 if precision == Precision.FLOAT16 else torch.float32,
|
||||
device=device,
|
||||
)
|
||||
else:
|
||||
if max_input_size is not None and sequence_length > max_input_size:
|
||||
continue
|
||||
|
||||
logger.info(f"Run PyTorch on {model_name} with input shape {[batch_size, sequence_length]}")
|
||||
input_ids = torch.randint(
|
||||
low=0,
|
||||
high=config.vocab_size - 1,
|
||||
size=(batch_size, sequence_length),
|
||||
dtype=torch.long,
|
||||
device=device,
|
||||
)
|
||||
try:
|
||||
inference = (
|
||||
torch.jit.trace(model, input_ids) if torchscript else torch.compile(model) if torch2 else model
|
||||
)
|
||||
inference(input_ids)
|
||||
|
||||
runtimes = timeit.repeat(lambda: inference(input_ids), repeat=repeat_times, number=1) # noqa: B023
|
||||
|
||||
result = {
|
||||
"engine": "torchscript" if torchscript else "torch2" if torch2 else "torch",
|
||||
"version": torch.__version__,
|
||||
"providers": "NA",
|
||||
"device": "cuda" if use_gpu else "cpu",
|
||||
"optimizer": "",
|
||||
"precision": precision,
|
||||
"io_binding": "",
|
||||
"model_name": model_name,
|
||||
"inputs": 1,
|
||||
"threads": num_threads,
|
||||
"batch_size": batch_size,
|
||||
"sequence_length": sequence_length,
|
||||
"custom_layer_num": config_modifier.get_layer_num(),
|
||||
"datetime": str(datetime.now()),
|
||||
}
|
||||
result.update(get_latency_result(runtimes, batch_size))
|
||||
logger.info(result)
|
||||
results.append(result)
|
||||
except RuntimeError as e:
|
||||
logger.exception(e)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def run_with_tf_optimizations(do_eager_mode: bool, use_xla: bool):
|
||||
from functools import wraps # noqa: PLC0415
|
||||
|
||||
import tensorflow as tf # noqa: PLC0415
|
||||
|
||||
def run_func(func):
|
||||
@wraps(func)
|
||||
def run_in_eager_mode(*args, **kwargs):
|
||||
return func(*args, **kwargs)
|
||||
|
||||
@wraps(func)
|
||||
@tf.function(experimental_compile=use_xla)
|
||||
def run_in_graph_mode(*args, **kwargs):
|
||||
return func(*args, **kwargs)
|
||||
|
||||
if do_eager_mode is True:
|
||||
assert use_xla is False, (
|
||||
"Cannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`."
|
||||
)
|
||||
return run_in_eager_mode
|
||||
else:
|
||||
return run_in_graph_mode
|
||||
|
||||
return run_func
|
||||
|
||||
|
||||
def run_tensorflow(
|
||||
use_gpu,
|
||||
model_names,
|
||||
model_class,
|
||||
config_modifier,
|
||||
precision,
|
||||
num_threads,
|
||||
batch_sizes,
|
||||
sequence_lengths,
|
||||
repeat_times,
|
||||
cache_dir,
|
||||
verbose,
|
||||
):
|
||||
results = []
|
||||
|
||||
import tensorflow as tf # noqa: PLC0415
|
||||
|
||||
tf.config.threading.set_intra_op_parallelism_threads(num_threads)
|
||||
|
||||
if not use_gpu:
|
||||
tf.config.set_visible_devices([], "GPU")
|
||||
|
||||
if use_gpu and not tf.test.is_built_with_cuda():
|
||||
logger.error("Please install Tensorflow-gpu, and use a machine with GPU for testing gpu performance.")
|
||||
return results
|
||||
|
||||
if use_gpu: # Restrict TensorFlow to only use the first GPU
|
||||
physical_devices = tf.config.list_physical_devices("GPU")
|
||||
try:
|
||||
tf.config.set_visible_devices(physical_devices[0], "GPU")
|
||||
tf.config.experimental.set_memory_growth(physical_devices[0], True)
|
||||
tf.distribute.OneDeviceStrategy(device="/gpu:0")
|
||||
except RuntimeError as e:
|
||||
logger.exception(e)
|
||||
|
||||
if precision == Precision.FLOAT16 or precision == Precision.INT8:
|
||||
raise NotImplementedError("Mixed precision is currently not supported.")
|
||||
|
||||
for model_name in model_names:
|
||||
config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir)
|
||||
config_modifier.modify(config)
|
||||
|
||||
model = load_pretrained_model(
|
||||
model_name,
|
||||
config=config,
|
||||
cache_dir=cache_dir,
|
||||
custom_model_class=model_class,
|
||||
is_tf_model=True,
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
|
||||
|
||||
max_input_size = tokenizer.model_max_length
|
||||
|
||||
for batch_size in batch_sizes:
|
||||
if batch_size <= 0:
|
||||
continue
|
||||
|
||||
for sequence_length in sequence_lengths:
|
||||
if max_input_size is not None and sequence_length > max_input_size:
|
||||
continue
|
||||
|
||||
logger.info(f"Run Tensorflow on {model_name} with input shape {[batch_size, sequence_length]}")
|
||||
|
||||
import random # noqa: PLC0415
|
||||
|
||||
rng = random.Random()
|
||||
values = [rng.randint(0, config.vocab_size - 1) for i in range(batch_size * sequence_length)]
|
||||
input_ids = tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32)
|
||||
|
||||
try:
|
||||
# Disable both for better inference perf
|
||||
@run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
|
||||
def encoder_forward():
|
||||
return model(input_ids, training=False) # noqa: B023
|
||||
|
||||
@run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
|
||||
def encoder_decoder_forward():
|
||||
return model(input_ids, decoder_input_ids=input_ids, training=False) # noqa: B023
|
||||
|
||||
@run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
|
||||
def lxmert_forward():
|
||||
feats = tf.random.normal([1, 1, config.visual_feat_dim]) # noqa: B023
|
||||
pos = tf.random.normal([1, 1, config.visual_pos_dim]) # noqa: B023
|
||||
return model( # noqa: B023
|
||||
input_ids, # noqa: B023
|
||||
visual_feats=feats,
|
||||
visual_pos=pos,
|
||||
training=False,
|
||||
)
|
||||
|
||||
inference = encoder_forward
|
||||
if config.is_encoder_decoder:
|
||||
inference = encoder_decoder_forward
|
||||
elif isinstance(config, LxmertConfig):
|
||||
inference = lxmert_forward
|
||||
|
||||
inference()
|
||||
|
||||
runtimes = timeit.repeat(lambda: inference(), repeat=repeat_times, number=1) # noqa: B023
|
||||
|
||||
result = {
|
||||
"engine": "tensorflow",
|
||||
"version": tf.__version__,
|
||||
"providers": "NA",
|
||||
"device": "cuda" if use_gpu else "cpu",
|
||||
"optimizer": "",
|
||||
"precision": precision,
|
||||
"io_binding": "",
|
||||
"model_name": model_name,
|
||||
"inputs": 1,
|
||||
"threads": num_threads,
|
||||
"batch_size": batch_size,
|
||||
"sequence_length": sequence_length,
|
||||
"custom_layer_num": config_modifier.get_layer_num(),
|
||||
"datetime": str(datetime.now()),
|
||||
}
|
||||
result.update(get_latency_result(runtimes, batch_size))
|
||||
logger.info(result)
|
||||
results.append(result)
|
||||
except RuntimeError as e:
|
||||
logger.exception(e)
|
||||
from numba import cuda # noqa: PLC0415
|
||||
|
||||
device = cuda.get_current_device()
|
||||
device.reset()
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"-m",
|
||||
"--models",
|
||||
required=False,
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=["bert-base-cased", "roberta-base", "gpt2"],
|
||||
choices=list(MODELS.keys()),
|
||||
help="Pre-trained models in the list: " + ", ".join(MODELS.keys()),
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--model_source",
|
||||
required=False,
|
||||
nargs=1,
|
||||
type=str,
|
||||
default="pt",
|
||||
choices=["pt", "tf"],
|
||||
help="Export onnx from pt or tf",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--model_class",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
choices=list(MODEL_CLASSES),
|
||||
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES),
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-e",
|
||||
"--engines",
|
||||
required=False,
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=["onnxruntime"],
|
||||
choices=["onnxruntime", "torch", "torch2", "torchscript", "tensorflow"],
|
||||
help="Engines to benchmark",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
"--cache_dir",
|
||||
required=False,
|
||||
type=str,
|
||||
default=os.path.join(".", "cache_models"),
|
||||
help="Directory to cache pre-trained models",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--onnx_dir",
|
||||
required=False,
|
||||
type=str,
|
||||
default=os.path.join(".", "onnx_models"),
|
||||
help="Directory to store onnx models",
|
||||
)
|
||||
|
||||
parser.add_argument("-g", "--use_gpu", required=False, action="store_true", help="Run on gpu device")
|
||||
|
||||
parser.add_argument(
|
||||
"--provider",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="Execution provider to use",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-p",
|
||||
"--precision",
|
||||
type=Precision,
|
||||
default=Precision.FLOAT32,
|
||||
choices=list(Precision),
|
||||
help="Precision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization",
|
||||
)
|
||||
|
||||
parser.add_argument("--verbose", required=False, action="store_true", help="Print more information")
|
||||
|
||||
parser.add_argument(
|
||||
"--overwrite",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="Overwrite existing models",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--optimizer_info",
|
||||
type=OptimizerInfo,
|
||||
default=OptimizerInfo.BYSCRIPT,
|
||||
choices=list(OptimizerInfo),
|
||||
help="Optimizer info: Use optimizer.py to optimize onnx model as default. Can also choose from by_ort and no_opt",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--validate_onnx",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="Validate ONNX model",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--fusion_csv",
|
||||
required=False,
|
||||
default=None,
|
||||
help="CSV file for saving summary results of graph optimization.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--detail_csv",
|
||||
required=False,
|
||||
default=None,
|
||||
help="CSV file for saving detail results.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--result_csv",
|
||||
required=False,
|
||||
default=None,
|
||||
help="CSV file for saving summary results.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-i",
|
||||
"--input_counts",
|
||||
required=False,
|
||||
nargs="+",
|
||||
default=[1],
|
||||
type=int,
|
||||
choices=[1, 2, 3],
|
||||
help="Number of ONNX model inputs. Please use 1 for fair comparison with Torch or TorchScript.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--test_times",
|
||||
required=False,
|
||||
default=100,
|
||||
type=int,
|
||||
help="Number of repeat times to get average inference latency.",
|
||||
)
|
||||
|
||||
parser.add_argument("-b", "--batch_sizes", nargs="+", type=int, default=[1])
|
||||
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--sequence_lengths",
|
||||
nargs="+",
|
||||
type=int,
|
||||
default=[4, 8, 16, 32, 64, 128, 256],
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--disable_ort_io_binding",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="Disable running ONNX Runtime with binded inputs and outputs. ",
|
||||
)
|
||||
parser.set_defaults(disable_ort_io_binding=False)
|
||||
|
||||
parser.add_argument(
|
||||
"-n",
|
||||
"--num_threads",
|
||||
required=False,
|
||||
nargs="+",
|
||||
type=int,
|
||||
default=[0],
|
||||
help="Threads to use",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--force_num_layers",
|
||||
required=False,
|
||||
type=int,
|
||||
default=None,
|
||||
help="Manually set the model's layer number",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--enable_arm64_bfloat16_fastmath_mlas_gemm",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="Enable bfloat16 mlas gemm kernels on aarch64. Supported only for CPU EP ",
|
||||
)
|
||||
parser.set_defaults(enable_arm64_bfloat16_fastmath_mlas_gemm=False)
|
||||
|
||||
FusionOptions.add_arguments(parser)
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_arguments()
|
||||
|
||||
setup_logger(args.verbose)
|
||||
|
||||
if args.precision == Precision.FLOAT16 and not args.use_gpu:
|
||||
logger.error("fp16 is for GPU only")
|
||||
return
|
||||
|
||||
if args.precision == Precision.INT8 and args.use_gpu and args.provider not in ["migraphx"]:
|
||||
logger.error("int8 is for CPU only")
|
||||
return
|
||||
|
||||
if len(args.models) == 1 and MODELS[args.models[0]][3] in ["vit", "swim"]:
|
||||
args.sequence_lengths = [""]
|
||||
|
||||
args.num_threads = sorted({cpu_count if x <= 0 else x for x in args.num_threads})
|
||||
|
||||
logger.info(f"Arguments: {args}")
|
||||
|
||||
if not os.path.exists(args.cache_dir):
|
||||
try:
|
||||
os.mkdir(args.cache_dir)
|
||||
except OSError:
|
||||
logger.error("Creation of the directory %s failed", args.cache_dir)
|
||||
|
||||
enable_torch = "torch" in args.engines
|
||||
enable_torch2 = "torch2" in args.engines
|
||||
enable_torchscript = "torchscript" in args.engines
|
||||
enable_onnxruntime = "onnxruntime" in args.engines
|
||||
enable_tensorflow = "tensorflow" in args.engines
|
||||
|
||||
if enable_torch2 and version.parse(torch.__version__) < version.parse("2.0.0"):
|
||||
logger.error(f"PyTorch version must be >=2.0.0 and you are using {torch.__version__}")
|
||||
return
|
||||
|
||||
config_modifier = ConfigModifier(args.force_num_layers)
|
||||
|
||||
results = []
|
||||
|
||||
for num_threads in args.num_threads:
|
||||
torch.set_num_threads(num_threads)
|
||||
logger.debug(torch.__config__.parallel_info())
|
||||
if enable_torch or enable_torch2 or enable_torchscript:
|
||||
if args.input_counts != [1]:
|
||||
logger.warning("--input_counts is not implemented for torch or torchscript engine.")
|
||||
|
||||
if enable_torchscript:
|
||||
results += run_pytorch(
|
||||
args.use_gpu,
|
||||
args.models,
|
||||
args.model_class,
|
||||
config_modifier,
|
||||
args.precision,
|
||||
num_threads,
|
||||
args.batch_sizes,
|
||||
args.sequence_lengths,
|
||||
args.test_times,
|
||||
True,
|
||||
False,
|
||||
args.cache_dir,
|
||||
args.verbose,
|
||||
)
|
||||
|
||||
if enable_torch:
|
||||
results += run_pytorch(
|
||||
args.use_gpu,
|
||||
args.models,
|
||||
args.model_class,
|
||||
config_modifier,
|
||||
args.precision,
|
||||
num_threads,
|
||||
args.batch_sizes,
|
||||
args.sequence_lengths,
|
||||
args.test_times,
|
||||
False,
|
||||
False,
|
||||
args.cache_dir,
|
||||
args.verbose,
|
||||
)
|
||||
|
||||
if enable_torch2:
|
||||
results += run_pytorch(
|
||||
args.use_gpu,
|
||||
args.models,
|
||||
args.model_class,
|
||||
config_modifier,
|
||||
args.precision,
|
||||
num_threads,
|
||||
args.batch_sizes,
|
||||
args.sequence_lengths,
|
||||
args.test_times,
|
||||
False,
|
||||
True,
|
||||
args.cache_dir,
|
||||
args.verbose,
|
||||
)
|
||||
|
||||
if enable_tensorflow:
|
||||
results += run_tensorflow(
|
||||
args.use_gpu,
|
||||
args.models,
|
||||
args.model_class,
|
||||
config_modifier,
|
||||
args.precision,
|
||||
num_threads,
|
||||
args.batch_sizes,
|
||||
args.sequence_lengths,
|
||||
args.test_times,
|
||||
args.cache_dir,
|
||||
args.verbose,
|
||||
)
|
||||
|
||||
model_fusion_statistics = {}
|
||||
if enable_onnxruntime:
|
||||
try:
|
||||
use_raw_attention_mask = not args.use_mask_index
|
||||
results += run_onnxruntime(
|
||||
args.use_gpu,
|
||||
args.provider,
|
||||
args.models,
|
||||
args.model_class,
|
||||
config_modifier,
|
||||
args.precision,
|
||||
num_threads,
|
||||
args.batch_sizes,
|
||||
args.sequence_lengths,
|
||||
args.test_times,
|
||||
args.input_counts,
|
||||
args.optimizer_info,
|
||||
args.validate_onnx,
|
||||
args.cache_dir,
|
||||
args.onnx_dir,
|
||||
args.verbose,
|
||||
args.overwrite,
|
||||
args.disable_ort_io_binding,
|
||||
use_raw_attention_mask,
|
||||
model_fusion_statistics,
|
||||
args.model_source,
|
||||
args.enable_arm64_bfloat16_fastmath_mlas_gemm,
|
||||
args,
|
||||
)
|
||||
except Exception:
|
||||
logger.exception("Exception")
|
||||
|
||||
time_stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
if model_fusion_statistics:
|
||||
csv_filename = args.fusion_csv or f"benchmark_fusion_{time_stamp}.csv"
|
||||
output_fusion_statistics(model_fusion_statistics, csv_filename)
|
||||
|
||||
if len(results) == 0:
|
||||
if args.batch_sizes != [0]:
|
||||
logger.warning("No any result available.")
|
||||
return
|
||||
|
||||
csv_filename = args.detail_csv or f"benchmark_detail_{time_stamp}.csv"
|
||||
output_details(results, csv_filename)
|
||||
|
||||
csv_filename = args.result_csv or f"benchmark_summary_{time_stamp}.csv"
|
||||
output_summary(results, csv_filename, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,643 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
import csv
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
import time
|
||||
import timeit
|
||||
from abc import ABC, abstractmethod
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from time import sleep
|
||||
from typing import Any
|
||||
|
||||
import numpy
|
||||
import torch
|
||||
import transformers
|
||||
from packaging import version
|
||||
|
||||
import onnxruntime
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Precision(Enum):
|
||||
FLOAT32 = "fp32"
|
||||
FLOAT16 = "fp16"
|
||||
INT8 = "int8"
|
||||
INT4 = "int4"
|
||||
|
||||
def __str__(self):
|
||||
return self.value
|
||||
|
||||
|
||||
class OptimizerInfo(Enum):
|
||||
# no_opt means using the raw ONNX model, but OnnxRuntime might still apply optimization as long as
|
||||
# graph optimization level is not 0 (disable all).
|
||||
NOOPT = "no_opt"
|
||||
BYORT = "by_ort"
|
||||
BYSCRIPT = "by_script"
|
||||
|
||||
def __str__(self):
|
||||
return self.value
|
||||
|
||||
|
||||
class ConfigModifier:
|
||||
def __init__(self, num_layers):
|
||||
self.num_layers = num_layers
|
||||
|
||||
def modify(self, config):
|
||||
if self.num_layers is None:
|
||||
return
|
||||
if hasattr(config, "num_hidden_layers"):
|
||||
config.num_hidden_layers = self.num_layers
|
||||
logger.info(f"Modifying pytorch model's number of hidden layers to: {self.num_layers}")
|
||||
if hasattr(config, "encoder_layers"):
|
||||
config.encoder_layers = self.num_layers
|
||||
logger.info(f"Modifying pytorch model's number of encoder layers to: {self.num_layers}")
|
||||
if hasattr(config, "decoder_layers "):
|
||||
config.decoder_layers = self.num_layers
|
||||
logger.info(f"Modifying pytorch model's number of decoder layers to: {self.num_layers}")
|
||||
|
||||
def get_layer_num(self):
|
||||
return self.num_layers
|
||||
|
||||
|
||||
IO_BINDING_DATA_TYPE_MAP = {
|
||||
"float32": numpy.float32,
|
||||
# TODO: Add more.
|
||||
}
|
||||
|
||||
|
||||
def create_onnxruntime_session(
|
||||
onnx_model_path,
|
||||
use_gpu,
|
||||
provider=None,
|
||||
enable_all_optimization=True,
|
||||
num_threads=-1,
|
||||
enable_profiling=False,
|
||||
verbose=False,
|
||||
enable_mlas_gemm_fastmath_arm64_bfloat16=False,
|
||||
provider_options={}, # map execution provider name to its option # noqa: B006
|
||||
):
|
||||
sess_options = onnxruntime.SessionOptions()
|
||||
|
||||
if enable_all_optimization:
|
||||
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
|
||||
else:
|
||||
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
|
||||
|
||||
if enable_profiling:
|
||||
sess_options.enable_profiling = True
|
||||
|
||||
if num_threads > 0:
|
||||
sess_options.intra_op_num_threads = num_threads
|
||||
logger.debug(f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}")
|
||||
|
||||
if verbose:
|
||||
sess_options.log_severity_level = 0
|
||||
else:
|
||||
sess_options.log_severity_level = 4
|
||||
|
||||
if provider in onnxruntime.get_available_providers():
|
||||
providers = [provider]
|
||||
elif use_gpu:
|
||||
if provider == "dml":
|
||||
providers = ["DmlExecutionProvider", "CPUExecutionProvider"]
|
||||
elif provider == "migraphx":
|
||||
providers = [
|
||||
"MIGraphXExecutionProvider",
|
||||
"CPUExecutionProvider",
|
||||
]
|
||||
elif provider == "cuda" or provider is None:
|
||||
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
||||
elif provider == "tensorrt":
|
||||
providers = [
|
||||
"TensorrtExecutionProvider",
|
||||
"CUDAExecutionProvider",
|
||||
"CPUExecutionProvider",
|
||||
]
|
||||
else:
|
||||
raise RuntimeError(f"The execution provider is not supported: {provider}")
|
||||
else:
|
||||
providers = ["CPUExecutionProvider"]
|
||||
|
||||
if provider_options:
|
||||
providers = [(name, provider_options[name]) if name in provider_options else name for name in providers]
|
||||
|
||||
if enable_mlas_gemm_fastmath_arm64_bfloat16:
|
||||
sess_options.add_session_config_entry("mlas.enable_gemm_fastmath_arm64_bfloat16", "1")
|
||||
|
||||
session = None
|
||||
try:
|
||||
session = onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=providers)
|
||||
except Exception:
|
||||
logger.exception(f"Failed to create session for {onnx_model_path} with providers={providers}")
|
||||
|
||||
return session
|
||||
|
||||
|
||||
def setup_logger(verbose=True):
|
||||
if verbose:
|
||||
logging.basicConfig(
|
||||
format="[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s",
|
||||
level=logging.DEBUG,
|
||||
)
|
||||
else:
|
||||
logging.basicConfig(format="%(message)s", level=logging.INFO)
|
||||
logging.getLogger("transformers").setLevel(logging.WARNING)
|
||||
|
||||
|
||||
def prepare_environment(cache_dir, output_dir, use_gpu, provider=None):
|
||||
if cache_dir and not os.path.exists(cache_dir):
|
||||
os.makedirs(cache_dir)
|
||||
|
||||
if output_dir and not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
if use_gpu:
|
||||
if provider == "dml":
|
||||
assert "DmlExecutionProvider" in onnxruntime.get_available_providers(), (
|
||||
"Please install onnxruntime-directml package to test GPU inference."
|
||||
)
|
||||
|
||||
else:
|
||||
assert not set(onnxruntime.get_available_providers()).isdisjoint(
|
||||
["CUDAExecutionProvider", "MIGraphXExecutionProvider"]
|
||||
), "Please install onnxruntime-gpu package, or install migraphx, to test GPU inference."
|
||||
|
||||
logger.info(f"PyTorch Version:{torch.__version__}")
|
||||
logger.info(f"Transformers Version:{transformers.__version__}")
|
||||
logger.info(f"OnnxRuntime Version:{onnxruntime.__version__}")
|
||||
|
||||
# Support three major versions of PyTorch and OnnxRuntime, and up to 9 months of transformers.
|
||||
assert version.parse(torch.__version__) >= version.parse("1.10.0")
|
||||
assert version.parse(transformers.__version__) >= version.parse("4.12.0")
|
||||
assert version.parse(onnxruntime.__version__) >= version.parse("1.10.0")
|
||||
|
||||
|
||||
def get_latency_result(latency_list, batch_size):
|
||||
latency_ms = sum(latency_list) / float(len(latency_list)) * 1000.0
|
||||
latency_variance = numpy.var(latency_list, dtype=numpy.float64) * 1000.0
|
||||
throughput = batch_size * (1000.0 / latency_ms)
|
||||
|
||||
return {
|
||||
"test_times": len(latency_list),
|
||||
"latency_variance": f"{latency_variance:.2f}",
|
||||
"latency_90_percentile": f"{numpy.percentile(latency_list, 90) * 1000.0:.2f}",
|
||||
"latency_95_percentile": f"{numpy.percentile(latency_list, 95) * 1000.0:.2f}",
|
||||
"latency_99_percentile": f"{numpy.percentile(latency_list, 99) * 1000.0:.2f}",
|
||||
"average_latency_ms": f"{latency_ms:.2f}",
|
||||
"QPS": f"{throughput:.2f}",
|
||||
}
|
||||
|
||||
|
||||
def output_details(results, csv_filename):
|
||||
with open(csv_filename, mode="a", newline="", encoding="ascii") as csv_file:
|
||||
column_names = [
|
||||
"engine",
|
||||
"version",
|
||||
"providers",
|
||||
"device",
|
||||
"precision",
|
||||
"optimizer",
|
||||
"io_binding",
|
||||
"model_name",
|
||||
"inputs",
|
||||
"threads",
|
||||
"batch_size",
|
||||
"sequence_length",
|
||||
"custom_layer_num",
|
||||
"datetime",
|
||||
"test_times",
|
||||
"QPS",
|
||||
"average_latency_ms",
|
||||
"latency_variance",
|
||||
"latency_90_percentile",
|
||||
"latency_95_percentile",
|
||||
"latency_99_percentile",
|
||||
]
|
||||
|
||||
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
|
||||
csv_writer.writeheader()
|
||||
for result in results:
|
||||
csv_writer.writerow(result)
|
||||
|
||||
logger.info(f"Detail results are saved to csv file: {csv_filename}")
|
||||
|
||||
|
||||
def output_summary(results, csv_filename, args):
|
||||
with open(csv_filename, mode="a", newline="", encoding="ascii") as csv_file:
|
||||
header_names = [
|
||||
"model_name",
|
||||
"inputs",
|
||||
"custom_layer_num",
|
||||
"engine",
|
||||
"version",
|
||||
"providers",
|
||||
"device",
|
||||
"precision",
|
||||
"optimizer",
|
||||
"io_binding",
|
||||
"threads",
|
||||
]
|
||||
data_names = []
|
||||
for batch_size in args.batch_sizes:
|
||||
if args.sequence_lengths == [""]:
|
||||
data_names.append(f"b{batch_size}")
|
||||
else:
|
||||
for sequence_length in args.sequence_lengths:
|
||||
data_names.append(f"b{batch_size}_s{sequence_length}")
|
||||
|
||||
csv_writer = csv.DictWriter(csv_file, fieldnames=header_names + data_names)
|
||||
csv_writer.writeheader()
|
||||
for model_name in args.models:
|
||||
for input_count in [1, 2, 3]:
|
||||
for engine_name in args.engines:
|
||||
for io_binding in [True, False, ""]:
|
||||
for threads in args.num_threads:
|
||||
row = {}
|
||||
for result in results:
|
||||
if (
|
||||
result["model_name"] == model_name
|
||||
and result["inputs"] == input_count
|
||||
and result["engine"] == engine_name
|
||||
and result["io_binding"] == io_binding
|
||||
and result["threads"] == threads
|
||||
):
|
||||
headers = {k: v for k, v in result.items() if k in header_names}
|
||||
if not row:
|
||||
row.update(headers)
|
||||
row.update(dict.fromkeys(data_names, ""))
|
||||
else:
|
||||
for k in header_names:
|
||||
assert row[k] == headers[k]
|
||||
b = result["batch_size"]
|
||||
s = result["sequence_length"]
|
||||
if s:
|
||||
row[f"b{b}_s{s}"] = result["average_latency_ms"]
|
||||
else:
|
||||
row[f"b{b}"] = result["average_latency_ms"]
|
||||
if row:
|
||||
csv_writer.writerow(row)
|
||||
|
||||
logger.info(f"Summary results are saved to csv file: {csv_filename}")
|
||||
|
||||
|
||||
def output_fusion_statistics(model_fusion_statistics, csv_filename):
|
||||
with open(csv_filename, mode="a", newline="", encoding="ascii") as csv_file:
|
||||
column_names = [
|
||||
"model_filename",
|
||||
"datetime",
|
||||
"transformers",
|
||||
"torch",
|
||||
*list(next(iter(model_fusion_statistics.values())).keys()),
|
||||
]
|
||||
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
|
||||
csv_writer.writeheader()
|
||||
for key in model_fusion_statistics:
|
||||
model_fusion_statistics[key]["datetime"] = str(datetime.now())
|
||||
model_fusion_statistics[key]["transformers"] = transformers.__version__
|
||||
model_fusion_statistics[key]["torch"] = torch.__version__
|
||||
model_fusion_statistics[key]["model_filename"] = key
|
||||
csv_writer.writerow(model_fusion_statistics[key])
|
||||
logger.info(f"Fusion statistics is saved to csv file: {csv_filename}")
|
||||
|
||||
|
||||
def inference_ort(ort_session, ort_inputs, result_template, repeat_times, batch_size, warm_up_repeat=0):
|
||||
result = {}
|
||||
timeit.repeat(lambda: ort_session.run(None, ort_inputs), number=1, repeat=warm_up_repeat) # Dry run
|
||||
latency_list = timeit.repeat(lambda: ort_session.run(None, ort_inputs), number=1, repeat=repeat_times)
|
||||
result.update(result_template)
|
||||
result.update({"io_binding": False})
|
||||
result.update(get_latency_result(latency_list, batch_size))
|
||||
return result
|
||||
|
||||
|
||||
def inference_ort_with_io_binding(
|
||||
ort_session,
|
||||
ort_inputs,
|
||||
result_template,
|
||||
repeat_times,
|
||||
ort_output_names,
|
||||
ort_outputs,
|
||||
output_buffers,
|
||||
output_buffer_max_sizes,
|
||||
batch_size,
|
||||
device,
|
||||
data_type=numpy.longlong,
|
||||
warm_up_repeat=0,
|
||||
):
|
||||
result = {}
|
||||
|
||||
# Bind inputs and outputs to onnxruntime session
|
||||
io_binding = ort_session.io_binding()
|
||||
# Bind inputs to device
|
||||
for name in ort_inputs:
|
||||
np_input = torch.from_numpy(ort_inputs[name]).to(device)
|
||||
input_type = IO_BINDING_DATA_TYPE_MAP.get(str(ort_inputs[name].dtype), data_type)
|
||||
io_binding.bind_input(
|
||||
name,
|
||||
np_input.device.type,
|
||||
0,
|
||||
input_type,
|
||||
np_input.shape,
|
||||
np_input.data_ptr(),
|
||||
)
|
||||
# Bind outputs buffers with the sizes needed if not allocated already
|
||||
if len(output_buffers) == 0:
|
||||
allocateOutputBuffers(output_buffers, output_buffer_max_sizes, device)
|
||||
|
||||
for i, ort_output_name in enumerate(ort_output_names):
|
||||
io_binding.bind_output(
|
||||
ort_output_name,
|
||||
output_buffers[i].device.type,
|
||||
0,
|
||||
numpy.float32,
|
||||
ort_outputs[i].shape,
|
||||
output_buffers[i].data_ptr(),
|
||||
)
|
||||
|
||||
timeit.repeat(
|
||||
lambda: ort_session.run_with_iobinding(io_binding),
|
||||
number=1,
|
||||
repeat=warm_up_repeat,
|
||||
) # Dry run
|
||||
|
||||
latency_list = timeit.repeat(
|
||||
lambda: ort_session.run_with_iobinding(io_binding),
|
||||
number=1,
|
||||
repeat=repeat_times,
|
||||
)
|
||||
result.update(result_template)
|
||||
result.update({"io_binding": True})
|
||||
result.update(get_latency_result(latency_list, batch_size))
|
||||
return result
|
||||
|
||||
|
||||
def allocateOutputBuffers(output_buffers, output_buffer_max_sizes, device): # noqa: N802
|
||||
# Allocate output tensors with the largest test size needed. So the allocated memory can be reused
|
||||
# for each test run.
|
||||
|
||||
for i in output_buffer_max_sizes:
|
||||
output_buffers.append(torch.empty(i, dtype=torch.float32, device=device))
|
||||
|
||||
|
||||
def set_random_seed(seed=123):
|
||||
"""Set random seed manually to get deterministic results"""
|
||||
random.seed(seed)
|
||||
numpy.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
torch.cuda.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
# torch.backends.cudnn.enabled = False
|
||||
# torch.backends.cudnn.benchmark = False
|
||||
# torch.backends.cudnn.deterministic = True
|
||||
|
||||
|
||||
def get_gpu_info() -> list[dict[str, Any]] | None:
|
||||
from py3nvml.py3nvml import ( # noqa: PLC0415
|
||||
NVMLError,
|
||||
nvmlDeviceGetCount,
|
||||
nvmlDeviceGetHandleByIndex,
|
||||
nvmlDeviceGetMemoryInfo,
|
||||
nvmlDeviceGetName,
|
||||
nvmlInit,
|
||||
nvmlShutdown,
|
||||
)
|
||||
|
||||
try:
|
||||
nvmlInit()
|
||||
result = []
|
||||
device_count = nvmlDeviceGetCount()
|
||||
if not isinstance(device_count, int):
|
||||
return None
|
||||
|
||||
for i in range(device_count):
|
||||
info = nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(i))
|
||||
if isinstance(info, str):
|
||||
return None
|
||||
result.append(
|
||||
{
|
||||
"id": i,
|
||||
"name": nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)),
|
||||
"total": info.total,
|
||||
"free": info.free,
|
||||
"used": info.used,
|
||||
}
|
||||
)
|
||||
nvmlShutdown()
|
||||
return result
|
||||
except NVMLError as error:
|
||||
print("Error fetching GPU information using nvml: %s", error)
|
||||
return None
|
||||
|
||||
|
||||
class MemoryMonitor(ABC):
|
||||
def __init__(self, keep_measuring=True):
|
||||
self.keep_measuring = keep_measuring
|
||||
|
||||
def measure_cpu_usage(self):
|
||||
import psutil # noqa: PLC0415
|
||||
|
||||
max_usage = 0
|
||||
while True:
|
||||
max_usage = max(max_usage, psutil.Process(os.getpid()).memory_info().rss / 1024**2)
|
||||
sleep(0.005) # 5ms
|
||||
if not self.keep_measuring:
|
||||
break
|
||||
return max_usage
|
||||
|
||||
@abstractmethod
|
||||
def measure_gpu_usage(self) -> list[dict[str, Any]] | None:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class CudaMemoryMonitor(MemoryMonitor):
|
||||
def __init__(self, keep_measuring=True):
|
||||
super().__init__(keep_measuring)
|
||||
|
||||
def measure_gpu_usage(self) -> list[dict[str, Any]] | None:
|
||||
from py3nvml.py3nvml import ( # noqa: PLC0415
|
||||
NVMLError,
|
||||
nvmlDeviceGetCount,
|
||||
nvmlDeviceGetHandleByIndex,
|
||||
nvmlDeviceGetMemoryInfo,
|
||||
nvmlDeviceGetName,
|
||||
nvmlInit,
|
||||
nvmlShutdown,
|
||||
)
|
||||
|
||||
max_gpu_usage = []
|
||||
gpu_name = []
|
||||
try:
|
||||
nvmlInit()
|
||||
device_count = nvmlDeviceGetCount()
|
||||
if not isinstance(device_count, int):
|
||||
logger.error(f"nvmlDeviceGetCount result is not integer: {device_count}")
|
||||
return None
|
||||
|
||||
max_gpu_usage = [0 for i in range(device_count)]
|
||||
gpu_name = [nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)) for i in range(device_count)]
|
||||
while True:
|
||||
for i in range(device_count):
|
||||
info = nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(i))
|
||||
if isinstance(info, str):
|
||||
logger.error(f"nvmlDeviceGetMemoryInfo returns str: {info}")
|
||||
return None
|
||||
max_gpu_usage[i] = max(max_gpu_usage[i], info.used / 1024**2)
|
||||
sleep(0.005) # 5ms
|
||||
if not self.keep_measuring:
|
||||
break
|
||||
nvmlShutdown()
|
||||
return [
|
||||
{
|
||||
"device_id": i,
|
||||
"name": gpu_name[i],
|
||||
"max_used_MB": max_gpu_usage[i],
|
||||
}
|
||||
for i in range(device_count)
|
||||
]
|
||||
except NVMLError as error:
|
||||
logger.error("Error fetching GPU information using nvml: %s", error)
|
||||
return None
|
||||
|
||||
|
||||
class RocmMemoryMonitor(MemoryMonitor):
|
||||
def __init__(self, keep_measuring=True):
|
||||
super().__init__(keep_measuring)
|
||||
rocm_smi_path = "/opt/rocm/libexec/rocm_smi"
|
||||
if os.path.exists(rocm_smi_path):
|
||||
if rocm_smi_path not in sys.path:
|
||||
sys.path.append(rocm_smi_path)
|
||||
try:
|
||||
import rocm_smi # noqa: PLC0415
|
||||
|
||||
self.rocm_smi = rocm_smi
|
||||
self.rocm_smi.initializeRsmi()
|
||||
except ImportError:
|
||||
self.rocm_smi = None
|
||||
|
||||
def get_used_memory(self, dev):
|
||||
if self.rocm_smi is None:
|
||||
return -1
|
||||
return self.rocm_smi.getMemInfo(dev, "VRAM")[0] / 1024 / 1024
|
||||
|
||||
def measure_gpu_usage(self):
|
||||
if self.rocm_smi is None:
|
||||
return None
|
||||
|
||||
device_count = len(self.rocm_smi.listDevices()) if self.rocm_smi is not None else 0
|
||||
max_gpu_usage = [0 for i in range(device_count)]
|
||||
gpu_name = [f"GPU{i}" for i in range(device_count)]
|
||||
while True:
|
||||
for i in range(device_count):
|
||||
max_gpu_usage[i] = max(max_gpu_usage[i], self.get_used_memory(i))
|
||||
time.sleep(0.005) # 5ms
|
||||
if not self.keep_measuring:
|
||||
break
|
||||
return [
|
||||
{
|
||||
"device_id": i,
|
||||
"name": gpu_name[i],
|
||||
"max_used_MB": max_gpu_usage[i],
|
||||
}
|
||||
for i in range(device_count)
|
||||
]
|
||||
|
||||
|
||||
def measure_memory(is_gpu, func, monitor_type="cuda", start_memory=None):
|
||||
memory_monitor_type = None
|
||||
if monitor_type == "rocm":
|
||||
memory_monitor_type = RocmMemoryMonitor
|
||||
else:
|
||||
memory_monitor_type = CudaMemoryMonitor
|
||||
|
||||
monitor = memory_monitor_type(False)
|
||||
|
||||
if is_gpu:
|
||||
if start_memory is not None:
|
||||
memory_before_test = start_memory
|
||||
else:
|
||||
memory_before_test = monitor.measure_gpu_usage()
|
||||
if memory_before_test is None:
|
||||
return None
|
||||
|
||||
if func is None:
|
||||
return memory_before_test
|
||||
|
||||
with ThreadPoolExecutor() as executor:
|
||||
monitor = memory_monitor_type()
|
||||
mem_thread = executor.submit(monitor.measure_gpu_usage)
|
||||
try:
|
||||
fn_thread = executor.submit(func)
|
||||
_ = fn_thread.result()
|
||||
finally:
|
||||
monitor.keep_measuring = False
|
||||
max_usage = mem_thread.result()
|
||||
|
||||
if max_usage is None:
|
||||
return None
|
||||
|
||||
logger.info(f"GPU memory usage: before={memory_before_test} peak={max_usage}")
|
||||
if len(memory_before_test) >= 1 and len(max_usage) >= 1 and len(memory_before_test) == len(max_usage):
|
||||
# When there are multiple GPUs, we will check the one with maximum usage.
|
||||
max_used = 0
|
||||
for i, memory_before in enumerate(memory_before_test):
|
||||
before = memory_before["max_used_MB"]
|
||||
after = max_usage[i]["max_used_MB"]
|
||||
used = after - before
|
||||
max_used = max(max_used, used)
|
||||
return max_used
|
||||
return None
|
||||
|
||||
# CPU memory
|
||||
if start_memory is not None:
|
||||
memory_before_test = start_memory
|
||||
else:
|
||||
memory_before_test = monitor.measure_cpu_usage()
|
||||
|
||||
if func is None:
|
||||
return memory_before_test
|
||||
|
||||
with ThreadPoolExecutor() as executor:
|
||||
monitor = memory_monitor_type()
|
||||
mem_thread = executor.submit(monitor.measure_cpu_usage)
|
||||
try:
|
||||
fn_thread = executor.submit(func)
|
||||
_ = fn_thread.result()
|
||||
finally:
|
||||
monitor.keep_measuring = False
|
||||
max_usage = mem_thread.result()
|
||||
|
||||
logger.info(f"CPU memory usage: before={memory_before_test:.1f} MB, peak={max_usage:.1f} MB")
|
||||
return max_usage - memory_before_test
|
||||
|
||||
|
||||
def get_ort_environment_variables():
|
||||
# Environment variables might impact ORT performance on transformer models. Note that they are for testing only.
|
||||
env_names = [
|
||||
"ORT_DISABLE_FUSED_ATTENTION",
|
||||
"ORT_ENABLE_FUSED_CAUSAL_ATTENTION",
|
||||
"ORT_DISABLE_FUSED_CROSS_ATTENTION",
|
||||
"ORT_DISABLE_TRT_FLASH_ATTENTION",
|
||||
"ORT_DISABLE_MEMORY_EFFICIENT_ATTENTION",
|
||||
"ORT_TRANSFORMER_OPTIONS",
|
||||
"ORT_CUDA_GEMM_OPTIONS",
|
||||
]
|
||||
env = ""
|
||||
for name in env_names:
|
||||
value = os.getenv(name)
|
||||
if value is None:
|
||||
continue
|
||||
if env:
|
||||
env += ","
|
||||
env += f"{name}={value}"
|
||||
return env
|
||||
@@ -0,0 +1,629 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
# This tool measures the inference performance of onnxruntime on BERT-like model with inputs like input_ids,
|
||||
# token_type_ids (optional), and attention_mask (optional).
|
||||
#
|
||||
# If the model does not have exactly three inputs like above, you might need specify names of inputs with
|
||||
# --input_ids_name, --segment_ids_name and --input_mask_name
|
||||
|
||||
# Example command to run test on batch_size 1 and 2 for a model on GPU:
|
||||
# python bert_perf_test.py --model bert.onnx --batch_size 1 2 --sequence_length 128 --use_gpu --samples 1000 --test_times 1
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import multiprocessing
|
||||
import os
|
||||
import random
|
||||
import statistics
|
||||
import timeit
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import psutil
|
||||
import torch
|
||||
from bert_test_data import generate_test_data, get_bert_inputs
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestSetting:
|
||||
batch_size: int
|
||||
sequence_length: int
|
||||
test_cases: int
|
||||
test_times: int
|
||||
use_gpu: bool
|
||||
use_io_binding: bool
|
||||
provider: str
|
||||
intra_op_num_threads: int
|
||||
seed: int
|
||||
verbose: bool
|
||||
log_severity: int
|
||||
average_sequence_length: int
|
||||
random_sequence_length: bool
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelSetting:
|
||||
model_path: str
|
||||
input_ids_name: str
|
||||
segment_ids_name: str
|
||||
input_mask_name: str
|
||||
opt_level: int
|
||||
input_tuning_results: str | None
|
||||
output_tuning_results: str | None
|
||||
mask_type: int
|
||||
|
||||
|
||||
def create_session(
|
||||
model_path,
|
||||
use_gpu,
|
||||
provider,
|
||||
intra_op_num_threads,
|
||||
graph_optimization_level=None,
|
||||
log_severity=2,
|
||||
tuning_results_path=None,
|
||||
):
|
||||
import onnxruntime # noqa: PLC0415
|
||||
|
||||
onnxruntime.set_default_logger_severity(log_severity)
|
||||
|
||||
if use_gpu and ("CUDAExecutionProvider" not in onnxruntime.get_available_providers()):
|
||||
print(
|
||||
"Warning: Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance."
|
||||
)
|
||||
|
||||
if use_gpu:
|
||||
if provider == "dml":
|
||||
execution_providers = ["DmlExecutionProvider", "CPUExecutionProvider"]
|
||||
elif provider == "migraphx":
|
||||
execution_providers = [
|
||||
"MIGraphXExecutionProvider",
|
||||
"CPUExecutionProvider",
|
||||
]
|
||||
elif provider == "cuda":
|
||||
execution_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
||||
elif provider == "tensorrt":
|
||||
execution_providers = [
|
||||
"TensorrtExecutionProvider",
|
||||
"CUDAExecutionProvider",
|
||||
"CPUExecutionProvider",
|
||||
]
|
||||
else:
|
||||
execution_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
||||
else:
|
||||
execution_providers = ["CPUExecutionProvider"]
|
||||
|
||||
sess_options = onnxruntime.SessionOptions()
|
||||
sess_options.log_severity_level = log_severity
|
||||
sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
|
||||
|
||||
if graph_optimization_level is None:
|
||||
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
|
||||
elif graph_optimization_level == 0:
|
||||
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
|
||||
elif graph_optimization_level == 1:
|
||||
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
|
||||
elif graph_optimization_level == 2:
|
||||
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
|
||||
elif graph_optimization_level == 3:
|
||||
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_LAYOUT
|
||||
elif graph_optimization_level == 99:
|
||||
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
|
||||
else:
|
||||
sess_options.graph_optimization_level = graph_optimization_level
|
||||
|
||||
if intra_op_num_threads is not None:
|
||||
sess_options.intra_op_num_threads = intra_op_num_threads
|
||||
|
||||
session = onnxruntime.InferenceSession(model_path, sess_options, providers=execution_providers)
|
||||
|
||||
if use_gpu:
|
||||
if provider == "dml":
|
||||
assert "DmlExecutionProvider" in session.get_providers()
|
||||
elif provider == "migraphx":
|
||||
assert "MIGraphXExecutionProvider" in session.get_providers()
|
||||
elif provider == "cuda":
|
||||
assert "CUDAExecutionProvider" in session.get_providers()
|
||||
elif provider == "tensorrt":
|
||||
assert "TensorrtExecutionProvider" in session.get_providers()
|
||||
assert "CUDAExecutionProvider" in session.get_providers()
|
||||
else:
|
||||
assert "CUDAExecutionProvider" in session.get_providers()
|
||||
else:
|
||||
assert "CPUExecutionProvider" in session.get_providers()
|
||||
|
||||
if tuning_results_path is not None:
|
||||
with open(tuning_results_path) as f:
|
||||
session.set_tuning_results(json.load(f))
|
||||
|
||||
return session
|
||||
|
||||
|
||||
def numpy_type(torch_type):
|
||||
type_map = {
|
||||
torch.float32: np.float32,
|
||||
torch.float16: np.float16,
|
||||
torch.int32: np.int32,
|
||||
torch.int64: np.longlong,
|
||||
}
|
||||
return type_map[torch_type]
|
||||
|
||||
|
||||
def create_input_output_tensors(inputs, outputs, device):
|
||||
input_tensors = {name: torch.from_numpy(array).to(device) for name, array in inputs.items()}
|
||||
output_tensors = {name: torch.from_numpy(array).to(device) for name, array in outputs.items()}
|
||||
return input_tensors, output_tensors
|
||||
|
||||
|
||||
def create_io_binding(sess, input_tensors, output_tensors):
|
||||
io_binding = sess.io_binding()
|
||||
for name, tensor in input_tensors.items():
|
||||
io_binding.bind_input(
|
||||
name,
|
||||
tensor.device.type,
|
||||
0,
|
||||
numpy_type(tensor.dtype),
|
||||
tensor.shape,
|
||||
tensor.data_ptr(),
|
||||
)
|
||||
for name, tensor in output_tensors.items():
|
||||
io_binding.bind_output(
|
||||
name,
|
||||
tensor.device.type,
|
||||
0,
|
||||
numpy_type(tensor.dtype),
|
||||
tensor.shape,
|
||||
tensor.data_ptr(),
|
||||
)
|
||||
return io_binding
|
||||
|
||||
|
||||
def onnxruntime_inference_with_io_binding(session, all_inputs, output_names, test_setting):
|
||||
results = []
|
||||
latency_list = []
|
||||
device = "cuda" if test_setting.use_gpu else "cpu"
|
||||
for _test_case_id, inputs in enumerate(all_inputs):
|
||||
result = session.run(output_names, inputs)
|
||||
results.append(result)
|
||||
outputs = {}
|
||||
for i in range(len(output_names)):
|
||||
outputs[output_names[i]] = result[i]
|
||||
|
||||
input_tensors, output_tensors = create_input_output_tensors(inputs, outputs, device)
|
||||
io_binding = create_io_binding(session, input_tensors, output_tensors)
|
||||
|
||||
# warm up once
|
||||
session.run_with_iobinding(io_binding)
|
||||
|
||||
start_time = timeit.default_timer()
|
||||
session.run_with_iobinding(io_binding)
|
||||
latency = timeit.default_timer() - start_time
|
||||
latency_list.append(latency)
|
||||
|
||||
return results, latency_list
|
||||
|
||||
|
||||
def onnxruntime_inference(session, all_inputs, output_names):
|
||||
if len(all_inputs) > 0:
|
||||
# Use a random input as warm up.
|
||||
session.run(output_names, random.choice(all_inputs))
|
||||
|
||||
results = []
|
||||
latency_list = []
|
||||
for _test_case_id, inputs in enumerate(all_inputs):
|
||||
start_time = timeit.default_timer()
|
||||
result = session.run(output_names, inputs)
|
||||
latency = timeit.default_timer() - start_time
|
||||
results.append(result)
|
||||
latency_list.append(latency)
|
||||
return results, latency_list
|
||||
|
||||
|
||||
def to_string(model_path, session, test_setting):
|
||||
sess_options = session.get_session_options()
|
||||
option = f"model={os.path.basename(model_path)},"
|
||||
option += f"graph_optimization_level={sess_options.graph_optimization_level},intra_op_num_threads={sess_options.intra_op_num_threads},".replace(
|
||||
"GraphOptimizationLevel.ORT_", ""
|
||||
)
|
||||
|
||||
option += f"batch_size={test_setting.batch_size},sequence_length={test_setting.sequence_length},"
|
||||
option += f"test_cases={test_setting.test_cases},test_times={test_setting.test_times},"
|
||||
option += f"use_gpu={test_setting.use_gpu},use_io_binding={test_setting.use_io_binding},"
|
||||
option += f"average_sequence_length={test_setting.average_sequence_length},"
|
||||
option += f"random_sequence_length={test_setting.random_sequence_length}"
|
||||
return option
|
||||
|
||||
|
||||
def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads):
|
||||
session = create_session(
|
||||
model_setting.model_path,
|
||||
test_setting.use_gpu,
|
||||
test_setting.provider,
|
||||
intra_op_num_threads,
|
||||
model_setting.opt_level,
|
||||
log_severity=test_setting.log_severity,
|
||||
tuning_results_path=model_setting.input_tuning_results,
|
||||
)
|
||||
output_names = [output.name for output in session.get_outputs()]
|
||||
|
||||
key = to_string(model_setting.model_path, session, test_setting)
|
||||
if key in perf_results:
|
||||
print("skip duplicated test:", key)
|
||||
return
|
||||
|
||||
print("Running test:", key)
|
||||
|
||||
all_latency_list = []
|
||||
if test_setting.use_io_binding:
|
||||
for _i in range(test_setting.test_times):
|
||||
results, latency_list = onnxruntime_inference_with_io_binding(
|
||||
session, all_inputs, output_names, test_setting
|
||||
)
|
||||
all_latency_list.extend(latency_list)
|
||||
else:
|
||||
for _i in range(test_setting.test_times):
|
||||
results, latency_list = onnxruntime_inference(session, all_inputs, output_names)
|
||||
all_latency_list.extend(latency_list)
|
||||
|
||||
# latency in milliseconds
|
||||
latency_ms = np.array(all_latency_list) * 1000
|
||||
|
||||
average_latency = statistics.mean(latency_ms)
|
||||
latency_50 = np.percentile(latency_ms, 50)
|
||||
latency_75 = np.percentile(latency_ms, 75)
|
||||
latency_90 = np.percentile(latency_ms, 90)
|
||||
latency_95 = np.percentile(latency_ms, 95)
|
||||
latency_99 = np.percentile(latency_ms, 99)
|
||||
throughput = test_setting.batch_size * (1000.0 / average_latency)
|
||||
|
||||
perf_results[key] = (
|
||||
average_latency,
|
||||
latency_50,
|
||||
latency_75,
|
||||
latency_90,
|
||||
latency_95,
|
||||
latency_99,
|
||||
throughput,
|
||||
)
|
||||
|
||||
print(
|
||||
"Average latency = {} ms, Throughput = {} QPS".format(format(average_latency, ".2f"), format(throughput, ".2f"))
|
||||
)
|
||||
|
||||
if model_setting.output_tuning_results:
|
||||
output_path = os.path.abspath(model_setting.output_tuning_results)
|
||||
if os.path.exists(output_path):
|
||||
old_output_path = output_path
|
||||
output_path = f"""{output_path.rsplit(".json", 1)[0]}.{datetime.now().timestamp()}.json"""
|
||||
print("WARNING:", old_output_path, "exists, will write to", output_path, "instead.")
|
||||
|
||||
trs = session.get_tuning_results()
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(trs, f)
|
||||
print("Tuning results is saved to", output_path)
|
||||
|
||||
|
||||
def launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads):
|
||||
process = multiprocessing.Process(
|
||||
target=run_one_test,
|
||||
args=(
|
||||
model_setting,
|
||||
test_setting,
|
||||
perf_results,
|
||||
all_inputs,
|
||||
intra_op_num_threads,
|
||||
),
|
||||
)
|
||||
process.start()
|
||||
process.join()
|
||||
|
||||
|
||||
def run_perf_tests(model_setting, test_setting, perf_results, all_inputs):
|
||||
if test_setting.intra_op_num_threads is not None:
|
||||
launch_test(
|
||||
model_setting,
|
||||
test_setting,
|
||||
perf_results,
|
||||
all_inputs,
|
||||
test_setting.intra_op_num_threads,
|
||||
)
|
||||
return
|
||||
|
||||
cpu_count = psutil.cpu_count(logical=False)
|
||||
logical_cores = psutil.cpu_count(logical=True)
|
||||
|
||||
candidate_threads = list({logical_cores, cpu_count})
|
||||
for i in range(1, min(16, logical_cores)):
|
||||
if i not in candidate_threads:
|
||||
candidate_threads.append(i)
|
||||
candidate_threads.sort(reverse=True)
|
||||
|
||||
for intra_op_num_threads in candidate_threads:
|
||||
launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads)
|
||||
|
||||
|
||||
def run_performance(model_setting, test_setting, perf_results):
|
||||
input_ids, segment_ids, input_mask = get_bert_inputs(
|
||||
model_setting.model_path,
|
||||
model_setting.input_ids_name,
|
||||
model_setting.segment_ids_name,
|
||||
model_setting.input_mask_name,
|
||||
)
|
||||
|
||||
# Do not generate random mask for performance test.
|
||||
print(
|
||||
f"Generating {test_setting.test_cases} samples for batch_size={test_setting.batch_size} sequence_length={test_setting.sequence_length}"
|
||||
)
|
||||
all_inputs = generate_test_data(
|
||||
test_setting.batch_size,
|
||||
test_setting.sequence_length,
|
||||
test_setting.test_cases,
|
||||
test_setting.seed,
|
||||
test_setting.verbose,
|
||||
input_ids,
|
||||
segment_ids,
|
||||
input_mask,
|
||||
test_setting.average_sequence_length,
|
||||
test_setting.random_sequence_length,
|
||||
mask_type=model_setting.mask_type,
|
||||
)
|
||||
|
||||
run_perf_tests(model_setting, test_setting, perf_results, all_inputs)
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--model", required=True, type=str, help="bert onnx model path")
|
||||
|
||||
parser.add_argument(
|
||||
"-b",
|
||||
"--batch_size",
|
||||
required=True,
|
||||
type=int,
|
||||
nargs="+",
|
||||
help="batch size of input. Allow one or multiple values in the range of [1, 128].",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--sequence_length",
|
||||
required=True,
|
||||
type=int,
|
||||
help="maximum sequence length of input",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--samples",
|
||||
required=False,
|
||||
type=int,
|
||||
default=10,
|
||||
help="number of samples to be generated",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--test_times",
|
||||
required=False,
|
||||
type=int,
|
||||
default=0,
|
||||
help="number of times to run per sample. By default, the value is 1000 / samples",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--opt_level",
|
||||
required=False,
|
||||
type=int,
|
||||
choices=[0, 1, 2, 3, 99],
|
||||
default=99,
|
||||
help="onnxruntime optimization level: 0 - disable all, 1 - basic, 2 - extended, 3 - layout, 99 - enable all.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--seed",
|
||||
required=False,
|
||||
type=int,
|
||||
default=3,
|
||||
help="random seed. Use the same seed to make sure test data is same in multiple tests.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="print verbose information",
|
||||
)
|
||||
parser.set_defaults(verbose=False)
|
||||
|
||||
parser.add_argument(
|
||||
"--log_severity",
|
||||
required=False,
|
||||
type=int,
|
||||
default=2,
|
||||
choices=[0, 1, 2, 3, 4],
|
||||
help="0:Verbose, 1:Info, 2:Warning, 3:Error, 4:Fatal",
|
||||
)
|
||||
|
||||
parser.add_argument("--use_gpu", required=False, action="store_true", help="use GPU")
|
||||
parser.set_defaults(use_gpu=False)
|
||||
|
||||
parser.add_argument("--use_io_binding", required=False, action="store_true", help="use io_binding")
|
||||
parser.set_defaults(use_io_binding=False)
|
||||
|
||||
parser.add_argument(
|
||||
"--provider",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="Execution provider to use",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-n",
|
||||
"--intra_op_num_threads",
|
||||
required=False,
|
||||
type=int,
|
||||
default=None,
|
||||
help=">=0, set intra_op_num_threads",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--input_ids_name",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for input ids",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--segment_ids_name",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for segment ids",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--input_mask_name",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for attention mask",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--input_tuning_results",
|
||||
default=None,
|
||||
type=str,
|
||||
help="tuning results (json) to be loaded before benchmark",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--output_tuning_results",
|
||||
default=None,
|
||||
type=str,
|
||||
help="tuning results (json) to be saved after benchmark",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-a",
|
||||
"--average_sequence_length",
|
||||
default=-1,
|
||||
type=int,
|
||||
help="average sequence length excluding padding",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--random_sequence_length",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="use uniform random instead of fixed sequence length",
|
||||
)
|
||||
parser.set_defaults(random_sequence_length=False)
|
||||
|
||||
parser.add_argument(
|
||||
"--mask_type",
|
||||
required=False,
|
||||
type=int,
|
||||
default=2,
|
||||
help="mask type: (1: mask index or sequence length, 2: raw 2D mask, 3: key len, cumulated lengths of query and key)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_arguments()
|
||||
|
||||
if args.test_times == 0:
|
||||
args.test_times = max(1, int(1000 / args.samples))
|
||||
|
||||
if args.average_sequence_length <= 0:
|
||||
args.average_sequence_length = args.sequence_length
|
||||
|
||||
manager = multiprocessing.Manager()
|
||||
perf_results = manager.dict()
|
||||
|
||||
batch_size_set = set(args.batch_size)
|
||||
if not (min(batch_size_set) >= 1 and max(batch_size_set) <= 128):
|
||||
raise Exception("batch_size not in range [1, 128]")
|
||||
|
||||
model_setting = ModelSetting(
|
||||
args.model,
|
||||
args.input_ids_name,
|
||||
args.segment_ids_name,
|
||||
args.input_mask_name,
|
||||
args.opt_level,
|
||||
args.input_tuning_results,
|
||||
args.output_tuning_results,
|
||||
args.mask_type,
|
||||
)
|
||||
|
||||
for batch_size in batch_size_set:
|
||||
test_setting = TestSetting(
|
||||
batch_size,
|
||||
args.sequence_length,
|
||||
args.samples,
|
||||
args.test_times,
|
||||
args.use_gpu,
|
||||
args.use_io_binding,
|
||||
args.provider,
|
||||
args.intra_op_num_threads,
|
||||
args.seed,
|
||||
args.verbose,
|
||||
args.log_severity,
|
||||
args.average_sequence_length,
|
||||
args.random_sequence_length,
|
||||
)
|
||||
|
||||
print("test setting", test_setting)
|
||||
run_performance(model_setting, test_setting, perf_results)
|
||||
|
||||
# Sort the results so that the first one has smallest latency.
|
||||
sorted_results = sorted(perf_results.items(), reverse=False, key=lambda x: x[1])
|
||||
|
||||
summary_file = os.path.join(
|
||||
Path(args.model).parent,
|
||||
"perf_results_{}_B{}_S{}_{}.txt".format(
|
||||
"GPU" if args.use_gpu else "CPU",
|
||||
"-".join([str(x) for x in sorted(batch_size_set)]),
|
||||
args.sequence_length,
|
||||
datetime.now().strftime("%Y%m%d-%H%M%S"),
|
||||
),
|
||||
)
|
||||
with open(summary_file, "w+", newline="") as tsv_file:
|
||||
tsv_writer = csv.writer(tsv_file, delimiter="\t", lineterminator="\n")
|
||||
headers = None
|
||||
for key, perf_result in sorted_results:
|
||||
params = key.split(",")
|
||||
if headers is None:
|
||||
headers = [
|
||||
"Latency(ms)",
|
||||
"Latency_P50",
|
||||
"Latency_P75",
|
||||
"Latency_P90",
|
||||
"Latency_P95",
|
||||
"Latency_P99",
|
||||
"Throughput(QPS)",
|
||||
]
|
||||
headers.extend([x.split("=")[0] for x in params])
|
||||
tsv_writer.writerow(headers)
|
||||
|
||||
values = [format(x, ".2f") for x in perf_result]
|
||||
values.extend([x.split("=")[1] for x in params])
|
||||
tsv_writer.writerow(values)
|
||||
|
||||
print("Test summary is saved to", summary_file)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# work around for AnaConda Jupyter. See https://stackoverflow.com/questions/45720153/python-multiprocessing-error-attributeerror-module-main-has-no-attribute
|
||||
__spec__ = None
|
||||
|
||||
main()
|
||||
@@ -0,0 +1,641 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
# It is a tool to generate test data for a bert model.
|
||||
# The test data can be used by onnxruntime_perf_test tool to evaluate the inference latency.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import random
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from onnx import ModelProto, TensorProto, numpy_helper
|
||||
from onnx_model import OnnxModel
|
||||
|
||||
|
||||
def fake_input_ids_data(
|
||||
input_ids: TensorProto, batch_size: int, sequence_length: int, dictionary_size: int
|
||||
) -> np.ndarray:
|
||||
"""Create input tensor based on the graph input of input_ids
|
||||
|
||||
Args:
|
||||
input_ids (TensorProto): graph input of the input_ids input tensor
|
||||
batch_size (int): batch size
|
||||
sequence_length (int): sequence length
|
||||
dictionary_size (int): vocabulary size of dictionary
|
||||
|
||||
Returns:
|
||||
np.ndarray: the input tensor created
|
||||
"""
|
||||
assert input_ids.type.tensor_type.elem_type in [
|
||||
TensorProto.FLOAT,
|
||||
TensorProto.INT32,
|
||||
TensorProto.INT64,
|
||||
]
|
||||
|
||||
data = np.random.randint(dictionary_size, size=(batch_size, sequence_length), dtype=np.int32)
|
||||
|
||||
if input_ids.type.tensor_type.elem_type == TensorProto.FLOAT:
|
||||
data = np.float32(data)
|
||||
elif input_ids.type.tensor_type.elem_type == TensorProto.INT64:
|
||||
data = np.int64(data)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def fake_segment_ids_data(segment_ids: TensorProto, batch_size: int, sequence_length: int) -> np.ndarray:
|
||||
"""Create input tensor based on the graph input of segment_ids
|
||||
|
||||
Args:
|
||||
segment_ids (TensorProto): graph input of the token_type_ids input tensor
|
||||
batch_size (int): batch size
|
||||
sequence_length (int): sequence length
|
||||
|
||||
Returns:
|
||||
np.ndarray: the input tensor created
|
||||
"""
|
||||
assert segment_ids.type.tensor_type.elem_type in [
|
||||
TensorProto.FLOAT,
|
||||
TensorProto.INT32,
|
||||
TensorProto.INT64,
|
||||
]
|
||||
|
||||
data = np.zeros((batch_size, sequence_length), dtype=np.int32)
|
||||
|
||||
if segment_ids.type.tensor_type.elem_type == TensorProto.FLOAT:
|
||||
data = np.float32(data)
|
||||
elif segment_ids.type.tensor_type.elem_type == TensorProto.INT64:
|
||||
data = np.int64(data)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def get_random_length(max_sequence_length: int, average_sequence_length: int):
|
||||
assert average_sequence_length >= 1 and average_sequence_length <= max_sequence_length
|
||||
|
||||
# For uniform distribution, we find proper lower and upper bounds so that the average is in the middle.
|
||||
if 2 * average_sequence_length > max_sequence_length:
|
||||
return random.randint(2 * average_sequence_length - max_sequence_length, max_sequence_length)
|
||||
else:
|
||||
return random.randint(1, 2 * average_sequence_length - 1)
|
||||
|
||||
|
||||
def fake_input_mask_data(
|
||||
input_mask: TensorProto,
|
||||
batch_size: int,
|
||||
sequence_length: int,
|
||||
average_sequence_length: int,
|
||||
random_sequence_length: bool,
|
||||
mask_type: int = 2,
|
||||
) -> np.ndarray:
|
||||
"""Create input tensor based on the graph input of segment_ids.
|
||||
|
||||
Args:
|
||||
input_mask (TensorProto): graph input of the attention mask input tensor
|
||||
batch_size (int): batch size
|
||||
sequence_length (int): sequence length
|
||||
average_sequence_length (int): average sequence length excluding paddings
|
||||
random_sequence_length (bool): whether use uniform random number for sequence length
|
||||
mask_type (int): mask type - 1: mask index (sequence length excluding paddings). Shape is (batch_size).
|
||||
2: 2D attention mask. Shape is (batch_size, sequence_length).
|
||||
3: key len, cumulated lengths of query and key. Shape is (3 * batch_size + 2).
|
||||
|
||||
Returns:
|
||||
np.ndarray: the input tensor created
|
||||
"""
|
||||
|
||||
assert input_mask.type.tensor_type.elem_type in [
|
||||
TensorProto.FLOAT,
|
||||
TensorProto.INT32,
|
||||
TensorProto.INT64,
|
||||
]
|
||||
|
||||
if mask_type == 1: # sequence length excluding paddings
|
||||
data = np.ones((batch_size), dtype=np.int32)
|
||||
if random_sequence_length:
|
||||
for i in range(batch_size):
|
||||
data[i] = get_random_length(sequence_length, average_sequence_length)
|
||||
else:
|
||||
for i in range(batch_size):
|
||||
data[i] = average_sequence_length
|
||||
elif mask_type == 2: # 2D attention mask
|
||||
data = np.zeros((batch_size, sequence_length), dtype=np.int32)
|
||||
if random_sequence_length:
|
||||
for i in range(batch_size):
|
||||
actual_seq_len = get_random_length(sequence_length, average_sequence_length)
|
||||
for j in range(actual_seq_len):
|
||||
data[i, j] = 1
|
||||
else:
|
||||
temp = np.ones((batch_size, average_sequence_length), dtype=np.int32)
|
||||
data[: temp.shape[0], : temp.shape[1]] = temp
|
||||
else:
|
||||
assert mask_type == 3
|
||||
data = np.zeros((batch_size * 3 + 2), dtype=np.int32)
|
||||
if random_sequence_length:
|
||||
for i in range(batch_size):
|
||||
data[i] = get_random_length(sequence_length, average_sequence_length)
|
||||
|
||||
for i in range(batch_size + 1):
|
||||
data[batch_size + i] = data[batch_size + i - 1] + data[i - 1] if i > 0 else 0
|
||||
data[2 * batch_size + 1 + i] = data[batch_size + i - 1] + data[i - 1] if i > 0 else 0
|
||||
else:
|
||||
for i in range(batch_size):
|
||||
data[i] = average_sequence_length
|
||||
for i in range(batch_size + 1):
|
||||
data[batch_size + i] = i * average_sequence_length
|
||||
data[2 * batch_size + 1 + i] = i * average_sequence_length
|
||||
|
||||
if input_mask.type.tensor_type.elem_type == TensorProto.FLOAT:
|
||||
data = np.float32(data)
|
||||
elif input_mask.type.tensor_type.elem_type == TensorProto.INT64:
|
||||
data = np.int64(data)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def output_test_data(directory: str, inputs: dict[str, np.ndarray]):
|
||||
"""Output input tensors of test data to a directory
|
||||
|
||||
Args:
|
||||
directory (str): path of a directory
|
||||
inputs (Dict[str, np.ndarray]): map from input name to value
|
||||
"""
|
||||
if not os.path.exists(directory):
|
||||
try:
|
||||
os.mkdir(directory)
|
||||
except OSError:
|
||||
print(f"Creation of the directory {directory} failed")
|
||||
else:
|
||||
print(f"Successfully created the directory {directory} ")
|
||||
else:
|
||||
print(f"Warning: directory {directory} existed. Files will be overwritten.")
|
||||
|
||||
for index, (name, data) in enumerate(inputs.items()):
|
||||
tensor = numpy_helper.from_array(data, name)
|
||||
with open(os.path.join(directory, f"input_{index}.pb"), "wb") as file:
|
||||
file.write(tensor.SerializeToString())
|
||||
|
||||
|
||||
def fake_test_data(
|
||||
batch_size: int,
|
||||
sequence_length: int,
|
||||
test_cases: int,
|
||||
dictionary_size: int,
|
||||
verbose: bool,
|
||||
random_seed: int,
|
||||
input_ids: TensorProto,
|
||||
segment_ids: TensorProto,
|
||||
input_mask: TensorProto,
|
||||
average_sequence_length: int,
|
||||
random_sequence_length: bool,
|
||||
mask_type: int,
|
||||
):
|
||||
"""Create given number of input data for testing
|
||||
|
||||
Args:
|
||||
batch_size (int): batch size
|
||||
sequence_length (int): sequence length
|
||||
test_cases (int): number of test cases
|
||||
dictionary_size (int): vocabulary size of dictionary for input_ids
|
||||
verbose (bool): print more information or not
|
||||
random_seed (int): random seed
|
||||
input_ids (TensorProto): graph input of input IDs
|
||||
segment_ids (TensorProto): graph input of token type IDs
|
||||
input_mask (TensorProto): graph input of attention mask
|
||||
average_sequence_length (int): average sequence length excluding paddings
|
||||
random_sequence_length (bool): whether use uniform random number for sequence length
|
||||
mask_type (int): mask type 1 is mask index; 2 is 2D mask; 3 is key len, cumulated lengths of query and key
|
||||
|
||||
Returns:
|
||||
List[Dict[str,numpy.ndarray]]: list of test cases, where each test case is a dictionary
|
||||
with input name as key and a tensor as value
|
||||
"""
|
||||
assert input_ids is not None
|
||||
|
||||
np.random.seed(random_seed)
|
||||
random.seed(random_seed)
|
||||
|
||||
all_inputs = []
|
||||
for _test_case in range(test_cases):
|
||||
input_1 = fake_input_ids_data(input_ids, batch_size, sequence_length, dictionary_size)
|
||||
inputs = {input_ids.name: input_1}
|
||||
|
||||
if segment_ids:
|
||||
inputs[segment_ids.name] = fake_segment_ids_data(segment_ids, batch_size, sequence_length)
|
||||
|
||||
if input_mask:
|
||||
inputs[input_mask.name] = fake_input_mask_data(
|
||||
input_mask, batch_size, sequence_length, average_sequence_length, random_sequence_length, mask_type
|
||||
)
|
||||
|
||||
if verbose and len(all_inputs) == 0:
|
||||
print("Example inputs", inputs)
|
||||
all_inputs.append(inputs)
|
||||
return all_inputs
|
||||
|
||||
|
||||
def generate_test_data(
|
||||
batch_size: int,
|
||||
sequence_length: int,
|
||||
test_cases: int,
|
||||
seed: int,
|
||||
verbose: bool,
|
||||
input_ids: TensorProto,
|
||||
segment_ids: TensorProto,
|
||||
input_mask: TensorProto,
|
||||
average_sequence_length: int,
|
||||
random_sequence_length: bool,
|
||||
mask_type: int,
|
||||
dictionary_size: int = 10000,
|
||||
):
|
||||
"""Create given number of input data for testing
|
||||
|
||||
Args:
|
||||
batch_size (int): batch size
|
||||
sequence_length (int): sequence length
|
||||
test_cases (int): number of test cases
|
||||
seed (int): random seed
|
||||
verbose (bool): print more information or not
|
||||
input_ids (TensorProto): graph input of input IDs
|
||||
segment_ids (TensorProto): graph input of token type IDs
|
||||
input_mask (TensorProto): graph input of attention mask
|
||||
average_sequence_length (int): average sequence length excluding paddings
|
||||
random_sequence_length (bool): whether use uniform random number for sequence length
|
||||
mask_type (int): mask type 1 is mask index; 2 is 2D mask; 3 is key len, cumulated lengths of query and key
|
||||
|
||||
Returns:
|
||||
List[Dict[str,numpy.ndarray]]: list of test cases, where each test case is a dictionary
|
||||
with input name as key and a tensor as value
|
||||
"""
|
||||
all_inputs = fake_test_data(
|
||||
batch_size,
|
||||
sequence_length,
|
||||
test_cases,
|
||||
dictionary_size,
|
||||
verbose,
|
||||
seed,
|
||||
input_ids,
|
||||
segment_ids,
|
||||
input_mask,
|
||||
average_sequence_length,
|
||||
random_sequence_length,
|
||||
mask_type,
|
||||
)
|
||||
if len(all_inputs) != test_cases:
|
||||
print("Failed to create test data for test.")
|
||||
return all_inputs
|
||||
|
||||
|
||||
def get_graph_input_from_embed_node(onnx_model, embed_node, input_index):
|
||||
if input_index >= len(embed_node.input):
|
||||
return None
|
||||
|
||||
input = embed_node.input[input_index]
|
||||
graph_input = onnx_model.find_graph_input(input)
|
||||
if graph_input is None:
|
||||
parent_node = onnx_model.get_parent(embed_node, input_index)
|
||||
if parent_node is not None and parent_node.op_type == "Cast":
|
||||
graph_input = onnx_model.find_graph_input(parent_node.input[0])
|
||||
return graph_input
|
||||
|
||||
|
||||
def find_bert_inputs(
|
||||
onnx_model: OnnxModel,
|
||||
input_ids_name: str | None = None,
|
||||
segment_ids_name: str | None = None,
|
||||
input_mask_name: str | None = None,
|
||||
) -> tuple[np.ndarray | None, np.ndarray | None, np.ndarray | None]:
|
||||
"""Find graph inputs for BERT model.
|
||||
First, we will deduce inputs from EmbedLayerNormalization node.
|
||||
If not found, we will guess the meaning of graph inputs based on naming.
|
||||
|
||||
Args:
|
||||
onnx_model (OnnxModel): onnx model object
|
||||
input_ids_name (str, optional): Name of graph input for input IDs. Defaults to None.
|
||||
segment_ids_name (str, optional): Name of graph input for segment IDs. Defaults to None.
|
||||
input_mask_name (str, optional): Name of graph input for attention mask. Defaults to None.
|
||||
|
||||
Raises:
|
||||
ValueError: Graph does not have input named of input_ids_name or segment_ids_name or input_mask_name
|
||||
ValueError: Expected graph input number does not match with specified input_ids_name, segment_ids_name
|
||||
and input_mask_name
|
||||
|
||||
Returns:
|
||||
Tuple[Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray]]: input tensors of input_ids,
|
||||
segment_ids and input_mask
|
||||
"""
|
||||
|
||||
graph_inputs = onnx_model.get_graph_inputs_excluding_initializers()
|
||||
|
||||
if input_ids_name is not None:
|
||||
input_ids = onnx_model.find_graph_input(input_ids_name)
|
||||
if input_ids is None:
|
||||
raise ValueError(f"Graph does not have input named {input_ids_name}")
|
||||
|
||||
segment_ids = None
|
||||
if segment_ids_name:
|
||||
segment_ids = onnx_model.find_graph_input(segment_ids_name)
|
||||
if segment_ids is None:
|
||||
raise ValueError(f"Graph does not have input named {segment_ids_name}")
|
||||
|
||||
input_mask = None
|
||||
if input_mask_name:
|
||||
input_mask = onnx_model.find_graph_input(input_mask_name)
|
||||
if input_mask is None:
|
||||
raise ValueError(f"Graph does not have input named {input_mask_name}")
|
||||
|
||||
expected_inputs = 1 + (1 if segment_ids else 0) + (1 if input_mask else 0)
|
||||
if len(graph_inputs) != expected_inputs:
|
||||
raise ValueError(f"Expect the graph to have {expected_inputs} inputs. Got {len(graph_inputs)}")
|
||||
|
||||
return input_ids, segment_ids, input_mask
|
||||
|
||||
if len(graph_inputs) != 3:
|
||||
raise ValueError(f"Expect the graph to have 3 inputs. Got {len(graph_inputs)}")
|
||||
|
||||
embed_nodes = onnx_model.get_nodes_by_op_type("EmbedLayerNormalization")
|
||||
if len(embed_nodes) == 1:
|
||||
embed_node = embed_nodes[0]
|
||||
input_ids = get_graph_input_from_embed_node(onnx_model, embed_node, 0)
|
||||
segment_ids = get_graph_input_from_embed_node(onnx_model, embed_node, 1)
|
||||
input_mask = get_graph_input_from_embed_node(onnx_model, embed_node, 7)
|
||||
|
||||
if input_mask is None:
|
||||
for input in graph_inputs:
|
||||
input_name_lower = input.name.lower()
|
||||
if "mask" in input_name_lower:
|
||||
input_mask = input
|
||||
if input_mask is None:
|
||||
raise ValueError("Failed to find attention mask input")
|
||||
|
||||
return input_ids, segment_ids, input_mask
|
||||
|
||||
# Try guess the inputs based on naming.
|
||||
input_ids = None
|
||||
segment_ids = None
|
||||
input_mask = None
|
||||
for input in graph_inputs:
|
||||
input_name_lower = input.name.lower()
|
||||
if "mask" in input_name_lower: # matches input with name like "attention_mask" or "input_mask"
|
||||
input_mask = input
|
||||
elif (
|
||||
"token" in input_name_lower or "segment" in input_name_lower
|
||||
): # matches input with name like "segment_ids" or "token_type_ids"
|
||||
segment_ids = input
|
||||
else:
|
||||
input_ids = input
|
||||
|
||||
if input_ids and segment_ids and input_mask:
|
||||
return input_ids, segment_ids, input_mask
|
||||
|
||||
raise ValueError("Fail to assign 3 inputs. You might try rename the graph inputs.")
|
||||
|
||||
|
||||
def get_bert_inputs(
|
||||
onnx_file: str,
|
||||
input_ids_name: str | None = None,
|
||||
segment_ids_name: str | None = None,
|
||||
input_mask_name: str | None = None,
|
||||
) -> tuple[np.ndarray | None, np.ndarray | None, np.ndarray | None]:
|
||||
"""Find graph inputs for BERT model.
|
||||
First, we will deduce inputs from EmbedLayerNormalization node.
|
||||
If not found, we will guess the meaning of graph inputs based on naming.
|
||||
|
||||
Args:
|
||||
onnx_file (str): onnx model path
|
||||
input_ids_name (str, optional): Name of graph input for input IDs. Defaults to None.
|
||||
segment_ids_name (str, optional): Name of graph input for segment IDs. Defaults to None.
|
||||
input_mask_name (str, optional): Name of graph input for attention mask. Defaults to None.
|
||||
|
||||
Returns:
|
||||
Tuple[Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray]]: input tensors of input_ids,
|
||||
segment_ids and input_mask
|
||||
"""
|
||||
model = ModelProto()
|
||||
with open(onnx_file, "rb") as file:
|
||||
model.ParseFromString(file.read())
|
||||
|
||||
onnx_model = OnnxModel(model)
|
||||
return find_bert_inputs(onnx_model, input_ids_name, segment_ids_name, input_mask_name)
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("--model", required=True, type=str, help="bert onnx model path.")
|
||||
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="output test data path. Default is current directory.",
|
||||
)
|
||||
|
||||
parser.add_argument("--batch_size", required=False, type=int, default=1, help="batch size of input")
|
||||
|
||||
parser.add_argument(
|
||||
"--sequence_length",
|
||||
required=False,
|
||||
type=int,
|
||||
default=128,
|
||||
help="maximum sequence length of input",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--input_ids_name",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for input ids",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--segment_ids_name",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for segment ids",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input_mask_name",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for attention mask",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--samples",
|
||||
required=False,
|
||||
type=int,
|
||||
default=1,
|
||||
help="number of test cases to be generated",
|
||||
)
|
||||
|
||||
parser.add_argument("--seed", required=False, type=int, default=3, help="random seed")
|
||||
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="print verbose information",
|
||||
)
|
||||
parser.set_defaults(verbose=False)
|
||||
|
||||
parser.add_argument(
|
||||
"--only_input_tensors",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="only save input tensors and no output tensors",
|
||||
)
|
||||
parser.set_defaults(only_input_tensors=False)
|
||||
|
||||
parser.add_argument(
|
||||
"-a",
|
||||
"--average_sequence_length",
|
||||
default=-1,
|
||||
type=int,
|
||||
help="average sequence length excluding padding",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--random_sequence_length",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="use uniform random instead of fixed sequence length",
|
||||
)
|
||||
parser.set_defaults(random_sequence_length=False)
|
||||
|
||||
parser.add_argument(
|
||||
"--mask_type",
|
||||
required=False,
|
||||
type=int,
|
||||
default=2,
|
||||
help="mask type: (1: mask index, 2: raw 2D mask, 3: key lengths, cumulated lengths of query and key)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def create_and_save_test_data(
|
||||
model: str,
|
||||
output_dir: str,
|
||||
batch_size: int,
|
||||
sequence_length: int,
|
||||
test_cases: int,
|
||||
seed: int,
|
||||
verbose: bool,
|
||||
input_ids_name: str | None,
|
||||
segment_ids_name: str | None,
|
||||
input_mask_name: str | None,
|
||||
only_input_tensors: bool,
|
||||
average_sequence_length: int,
|
||||
random_sequence_length: bool,
|
||||
mask_type: int,
|
||||
):
|
||||
"""Create test data for a model, and save test data to a directory.
|
||||
|
||||
Args:
|
||||
model (str): path of ONNX bert model
|
||||
output_dir (str): output directory
|
||||
batch_size (int): batch size
|
||||
sequence_length (int): sequence length
|
||||
test_cases (int): number of test cases
|
||||
seed (int): random seed
|
||||
verbose (bool): whether print more information
|
||||
input_ids_name (str): graph input name of input_ids
|
||||
segment_ids_name (str): graph input name of segment_ids
|
||||
input_mask_name (str): graph input name of input_mask
|
||||
only_input_tensors (bool): only save input tensors,
|
||||
average_sequence_length (int): average sequence length excluding paddings
|
||||
random_sequence_length (bool): whether use uniform random number for sequence length
|
||||
mask_type(int): mask type
|
||||
"""
|
||||
input_ids, segment_ids, input_mask = get_bert_inputs(model, input_ids_name, segment_ids_name, input_mask_name)
|
||||
|
||||
all_inputs = generate_test_data(
|
||||
batch_size,
|
||||
sequence_length,
|
||||
test_cases,
|
||||
seed,
|
||||
verbose,
|
||||
input_ids,
|
||||
segment_ids,
|
||||
input_mask,
|
||||
average_sequence_length,
|
||||
random_sequence_length,
|
||||
mask_type,
|
||||
)
|
||||
|
||||
for i, inputs in enumerate(all_inputs):
|
||||
directory = os.path.join(output_dir, "test_data_set_" + str(i))
|
||||
output_test_data(directory, inputs)
|
||||
|
||||
if only_input_tensors:
|
||||
return
|
||||
|
||||
import onnxruntime # noqa: PLC0415
|
||||
|
||||
providers = (
|
||||
["CUDAExecutionProvider", "CPUExecutionProvider"]
|
||||
if "CUDAExecutionProvider" in onnxruntime.get_available_providers()
|
||||
else ["CPUExecutionProvider"]
|
||||
)
|
||||
session = onnxruntime.InferenceSession(model, providers=providers)
|
||||
output_names = [output.name for output in session.get_outputs()]
|
||||
|
||||
for i, inputs in enumerate(all_inputs):
|
||||
directory = os.path.join(output_dir, "test_data_set_" + str(i))
|
||||
result = session.run(output_names, inputs)
|
||||
for i, output_name in enumerate(output_names): # noqa: PLW2901
|
||||
tensor_result = numpy_helper.from_array(np.asarray(result[i]), output_name)
|
||||
with open(os.path.join(directory, f"output_{i}.pb"), "wb") as file:
|
||||
file.write(tensor_result.SerializeToString())
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_arguments()
|
||||
|
||||
if args.average_sequence_length <= 0:
|
||||
args.average_sequence_length = args.sequence_length
|
||||
|
||||
output_dir = args.output_dir
|
||||
if output_dir is None:
|
||||
# Default output directory is a sub-directory under the directory of model.
|
||||
p = Path(args.model)
|
||||
output_dir = os.path.join(p.parent, f"batch_{args.batch_size}_seq_{args.sequence_length}")
|
||||
|
||||
if output_dir is not None:
|
||||
# create the output directory if not existed
|
||||
path = Path(output_dir)
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
else:
|
||||
print("Directory existed. test data files will be overwritten.")
|
||||
|
||||
create_and_save_test_data(
|
||||
args.model,
|
||||
output_dir,
|
||||
args.batch_size,
|
||||
args.sequence_length,
|
||||
args.samples,
|
||||
args.seed,
|
||||
args.verbose,
|
||||
args.input_ids_name,
|
||||
args.segment_ids_name,
|
||||
args.input_mask_name,
|
||||
args.only_input_tensors,
|
||||
args.average_sequence_length,
|
||||
args.random_sequence_length,
|
||||
args.mask_type,
|
||||
)
|
||||
|
||||
print("Test data is saved to directory:", output_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,256 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
# It is a tool to compare the inference results of the original model and optimized model.
|
||||
|
||||
import argparse
|
||||
import statistics
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import psutil
|
||||
from bert_perf_test import create_session, onnxruntime_inference
|
||||
from bert_test_data import generate_test_data, get_bert_inputs, output_test_data
|
||||
|
||||
|
||||
def run_model(model_path, all_inputs, use_gpu, disable_optimization):
|
||||
import onnxruntime # noqa: PLC0415
|
||||
|
||||
graph_optimization_level = None
|
||||
if disable_optimization:
|
||||
graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
|
||||
|
||||
intra_op_num_threads = psutil.cpu_count(logical=False)
|
||||
|
||||
session = create_session(
|
||||
model_path, use_gpu, "cuda" if use_gpu else "cpu", intra_op_num_threads, graph_optimization_level
|
||||
)
|
||||
|
||||
output_names = [output.name for output in session.get_outputs()]
|
||||
results, latency_list = onnxruntime_inference(session, all_inputs, output_names)
|
||||
return results, latency_list, output_names
|
||||
|
||||
|
||||
def compare(baseline_results, treatment_results, verbose, rtol=1e-1, atol=1e-3):
|
||||
# Validate the output of baseline and treatment, to make sure the results are similar.
|
||||
diff_count = 0
|
||||
max_abs_diff = 0
|
||||
max_diff_percentage = 0
|
||||
case_passed = True
|
||||
for test_case_id, results in enumerate(baseline_results):
|
||||
for i in range(len(results)):
|
||||
treatment_output = treatment_results[test_case_id][i]
|
||||
abs_diff_tensor = np.abs(treatment_output - results[i])
|
||||
abs_diff = np.amax(abs_diff_tensor)
|
||||
if verbose and abs_diff > atol:
|
||||
print("abs_diff", abs_diff)
|
||||
print("treatment", treatment_output)
|
||||
print("baseline", results[i])
|
||||
|
||||
count_exceeding = np.sum(abs_diff_tensor > atol)
|
||||
total_elements = abs_diff_tensor.size
|
||||
percentage_exceeding = (count_exceeding / total_elements) * 100
|
||||
max_diff_percentage = max(max_diff_percentage, percentage_exceeding)
|
||||
|
||||
max_abs_diff = max(max_abs_diff, abs_diff)
|
||||
if not np.allclose(results[i].tolist(), treatment_output.tolist(), rtol=rtol, atol=atol):
|
||||
if case_passed:
|
||||
case_passed = False
|
||||
diff_count += 1
|
||||
|
||||
if verbose:
|
||||
print(f"case {test_case_id} output {i}")
|
||||
print(f"baseline={results[i].tolist()}\ntreatment={treatment_output}")
|
||||
print(f"abs_diff={abs_diff}")
|
||||
|
||||
if diff_count == 0:
|
||||
print(f"100% passed for {len(baseline_results)} random inputs given thresholds (rtol={rtol}, atol={atol}).")
|
||||
else:
|
||||
print(
|
||||
f"WARNING: {diff_count} out of {len(baseline_results)} results NOT passed for thresholds (rtol={rtol}, atol={atol})."
|
||||
)
|
||||
|
||||
print(f"maximum absolute difference={max_abs_diff}")
|
||||
print(f"maximum percentage of elements that exceeds atol={atol} is {max_diff_percentage:.3f}%")
|
||||
return max_abs_diff, case_passed
|
||||
|
||||
|
||||
def run_test(
|
||||
baseline_model,
|
||||
optimized_model,
|
||||
output_dir,
|
||||
batch_size,
|
||||
sequence_length,
|
||||
use_gpu,
|
||||
test_cases,
|
||||
seed,
|
||||
verbose,
|
||||
rtol,
|
||||
atol,
|
||||
input_ids_name,
|
||||
segment_ids_name,
|
||||
input_mask_name,
|
||||
mask_type,
|
||||
dictionary_size: int = 1024,
|
||||
):
|
||||
# Try deduce input names from optimized model.
|
||||
input_ids, segment_ids, input_mask = get_bert_inputs(
|
||||
optimized_model, input_ids_name, segment_ids_name, input_mask_name
|
||||
)
|
||||
|
||||
# Use random mask length for accuracy test. It might introduce slight inflation in latency reported in this script.
|
||||
average_sequence_length = int(sequence_length / 2) if sequence_length >= 2 else sequence_length
|
||||
all_inputs = generate_test_data(
|
||||
batch_size,
|
||||
sequence_length,
|
||||
test_cases,
|
||||
seed,
|
||||
verbose,
|
||||
input_ids,
|
||||
segment_ids,
|
||||
input_mask,
|
||||
average_sequence_length,
|
||||
True, # random sequence length
|
||||
mask_type,
|
||||
dictionary_size=dictionary_size,
|
||||
)
|
||||
|
||||
baseline_results, baseline_latency, output_names = run_model(
|
||||
baseline_model, all_inputs, use_gpu, disable_optimization=True
|
||||
)
|
||||
if verbose:
|
||||
print(f"baseline average latency (all optimizations disabled): {statistics.mean(baseline_latency) * 1000} ms")
|
||||
|
||||
if output_dir is not None:
|
||||
for i, inputs in enumerate(all_inputs):
|
||||
output_test_data(output_dir, i, inputs)
|
||||
|
||||
treatment_results, treatment_latency, treatment_output_names = run_model(
|
||||
optimized_model, all_inputs, use_gpu, disable_optimization=False
|
||||
)
|
||||
if verbose:
|
||||
print(f"treatment average latency: {statistics.mean(treatment_latency) * 1000} ms")
|
||||
|
||||
# Validate the output of baseline and treatment, to make sure the results are similar.
|
||||
return compare(baseline_results, treatment_results, verbose, rtol, atol)
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--baseline_model", required=True, type=str, help="baseline onnx model path.")
|
||||
|
||||
parser.add_argument(
|
||||
"--optimized_model",
|
||||
required=True,
|
||||
type=str,
|
||||
default=None,
|
||||
help="path of the optimized model. It shall have same inputs as the baseline model.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="output test data path. If not specified, test data will not be saved.",
|
||||
)
|
||||
|
||||
parser.add_argument("--batch_size", required=True, type=int, help="batch size of input")
|
||||
|
||||
parser.add_argument(
|
||||
"--sequence_length",
|
||||
required=True,
|
||||
type=int,
|
||||
help="maximum sequence length of input",
|
||||
)
|
||||
|
||||
parser.add_argument("--rtol", required=False, type=float, default=1e-3, help="relative tolerance")
|
||||
|
||||
parser.add_argument("--atol", required=False, type=float, default=1e-4, help="absolute tolerance")
|
||||
|
||||
parser.add_argument(
|
||||
"--samples",
|
||||
required=False,
|
||||
type=int,
|
||||
default=100,
|
||||
help="number of test cases to be generated",
|
||||
)
|
||||
|
||||
parser.add_argument("--seed", required=False, type=int, default=3, help="random seed")
|
||||
|
||||
parser.add_argument("--use_gpu", required=False, action="store_true", help="use GPU")
|
||||
parser.set_defaults(use_gpu=False)
|
||||
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="print verbose information",
|
||||
)
|
||||
parser.set_defaults(verbose=False)
|
||||
|
||||
parser.add_argument(
|
||||
"--input_ids",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for input ids",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--segment_ids",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for segment ids",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input_mask",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="input name for attention mask",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--mask_type",
|
||||
required=False,
|
||||
type=int,
|
||||
default=2,
|
||||
help="mask type: (1: mask index or sequence length, 2: raw 2D mask, 3: key len, cumulated lengths of query and key)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_arguments()
|
||||
|
||||
if args.output_dir is not None:
|
||||
# create the output directory if not existed
|
||||
path = Path(args.output_dir)
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
run_test(
|
||||
args.baseline_model,
|
||||
args.optimized_model,
|
||||
args.output_dir,
|
||||
args.batch_size,
|
||||
args.sequence_length,
|
||||
args.use_gpu,
|
||||
args.samples,
|
||||
args.seed,
|
||||
args.verbose,
|
||||
args.rtol,
|
||||
args.atol,
|
||||
args.input_ids,
|
||||
args.segment_ids,
|
||||
args.input_mask,
|
||||
args.mask_type,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,47 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
|
||||
class Operators:
|
||||
ATTENTION = "Attention"
|
||||
LAYERNORM = "LayerNormalization"
|
||||
MULTI_HEAD_ATTENTION = "MultiHeadAttention"
|
||||
PACKEDATTENTION = "PackedAttention"
|
||||
PACKED_MULTI_HEAD_ATTENTION = "PackedMultiHeadAttention"
|
||||
REMOVEPADDING = "RemovePadding"
|
||||
RESTOREPADDING = "RestorePadding"
|
||||
SKIPLAYERNORM = "SkipLayerNormalization"
|
||||
|
||||
|
||||
class AttentionInputIDs:
|
||||
INPUT = 0
|
||||
WEIGHTS = 1
|
||||
BIAS = 2
|
||||
MASK_INDEX = 3
|
||||
PAST = 4
|
||||
ATTENTION_BIAS = 5
|
||||
PAST_SEQUENCE_LENGTH = 6
|
||||
|
||||
|
||||
class AttentionOutputIDs:
|
||||
OUTPUT = 0
|
||||
PRESENT = 1
|
||||
|
||||
|
||||
class MultiHeadAttentionInputIDs:
|
||||
QUERY = 0
|
||||
KEY = 1
|
||||
VALUE = 2
|
||||
BIAS = 3
|
||||
KEY_PADDING_MASK = 4
|
||||
ATTENTION_BIAS = 5
|
||||
PAST_KEY = 6
|
||||
PAST_VALUE = 7
|
||||
|
||||
|
||||
class MultiHeadAttentionOutputIDs:
|
||||
OUTPUT = 0
|
||||
PRESENT_KEY = 1
|
||||
PRESENT_VALUE = 2
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,205 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
import glob
|
||||
import os
|
||||
|
||||
import requests
|
||||
|
||||
TFMODELS = {
|
||||
"bert-base-uncased": (
|
||||
"bert",
|
||||
"BertConfig",
|
||||
"",
|
||||
"https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip",
|
||||
),
|
||||
"bert-base-cased": (
|
||||
"bert",
|
||||
"BertConfig",
|
||||
"",
|
||||
"https://storage.googleapis.com/bert_models/2019_05_30/wwm_cased_L-24_H-1024_A-16.zip",
|
||||
),
|
||||
"bert-large-uncased": (
|
||||
"bert",
|
||||
"BertConfig",
|
||||
"",
|
||||
"https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip",
|
||||
),
|
||||
"albert-base": (
|
||||
"albert",
|
||||
"AlbertConfig",
|
||||
"",
|
||||
"https://storage.googleapis.com/albert_models/albert_base_v1.tar.gz",
|
||||
),
|
||||
"albert-large": (
|
||||
"albert",
|
||||
"AlbertConfig",
|
||||
"",
|
||||
"https://storage.googleapis.com/albert_models/albert_large_v1.tar.gz",
|
||||
),
|
||||
"gpt-2-117M": (
|
||||
"gpt2",
|
||||
"GPT2Config",
|
||||
"GPT2Model",
|
||||
"https://storage.googleapis.com/gpt-2/models/117M",
|
||||
),
|
||||
"gpt-2-124M": (
|
||||
"gpt2",
|
||||
"GPT2Config",
|
||||
"GPT2Model",
|
||||
"https://storage.googleapis.com/gpt-2/models/124M",
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def download_compressed_file(tf_ckpt_url, ckpt_dir):
|
||||
r = requests.get(tf_ckpt_url)
|
||||
compressed_file_name = tf_ckpt_url.split("/")[-1]
|
||||
compressed_file_dir = os.path.join(ckpt_dir, compressed_file_name)
|
||||
with open(compressed_file_dir, "wb") as f:
|
||||
f.write(r.content)
|
||||
return compressed_file_dir
|
||||
|
||||
|
||||
def get_ckpt_prefix_path(ckpt_dir):
|
||||
# get prefix
|
||||
sub_folder_dir = None
|
||||
for o in os.listdir(ckpt_dir):
|
||||
sub_folder_dir = os.path.join(ckpt_dir, o)
|
||||
break
|
||||
if os.path.isfile(sub_folder_dir):
|
||||
sub_folder_dir = ckpt_dir
|
||||
unique_file_name = str(glob.glob(sub_folder_dir + "/*data-00000-of-00001"))
|
||||
prefix = (unique_file_name.rpartition(".")[0]).split("/")[-1]
|
||||
|
||||
return os.path.join(sub_folder_dir, prefix)
|
||||
|
||||
|
||||
def download_tf_checkpoint(model_name, tf_models_dir="tf_models"):
|
||||
import pathlib # noqa: PLC0415
|
||||
|
||||
base_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), tf_models_dir)
|
||||
ckpt_dir = os.path.join(base_dir, model_name)
|
||||
|
||||
if not os.path.exists(ckpt_dir):
|
||||
os.makedirs(ckpt_dir)
|
||||
|
||||
tf_ckpt_url = TFMODELS[model_name][3]
|
||||
|
||||
import re # noqa: PLC0415
|
||||
|
||||
if re.search(".zip$", tf_ckpt_url) is not None:
|
||||
zip_dir = download_compressed_file(tf_ckpt_url, ckpt_dir)
|
||||
|
||||
# unzip file
|
||||
import zipfile # noqa: PLC0415
|
||||
|
||||
with zipfile.ZipFile(zip_dir, "r") as zip_ref:
|
||||
zip_ref.extractall(ckpt_dir)
|
||||
os.remove(zip_dir)
|
||||
|
||||
return get_ckpt_prefix_path(ckpt_dir)
|
||||
|
||||
elif re.search(".tar.gz$", tf_ckpt_url) is not None:
|
||||
tar_dir = download_compressed_file(tf_ckpt_url, ckpt_dir)
|
||||
|
||||
# untar file
|
||||
import tarfile # noqa: PLC0415
|
||||
|
||||
with tarfile.open(tar_dir, "r") as tar_ref:
|
||||
tar_ref.extractall(ckpt_dir)
|
||||
os.remove(tar_dir)
|
||||
|
||||
return get_ckpt_prefix_path(ckpt_dir)
|
||||
|
||||
else:
|
||||
for filename in [
|
||||
"checkpoint",
|
||||
"model.ckpt.data-00000-of-00001",
|
||||
"model.ckpt.index",
|
||||
"model.ckpt.meta",
|
||||
]:
|
||||
r = requests.get(tf_ckpt_url + "/" + filename)
|
||||
with open(os.path.join(ckpt_dir, filename), "wb") as f:
|
||||
f.write(r.content)
|
||||
|
||||
return get_ckpt_prefix_path(ckpt_dir)
|
||||
|
||||
|
||||
def init_pytorch_model(model_name, tf_checkpoint_path):
|
||||
config_name = TFMODELS[model_name][1]
|
||||
config_module = __import__("transformers", fromlist=[config_name])
|
||||
model_config = getattr(config_module, config_name)
|
||||
|
||||
parent_path = tf_checkpoint_path.rpartition("/")[0]
|
||||
config_path = glob.glob(parent_path + "/*config.json")
|
||||
config = model_config() if len(config_path) == 0 else model_config.from_json_file(str(config_path[0]))
|
||||
|
||||
if not TFMODELS[model_name][2]:
|
||||
from transformers import AutoModelForPreTraining # noqa: PLC0415
|
||||
|
||||
init_model = AutoModelForPreTraining.from_config(config)
|
||||
else:
|
||||
model_categroy_name = TFMODELS[model_name][2]
|
||||
module = __import__("transformers", fromlist=[model_categroy_name])
|
||||
model_categroy = getattr(module, model_categroy_name)
|
||||
init_model = model_categroy(config)
|
||||
return config, init_model
|
||||
|
||||
|
||||
def convert_tf_checkpoint_to_pytorch(model_name, config, init_model, tf_checkpoint_path, is_tf2):
|
||||
load_tf_weight_func_name = "load_tf_weights_in_" + TFMODELS[model_name][0]
|
||||
|
||||
module = __import__("transformers", fromlist=[load_tf_weight_func_name])
|
||||
|
||||
if is_tf2 is False:
|
||||
load_tf_weight_func = getattr(module, load_tf_weight_func_name)
|
||||
else:
|
||||
if TFMODELS[model_name][0] != "bert":
|
||||
raise NotImplementedError("Only support tf2 ckeckpoint for Bert model")
|
||||
from transformers import convert_bert_original_tf2_checkpoint_to_pytorch # noqa: PLC0415
|
||||
|
||||
load_tf_weight_func = convert_bert_original_tf2_checkpoint_to_pytorch.load_tf2_weights_in_bert
|
||||
|
||||
# Expect transformers team will unify the order of signature in the future
|
||||
model = (
|
||||
load_tf_weight_func(init_model, config, tf_checkpoint_path)
|
||||
if is_tf2 is False
|
||||
else load_tf_weight_func(init_model, tf_checkpoint_path, config)
|
||||
)
|
||||
model.eval()
|
||||
return model
|
||||
|
||||
|
||||
def tf2pt_pipeline(model_name, is_tf2=False):
|
||||
if model_name not in TFMODELS:
|
||||
raise NotImplementedError(model_name + " not implemented")
|
||||
tf_checkpoint_path = download_tf_checkpoint(model_name)
|
||||
config, init_model = init_pytorch_model(model_name, tf_checkpoint_path)
|
||||
model = convert_tf_checkpoint_to_pytorch(model_name, config, init_model, tf_checkpoint_path, is_tf2)
|
||||
# Could then use the model in Benchmark
|
||||
return config, model
|
||||
|
||||
|
||||
def tf2pt_pipeline_test():
|
||||
# For test on linux only
|
||||
import logging # noqa: PLC0415
|
||||
|
||||
import torch # noqa: PLC0415
|
||||
|
||||
logger = logging.getLogger("")
|
||||
for model_name in TFMODELS:
|
||||
config, model = tf2pt_pipeline(model_name)
|
||||
assert config.model_type is TFMODELS[model_name][0]
|
||||
|
||||
input = torch.randint(low=0, high=config.vocab_size - 1, size=(4, 128), dtype=torch.long)
|
||||
try:
|
||||
model(input)
|
||||
except RuntimeError as e:
|
||||
logger.exception(e)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
tf2pt_pipeline_test()
|
||||
@@ -0,0 +1,385 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
|
||||
from constants import (
|
||||
AttentionInputIDs,
|
||||
AttentionOutputIDs,
|
||||
MultiHeadAttentionInputIDs,
|
||||
MultiHeadAttentionOutputIDs,
|
||||
Operators,
|
||||
)
|
||||
from onnx import helper, load_model
|
||||
from onnx_model import NodeProto, OnnxModel
|
||||
from shape_infer_helper import SymbolicShapeInferenceHelper
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PackingAttentionBase:
|
||||
def __init__(self, model: OnnxModel, attention_op_type: str):
|
||||
self.model: OnnxModel = model
|
||||
self.nodes_to_remove: list = []
|
||||
self.nodes_to_add: list = []
|
||||
self.prune_graph: bool = False
|
||||
self.node_name_to_graph_name: dict = {}
|
||||
self.this_graph_name: str = self.model.model.graph.name
|
||||
self.attention_op_type = attention_op_type
|
||||
self.attention_nodes = self.model.get_nodes_by_op_type(attention_op_type)
|
||||
|
||||
def _try_getting_attention_mask(self) -> str | None:
|
||||
mask_index = (
|
||||
AttentionInputIDs.MASK_INDEX
|
||||
if self.attention_op_type == Operators.ATTENTION
|
||||
else MultiHeadAttentionInputIDs.KEY_PADDING_MASK
|
||||
)
|
||||
first_attention_node = self._try_getting_first_attention()
|
||||
# check if attention has mask
|
||||
if not first_attention_node or len(first_attention_node.input) <= mask_index:
|
||||
return None
|
||||
|
||||
attention_mask = first_attention_node.input[mask_index]
|
||||
|
||||
# check if all attention nodes have same mask
|
||||
for node in self.attention_nodes:
|
||||
if len(node.input) <= mask_index or node.input[mask_index] != attention_mask:
|
||||
return None
|
||||
|
||||
return attention_mask
|
||||
|
||||
def _try_getting_first_attention(self) -> NodeProto | None:
|
||||
if len(self.attention_nodes) <= 0:
|
||||
return None
|
||||
|
||||
return self.attention_nodes[0]
|
||||
|
||||
def _try_getting_last_layernorm(self) -> NodeProto | None:
|
||||
last_layernorm_node = None
|
||||
for node in self.model.nodes():
|
||||
if node.op_type == Operators.LAYERNORM or node.op_type == Operators.SKIPLAYERNORM:
|
||||
last_layernorm_node = node
|
||||
return last_layernorm_node
|
||||
|
||||
def _are_attentions_supported(self) -> bool:
|
||||
raise NotImplementedError()
|
||||
|
||||
def _insert_removepadding_node(self, inputs: list[str], outputs: list[str]) -> None:
|
||||
new_node = helper.make_node(
|
||||
Operators.REMOVEPADDING,
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
name=self.model.create_node_name(Operators.REMOVEPADDING),
|
||||
)
|
||||
|
||||
new_node.domain = "com.microsoft"
|
||||
self.nodes_to_add.append(new_node)
|
||||
self.node_name_to_graph_name[new_node.name] = self.this_graph_name
|
||||
|
||||
def _insert_restorepadding_node(self, inputs: list[str], outputs: list[str]) -> None:
|
||||
new_node = helper.make_node(
|
||||
Operators.RESTOREPADDING,
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
name=self.model.create_node_name(Operators.RESTOREPADDING),
|
||||
)
|
||||
|
||||
new_node.domain = "com.microsoft"
|
||||
self.nodes_to_add.append(new_node)
|
||||
self.node_name_to_graph_name[new_node.name] = self.this_graph_name
|
||||
|
||||
def _replace_attention_with_packing_attention(self, token_offset: str, cumulative_sequence_length: str) -> None:
|
||||
raise NotImplementedError()
|
||||
|
||||
def _get_input_to_remove_padding(self, first_attention_node) -> str | None:
|
||||
if self.attention_op_type == Operators.ATTENTION:
|
||||
return first_attention_node.input[AttentionInputIDs.INPUT]
|
||||
return None
|
||||
|
||||
def convert(self, use_symbolic_shape_infer: bool = True) -> None:
|
||||
logger.debug("start converting to packing model...")
|
||||
|
||||
if not self._are_attentions_supported():
|
||||
return
|
||||
|
||||
attention_mask = self._try_getting_attention_mask()
|
||||
if not attention_mask:
|
||||
return
|
||||
|
||||
first_attention_node = self._try_getting_first_attention()
|
||||
last_layernorm_node = self._try_getting_last_layernorm()
|
||||
if not last_layernorm_node:
|
||||
return
|
||||
|
||||
# insert RemovePadding
|
||||
input_to_remove_padding = self._get_input_to_remove_padding(first_attention_node)
|
||||
if not input_to_remove_padding:
|
||||
return
|
||||
|
||||
output_without_padding = input_to_remove_padding + "_no_padding"
|
||||
token_offset = input_to_remove_padding + "_token_offset"
|
||||
cumulated_seq_len = input_to_remove_padding + "_cumulated_seq_len"
|
||||
max_seq_len = input_to_remove_padding + "_max_seq_len"
|
||||
self._insert_removepadding_node(
|
||||
[input_to_remove_padding, attention_mask],
|
||||
[output_without_padding, token_offset, cumulated_seq_len, max_seq_len],
|
||||
)
|
||||
self.model.replace_input_of_all_nodes(input_to_remove_padding, output_without_padding)
|
||||
logger.debug("inserted RemovePadding before Attention")
|
||||
|
||||
# insert RestorePadding
|
||||
restorepadding_input = last_layernorm_node.output[0] + "_restore_input"
|
||||
self._insert_restorepadding_node([restorepadding_input, token_offset], [last_layernorm_node.output[0]])
|
||||
self.model.replace_output_of_all_nodes(last_layernorm_node.output[0], restorepadding_input)
|
||||
logger.debug(f"inserted RestorePadding after last {last_layernorm_node.op_type} layer")
|
||||
|
||||
# insert PackedAttention
|
||||
self._replace_attention_with_packing_attention(token_offset, cumulated_seq_len)
|
||||
logger.debug(f"replaced {self.attention_op_type} with Packed{self.attention_op_type}")
|
||||
|
||||
self.model.remove_nodes(self.nodes_to_remove)
|
||||
self.model.add_nodes(self.nodes_to_add, self.node_name_to_graph_name)
|
||||
|
||||
if self.prune_graph:
|
||||
self.model.prune_graph()
|
||||
elif self.nodes_to_remove or self.nodes_to_add:
|
||||
self.model.update_graph()
|
||||
self.model.clean_shape_infer()
|
||||
if use_symbolic_shape_infer:
|
||||
# Use symbolic shape inference since custom operators (like Gelu, SkipLayerNormalization etc)
|
||||
# are not recognized by onnx shape inference.
|
||||
shape_infer_helper = SymbolicShapeInferenceHelper(self.model.model, verbose=0)
|
||||
inferred_model = shape_infer_helper.infer_shapes(self.model.model, auto_merge=True, guess_output_rank=False)
|
||||
if inferred_model:
|
||||
self.model.model = inferred_model
|
||||
|
||||
|
||||
class PackingAttention(PackingAttentionBase):
|
||||
def __init__(self, model: OnnxModel):
|
||||
super().__init__(model, Operators.ATTENTION)
|
||||
|
||||
def _are_attentions_supported(self) -> bool:
|
||||
for node in self.attention_nodes:
|
||||
if OnnxModel.get_node_attribute(node, "past_present_share_buffer") is not None:
|
||||
return False
|
||||
if OnnxModel.get_node_attribute(node, "do_rotary") is not None:
|
||||
return False
|
||||
unidirection_attr = OnnxModel.get_node_attribute(node, "unidirectional")
|
||||
if unidirection_attr is not None and unidirection_attr != 0:
|
||||
return False
|
||||
if len(node.input) > AttentionInputIDs.PAST and not node.input[AttentionInputIDs.PAST]:
|
||||
return False
|
||||
if (
|
||||
len(node.input) > AttentionInputIDs.PAST_SEQUENCE_LENGTH
|
||||
and not node.input[AttentionInputIDs.PAST_SEQUENCE_LENGTH]
|
||||
):
|
||||
return False
|
||||
return True
|
||||
|
||||
def _replace_attention_with_packing_attention(self, token_offset: str, cumulative_sequence_length: str) -> None:
|
||||
for attention in self.attention_nodes:
|
||||
attention_bias = (
|
||||
attention.input[AttentionInputIDs.ATTENTION_BIAS]
|
||||
if len(attention.input) > AttentionInputIDs.ATTENTION_BIAS
|
||||
else ""
|
||||
)
|
||||
packed_attention = helper.make_node(
|
||||
Operators.PACKEDATTENTION,
|
||||
inputs=[
|
||||
attention.input[AttentionInputIDs.INPUT],
|
||||
attention.input[AttentionInputIDs.WEIGHTS],
|
||||
attention.input[AttentionInputIDs.BIAS],
|
||||
token_offset,
|
||||
cumulative_sequence_length,
|
||||
attention_bias,
|
||||
],
|
||||
outputs=[attention.output[AttentionOutputIDs.OUTPUT]],
|
||||
name=self.model.create_node_name(Operators.PACKEDATTENTION),
|
||||
)
|
||||
|
||||
attributes = []
|
||||
for attr in attention.attribute:
|
||||
if attr.name in ["num_heads", "qkv_hidden_sizes", "scale"]:
|
||||
attributes.append(attr)
|
||||
|
||||
packed_attention.attribute.extend(attributes)
|
||||
packed_attention.domain = "com.microsoft"
|
||||
self.nodes_to_add.append(packed_attention)
|
||||
self.nodes_to_remove.append(attention)
|
||||
self.node_name_to_graph_name[packed_attention.name] = self.this_graph_name
|
||||
|
||||
logger.info("Converted %d Attention nodes to PackedAttention.", len(self.attention_nodes))
|
||||
|
||||
|
||||
class PackingMultiHeadAttention(PackingAttentionBase):
|
||||
def __init__(self, model: OnnxModel):
|
||||
super().__init__(model, Operators.MULTI_HEAD_ATTENTION)
|
||||
|
||||
def _check_empty_input(self, node, index: int, name: str):
|
||||
"""Check a node does not have given input."""
|
||||
if len(node.input) > index:
|
||||
if len(node.input[index]) > 0:
|
||||
logger.error(f"node input {index} ({name}) is not supported in PackedMultiHeadAttention: {node}")
|
||||
return False
|
||||
return True
|
||||
|
||||
def _check_empty_output(self, node, index: int, name: str):
|
||||
"""Check a node does not have given input."""
|
||||
if len(node.output) > index:
|
||||
if len(node.output[index]) > 0:
|
||||
logger.error(f"node output {index} ({name}) is not supported in PackedMultiHeadAttention: {node}")
|
||||
return False
|
||||
return True
|
||||
|
||||
def _are_attentions_supported(self) -> bool:
|
||||
for node in self.attention_nodes:
|
||||
for attr in node.attribute:
|
||||
if attr.name not in ["num_heads", "mask_filter_value", "scale"]:
|
||||
logger.error(f"node attribute {attr.name} is not supported in PackedMultiHeadAttention: {node}")
|
||||
return False
|
||||
|
||||
if node.input[MultiHeadAttentionInputIDs.KEY] and not node.input[MultiHeadAttentionInputIDs.VALUE]:
|
||||
logger.error("packed kv format is not supported in PackedMultiHeadAttention")
|
||||
return False
|
||||
|
||||
if not (
|
||||
self._check_empty_input(node, MultiHeadAttentionInputIDs.PAST_KEY, "past_key")
|
||||
and self._check_empty_input(node, MultiHeadAttentionInputIDs.PAST_VALUE, "past_key")
|
||||
and self._check_empty_output(node, MultiHeadAttentionOutputIDs.PRESENT_KEY, "present_key")
|
||||
and self._check_empty_output(node, MultiHeadAttentionOutputIDs.PRESENT_VALUE, "present_key")
|
||||
):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _replace_attention_with_packing_attention(self, token_offset: str, cumulative_sequence_length: str) -> None:
|
||||
gated_relative_pos_bias_count = 0
|
||||
for mha in self.attention_nodes:
|
||||
attention_bias = (
|
||||
mha.input[MultiHeadAttentionInputIDs.ATTENTION_BIAS]
|
||||
if len(mha.input) > MultiHeadAttentionInputIDs.ATTENTION_BIAS
|
||||
else ""
|
||||
)
|
||||
packed_mha = helper.make_node(
|
||||
Operators.PACKED_MULTI_HEAD_ATTENTION,
|
||||
inputs=[
|
||||
mha.input[MultiHeadAttentionInputIDs.QUERY],
|
||||
mha.input[MultiHeadAttentionInputIDs.KEY],
|
||||
mha.input[MultiHeadAttentionInputIDs.VALUE],
|
||||
mha.input[MultiHeadAttentionInputIDs.BIAS],
|
||||
token_offset,
|
||||
cumulative_sequence_length,
|
||||
attention_bias,
|
||||
],
|
||||
outputs=[mha.output[MultiHeadAttentionOutputIDs.OUTPUT]],
|
||||
name=self.model.create_node_name(Operators.PACKED_MULTI_HEAD_ATTENTION),
|
||||
)
|
||||
|
||||
attributes = []
|
||||
for attr in mha.attribute:
|
||||
if attr.name in ["num_heads", "mask_filter_value", "scale"]:
|
||||
attributes.append(attr)
|
||||
|
||||
packed_mha.attribute.extend(attributes)
|
||||
packed_mha.domain = "com.microsoft"
|
||||
self.nodes_to_add.append(packed_mha)
|
||||
self.nodes_to_remove.append(mha)
|
||||
self.node_name_to_graph_name[packed_mha.name] = self.this_graph_name
|
||||
|
||||
# Append token_offset input to GatedRelativePositionBias
|
||||
if attention_bias:
|
||||
rel_pos_bias_node = self.model.get_parent(mha, MultiHeadAttentionInputIDs.ATTENTION_BIAS)
|
||||
if (
|
||||
rel_pos_bias_node
|
||||
and rel_pos_bias_node.op_type == "GatedRelativePositionBias"
|
||||
and len(rel_pos_bias_node.input) == 6
|
||||
):
|
||||
rel_pos_bias_node.input.append(token_offset)
|
||||
gated_relative_pos_bias_count += 1
|
||||
|
||||
logger.info("Converted %d MultiHeadAttention nodes to PackedMultiHeadAttention.", len(self.attention_nodes))
|
||||
logger.info("Converted %d GatedRelativePositionBias nodes to packing mode.", gated_relative_pos_bias_count)
|
||||
|
||||
def _get_input_to_remove_padding(self, first_attention_node) -> str | None:
|
||||
# When there are query, key and value inputs, we need to find the first input of the parent MatMul node.
|
||||
matmul = self.model.get_parent(first_attention_node, 0)
|
||||
if matmul and matmul.op_type == "MatMul":
|
||||
return matmul.input[0]
|
||||
return None
|
||||
|
||||
|
||||
class PackingMode:
|
||||
def __init__(self, model: OnnxModel):
|
||||
self.model = model
|
||||
|
||||
def convert(self, use_symbolic_shape_infer: bool = True) -> None:
|
||||
if self.model.get_nodes_by_op_type(Operators.ATTENTION):
|
||||
if self.model.get_nodes_by_op_type(Operators.MULTI_HEAD_ATTENTION):
|
||||
logger.error("Packing mode does not support both Attention and MultiHeadAttention in same graph.")
|
||||
return None
|
||||
packing = PackingAttention(self.model)
|
||||
return packing.convert(use_symbolic_shape_infer)
|
||||
elif self.model.get_nodes_by_op_type(Operators.MULTI_HEAD_ATTENTION):
|
||||
packing = PackingMultiHeadAttention(self.model)
|
||||
return packing.convert(use_symbolic_shape_infer)
|
||||
else:
|
||||
logger.error("Packing mode requires either Attention or MultiHeadAttention node in onnx graph.")
|
||||
return None
|
||||
|
||||
|
||||
def _parse_arguments():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert to packing mode tool for ONNX Runtime. It converts BERT like model to use packing mode."
|
||||
)
|
||||
parser.add_argument("--input", required=True, type=str, help="input onnx model path")
|
||||
|
||||
parser.add_argument("--output", required=True, type=str, help="optimized onnx model path")
|
||||
|
||||
parser.add_argument("--verbose", required=False, action="store_true", help="show debug information.")
|
||||
parser.set_defaults(verbose=False)
|
||||
|
||||
parser.add_argument(
|
||||
"--use_external_data_format",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="use external data format to store large model (>2GB)",
|
||||
)
|
||||
parser.set_defaults(use_external_data_format=False)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def _setup_logger(verbose):
|
||||
if verbose:
|
||||
logging.basicConfig(
|
||||
format="[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s",
|
||||
level=logging.DEBUG,
|
||||
)
|
||||
else:
|
||||
logging.basicConfig(format="%(funcName)20s: %(message)s", level=logging.INFO)
|
||||
|
||||
|
||||
def main():
|
||||
args = _parse_arguments()
|
||||
|
||||
_setup_logger(args.verbose)
|
||||
|
||||
logger.debug(f"arguments:{args}")
|
||||
|
||||
if os.path.realpath(args.input) == os.path.realpath(args.output):
|
||||
logger.warning("Specified the same input and output path. Note that this may overwrite the original model")
|
||||
|
||||
model = load_model(args.input)
|
||||
packing_mode = PackingMode(OnnxModel(model))
|
||||
packing_mode.convert()
|
||||
packing_mode.model.save_model_to_file(args.output, use_external_data_format=args.use_external_data_format)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,205 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
from collections.abc import Sequence
|
||||
from logging import getLogger
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import onnx
|
||||
from onnx import helper
|
||||
from onnx_model import OnnxModel
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
||||
class DynamoOnnxHelper:
|
||||
"""
|
||||
Helper class for processing ONNX models exported by Torch Dynamo.
|
||||
"""
|
||||
|
||||
def __init__(self, model: onnx.ModelProto):
|
||||
self.model = OnnxModel(model)
|
||||
|
||||
def update_edges(self, edge_mapping: dict) -> None:
|
||||
"""
|
||||
Updates the edges in the model according to the given mapping.
|
||||
"""
|
||||
for node in self.model.model.graph.node:
|
||||
for i in range(len(node.input)):
|
||||
if node.input[i] in edge_mapping:
|
||||
node.input[i] = edge_mapping[node.input[i]]
|
||||
for i in range(len(node.output)):
|
||||
if node.output[i] in edge_mapping:
|
||||
node.output[i] = edge_mapping[node.output[i]]
|
||||
|
||||
for graph_input in self.model.model.graph.input:
|
||||
if graph_input.name in edge_mapping:
|
||||
graph_input.name = edge_mapping[graph_input.name]
|
||||
for graph_output in self.model.model.graph.output:
|
||||
if graph_output.name in edge_mapping:
|
||||
graph_output.name = edge_mapping[graph_output.name]
|
||||
|
||||
def unroll_function(self, func_name: str) -> None:
|
||||
"""
|
||||
Unrolls the function with the given name in the model.
|
||||
"""
|
||||
logger.debug(f"Unrolling function {func_name}...")
|
||||
nodes_to_remove = []
|
||||
nodes_to_add = []
|
||||
edges_to_remove = []
|
||||
edges_to_add = []
|
||||
for node in self.model.model.graph.node:
|
||||
if node.op_type == func_name:
|
||||
nodes_to_remove.append(node)
|
||||
edges_to_remove.extend(list(node.input) + list(node.output))
|
||||
|
||||
func_to_remove = None
|
||||
for f in self.model.model.functions:
|
||||
if f.name == func_name:
|
||||
nodes_to_add.extend(list(f.node))
|
||||
edges_to_add.extend(list(f.input) + list(f.output))
|
||||
func_to_remove = f
|
||||
|
||||
assert len(edges_to_remove) == len(edges_to_add)
|
||||
|
||||
for node in nodes_to_remove:
|
||||
self.model.model.graph.node.remove(node)
|
||||
for node in nodes_to_add:
|
||||
self.model.model.graph.node.append(node)
|
||||
if func_to_remove is not None:
|
||||
self.model.model.functions.remove(func_to_remove)
|
||||
|
||||
edge_mapping = {}
|
||||
for i in range(len(edges_to_remove)):
|
||||
k = edges_to_remove[i]
|
||||
v = edges_to_add[i]
|
||||
if k != v:
|
||||
edge_mapping[k] = v
|
||||
|
||||
return self.update_edges(edge_mapping)
|
||||
|
||||
def remove_function(self, func_name: str, input_id: int, output_id: int) -> None:
|
||||
"""
|
||||
Removes the function in the model.
|
||||
"""
|
||||
edge_mapping = {}
|
||||
nodes_to_remove = []
|
||||
for node in self.model.model.graph.node:
|
||||
if node.op_type.find(func_name) != -1:
|
||||
edge_mapping[node.input[input_id]] = node.output[output_id]
|
||||
nodes_to_remove.append(node)
|
||||
for node in nodes_to_remove:
|
||||
self.model.model.graph.node.remove(node)
|
||||
|
||||
self.update_edges(edge_mapping)
|
||||
|
||||
def remove_dropout_layer(self) -> None:
|
||||
"""
|
||||
Removes the dropout layer in the model.
|
||||
"""
|
||||
logger.debug("Removing dropout layer...")
|
||||
self.remove_function("Dropout", 0, 0)
|
||||
|
||||
def remove_lm_head_layer(self) -> None:
|
||||
"""
|
||||
Removes the LM head layer in the model.
|
||||
"""
|
||||
logger.debug("Removing LM head layer...")
|
||||
# bugbug: need to copy the right vi over
|
||||
self.remove_function("Linear_lm_head", 2, 0)
|
||||
|
||||
def add_initializer(self, name: str, data_type: int, dims: Sequence[int], vals: Any, raw: bool = True):
|
||||
if raw:
|
||||
np_type = helper.tensor_dtype_to_np_dtype(data_type)
|
||||
if not isinstance(vals, np.ndarray):
|
||||
bytes = np.array(vals, dtype=np_type).tobytes()
|
||||
else:
|
||||
bytes = vals.astype(np_type).tobytes()
|
||||
tensor = helper.make_tensor(
|
||||
name=name,
|
||||
data_type=data_type,
|
||||
dims=dims,
|
||||
vals=bytes,
|
||||
raw=True,
|
||||
)
|
||||
else:
|
||||
tensor = helper.make_tensor(
|
||||
name=name,
|
||||
data_type=data_type,
|
||||
dims=dims,
|
||||
vals=vals,
|
||||
raw=False,
|
||||
)
|
||||
|
||||
self.model.add_initializer(tensor)
|
||||
return tensor
|
||||
|
||||
def convert_constants_to_initializers(self, min_size: int = 1) -> None:
|
||||
"""
|
||||
Converts Constant ops of size [min_size] or higher to initializers
|
||||
"""
|
||||
logger.debug(f"Converting constants greater than size {min_size} to initializers")
|
||||
|
||||
constant_nodes = self.model.get_nodes_by_op_type("Constant")
|
||||
nodes_to_remove = []
|
||||
|
||||
for node in constant_nodes:
|
||||
# Get info from Constant op
|
||||
np_data = self.model.get_constant_value(node.output[0])
|
||||
|
||||
# Skip if there are less than [min_size] elements
|
||||
if np_data is None or np_data.size < min_size:
|
||||
continue
|
||||
|
||||
# Add new initializer with same name as Constant op's output
|
||||
for att in node.attribute:
|
||||
if att.name == "value":
|
||||
self.add_initializer(
|
||||
name=node.output[0],
|
||||
data_type=att.t.data_type,
|
||||
dims=list(np_data.shape),
|
||||
vals=np_data,
|
||||
)
|
||||
break
|
||||
|
||||
nodes_to_remove.append(node)
|
||||
|
||||
# Remove Constant ops from graph
|
||||
self.model.remove_nodes(nodes_to_remove)
|
||||
|
||||
def clear_metadata(self) -> None:
|
||||
"""
|
||||
Clear metadata fields in all nodes
|
||||
"""
|
||||
for graph in self.model.graphs():
|
||||
graph.ClearField("metadata_props")
|
||||
for node in self.model.nodes():
|
||||
node.ClearField("metadata_props")
|
||||
|
||||
@staticmethod
|
||||
def fold_transpose_initializers(model) -> None:
|
||||
"""
|
||||
Constant fold Transpose initializers without changing the initializer names
|
||||
"""
|
||||
from onnxscript import ir # noqa: PLC0415
|
||||
|
||||
for name, initializer in model.graph.initializers.items():
|
||||
user_nodes = initializer.consumers()
|
||||
if len(user_nodes) == 1 and user_nodes[0].op_type == "Transpose":
|
||||
transpose_node = user_nodes[0]
|
||||
perm = transpose_node.attributes.get("perm")
|
||||
if perm is None:
|
||||
transposed_tensor = ir.tensor(initializer.const_value.numpy().transpose())
|
||||
else:
|
||||
transposed_tensor = ir.tensor(initializer.const_value.numpy().transpose(perm.as_ints()))
|
||||
new_initializer = ir.Value(
|
||||
name=initializer.name,
|
||||
shape=transposed_tensor.shape,
|
||||
type=ir.TensorType(transposed_tensor.dtype),
|
||||
const_value=transposed_tensor,
|
||||
)
|
||||
ir.convenience.replace_all_uses_with(transpose_node.outputs[0], new_initializer)
|
||||
model.graph.initializers[name] = new_initializer
|
||||
transpose_node.graph.remove(transpose_node, safe=True)
|
||||
@@ -0,0 +1,501 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
# This file is modified from https://github.com/microsoft/onnxconverter-common/blob/master/onnxconverter_common/float16.py
|
||||
# Modifications:
|
||||
# (1) Update default value of min_positive_val and max_finite_val
|
||||
# (2) keep_io_types can be list of names
|
||||
# (3) convert initializers if needed to preserve precision
|
||||
# (4) add force_fp16_initializers option
|
||||
# (5) handle Resize and GroupNorm with mixed float inputs
|
||||
# (6) allow convert_float_to_float16 to accept model path
|
||||
|
||||
import itertools
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import numpy as np
|
||||
import onnx
|
||||
from onnx import AttributeProto, GraphProto, ModelProto, NodeProto, TensorProto, helper, numpy_helper
|
||||
from onnx.shape_inference import infer_shapes, infer_shapes_path
|
||||
from packaging import version
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _npfloat16_to_int(np_list):
|
||||
"""
|
||||
Convert numpy float16 to python int.
|
||||
|
||||
:param np_list: numpy float16 list
|
||||
:return int_list: python int list
|
||||
"""
|
||||
return [int(bin(_.view("H"))[2:].zfill(16), 2) for _ in np_list]
|
||||
|
||||
|
||||
def convert_np_to_float16(np_array, min_positive_val=5.96e-08, max_finite_val=65504.0):
|
||||
"""
|
||||
Convert float32 numpy array to float16 without changing sign or finiteness.
|
||||
Positive values less than min_positive_val are mapped to min_positive_val.
|
||||
Positive finite values greater than max_finite_val are mapped to max_finite_val.
|
||||
Similar for negative values. NaN, 0, inf, and -inf are unchanged.
|
||||
"""
|
||||
|
||||
def between(a, b, c):
|
||||
return np.logical_and(a < b, b < c)
|
||||
|
||||
if np_array[np.where(np_array > 0)].shape[0] > 0:
|
||||
positive_max = np_array[np.where(np_array > 0)].max()
|
||||
positive_min = np_array[np.where(np_array > 0)].min()
|
||||
if positive_max >= max_finite_val:
|
||||
logger.debug(f"the float32 number {positive_max} will be truncated to {max_finite_val}")
|
||||
if positive_min <= min_positive_val:
|
||||
logger.debug(f"the float32 number {positive_min} will be truncated to {min_positive_val}")
|
||||
|
||||
if np_array[np.where(np_array < 0)].shape[0] > 0:
|
||||
negative_max = np_array[np.where(np_array < 0)].max()
|
||||
negative_min = np_array[np.where(np_array < 0)].min()
|
||||
if negative_min <= -max_finite_val:
|
||||
logger.debug(f"the float32 number {negative_min} will be truncated to {-max_finite_val}")
|
||||
if negative_max >= -min_positive_val:
|
||||
logger.debug(f"the float32 number {negative_max} will be truncated to {-min_positive_val}")
|
||||
|
||||
np_array = np.where(between(0, np_array, min_positive_val), min_positive_val, np_array)
|
||||
np_array = np.where(between(-min_positive_val, np_array, 0), -min_positive_val, np_array)
|
||||
np_array = np.where(between(max_finite_val, np_array, float("inf")), max_finite_val, np_array)
|
||||
np_array = np.where(between(float("-inf"), np_array, -max_finite_val), -max_finite_val, np_array)
|
||||
return np.float16(np_array)
|
||||
|
||||
|
||||
def convert_tensor_float_to_float16(tensor, min_positive_val=5.96e-08, max_finite_val=65504.0):
|
||||
"""Convert tensor float to float16.
|
||||
|
||||
Args:
|
||||
tensor (TensorProto): the tensor to convert.
|
||||
min_positive_val (float, optional): minimal positive value. Defaults to 1e-7.
|
||||
max_finite_val (float, optional): maximal finite value. Defaults to 1e4.
|
||||
|
||||
Raises:
|
||||
ValueError: input type is not TensorProto.
|
||||
|
||||
Returns:
|
||||
TensorProto: the converted tensor.
|
||||
"""
|
||||
|
||||
if not isinstance(tensor, TensorProto):
|
||||
raise ValueError(f"Expected input type is an ONNX TensorProto but got {type(tensor)}")
|
||||
|
||||
if tensor.data_type == TensorProto.FLOAT:
|
||||
tensor.data_type = TensorProto.FLOAT16
|
||||
# convert float_data (float type) to float16 and write to int32_data
|
||||
if tensor.float_data:
|
||||
float16_data = convert_np_to_float16(np.array(tensor.float_data), min_positive_val, max_finite_val)
|
||||
int_list = _npfloat16_to_int(float16_data)
|
||||
tensor.int32_data[:] = int_list
|
||||
tensor.float_data[:] = []
|
||||
# convert raw_data (bytes type)
|
||||
if tensor.raw_data:
|
||||
# convert n.raw_data to float
|
||||
float32_list = np.frombuffer(tensor.raw_data, dtype="float32")
|
||||
# convert float to float16
|
||||
float16_list = convert_np_to_float16(float32_list, min_positive_val, max_finite_val)
|
||||
# convert float16 to bytes and write back to raw_data
|
||||
tensor.raw_data = float16_list.tobytes()
|
||||
return tensor
|
||||
|
||||
|
||||
def make_value_info_from_tensor(tensor):
|
||||
shape = numpy_helper.to_array(tensor).shape
|
||||
return helper.make_tensor_value_info(tensor.name, tensor.data_type, shape)
|
||||
|
||||
|
||||
DEFAULT_OP_BLOCK_LIST = [
|
||||
"ArrayFeatureExtractor",
|
||||
"Binarizer",
|
||||
"CastMap",
|
||||
"CategoryMapper",
|
||||
"DictVectorizer",
|
||||
"FeatureVectorizer",
|
||||
"Imputer",
|
||||
"LabelEncoder",
|
||||
"LinearClassifier",
|
||||
"LinearRegressor",
|
||||
"Normalizer",
|
||||
"OneHotEncoder",
|
||||
"RandomUniformLike",
|
||||
"SVMClassifier",
|
||||
"SVMRegressor",
|
||||
"Scaler",
|
||||
"TreeEnsembleClassifier",
|
||||
"TreeEnsembleRegressor",
|
||||
"TreeEnsemble",
|
||||
"ZipMap",
|
||||
"NonMaxSuppression",
|
||||
"TopK",
|
||||
"RoiAlign",
|
||||
"Range",
|
||||
"CumSum",
|
||||
"Min",
|
||||
"Max",
|
||||
"Upsample",
|
||||
]
|
||||
|
||||
|
||||
# Some operators has data type fixed as float for some inputs. Key is op_type, value is list of input indices
|
||||
# Note that DirectML allows float16 gamma and beta in GroupNorm. Use force_fp16_inputs parameter could overwrite this.
|
||||
ALWAYS_FLOAT_INPUTS = {"Resize": [2], "GroupNorm": [1, 2], "SkipGroupNorm": [1, 2]}
|
||||
|
||||
|
||||
class InitializerTracker:
|
||||
"""Class for keeping track of initializer."""
|
||||
|
||||
def __init__(self, initializer: TensorProto):
|
||||
self.initializer = initializer
|
||||
self.fp32_nodes = []
|
||||
self.fp16_nodes = []
|
||||
|
||||
def add_node(self, node: NodeProto, is_node_blocked):
|
||||
if is_node_blocked:
|
||||
self.fp32_nodes.append(node)
|
||||
else:
|
||||
self.fp16_nodes.append(node)
|
||||
|
||||
|
||||
def convert_float_to_float16(
|
||||
model,
|
||||
min_positive_val=5.96e-08,
|
||||
max_finite_val=65504.0,
|
||||
keep_io_types=False,
|
||||
disable_shape_infer=False,
|
||||
op_block_list=None,
|
||||
node_block_list=None,
|
||||
force_fp16_initializers=False,
|
||||
force_fp16_inputs=None,
|
||||
use_bfloat16_as_blocked_nodes_dtype=False,
|
||||
):
|
||||
"""Convert tensor float type in the input ONNX model to tensor float16.
|
||||
|
||||
Args:
|
||||
model (ModelProto or str): The ONNX model or path of the model to convert.
|
||||
min_positive_val (float, optional): minimal positive value. Defaults to 5.96e-08.
|
||||
max_finite_val (float, optional): maximal finite value of float16. Defaults to 65504.
|
||||
keep_io_types (Union[bool, List[str]], optional): It could be boolean or a list of float32 input/output names.
|
||||
If True, model inputs/outputs should be left as float32.
|
||||
Defaults to False.
|
||||
disable_shape_infer (bool, optional): Skips running onnx shape/type inference.
|
||||
Useful if shape inference has been done. Defaults to False.
|
||||
op_block_list (List[str], optional): List of op types to leave as float32.
|
||||
Defaults to None, which will use `float16.DEFAULT_OP_BLOCK_LIST`.
|
||||
node_block_list (List[str], optional): List of node names to leave as float32. Defaults to None.
|
||||
force_fp16_initializers(bool): force converting all float initializers to float16.
|
||||
Default to false, which will convert only the one needed to avoid precision loss.
|
||||
force_fp16_inputs(Dict[str, List[int]]): Force the conversion of the inputs of some operators to float16, even if
|
||||
this script's preference it to keep them in float32.
|
||||
Raises:
|
||||
ValueError: input type is not ModelProto.
|
||||
|
||||
Returns:
|
||||
ModelProto: converted model.
|
||||
"""
|
||||
assert min_positive_val >= 5.96e-08, (
|
||||
"invalid min_positive_val. smallest positive float16 value: subnormal 5.96e-08, and normalized 6.104e-05"
|
||||
)
|
||||
assert max_finite_val <= float(np.finfo(np.float16).max), "invalid max_finite_val. largest float16 value: 65504"
|
||||
|
||||
force_fp16_inputs_dict = {} if force_fp16_inputs is None else force_fp16_inputs
|
||||
|
||||
if isinstance(model, str):
|
||||
model_path = model
|
||||
if version.parse(onnx.__version__) >= version.parse("1.8.0") and not disable_shape_infer:
|
||||
# shape_infer_model_path should be in the same folder of model_path
|
||||
with tempfile.NamedTemporaryFile(dir=os.path.dirname(model_path)) as tmpfile:
|
||||
shape_infer_model_path = tmpfile.name
|
||||
# infer_shapes_path can be used for model >2GB, and infer_shapes cannot.
|
||||
infer_shapes_path(model_path, shape_infer_model_path)
|
||||
model = onnx.load(shape_infer_model_path)
|
||||
disable_shape_infer = True
|
||||
else:
|
||||
model = onnx.load(model_path)
|
||||
|
||||
if not isinstance(model, ModelProto):
|
||||
raise ValueError(f"Expected an ONNX ModelProto but got {type(model)}")
|
||||
|
||||
func_infer_shape = None
|
||||
if not disable_shape_infer and version.parse(onnx.__version__) >= version.parse("1.2.0"):
|
||||
try:
|
||||
func_infer_shape = infer_shapes
|
||||
finally:
|
||||
pass
|
||||
|
||||
# create blocklists
|
||||
if op_block_list is None:
|
||||
op_block_list = DEFAULT_OP_BLOCK_LIST
|
||||
if node_block_list is None:
|
||||
node_block_list = []
|
||||
op_block_list = set(op_block_list)
|
||||
node_block_list = set(node_block_list)
|
||||
|
||||
logger.debug(
|
||||
f"fp16 parameters: min_positive_val={min_positive_val} max_finite_val={max_finite_val} keep_io_types={keep_io_types} disable_shape_infer={disable_shape_infer} op_block_list={op_block_list} node_block_list={node_block_list} force_fp16_initializers={force_fp16_initializers}"
|
||||
)
|
||||
|
||||
# create a queue for BFS
|
||||
queue = []
|
||||
value_info_list = []
|
||||
node_list = []
|
||||
|
||||
# Some operators (Like Resize or GroupNorm) have data type fixed as float for some input.
|
||||
# When it is converted to float16, there are mixed types: some inputs are float32 and some are float16.
|
||||
# This list keeps track of such nodes that are not in block list.
|
||||
mixed_float_type_node_list = []
|
||||
|
||||
# type inference on input model
|
||||
if func_infer_shape is not None:
|
||||
model = func_infer_shape(model)
|
||||
queue.append(model)
|
||||
name_mapping = {}
|
||||
graph_io_to_skip = set()
|
||||
io_casts = set()
|
||||
|
||||
fp32_inputs = [n.name for n in model.graph.input if n.type.tensor_type.elem_type == TensorProto.FLOAT]
|
||||
fp32_outputs = [n.name for n in model.graph.output if n.type.tensor_type.elem_type == TensorProto.FLOAT]
|
||||
if isinstance(keep_io_types, list):
|
||||
fp32_inputs = [n for n in fp32_inputs if n in keep_io_types]
|
||||
fp32_outputs = [n for n in fp32_outputs if n in keep_io_types]
|
||||
elif not keep_io_types:
|
||||
fp32_inputs = []
|
||||
fp32_outputs = []
|
||||
|
||||
for i, n in enumerate(model.graph.input):
|
||||
if n.name in fp32_inputs:
|
||||
output_name = "graph_input_cast_" + str(i)
|
||||
name_mapping[n.name] = output_name
|
||||
graph_io_to_skip.add(n.name)
|
||||
|
||||
node_name = "graph_input_cast" + str(i)
|
||||
new_value_info = model.graph.value_info.add()
|
||||
new_value_info.CopyFrom(n)
|
||||
new_value_info.name = output_name
|
||||
new_value_info.type.tensor_type.elem_type = TensorProto.FLOAT16
|
||||
# add Cast node (from tensor(float) to tensor(float16) after graph input
|
||||
new_node = [helper.make_node("Cast", [n.name], [output_name], to=TensorProto.FLOAT16, name=node_name)]
|
||||
model.graph.node.extend(new_node)
|
||||
value_info_list.append(new_value_info)
|
||||
io_casts.add(node_name)
|
||||
|
||||
for i, n in enumerate(model.graph.output):
|
||||
if n.name in fp32_outputs:
|
||||
input_name = "graph_output_cast_" + str(i)
|
||||
name_mapping[n.name] = input_name
|
||||
graph_io_to_skip.add(n.name)
|
||||
|
||||
node_name = "graph_output_cast" + str(i)
|
||||
# add Cast node (from tensor(float16) to tensor(float) before graph output
|
||||
new_value_info = model.graph.value_info.add()
|
||||
new_value_info.CopyFrom(n)
|
||||
new_value_info.name = input_name
|
||||
new_value_info.type.tensor_type.elem_type = TensorProto.FLOAT16
|
||||
new_node = [helper.make_node("Cast", [input_name], [n.name], to=1, name=node_name)]
|
||||
model.graph.node.extend(new_node)
|
||||
value_info_list.append(new_value_info)
|
||||
io_casts.add(node_name)
|
||||
|
||||
fp32_initializers: dict[str, InitializerTracker] = {}
|
||||
while queue:
|
||||
next_level = []
|
||||
for q in queue:
|
||||
# if q is model, push q.graph (GraphProto)
|
||||
if isinstance(q, ModelProto):
|
||||
next_level.append(q.graph)
|
||||
# if q is model.graph, push q.node.attribute (AttributeProto)
|
||||
if isinstance(q, GraphProto):
|
||||
for n in q.initializer: # TensorProto type
|
||||
if n.data_type == TensorProto.FLOAT:
|
||||
assert n.name not in fp32_initializers
|
||||
fp32_initializers[n.name] = InitializerTracker(n)
|
||||
|
||||
for n in q.node:
|
||||
# if n is in the block list (doesn't support float16), no conversion for the node,
|
||||
# and save the node for further processing
|
||||
if n.name in io_casts:
|
||||
continue
|
||||
for i in range(len(n.input)):
|
||||
if n.input[i] in name_mapping:
|
||||
n.input[i] = name_mapping[n.input[i]]
|
||||
for i in range(len(n.output)):
|
||||
if n.output[i] in name_mapping:
|
||||
n.output[i] = name_mapping[n.output[i]]
|
||||
|
||||
is_node_blocked = n.op_type in op_block_list or n.name in node_block_list
|
||||
for i, input_name in enumerate(n.input):
|
||||
if input_name in fp32_initializers:
|
||||
# For Resize/GroupNorm, only the first input can be float16
|
||||
use_fp32_weight = is_node_blocked or (
|
||||
i in ALWAYS_FLOAT_INPUTS.get(n.op_type, [])
|
||||
and i not in force_fp16_inputs_dict.get(n.op_type, [])
|
||||
)
|
||||
fp32_initializers[input_name].add_node(n, use_fp32_weight)
|
||||
|
||||
if is_node_blocked:
|
||||
node_list.append(n)
|
||||
else:
|
||||
if n.op_type == "Cast":
|
||||
for attr in n.attribute:
|
||||
if attr.name == "to" and attr.i == TensorProto.FLOAT:
|
||||
attr.i = TensorProto.FLOAT16
|
||||
break
|
||||
|
||||
if n.op_type in [
|
||||
"EyeLike",
|
||||
"Multinomial",
|
||||
"RandomNormal",
|
||||
"RandomNormalLike",
|
||||
"RandomUniform",
|
||||
"RandomUniformLike",
|
||||
"SequenceEmpty",
|
||||
"Bernoulli",
|
||||
]:
|
||||
has_dtype = False
|
||||
for attr in n.attribute:
|
||||
if attr.name == "dtype":
|
||||
has_dtype = True
|
||||
if attr.i == TensorProto.FLOAT:
|
||||
attr.i = TensorProto.FLOAT16
|
||||
|
||||
# The dtype attribute is optional and default is FLOAT in the following operators
|
||||
# so we need add dtype attribute to specify the data type float16
|
||||
if (n.op_type in ["RandomNormal", "RandomUniform", "SequenceEmpty"]) and not has_dtype:
|
||||
n.attribute.extend([helper.make_attribute("dtype", TensorProto.FLOAT16)])
|
||||
|
||||
# For Resize/GroupNorm, attribute data type cannot be changed
|
||||
if n.op_type not in ALWAYS_FLOAT_INPUTS or n.op_type in force_fp16_inputs_dict:
|
||||
for attr in n.attribute:
|
||||
next_level.append(attr) # noqa: PERF402
|
||||
else:
|
||||
mixed_float_type_node_list.append(n)
|
||||
|
||||
# if q is model.graph.node.attribute, push q.g and q.graphs (GraphProto)
|
||||
# and process node.attribute.t and node.attribute.tensors (TensorProto)
|
||||
if isinstance(q, AttributeProto):
|
||||
next_level.append(q.g)
|
||||
for n in q.graphs:
|
||||
next_level.append(n) # noqa: PERF402
|
||||
q.t.CopyFrom(convert_tensor_float_to_float16(q.t, min_positive_val, max_finite_val))
|
||||
for n in q.tensors:
|
||||
n = convert_tensor_float_to_float16(n, min_positive_val, max_finite_val) # noqa: PLW2901
|
||||
# if q is graph, process input, output and value_info (ValueInfoProto)
|
||||
if isinstance(q, GraphProto):
|
||||
# Note that float initializers tracked by fp32_initializers will be processed later.
|
||||
# for all ValueInfoProto with tensor(float) type in input, output and value_info, convert them to
|
||||
# tensor(float16) except map and seq(map). And save them in value_info_list for further processing
|
||||
for n in itertools.chain(q.input, q.output, q.value_info):
|
||||
if n.type.tensor_type.elem_type == TensorProto.FLOAT:
|
||||
if n.name not in graph_io_to_skip:
|
||||
n.type.tensor_type.elem_type = TensorProto.FLOAT16
|
||||
value_info_list.append(n)
|
||||
if n.type.HasField("sequence_type"):
|
||||
if n.type.sequence_type.elem_type.tensor_type.elem_type == TensorProto.FLOAT:
|
||||
if n.name not in graph_io_to_skip:
|
||||
n.type.sequence_type.elem_type.tensor_type.elem_type = TensorProto.FLOAT16
|
||||
value_info_list.append(n)
|
||||
|
||||
queue = next_level
|
||||
|
||||
for value in fp32_initializers.values():
|
||||
# By default, to avoid precision loss, do not convert an initializer to fp16 when it is used only by fp32 nodes.
|
||||
if force_fp16_initializers or value.fp16_nodes:
|
||||
value.initializer = convert_tensor_float_to_float16(value.initializer, min_positive_val, max_finite_val)
|
||||
value_info_list.append(make_value_info_from_tensor(value.initializer))
|
||||
if value.fp32_nodes and not force_fp16_initializers:
|
||||
logger.info(
|
||||
f"initializer is used by both fp32 and fp16 nodes. Consider add these nodes to block list:{value.fp16_nodes}"
|
||||
)
|
||||
|
||||
# Some operators have data type fixed as float for some input. Add a float16 to float cast for those inputs.
|
||||
for node in mixed_float_type_node_list:
|
||||
for i, input_name in enumerate(node.input):
|
||||
if i not in ALWAYS_FLOAT_INPUTS[node.op_type] or i in force_fp16_inputs_dict.get(node.op_type, []):
|
||||
continue
|
||||
for value_info in value_info_list:
|
||||
if input_name == value_info.name:
|
||||
# create new value_info for current node's new input name
|
||||
new_value_info = model.graph.value_info.add()
|
||||
new_value_info.CopyFrom(value_info)
|
||||
output_name = node.name + "_input_cast_" + str(i)
|
||||
new_value_info.name = output_name
|
||||
new_value_info.type.tensor_type.elem_type = TensorProto.FLOAT
|
||||
# add Cast node (from tensor(float16) to tensor(float) before current node
|
||||
node_name = node.name + "_input_cast" + str(i)
|
||||
new_node = [helper.make_node("Cast", [input_name], [output_name], to=1, name=node_name)]
|
||||
model.graph.node.extend(new_node)
|
||||
# change current node's input name
|
||||
node.input[i] = output_name
|
||||
break
|
||||
|
||||
accuracy_type = TensorProto.BFLOAT16 if use_bfloat16_as_blocked_nodes_dtype else TensorProto.FLOAT
|
||||
# process the nodes in block list that doesn't support tensor(float16)
|
||||
for node in node_list:
|
||||
# if input's name is in the value_info_list meaning input is tensor(float16) type,
|
||||
# insert a float16 to float Cast node before the node,
|
||||
# change current node's input name and create new value_info for the new name
|
||||
for i in range(len(node.input)):
|
||||
input_name = node.input[i]
|
||||
for value_info in value_info_list:
|
||||
if input_name == value_info.name:
|
||||
# create new value_info for current node's new input name
|
||||
new_value_info = model.graph.value_info.add()
|
||||
new_value_info.CopyFrom(value_info)
|
||||
output_name = node.name + "_input_cast_" + str(i)
|
||||
new_value_info.name = output_name
|
||||
new_value_info.type.tensor_type.elem_type = accuracy_type
|
||||
# add Cast node (from tensor(float16) to tensor(float) before current node
|
||||
node_name = node.name + "_input_cast" + str(i)
|
||||
new_node = [helper.make_node("Cast", [input_name], [output_name], to=accuracy_type, name=node_name)]
|
||||
model.graph.node.extend(new_node)
|
||||
# change current node's input name
|
||||
node.input[i] = output_name
|
||||
break
|
||||
# if output's name is in the value_info_list meaning output is tensor(float16) type, insert a float to
|
||||
# float16 Cast node after the node, change current node's output name and create new value_info for the new name
|
||||
for i in range(len(node.output)):
|
||||
output = node.output[i]
|
||||
for value_info in value_info_list:
|
||||
if output == value_info.name:
|
||||
# create new value_info for current node's new output
|
||||
new_value_info = model.graph.value_info.add()
|
||||
new_value_info.CopyFrom(value_info)
|
||||
input_name = node.name + "_output_cast_" + str(i)
|
||||
new_value_info.name = input_name
|
||||
new_value_info.type.tensor_type.elem_type = accuracy_type
|
||||
# add Cast node (from tensor(float) to tensor(float16) after current node
|
||||
node_name = node.name + "_output_cast" + str(i)
|
||||
new_node = [helper.make_node("Cast", [input_name], [output], to=10, name=node_name)]
|
||||
model.graph.node.extend(new_node)
|
||||
# change current node's input name
|
||||
node.output[i] = input_name
|
||||
break
|
||||
return model
|
||||
|
||||
|
||||
def float_to_float16_max_diff(tensor, min_positive_val=5.96e-08, max_finite_val=65504.0):
|
||||
"""Measure the maximum absolute difference after converting a float tensor to float16."""
|
||||
if not isinstance(tensor, TensorProto):
|
||||
raise ValueError(f"Expected input type is an ONNX TensorProto but got {type(tensor)}")
|
||||
if tensor.data_type != TensorProto.FLOAT:
|
||||
raise ValueError("Expected tensor data type is float.")
|
||||
|
||||
float32_data = None
|
||||
if tensor.float_data:
|
||||
float32_data = np.array(tensor.float_data)
|
||||
|
||||
if tensor.raw_data:
|
||||
float32_data = np.frombuffer(tensor.raw_data, dtype="float32")
|
||||
|
||||
if float32_data is None:
|
||||
raise RuntimeError("external data not loaded!")
|
||||
|
||||
float16_data = convert_np_to_float16(float32_data, min_positive_val, max_finite_val)
|
||||
return np.amax(np.abs(float32_data - np.float32(float16_data)))
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,340 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
from logging import getLogger
|
||||
|
||||
from fusion_attention import AttentionMask, FusionAttention
|
||||
from fusion_options import AttentionMaskFormat
|
||||
from onnx import NodeProto
|
||||
from onnx_model import OnnxModel
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
||||
class FusionAttentionClip(FusionAttention):
|
||||
"""
|
||||
Fuse Attention subgraph of Clip into one Attention node.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: OnnxModel,
|
||||
hidden_size: int,
|
||||
num_heads: int,
|
||||
):
|
||||
attention_mask = AttentionMask(model)
|
||||
attention_mask.mask_format = AttentionMaskFormat.NoMask
|
||||
|
||||
super().__init__(
|
||||
model,
|
||||
hidden_size,
|
||||
num_heads,
|
||||
attention_mask,
|
||||
use_multi_head_attention=False,
|
||||
search_op_types=["SkipLayerNormalization"],
|
||||
)
|
||||
|
||||
def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> tuple[int, int]:
|
||||
"""Detect num_heads and hidden_size for ONNX model from MiDaS
|
||||
Args:
|
||||
reshape_q (NodeProto): reshape node for q
|
||||
Returns:
|
||||
Tuple[int, int]: num_heads and hidden_size
|
||||
"""
|
||||
concat = self.model.match_parent(reshape_q, "Concat", 1)
|
||||
if concat is None or len(concat.input) != 4:
|
||||
return self.num_heads, self.hidden_size
|
||||
|
||||
# The shape is a tensor like [?, ?, num_heads, head_size]
|
||||
num_head_value = self.model.get_constant_value(concat.input[2])
|
||||
if num_head_value is None:
|
||||
return self.num_heads, self.hidden_size # Fall back to user specified value
|
||||
|
||||
if len(num_head_value) != 1 or num_head_value[0] <= 0:
|
||||
return self.num_heads, self.hidden_size # Fall back to user specified value
|
||||
|
||||
num_heads = num_head_value[0]
|
||||
|
||||
head_size_value = self.model.get_constant_value(concat.input[3])
|
||||
if head_size_value is None:
|
||||
return self.num_heads, self.hidden_size # Fall back to user specified value
|
||||
|
||||
if len(head_size_value) != 1 or head_size_value[0] <= 0:
|
||||
return self.num_heads, self.hidden_size # Fall back to user specified value
|
||||
|
||||
head_size = head_size_value[0]
|
||||
|
||||
hidden_size = num_heads * head_size
|
||||
|
||||
if self.num_heads > 0 and num_heads != self.num_heads:
|
||||
if self.num_heads_warning:
|
||||
logger.warning(f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value.")
|
||||
self.num_heads_warning = False # Do not show the warning more than once
|
||||
|
||||
if self.hidden_size > 0 and hidden_size != self.hidden_size:
|
||||
if self.hidden_size_warning:
|
||||
logger.warning(
|
||||
f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
|
||||
)
|
||||
self.hidden_size_warning = False # Do not show the warning more than once
|
||||
|
||||
return num_heads, hidden_size
|
||||
|
||||
def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
|
||||
skip_input_index = None
|
||||
node_before_layer_norm = None
|
||||
for i in [1, 0]:
|
||||
parent = self.model.match_parent(normalize_node, "SkipLayerNormalization", i)
|
||||
if parent is not None:
|
||||
skip_input_index = i
|
||||
node_before_layer_norm = parent
|
||||
|
||||
root_input = None
|
||||
if node_before_layer_norm is not None:
|
||||
root_input = node_before_layer_norm.output[0]
|
||||
else:
|
||||
# Deal with the first attention after the embedding layer.
|
||||
for i in [0, 1]:
|
||||
node_before_layer_norm = None
|
||||
|
||||
node_before_layer_norm_1 = self.model.match_parent(normalize_node, "Add", i)
|
||||
node_before_layer_norm_2 = self.model.match_parent(normalize_node, "LayerNormalization", i)
|
||||
if node_before_layer_norm_1 is not None:
|
||||
# Add -----------+
|
||||
# | |
|
||||
# LayerNorm |
|
||||
# | |
|
||||
# LayerNorm |
|
||||
# | |
|
||||
# Attention subgraph |
|
||||
# | |
|
||||
# SkipLayerNorm ------+
|
||||
node_before_layer_norm = node_before_layer_norm_1
|
||||
elif node_before_layer_norm_2 is not None:
|
||||
# Add
|
||||
# |
|
||||
# LayerNorm --------+
|
||||
# | |
|
||||
# LayerNorm |
|
||||
# | |
|
||||
# Attention subgraph |
|
||||
# | |
|
||||
# SkipLayerNorm ------+
|
||||
node_before_layer_norm = node_before_layer_norm_2
|
||||
|
||||
if node_before_layer_norm is None:
|
||||
continue
|
||||
child = self.model.find_first_child_by_type(
|
||||
node_before_layer_norm,
|
||||
"LayerNormalization",
|
||||
input_name_to_nodes,
|
||||
False,
|
||||
)
|
||||
if child is None:
|
||||
continue
|
||||
root_input = child.output[0]
|
||||
skip_input_index = i
|
||||
break
|
||||
|
||||
if skip_input_index is None:
|
||||
return
|
||||
|
||||
qkv_nodes = self.model.match_parent_path(
|
||||
normalize_node,
|
||||
["Add", "MatMul", "Reshape", "Transpose", "Reshape", "MatMul"],
|
||||
[1 - skip_input_index, None, None, 0, 0, 0],
|
||||
)
|
||||
if qkv_nodes is None:
|
||||
qkv_nodes = self.model.match_parent_path(
|
||||
normalize_node,
|
||||
["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
|
||||
[1, None, 0, 0, 0],
|
||||
)
|
||||
if qkv_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match qkv path")
|
||||
return
|
||||
reshape_qkv, transpose_qkv, matmul_qkv = (
|
||||
qkv_nodes[2],
|
||||
qkv_nodes[3],
|
||||
qkv_nodes[-1],
|
||||
)
|
||||
|
||||
v_nodes = self.model.match_parent_path(
|
||||
matmul_qkv,
|
||||
["Reshape", "Transpose", "Reshape", "Add", "MatMul"],
|
||||
[1, 0, 0, 0, None],
|
||||
)
|
||||
if v_nodes is None:
|
||||
v_nodes = self.model.match_parent_path(
|
||||
matmul_qkv, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None]
|
||||
)
|
||||
if v_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match v path")
|
||||
return
|
||||
|
||||
add_v, matmul_v = v_nodes[-2], v_nodes[-1]
|
||||
|
||||
causal_mask_input_index = None
|
||||
add_mask = None
|
||||
add_mask_indices = []
|
||||
qk_nodes = self.model.match_parent_path(
|
||||
matmul_qkv,
|
||||
["Softmax", "Reshape", "Add", "Reshape", "MatMul"],
|
||||
[0, 0, 0, None, 0],
|
||||
return_indice=add_mask_indices,
|
||||
)
|
||||
if qk_nodes is None:
|
||||
qk_nodes = self.model.match_parent_path(
|
||||
matmul_qkv,
|
||||
["Softmax", "MatMul"],
|
||||
[0, 0],
|
||||
)
|
||||
if qk_nodes is None:
|
||||
qk_nodes = self.model.match_parent_path(matmul_qkv, ["Softmax", "Add", "Mul", "MatMul"], [0, 0, 0, 0])
|
||||
if qk_nodes is not None:
|
||||
add_mask = qk_nodes[1]
|
||||
else:
|
||||
# If attention mask is not used, we can still match the qk path.
|
||||
qk_nodes = self.model.match_parent_path(matmul_qkv, ["Softmax", "Mul", "MatMul"], [0, 0, 0])
|
||||
if qk_nodes is None:
|
||||
# Cast nodes are added in the model for fp16.
|
||||
qk_nodes = self.model.match_parent_path(
|
||||
matmul_qkv,
|
||||
["Cast", "Cast", "Softmax", "Add", "Mul", "MatMul"],
|
||||
[0, 0, 0, 0, 0, 0],
|
||||
)
|
||||
if qk_nodes is not None:
|
||||
add_mask = qk_nodes[3]
|
||||
else:
|
||||
# If attention mask is not used, we can still match the qk path.
|
||||
qk_nodes = self.model.match_parent_path(
|
||||
matmul_qkv,
|
||||
["Cast", "Cast", "Softmax", "Mul", "MatMul"],
|
||||
[0, 0, 0, 0, 0],
|
||||
)
|
||||
if qk_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match qk path")
|
||||
return
|
||||
else:
|
||||
assert len(add_mask_indices) == 1
|
||||
causal_mask_input_index = 1 - add_mask_indices[0]
|
||||
add_mask = qk_nodes[2]
|
||||
|
||||
matmul_qk = qk_nodes[-1]
|
||||
|
||||
q_nodes = self.model.match_parent_path(
|
||||
matmul_qk,
|
||||
["Reshape", "Transpose", "Reshape", "Mul", "Add", "MatMul"],
|
||||
[0, 0, 0, 0, None, None],
|
||||
)
|
||||
if q_nodes is None:
|
||||
q_nodes = self.model.match_parent_path(
|
||||
matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, None]
|
||||
)
|
||||
if q_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match q path")
|
||||
return
|
||||
|
||||
reshape_q = q_nodes[1]
|
||||
else:
|
||||
reshape_q = q_nodes[2]
|
||||
|
||||
add_q, matmul_q = q_nodes[-2], q_nodes[-1]
|
||||
|
||||
k_nodes = self.model.match_parent_path(
|
||||
matmul_qk,
|
||||
["Transpose", "Reshape", "Transpose", "Reshape", "Add", "MatMul"],
|
||||
[1, 0, 0, 0, 0, None],
|
||||
)
|
||||
if k_nodes is None:
|
||||
k_nodes = self.model.match_parent_path(
|
||||
matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None]
|
||||
)
|
||||
if k_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match k path")
|
||||
return
|
||||
|
||||
add_k, matmul_k = k_nodes[-2], k_nodes[-1]
|
||||
|
||||
if matmul_q.input[0] != root_input or matmul_k.input[0] != root_input or matmul_v.input[0] != root_input:
|
||||
logger.debug("fuse_attention: expect to have same input to q, k and v matmul")
|
||||
return
|
||||
|
||||
num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
|
||||
if num_heads <= 0 or hidden_size <= 0:
|
||||
logger.debug("fuse_attention: failed to detect num_heads or hidden_size")
|
||||
return
|
||||
|
||||
attention_last_node = reshape_qkv
|
||||
|
||||
add_qk = ""
|
||||
causal_mask_nodes_1 = None
|
||||
causal_mask_nodes_2 = None
|
||||
if add_mask is not None:
|
||||
if add_mask.input[1] == "attention_mask":
|
||||
add_qk = add_mask.input[1]
|
||||
else:
|
||||
# 4D Add after Q x K'
|
||||
add_qk_nodes = self.model.match_parent_path(
|
||||
add_mask,
|
||||
[
|
||||
"Where",
|
||||
"Sub",
|
||||
"Cast",
|
||||
"Expand",
|
||||
"Unsqueeze",
|
||||
"Unsqueeze",
|
||||
"Reshape",
|
||||
"Reshape",
|
||||
"Cast",
|
||||
],
|
||||
[1, 2, 1, 0, 0, 0, 0, 0, 0],
|
||||
)
|
||||
if add_qk_nodes is not None:
|
||||
add_qk = add_mask.input[1]
|
||||
else:
|
||||
# Here we do not match the whole subgraph since it is very complex. Instead, we just check whether a key path
|
||||
# of computing causal mask.
|
||||
causal_mask_nodes_1 = self.model.match_parent_path(
|
||||
add_mask,
|
||||
["Concat", "Expand", "Unsqueeze", "Unsqueeze", "Where", "Less"],
|
||||
[causal_mask_input_index, 0, 0, 0, 0, 0],
|
||||
)
|
||||
# If the model is exported with batch_size == 1, there is no Concat node
|
||||
causal_mask_nodes_2 = self.model.match_parent_path(
|
||||
add_mask,
|
||||
["Expand", "Unsqueeze", "Unsqueeze", "Where", "Less"],
|
||||
[causal_mask_input_index, 0, 0, 0, 0],
|
||||
)
|
||||
|
||||
if causal_mask_nodes_1 is None and causal_mask_nodes_2 is None:
|
||||
logger.debug("fuse_attention: failed to match causal mask subgraph")
|
||||
return
|
||||
|
||||
new_node = self.create_attention_node(
|
||||
mask_index=None,
|
||||
q_matmul=matmul_q,
|
||||
k_matmul=matmul_k,
|
||||
v_matmul=matmul_v,
|
||||
q_add=add_q,
|
||||
k_add=add_k,
|
||||
v_add=add_v,
|
||||
num_heads=num_heads,
|
||||
hidden_size=hidden_size,
|
||||
first_input=root_input,
|
||||
output=attention_last_node.output[0],
|
||||
add_qk_str=add_qk,
|
||||
scale=None,
|
||||
causal=(causal_mask_nodes_1 is not None) or (causal_mask_nodes_2 is not None),
|
||||
)
|
||||
if new_node is None:
|
||||
logger.debug("fuse_attention: failed to create fused node")
|
||||
return
|
||||
|
||||
self.nodes_to_add.append(new_node)
|
||||
self.node_name_to_graph_name[new_node.name] = self.this_graph_name
|
||||
self.nodes_to_remove.extend([attention_last_node, transpose_qkv])
|
||||
|
||||
# Use prune graph to remove nodes since they are shared by all attention nodes.
|
||||
self.prune_graph = True
|
||||
@@ -0,0 +1,533 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
from logging import getLogger
|
||||
|
||||
import numpy as np
|
||||
from fusion_base import Fusion
|
||||
from fusion_utils import NumpyHelper
|
||||
from onnx import NodeProto, helper, numpy_helper
|
||||
from onnx_model import OnnxModel
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
||||
class FusionMultiHeadAttentionSam2(Fusion):
|
||||
"""
|
||||
Fuse MultiHeadAttention subgraph of Segment Anything v2 (SAM2).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: OnnxModel,
|
||||
hidden_size: int,
|
||||
num_heads: int,
|
||||
):
|
||||
super().__init__(model, "MultiHeadAttention", ["LayerNormalization"])
|
||||
self.hidden_size = hidden_size
|
||||
self.num_heads = num_heads
|
||||
|
||||
# Flags to show warning only once
|
||||
self.num_heads_warning = True
|
||||
self.hidden_size_warning = True
|
||||
|
||||
def get_decoder_num_heads(self, reshape_q: NodeProto) -> int:
|
||||
"""Detect num_heads from a reshape node.
|
||||
|
||||
Args:
|
||||
reshape_q (NodeProto): reshape node for Q
|
||||
Returns:
|
||||
int: num_heads, or 0 if not found
|
||||
"""
|
||||
num_heads = 0
|
||||
|
||||
# we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
|
||||
shape_value = self.model.get_constant_value(reshape_q.input[1])
|
||||
if shape_value is not None:
|
||||
if isinstance(shape_value, np.ndarray) and list(shape_value.shape) == [4]:
|
||||
num_heads = int(shape_value[2])
|
||||
|
||||
if isinstance(num_heads, int) and num_heads > 0:
|
||||
return num_heads
|
||||
|
||||
return 0
|
||||
|
||||
def get_encoder_num_heads(self, reshape_in: NodeProto) -> int:
|
||||
"""Detect num_heads from a reshape node.
|
||||
|
||||
Args:
|
||||
reshape_q (NodeProto): reshape node for Q
|
||||
Returns:
|
||||
int: num_heads, or 0 if not found
|
||||
"""
|
||||
num_heads = 0
|
||||
|
||||
shape_value = self.model.get_constant_value(reshape_in.input[1])
|
||||
if shape_value is not None:
|
||||
if isinstance(shape_value, np.ndarray) and list(shape_value.shape) == [5]:
|
||||
num_heads = int(shape_value[3])
|
||||
else:
|
||||
concat_shape = self.model.match_parent(reshape_in, "Concat", 1)
|
||||
if concat_shape is not None and len(concat_shape.input) == 5:
|
||||
# we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
|
||||
shape_value = self.model.get_constant_value(concat_shape.input[3])
|
||||
if shape_value is not None:
|
||||
if isinstance(shape_value, np.ndarray) and list(shape_value.shape) == [1]:
|
||||
num_heads = int(shape_value[0])
|
||||
|
||||
if isinstance(num_heads, int) and num_heads > 0:
|
||||
return num_heads
|
||||
|
||||
return 0
|
||||
|
||||
def get_hidden_size(self, layernorm_node):
|
||||
"""Detect hidden_size from LayerNormalization node.
|
||||
Args:
|
||||
layernorm_node (NodeProto): LayerNormalization node before Q, K and V
|
||||
Returns:
|
||||
int: hidden_size, or 0 if not found
|
||||
"""
|
||||
layernorm_bias = self.model.get_initializer(layernorm_node.input[2])
|
||||
if layernorm_bias:
|
||||
return NumpyHelper.to_array(layernorm_bias).shape[0]
|
||||
|
||||
return 0
|
||||
|
||||
def get_num_heads_and_hidden_size(
|
||||
self, reshape_q: NodeProto, layernorm_node: NodeProto, is_encoder: bool = False
|
||||
) -> tuple[int, int]:
|
||||
"""Detect num_heads and hidden_size.
|
||||
|
||||
Args:
|
||||
reshape_q (NodeProto): reshape node for Q
|
||||
layernorm_node (NodeProto): LayerNormalization node before Q, K, V
|
||||
Returns:
|
||||
Tuple[int, int]: num_heads and hidden_size
|
||||
"""
|
||||
if is_encoder:
|
||||
num_heads = self.get_encoder_num_heads(reshape_q)
|
||||
else:
|
||||
num_heads = self.get_decoder_num_heads(reshape_q)
|
||||
if num_heads <= 0:
|
||||
num_heads = self.num_heads # Fall back to user specified value
|
||||
|
||||
if self.num_heads > 0 and num_heads != self.num_heads:
|
||||
if self.num_heads_warning:
|
||||
logger.warning(f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value.")
|
||||
self.num_heads_warning = False # Do not show the warning more than once
|
||||
|
||||
hidden_size = self.get_hidden_size(layernorm_node)
|
||||
if hidden_size <= 0:
|
||||
hidden_size = self.hidden_size # Fall back to user specified value
|
||||
|
||||
if self.hidden_size > 0 and hidden_size != self.hidden_size:
|
||||
if self.hidden_size_warning:
|
||||
logger.warning(
|
||||
f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
|
||||
)
|
||||
self.hidden_size_warning = False # Do not show the warning more than once
|
||||
|
||||
return num_heads, hidden_size
|
||||
|
||||
def create_attention_node(
|
||||
self,
|
||||
q_matmul: NodeProto,
|
||||
q_add: NodeProto,
|
||||
k_matmul: NodeProto,
|
||||
k_add: NodeProto,
|
||||
v_matmul: NodeProto,
|
||||
v_add: NodeProto,
|
||||
num_heads: int,
|
||||
hidden_size: int,
|
||||
output: str,
|
||||
) -> NodeProto | None:
|
||||
"""Create an Attention node.
|
||||
|
||||
Args:
|
||||
q_matmul (NodeProto): MatMul node in fully connection for Q
|
||||
q_add (NodeProto): Add bias node in fully connection for Q
|
||||
k_matmul (NodeProto): MatMul node in fully connection for K
|
||||
k_add (NodeProto): Add bias node in fully connection for K
|
||||
v_matmul (NodeProto): MatMul node in fully connection for V
|
||||
v_add (NodeProto): Add bias node in fully connection for V
|
||||
num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
|
||||
hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
|
||||
output (str): output name
|
||||
|
||||
Returns:
|
||||
Union[NodeProto, None]: the node created or None if failed.
|
||||
"""
|
||||
if hidden_size > 0 and (hidden_size % num_heads) != 0:
|
||||
logger.debug(f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}")
|
||||
return None
|
||||
|
||||
q_weight = self.model.get_initializer(q_matmul.input[1])
|
||||
k_weight = self.model.get_initializer(k_matmul.input[1])
|
||||
v_weight = self.model.get_initializer(v_matmul.input[1])
|
||||
if not (q_weight and k_weight and v_weight):
|
||||
return None
|
||||
|
||||
qw = NumpyHelper.to_array(q_weight)
|
||||
kw = NumpyHelper.to_array(k_weight)
|
||||
vw = NumpyHelper.to_array(v_weight)
|
||||
logger.debug(f"qw={qw.shape} kw={kw.shape} vw={vw.shape} hidden_size={hidden_size}")
|
||||
|
||||
attention_node_name = self.model.create_node_name("MultiHeadAttention")
|
||||
|
||||
attention_inputs = [
|
||||
q_add.output[0],
|
||||
k_add.output[0],
|
||||
v_add.output[0],
|
||||
]
|
||||
|
||||
attention_node = helper.make_node(
|
||||
"MultiHeadAttention",
|
||||
inputs=attention_inputs,
|
||||
outputs=[output],
|
||||
name=attention_node_name,
|
||||
)
|
||||
attention_node.domain = "com.microsoft"
|
||||
attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
|
||||
|
||||
counter_name = "MultiHeadAttention ({})".format("cross attention")
|
||||
self.increase_counter(counter_name)
|
||||
return attention_node
|
||||
|
||||
def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
|
||||
if self.fuse_sam_encoder_pattern(normalize_node, input_name_to_nodes, output_name_to_node):
|
||||
return
|
||||
|
||||
match_qkv = self.match_attention_subgraph(normalize_node)
|
||||
if match_qkv is None:
|
||||
if normalize_node.input[0] not in output_name_to_node:
|
||||
return
|
||||
|
||||
skip_add = output_name_to_node[normalize_node.input[0]]
|
||||
if skip_add.op_type != "Add":
|
||||
return
|
||||
|
||||
match_qkv = self.match_attention_subgraph(skip_add)
|
||||
|
||||
if match_qkv is None:
|
||||
return
|
||||
|
||||
reshape_qkv, transpose_qkv, reshape_q, matmul_q, add_q, matmul_k, add_k, matmul_v, add_v = match_qkv
|
||||
|
||||
attention_last_node = reshape_qkv
|
||||
|
||||
q_num_heads, q_hidden_size = self.get_num_heads_and_hidden_size(reshape_q, normalize_node, False)
|
||||
if q_num_heads <= 0:
|
||||
logger.debug("fuse_attention: failed to detect num_heads")
|
||||
return
|
||||
|
||||
# number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads
|
||||
new_node = self.create_attention_node(
|
||||
matmul_q,
|
||||
add_q,
|
||||
matmul_k,
|
||||
add_k,
|
||||
matmul_v,
|
||||
add_v,
|
||||
q_num_heads,
|
||||
q_hidden_size,
|
||||
output=attention_last_node.output[0],
|
||||
)
|
||||
if new_node is None:
|
||||
return
|
||||
|
||||
self.nodes_to_add.append(new_node)
|
||||
self.node_name_to_graph_name[new_node.name] = self.this_graph_name
|
||||
|
||||
self.nodes_to_remove.extend([attention_last_node, transpose_qkv])
|
||||
|
||||
# Use prune graph to remove nodes since they are shared by all attention nodes.
|
||||
self.prune_graph = True
|
||||
|
||||
def match_attention_subgraph(self, node_after_output_projection):
|
||||
"""Match Q, K and V paths exported by PyTorch 2.*"""
|
||||
qkv_nodes = self.model.match_parent_path(
|
||||
node_after_output_projection,
|
||||
["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
|
||||
[None, None, None, 0, 0],
|
||||
)
|
||||
|
||||
if qkv_nodes is None:
|
||||
return None
|
||||
|
||||
(_, _, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
|
||||
|
||||
v_nodes = self.model.match_parent_path(matmul_qkv, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None])
|
||||
if v_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match v path")
|
||||
return None
|
||||
(_, _, add_v, matmul_v) = v_nodes
|
||||
|
||||
qk_nodes = self.model.match_parent_path(matmul_qkv, ["Softmax", "MatMul"], [0, 0])
|
||||
if qk_nodes is not None:
|
||||
(_softmax_qk, matmul_qk) = qk_nodes
|
||||
else:
|
||||
logger.debug("fuse_attention: failed to match qk path")
|
||||
return None
|
||||
|
||||
q_nodes = self.model.match_parent_path(
|
||||
matmul_qk, ["Mul", "Transpose", "Reshape", "Add", "MatMul"], [0, None, 0, 0, None]
|
||||
)
|
||||
if q_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match q path")
|
||||
return None
|
||||
(mul_q, _transpose_q, reshape_q, add_q, matmul_q) = q_nodes
|
||||
|
||||
k_nodes = self.model.match_parent_path(
|
||||
matmul_qk, ["Mul", "Transpose", "Reshape", "Add", "MatMul"], [1, None, 0, 0, None]
|
||||
)
|
||||
if k_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match k path")
|
||||
return None
|
||||
|
||||
(_mul_k, _, _, add_k, matmul_k) = k_nodes
|
||||
|
||||
# The scalar for Q and K is sqrt(1.0/sqrt(head_size)).
|
||||
mul_q_nodes = self.model.match_parent_path(
|
||||
mul_q,
|
||||
["Sqrt", "Div", "Sqrt", "Cast", "Slice", "Shape", "Transpose", "Reshape"],
|
||||
[None, 0, 1, 0, 0, 0, 0, 0],
|
||||
)
|
||||
if mul_q_nodes is None or mul_q_nodes[-1] != reshape_q:
|
||||
logger.debug("fuse_attention: failed to match mul_q path")
|
||||
return None
|
||||
|
||||
return reshape_qkv, transpose_qkv, reshape_q, matmul_q, add_q, matmul_k, add_k, matmul_v, add_v
|
||||
|
||||
# --------------------------------------------------------
|
||||
# The following are for SAM encoder
|
||||
# --------------------------------------------------------
|
||||
def fuse_sam_encoder_pattern(self, normalize_node, input_name_to_nodes, output_name_to_node) -> bool:
|
||||
# SAM encoder attention layer pattern:
|
||||
# Add -----------+
|
||||
# | |
|
||||
# LayerNorm |
|
||||
# | |
|
||||
# Reshape |
|
||||
# | |
|
||||
# Transpose |
|
||||
# | |
|
||||
# MatMul |
|
||||
# | |
|
||||
# Add |
|
||||
# | |
|
||||
# Reshape |
|
||||
# | |
|
||||
# Split |
|
||||
# | |
|
||||
# Self Attention subgraph |
|
||||
# | |
|
||||
# Reshape |
|
||||
# | |
|
||||
# Transpose |
|
||||
# | |
|
||||
# Reshape |
|
||||
# | |
|
||||
# Add ----------+
|
||||
# |
|
||||
# LayerNorm (starts from here)
|
||||
|
||||
nodes = self.model.match_parent_path(
|
||||
normalize_node,
|
||||
["Add", "Reshape", "Transpose", "Reshape"],
|
||||
[0, None, 0, 0],
|
||||
)
|
||||
if nodes is None:
|
||||
nodes = self.model.match_parent_path(
|
||||
normalize_node,
|
||||
["Add", "Slice", "Slice", "Reshape", "Transpose", "Reshape"],
|
||||
[0, None, 0, 0, 0, 0],
|
||||
)
|
||||
if nodes is None:
|
||||
nodes = self.model.match_parent_path(
|
||||
normalize_node,
|
||||
["Add"],
|
||||
[0],
|
||||
)
|
||||
if nodes is None:
|
||||
return False
|
||||
|
||||
node_after_output_projection = nodes[-1]
|
||||
matched_sdpa = self.match_sam_encoder_attention_subgraph(
|
||||
node_after_output_projection, input_index=1 if len(nodes) == 1 else None
|
||||
)
|
||||
if matched_sdpa is None:
|
||||
return False
|
||||
|
||||
reshape_out, transpose_out, split_qkv, transpose_q, transpose_k, transpose_v = matched_sdpa
|
||||
|
||||
# B, S, N, H => B, N, S, H
|
||||
permutation_q = OnnxModel.get_node_attribute(transpose_q, "perm")
|
||||
if (not isinstance(permutation_q, list)) or permutation_q != [0, 2, 1, 3]:
|
||||
return False
|
||||
|
||||
# B, S, N, H => B, N, H, S
|
||||
permutation_k = OnnxModel.get_node_attribute(transpose_k, "perm")
|
||||
if (not isinstance(permutation_k, list)) or permutation_k != [0, 2, 3, 1]:
|
||||
return False
|
||||
|
||||
# B, S, N, H => B, N, S, H
|
||||
permutation_v = OnnxModel.get_node_attribute(transpose_v, "perm")
|
||||
if (not isinstance(permutation_v, list)) or permutation_v != [0, 2, 1, 3]:
|
||||
return False
|
||||
|
||||
input_projection_nodes = self.model.match_parent_path(
|
||||
split_qkv,
|
||||
["Reshape", "Add", "MatMul"],
|
||||
[0, 0, None],
|
||||
)
|
||||
if input_projection_nodes is None:
|
||||
return False
|
||||
reshape_in, add_in, matmul_in = input_projection_nodes
|
||||
q_num_heads, q_hidden_size = self.get_num_heads_and_hidden_size(reshape_in, normalize_node, True)
|
||||
if q_num_heads <= 0:
|
||||
logger.debug("fuse_attention: failed to detect num_heads")
|
||||
return False
|
||||
|
||||
# Add a shape to convert 4D BxSxNxH to 3D BxSxD, which is required by MHA operator.
|
||||
new_dims_name = "bsnh_to_bsd_reshape_dims"
|
||||
new_dims = self.model.get_initializer(new_dims_name)
|
||||
if new_dims is None:
|
||||
new_dims = numpy_helper.from_array(np.array([0, 0, -1], dtype="int64"), name=new_dims_name)
|
||||
self.model.add_initializer(new_dims, self.this_graph_name)
|
||||
reshape_q_name = self.model.create_node_name("Reshape")
|
||||
reshape_q = helper.make_node(
|
||||
"Reshape",
|
||||
inputs=[transpose_q.input[0], new_dims_name],
|
||||
outputs=[transpose_q.input[0] + "_BSD"],
|
||||
name=reshape_q_name,
|
||||
)
|
||||
self.nodes_to_add.append(reshape_q)
|
||||
self.node_name_to_graph_name[reshape_q.name] = self.this_graph_name
|
||||
|
||||
# Reuse the transpose_q node to transpose K from BSNH to BNSH. Here we update the input and output of the node.
|
||||
transpose_k_bnsh = transpose_q
|
||||
transpose_k_bnsh.input[0] = transpose_k.input[0]
|
||||
transpose_k_bnsh.output[0] = transpose_k.input[0] + "_BNSH"
|
||||
|
||||
logger.debug(f"Found MHA: {q_num_heads=} {q_hidden_size=}")
|
||||
|
||||
# number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads
|
||||
new_node = self.create_mha_node(
|
||||
reshape_q,
|
||||
transpose_k_bnsh,
|
||||
transpose_v,
|
||||
q_num_heads,
|
||||
)
|
||||
if new_node is None:
|
||||
return False
|
||||
|
||||
# Update the input of the next node that consumes the output of the MHA.
|
||||
assert len(self.model.get_children(transpose_out, input_name_to_nodes)) == 1
|
||||
reshape_out.input[0] = new_node.output[0]
|
||||
|
||||
self.nodes_to_add.append(new_node)
|
||||
self.node_name_to_graph_name[new_node.name] = self.this_graph_name
|
||||
self.nodes_to_remove.extend([transpose_out])
|
||||
|
||||
# Use prune graph to remove nodes since they are shared by all attention nodes.
|
||||
self.prune_graph = True
|
||||
return True
|
||||
|
||||
def match_sam_encoder_attention_subgraph(self, node_after_output_projection, input_index=None):
|
||||
"""Match SDPA pattern in SAM2 enconder.*"""
|
||||
|
||||
# nodes of output projection and the second MatMul in SDPA.
|
||||
out_nodes = self.model.match_parent_path(
|
||||
node_after_output_projection,
|
||||
["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
|
||||
[input_index, None, None, 0, 0],
|
||||
)
|
||||
|
||||
if out_nodes is None:
|
||||
return None
|
||||
|
||||
(_, _, reshape_out, transpose_out, matmul_qk_v) = out_nodes
|
||||
|
||||
# Split and Reshape is for packed QKV
|
||||
v_nodes = self.model.match_parent_path(matmul_qk_v, ["Transpose", "Squeeze", "Split", "Reshape"], [1, 0, 0, 0])
|
||||
if v_nodes is None:
|
||||
logger.debug("failed to match v path")
|
||||
return None
|
||||
(transpose_v, _, split_qkv, reshape_qkv) = v_nodes
|
||||
|
||||
qk_nodes = self.model.match_parent_path(matmul_qk_v, ["Softmax", "MatMul"], [0, 0])
|
||||
if qk_nodes is not None:
|
||||
(_softmax_qk, matmul_qk) = qk_nodes
|
||||
else:
|
||||
logger.debug("failed to match qk path")
|
||||
return None
|
||||
|
||||
q_nodes = self.model.match_parent_path(matmul_qk, ["Mul", "Transpose", "Squeeze", "Split"], [0, None, 0, 0])
|
||||
if q_nodes is None:
|
||||
q_nodes = self.model.match_parent_path(
|
||||
matmul_qk,
|
||||
["Mul", "Transpose", "Reshape", "Transpose", "MaxPool", "Transpose", "Reshape", "Squeeze", "Split"],
|
||||
[0, None, 0, 0, 0, 0, 0, 0, 0],
|
||||
)
|
||||
if q_nodes is None:
|
||||
logger.debug("failed to match q path")
|
||||
return None
|
||||
|
||||
if q_nodes[-1] != split_qkv:
|
||||
return None
|
||||
transpose_q = q_nodes[1]
|
||||
|
||||
k_nodes = self.model.match_parent_path(matmul_qk, ["Mul", "Transpose", "Squeeze", "Split"], [1, None, 0, 0])
|
||||
if k_nodes is None:
|
||||
logger.debug("failed to match k path")
|
||||
return None
|
||||
|
||||
if k_nodes[-1] != split_qkv:
|
||||
return None
|
||||
(mul_k, transpose_k, _squeeze_k, _) = k_nodes
|
||||
|
||||
return reshape_out, transpose_out, split_qkv, transpose_q, transpose_k, transpose_v
|
||||
|
||||
def create_mha_node(
|
||||
self,
|
||||
reshape_q: NodeProto,
|
||||
transpose_k: NodeProto,
|
||||
transpose_v: NodeProto,
|
||||
num_heads: int,
|
||||
) -> NodeProto:
|
||||
"""Create a MultiHeadAttention node for SAM2 encoder.
|
||||
|
||||
Args:
|
||||
reshape_q (NodeProto): Reshape node for Q, output is 3D BxSxNH format
|
||||
transpose_k (NodeProto): Transpose node for K, output is BNSH format
|
||||
transpose_v (NodeProto): Transpose node for V, output is BNSH format
|
||||
num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
|
||||
|
||||
Returns:
|
||||
NodeProto: the MultiHeadAttention node created.
|
||||
"""
|
||||
|
||||
attention_node_name = self.model.create_node_name("MultiHeadAttention")
|
||||
|
||||
inputs = [
|
||||
reshape_q.output[0],
|
||||
transpose_k.output[0],
|
||||
transpose_v.output[0],
|
||||
]
|
||||
|
||||
# Create a new output name since the shape is 3D, which is different from the original output shape (4D).
|
||||
output = attention_node_name + "_out"
|
||||
|
||||
attention_node = helper.make_node(
|
||||
"MultiHeadAttention",
|
||||
inputs=inputs,
|
||||
outputs=[output],
|
||||
name=attention_node_name,
|
||||
)
|
||||
attention_node.domain = "com.microsoft"
|
||||
attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
|
||||
|
||||
counter_name = "MultiHeadAttention ({})".format("self attention")
|
||||
self.increase_counter(counter_name)
|
||||
return attention_node
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,300 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
from logging import getLogger
|
||||
|
||||
import numpy as np
|
||||
from fusion_base import Fusion
|
||||
from onnx import NodeProto, TensorProto, helper, numpy_helper
|
||||
from onnx_model import OnnxModel
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
||||
class FusionAttentionVae(Fusion):
|
||||
"""
|
||||
Fuse Attention subgraph of Vae Decoder into one Attention node.
|
||||
"""
|
||||
|
||||
def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int):
|
||||
super().__init__(model, "Attention", ["Softmax"])
|
||||
self.hidden_size = hidden_size
|
||||
self.num_heads = num_heads
|
||||
|
||||
# Flags to show warning only once
|
||||
self.num_heads_warning = True
|
||||
self.hidden_size_warning = True
|
||||
|
||||
def get_num_heads_and_hidden_size(self, reshape_q: NodeProto, add_q: NodeProto) -> tuple[int, int]:
|
||||
"""Detect num_heads and hidden_size from a reshape node.
|
||||
|
||||
Args:
|
||||
reshape_q (NodeProto): reshape node for Q
|
||||
add_q (NodeProto): add node for Q
|
||||
|
||||
Returns:
|
||||
Tuple[int, int]: num_heads and hidden_size
|
||||
"""
|
||||
concat = self.model.get_parent(reshape_q, 1)
|
||||
if concat is None or len(concat.input) != 4:
|
||||
return self.num_heads, self.hidden_size # Fall back to user specified value
|
||||
|
||||
value = self.model.get_constant_value(concat.input[2])
|
||||
if not (value is not None and isinstance(value, np.ndarray) and value.size == 1):
|
||||
return self.num_heads, self.hidden_size # Fall back to user specified value
|
||||
num_heads = int(value)
|
||||
if num_heads <= 0:
|
||||
return self.num_heads, self.hidden_size # Fall back to user specified value
|
||||
|
||||
_, bias = self.model.get_constant_input(add_q)
|
||||
if (bias is None) or (not isinstance(bias, np.ndarray)) or bias.ndim != 1:
|
||||
return self.num_heads, self.hidden_size # Fall back to user specified value
|
||||
|
||||
hidden_size = bias.shape[0]
|
||||
|
||||
if self.num_heads > 0 and num_heads != self.num_heads:
|
||||
if self.num_heads_warning:
|
||||
logger.warning(
|
||||
"Detected number of attention heads is %d. Ignore --num_heads %d", num_heads, self.num_heads
|
||||
)
|
||||
self.num_heads_warning = False # Do not show the warning more than once
|
||||
|
||||
if self.hidden_size > 0 and hidden_size != self.hidden_size:
|
||||
if self.hidden_size_warning:
|
||||
logger.warning("Detected hidden size is %d. Ignore --hidden_size %d", hidden_size, self.hidden_size)
|
||||
self.hidden_size_warning = False # Do not show the warning more than once
|
||||
|
||||
return num_heads, hidden_size
|
||||
|
||||
def create_attention_node(
|
||||
self,
|
||||
q_matmul: NodeProto,
|
||||
q_add: NodeProto,
|
||||
k_matmul: NodeProto,
|
||||
k_add: NodeProto,
|
||||
v_matmul: NodeProto,
|
||||
v_add: NodeProto,
|
||||
num_heads: int,
|
||||
hidden_size: int,
|
||||
input_name: str,
|
||||
output_name: str,
|
||||
) -> NodeProto | None:
|
||||
"""Create an Attention node.
|
||||
|
||||
Args:
|
||||
q_matmul (NodeProto): MatMul node in fully connection for Q
|
||||
q_add (NodeProto): Add bias node in fully connection for Q
|
||||
k_matmul (NodeProto): MatMul node in fully connection for K
|
||||
k_add (NodeProto): Add bias node in fully connection for K
|
||||
v_matmul (NodeProto): MatMul node in fully connection for V
|
||||
v_add (NodeProto): Add bias node in fully connection for V
|
||||
num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
|
||||
hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
|
||||
input_name (str): input name
|
||||
output_name (str): output name
|
||||
|
||||
Returns:
|
||||
Union[NodeProto, None]: the node created or None if failed.
|
||||
"""
|
||||
if q_matmul.input[0] != input_name or k_matmul.input[0] != input_name or v_matmul.input[0] != input_name:
|
||||
logger.debug(
|
||||
"For self attention, input hidden state for q and k/v shall be same. Got %s, %s, %s",
|
||||
q_matmul.input[0],
|
||||
k_matmul.input[0],
|
||||
v_matmul.input[0],
|
||||
)
|
||||
return None
|
||||
|
||||
if hidden_size > 0 and (hidden_size % num_heads) != 0:
|
||||
logger.debug("input hidden size %d is not a multiple of num of heads %d", hidden_size, num_heads)
|
||||
return None
|
||||
|
||||
q_weight_tensor = self.model.get_initializer(q_matmul.input[1])
|
||||
k_weight_tensor = self.model.get_initializer(k_matmul.input[1])
|
||||
v_weight_tensor = self.model.get_initializer(v_matmul.input[1])
|
||||
if not (q_weight_tensor and k_weight_tensor and v_weight_tensor):
|
||||
return None
|
||||
|
||||
q_bias_tensor = self.model.get_initializer(q_add.input[1]) or self.model.get_initializer(q_add.input[0])
|
||||
k_bias_tensor = self.model.get_initializer(k_add.input[1]) or self.model.get_initializer(k_add.input[0])
|
||||
v_bias_tensor = self.model.get_initializer(v_add.input[1]) or self.model.get_initializer(v_add.input[0])
|
||||
|
||||
q_bias = numpy_helper.to_array(q_bias_tensor)
|
||||
k_bias = numpy_helper.to_array(k_bias_tensor)
|
||||
v_bias = numpy_helper.to_array(v_bias_tensor)
|
||||
|
||||
q_bias_shape = np.prod(q_bias.shape)
|
||||
k_bias_shape = np.prod(k_bias.shape)
|
||||
v_bias_shape = np.prod(v_bias.shape)
|
||||
|
||||
# Sometimes weights are stored in fp16
|
||||
if q_weight_tensor.data_type == 10:
|
||||
logger.debug("weights are in fp16. Please run fp16 conversion after optimization")
|
||||
return None
|
||||
|
||||
q_weight = numpy_helper.to_array(q_weight_tensor)
|
||||
k_weight = numpy_helper.to_array(k_weight_tensor)
|
||||
v_weight = numpy_helper.to_array(v_weight_tensor)
|
||||
|
||||
# assert q and k have same shape as expected
|
||||
if q_weight.shape != k_weight.shape or q_weight.shape != v_weight.shape:
|
||||
return None
|
||||
|
||||
qw_in_size = q_weight.shape[0]
|
||||
kw_in_size = k_weight.shape[0]
|
||||
vw_in_size = v_weight.shape[0]
|
||||
|
||||
assert qw_in_size == kw_in_size and kw_in_size == vw_in_size
|
||||
|
||||
if hidden_size > 0 and hidden_size != qw_in_size:
|
||||
raise ValueError(
|
||||
f"Input hidden size ({hidden_size}) is not same as weight dimension of q,k,v ({qw_in_size}). "
|
||||
"Please provide a correct input hidden size or pass in 0"
|
||||
)
|
||||
|
||||
# All the matrices can have the same shape or q, k matrics can have the same shape with v being different
|
||||
# For 2d weights, the shapes would be [in_size, out_size].
|
||||
# For 3d weights, shape would be [in_size, a, b] where a*b = out_size
|
||||
qw_out_size = np.prod(q_weight.shape[1:])
|
||||
|
||||
qkv_weight = np.stack((q_weight, k_weight, v_weight), axis=1)
|
||||
qkv_weight_dim = 3 * int(qw_out_size)
|
||||
|
||||
attention_node_name = self.model.create_node_name("Attention")
|
||||
|
||||
assert q_bias_shape == k_bias_shape == v_bias_shape
|
||||
|
||||
qkv_bias_dim = 0
|
||||
qkv_bias = np.stack((q_bias, k_bias, v_bias), axis=0)
|
||||
qkv_bias_dim = 3 * q_bias_shape
|
||||
|
||||
self.add_initializer(
|
||||
name=attention_node_name + "_qkv_weight",
|
||||
data_type=TensorProto.FLOAT,
|
||||
dims=[qw_in_size, qkv_weight_dim],
|
||||
vals=qkv_weight,
|
||||
)
|
||||
|
||||
# No bias, use zeros
|
||||
qkv_bias = np.zeros([3, hidden_size], dtype=np.float32)
|
||||
qkv_bias_dim = 3 * hidden_size
|
||||
|
||||
self.add_initializer(
|
||||
name=attention_node_name + "_qkv_bias",
|
||||
data_type=TensorProto.FLOAT,
|
||||
dims=[qkv_bias_dim],
|
||||
vals=qkv_bias,
|
||||
)
|
||||
|
||||
attention_inputs = [
|
||||
input_name,
|
||||
attention_node_name + "_qkv_weight",
|
||||
attention_node_name + "_qkv_bias",
|
||||
]
|
||||
|
||||
attention_node = helper.make_node(
|
||||
"Attention",
|
||||
inputs=attention_inputs,
|
||||
outputs=[output_name],
|
||||
name=attention_node_name,
|
||||
)
|
||||
attention_node.domain = "com.microsoft"
|
||||
attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
|
||||
|
||||
self.increase_counter("Attention (self attention)")
|
||||
return attention_node
|
||||
|
||||
def fuse(self, softmax_node, input_name_to_nodes, output_name_to_node):
|
||||
matmul_qkv = self.model.find_first_child_by_type(softmax_node, "MatMul", input_name_to_nodes, recursive=False)
|
||||
if matmul_qkv is None:
|
||||
return
|
||||
|
||||
reshape_qkv = self.model.find_first_child_by_type(matmul_qkv, "Reshape", input_name_to_nodes, recursive=False)
|
||||
if reshape_qkv is None:
|
||||
return
|
||||
|
||||
transpose_qkv = self.model.find_first_child_by_type(
|
||||
reshape_qkv, "Transpose", input_name_to_nodes, recursive=False
|
||||
)
|
||||
if transpose_qkv is None:
|
||||
return
|
||||
|
||||
reshape_out = self.model.find_first_child_by_type(
|
||||
transpose_qkv, "Reshape", input_name_to_nodes, recursive=False
|
||||
)
|
||||
if reshape_out is None:
|
||||
return
|
||||
|
||||
matmul_out = self.model.find_first_child_by_type(reshape_out, "MatMul", input_name_to_nodes, recursive=False)
|
||||
if matmul_out is None:
|
||||
return
|
||||
|
||||
add_out = self.model.find_first_child_by_type(matmul_out, "Add", input_name_to_nodes, recursive=False)
|
||||
if add_out is None:
|
||||
return
|
||||
|
||||
transpose_out = self.model.find_first_child_by_type(add_out, "Transpose", input_name_to_nodes, recursive=False)
|
||||
if transpose_out is None:
|
||||
return
|
||||
|
||||
v_nodes = self.model.match_parent_path(
|
||||
matmul_qkv, ["Reshape", "Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, 0, None]
|
||||
)
|
||||
if v_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match v path")
|
||||
return
|
||||
(_, _, _, add_v, matmul_v) = v_nodes
|
||||
|
||||
qk_nodes = self.model.match_parent_path(matmul_qkv, ["Softmax", "Add", "Mul", "MatMul"], [0, 0, 0, 0])
|
||||
if qk_nodes is not None:
|
||||
(_softmax_qk, _add_zero, _mul_qk, matmul_qk) = qk_nodes
|
||||
else:
|
||||
logger.debug("fuse_attention: failed to match qk path")
|
||||
return
|
||||
|
||||
q_nodes = self.model.match_parent_path(
|
||||
matmul_qk, ["Reshape", "Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, 0, None]
|
||||
)
|
||||
if q_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match q path")
|
||||
return
|
||||
(_, _transpose_q, reshape_q, add_q, matmul_q) = q_nodes
|
||||
k_nodes = self.model.match_parent_path(
|
||||
matmul_qk, ["Transpose", "Reshape", "Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, 0, 0, None]
|
||||
)
|
||||
if k_nodes is None:
|
||||
logger.debug("fuse_attention: failed to match k path")
|
||||
return
|
||||
(_, _, _, _, add_k, matmul_k) = k_nodes
|
||||
|
||||
attention_last_node = reshape_out
|
||||
|
||||
q_num_heads, q_hidden_size = self.get_num_heads_and_hidden_size(reshape_q, add_q)
|
||||
if q_num_heads <= 0:
|
||||
logger.debug("fuse_attention: failed to detect num_heads")
|
||||
return
|
||||
|
||||
# number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads
|
||||
new_node = self.create_attention_node(
|
||||
matmul_q,
|
||||
add_q,
|
||||
matmul_k,
|
||||
add_k,
|
||||
matmul_v,
|
||||
add_v,
|
||||
q_num_heads,
|
||||
q_hidden_size,
|
||||
matmul_q.input[0],
|
||||
attention_last_node.output[0],
|
||||
)
|
||||
if new_node is None:
|
||||
return
|
||||
|
||||
self.nodes_to_add.append(new_node)
|
||||
self.node_name_to_graph_name[new_node.name] = self.this_graph_name
|
||||
|
||||
self.nodes_to_remove.extend([attention_last_node, transpose_qkv])
|
||||
|
||||
# Use prune graph to remove nodes since they are shared by all attention nodes.
|
||||
self.prune_graph = True
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user