I am done

This commit is contained in:
2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions

View File

@ -0,0 +1,72 @@
# mypy: allow-untyped-defs
import types
from contextlib import contextmanager
# The idea for this parameter is that we forbid bare assignment
# to torch.backends.<cudnn|mkldnn>.enabled and friends when running our
# test suite, where it's very easy to forget to undo the change
# later.
__allow_nonbracketed_mutation_flag = True
def disable_global_flags():
global __allow_nonbracketed_mutation_flag
__allow_nonbracketed_mutation_flag = False
def flags_frozen():
return not __allow_nonbracketed_mutation_flag
@contextmanager
def __allow_nonbracketed_mutation():
global __allow_nonbracketed_mutation_flag
old = __allow_nonbracketed_mutation_flag
__allow_nonbracketed_mutation_flag = True
try:
yield
finally:
__allow_nonbracketed_mutation_flag = old
class ContextProp:
def __init__(self, getter, setter):
self.getter = getter
self.setter = setter
def __get__(self, obj, objtype):
return self.getter()
def __set__(self, obj, val):
if not flags_frozen():
self.setter(val)
else:
raise RuntimeError(
f"not allowed to set {obj.__name__} flags "
"after disable_global_flags; please use flags() context manager instead"
)
class PropModule(types.ModuleType):
def __init__(self, m, name):
super().__init__(name)
self.m = m
def __getattr__(self, attr):
return self.m.__getattribute__(attr)
from torch.backends import (
cpu as cpu,
cuda as cuda,
cudnn as cudnn,
cusparselt as cusparselt,
mha as mha,
mkl as mkl,
mkldnn as mkldnn,
mps as mps,
nnpack as nnpack,
openmp as openmp,
quantized as quantized,
)

View File

@ -0,0 +1,148 @@
# mypy: allow-untyped-defs
import hashlib
import json
from typing import Dict, Tuple
import coremltools as ct # type: ignore[import]
from coremltools.converters.mil.input_types import TensorType # type: ignore[import]
from coremltools.converters.mil.mil import types # type: ignore[import]
from coremltools.models.neural_network import quantization_utils # type: ignore[import]
import torch
CT_METADATA_VERSION = "com.github.apple.coremltools.version"
CT_METADATA_SOURCE = "com.github.apple.coremltools.source"
class ScalarType:
Float = 0
Double = 1
Int = 2
Long = 3
Undefined = 4
# Supported Tensor types in coremltools:
# https://github.com/apple/coremltools/blob/main/coremltools/converters/mil/frontend/torch/converter.py#L28
torch_to_mil_types = {
ScalarType.Float: types.fp32,
ScalarType.Double: types.fp64,
ScalarType.Int: types.int32,
ScalarType.Long: types.int64,
}
class CoreMLComputeUnit:
CPU = "cpuOnly"
CPUAndGPU = "cpuAndGPU"
ALL = "all"
class CoreMLQuantizationMode:
LINEAR = "linear"
LINEAR_SYMMETRIC = "linear_symmetric"
NONE = "none"
def TensorSpec(shape, dtype=ScalarType.Float):
return (shape, dtype)
def CompileSpec(
inputs,
outputs,
backend=CoreMLComputeUnit.CPU,
allow_low_precision=True,
quantization_mode=CoreMLQuantizationMode.NONE,
mlmodel_export_path=None,
):
return (
inputs,
outputs,
backend,
allow_low_precision,
quantization_mode,
mlmodel_export_path,
)
def _check_enumerated_shape(shape):
for s in shape:
if not isinstance(s, (list, tuple)):
return False
return True
def _convert_to_mil_type(shape, dtype, name: str):
mil_shape = shape
if _check_enumerated_shape(shape):
mil_shape = ct.EnumeratedShapes(shape)
ml_type = TensorType(shape=mil_shape, dtype=torch_to_mil_types[dtype])
ml_type.name = name
return ml_type
def preprocess(script_module: torch._C.ScriptObject, compile_spec: Dict[str, Tuple]):
spec = compile_spec["forward"]
(
input_specs,
output_specs,
backend,
allow_low_precision,
quantization_mode,
mlmodel_export_path,
) = spec
mil_inputs = []
inputs = []
for index, input in enumerate(input_specs):
shape, dtype = input
name = "input_" + str(index)
inputs.append([name, str(dtype), str(shape)])
ml_type = _convert_to_mil_type(shape, dtype, name)
mil_inputs.append(ml_type)
model = torch.jit.RecursiveScriptModule._construct(script_module, lambda x: None)
mlmodel = ct.convert(model, inputs=mil_inputs)
if quantization_mode != CoreMLQuantizationMode.NONE:
quant_model_spec = quantization_utils.quantize_weights(
mlmodel, nbits=8, quantization_mode=quantization_mode
)
mlmodel = ct.models.MLModel(quant_model_spec)
spec = mlmodel.get_spec()
assert len(spec.description.output) == len(output_specs) # type: ignore[attr-defined]
outputs = []
for index, output in enumerate(output_specs):
shape, dtype = output
name = spec.description.output[index].name # type: ignore[attr-defined]
outputs.append([name, str(dtype), str(shape)])
mlmodel = ct.models.model.MLModel(spec)
print(mlmodel)
if mlmodel_export_path is not None:
print(f"Saving CoreML .mlmodel file to {mlmodel_export_path}")
mlmodel.save(mlmodel_export_path)
config = {
"spec_ver": str(spec.specificationVersion), # type: ignore[attr-defined]
"backend": backend,
"allow_low_precision": str(allow_low_precision),
}
metadata = {
"coremltool_ver": mlmodel.user_defined_metadata[CT_METADATA_VERSION],
"torch_ver": mlmodel.user_defined_metadata[CT_METADATA_SOURCE],
}
coreml_compile_spec = {
"inputs": inputs,
"outputs": outputs,
"config": config,
"metadata": metadata,
}
mlmodel = spec.SerializeToString() # type: ignore[attr-defined]
return {
"model": mlmodel,
"hash": str(hashlib.sha256(mlmodel).hexdigest()),
"extra": json.dumps(coreml_compile_spec),
}

View File

@ -0,0 +1,199 @@
# mypy: allow-untyped-decorators
# mypy: allow-untyped-defs
from typing import List, Optional
import torch
from torch.backends._nnapi.serializer import _NnapiSerializer
ANEURALNETWORKS_PREFER_LOW_POWER = 0
ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER = 1
ANEURALNETWORKS_PREFER_SUSTAINED_SPEED = 2
class NnapiModule(torch.nn.Module):
"""Torch Module that wraps an NNAPI Compilation.
This module handles preparing the weights, initializing the
NNAPI TorchBind object, and adjusting the memory formats
of all inputs and outputs.
"""
# _nnapi.Compilation is defined
comp: Optional[torch.classes._nnapi.Compilation] # type: ignore[name-defined]
weights: List[torch.Tensor]
out_templates: List[torch.Tensor]
def __init__(
self,
shape_compute_module: torch.nn.Module,
ser_model: torch.Tensor,
weights: List[torch.Tensor],
inp_mem_fmts: List[int],
out_mem_fmts: List[int],
compilation_preference: int,
relax_f32_to_f16: bool,
):
super().__init__()
self.shape_compute_module = shape_compute_module
self.ser_model = ser_model
self.weights = weights
self.inp_mem_fmts = inp_mem_fmts
self.out_mem_fmts = out_mem_fmts
self.out_templates = []
self.comp = None
self.compilation_preference = compilation_preference
self.relax_f32_to_f16 = relax_f32_to_f16
@torch.jit.export
def init(self, args: List[torch.Tensor]):
assert self.comp is None
self.out_templates = self.shape_compute_module.prepare(self.ser_model, args) # type: ignore[operator]
self.weights = [w.contiguous() for w in self.weights]
comp = torch.classes._nnapi.Compilation()
comp.init2(
self.ser_model,
self.weights,
self.compilation_preference,
self.relax_f32_to_f16,
)
self.comp = comp
def forward(self, args: List[torch.Tensor]) -> List[torch.Tensor]:
if self.comp is None:
self.init(args)
comp = self.comp
assert comp is not None
outs = [torch.empty_like(out) for out in self.out_templates]
assert len(args) == len(self.inp_mem_fmts)
fixed_args = []
for idx in range(len(args)):
fmt = self.inp_mem_fmts[idx]
# These constants match the values in DimOrder in serializer.py
# TODO: See if it's possible to use those directly.
if fmt == 0:
fixed_args.append(args[idx].contiguous())
elif fmt == 1:
fixed_args.append(args[idx].permute(0, 2, 3, 1).contiguous())
else:
raise ValueError("Invalid mem_fmt")
comp.run(fixed_args, outs)
assert len(outs) == len(self.out_mem_fmts)
for idx in range(len(self.out_templates)):
fmt = self.out_mem_fmts[idx]
# These constants match the values in DimOrder in serializer.py
# TODO: See if it's possible to use those directly.
if fmt in (0, 2):
pass
elif fmt == 1:
outs[idx] = outs[idx].permute(0, 3, 1, 2)
else:
raise ValueError("Invalid mem_fmt")
return outs
def convert_model_to_nnapi(
model,
inputs,
serializer=None,
return_shapes=None,
use_int16_for_qint16=False,
compilation_preference=ANEURALNETWORKS_PREFER_SUSTAINED_SPEED,
relax_f32_to_f16=False,
):
(
shape_compute_module,
ser_model_tensor,
used_weights,
inp_mem_fmts,
out_mem_fmts,
retval_count,
) = process_for_nnapi(
model, inputs, serializer, return_shapes, use_int16_for_qint16
)
nnapi_model = NnapiModule(
shape_compute_module,
ser_model_tensor,
used_weights,
inp_mem_fmts,
out_mem_fmts,
compilation_preference,
relax_f32_to_f16,
)
class NnapiInterfaceWrapper(torch.nn.Module):
"""NNAPI list-ifying and de-list-ifying wrapper.
NNAPI always expects a list of inputs and provides a list of outputs.
This module allows us to accept inputs as separate arguments.
It returns results as either a single tensor or tuple,
matching the original module.
"""
def __init__(self, mod):
super().__init__()
self.mod = mod
wrapper_model_py = NnapiInterfaceWrapper(nnapi_model)
wrapper_model = torch.jit.script(wrapper_model_py)
# TODO: Maybe make these names match the original.
arg_list = ", ".join(f"arg_{idx}" for idx in range(len(inputs)))
if retval_count < 0:
ret_expr = "retvals[0]"
else:
ret_expr = "".join(f"retvals[{idx}], " for idx in range(retval_count))
wrapper_model.define(
f"def forward(self, {arg_list}):\n"
f" retvals = self.mod([{arg_list}])\n"
f" return {ret_expr}\n"
)
return wrapper_model
def process_for_nnapi(
model, inputs, serializer=None, return_shapes=None, use_int16_for_qint16=False
):
model = torch.jit.freeze(model)
if isinstance(inputs, torch.Tensor):
inputs = [inputs]
serializer = serializer or _NnapiSerializer(
config=None, use_int16_for_qint16=use_int16_for_qint16
)
(
ser_model,
used_weights,
inp_mem_fmts,
out_mem_fmts,
shape_compute_lines,
retval_count,
) = serializer.serialize_model(model, inputs, return_shapes)
ser_model_tensor = torch.tensor(ser_model, dtype=torch.int32)
# We have to create a new class here every time this function is called
# because module.define adds a method to the *class*, not the instance.
class ShapeComputeModule(torch.nn.Module):
"""Code-gen-ed module for tensor shape computation.
module.prepare will mutate ser_model according to the computed operand
shapes, based on the shapes of args. Returns a list of output templates.
"""
shape_compute_module = torch.jit.script(ShapeComputeModule())
real_shape_compute_lines = [
"def prepare(self, ser_model: torch.Tensor, args: List[torch.Tensor]) -> List[torch.Tensor]:\n",
] + [f" {line}\n" for line in shape_compute_lines]
shape_compute_module.define("".join(real_shape_compute_lines))
return (
shape_compute_module,
ser_model_tensor,
used_weights,
inp_mem_fmts,
out_mem_fmts,
retval_count,
)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,20 @@
import torch
__all__ = [
"get_cpu_capability",
]
def get_cpu_capability() -> str:
r"""Return cpu capability as a string value.
Possible values:
- "DEFAULT"
- "VSX"
- "Z VECTOR"
- "NO AVX"
- "AVX2"
- "AVX512"
"""
return torch._C._get_cpu_capability()

View File

@ -0,0 +1,478 @@
# mypy: allow-untyped-defs
import contextlib
from typing import Union
from typing_extensions import deprecated
import torch
__all__ = [
"is_built",
"cuFFTPlanCacheAttrContextProp",
"cuFFTPlanCache",
"cuFFTPlanCacheManager",
"cuBLASModule",
"preferred_linalg_library",
"preferred_blas_library",
"cufft_plan_cache",
"matmul",
"SDPAParams",
"enable_cudnn_sdp",
"cudnn_sdp_enabled",
"enable_flash_sdp",
"flash_sdp_enabled",
"enable_mem_efficient_sdp",
"mem_efficient_sdp_enabled",
"math_sdp_enabled",
"enable_math_sdp",
"allow_fp16_bf16_reduction_math_sdp",
"fp16_bf16_reduction_math_sdp_allowed",
"is_flash_attention_available",
"can_use_flash_attention",
"can_use_efficient_attention",
"can_use_cudnn_attention",
"sdp_kernel",
]
def is_built():
r"""
Return whether PyTorch is built with CUDA support.
Note that this doesn't necessarily mean CUDA is available; just that if this PyTorch
binary were run on a machine with working CUDA drivers and devices, we would be able to use it.
"""
return torch._C._has_cuda
class cuFFTPlanCacheAttrContextProp:
# Like regular ContextProp, but uses the `.device_index` attribute from the
# calling object as the first argument to the getter and setter.
def __init__(self, getter, setter):
self.getter = getter
self.setter = setter
def __get__(self, obj, objtype):
return self.getter(obj.device_index)
def __set__(self, obj, val):
if isinstance(self.setter, str):
raise RuntimeError(self.setter)
self.setter(obj.device_index, val)
class cuFFTPlanCache:
r"""
Represent a specific plan cache for a specific `device_index`.
The attributes `size` and `max_size`, and method `clear`, can fetch and/ or
change properties of the C++ cuFFT plan cache.
"""
def __init__(self, device_index):
self.device_index = device_index
size = cuFFTPlanCacheAttrContextProp(
torch._cufft_get_plan_cache_size,
".size is a read-only property showing the number of plans currently in the "
"cache. To change the cache capacity, set cufft_plan_cache.max_size.",
)
max_size = cuFFTPlanCacheAttrContextProp(
torch._cufft_get_plan_cache_max_size, torch._cufft_set_plan_cache_max_size
)
def clear(self):
return torch._cufft_clear_plan_cache(self.device_index)
class cuFFTPlanCacheManager:
r"""
Represent all cuFFT plan caches, return the cuFFTPlanCache for a given device when indexed.
Finally, this object, when used directly as a `cuFFTPlanCache` object (e.g.,
setting the `.max_size`) attribute, the current device's cuFFT plan cache is
used.
"""
__initialized = False
def __init__(self):
self.caches = []
self.__initialized = True
def __getitem__(self, device):
index = torch.cuda._utils._get_device_index(device)
if index < 0 or index >= torch.cuda.device_count():
raise RuntimeError(
f"cufft_plan_cache: expected 0 <= device index < {torch.cuda.device_count()}, but got "
f"device with index {index}"
)
if len(self.caches) == 0:
self.caches.extend(
cuFFTPlanCache(index) for index in range(torch.cuda.device_count())
)
return self.caches[index]
def __getattr__(self, name):
return getattr(self[torch.cuda.current_device()], name)
def __setattr__(self, name, value):
if self.__initialized:
return setattr(self[torch.cuda.current_device()], name, value)
else:
return super().__setattr__(name, value)
class cuBLASModule:
def __getattr__(self, name):
if name == "allow_tf32":
return torch._C._get_cublas_allow_tf32()
elif name == "allow_fp16_reduced_precision_reduction":
return torch._C._get_cublas_allow_fp16_reduced_precision_reduction()
elif name == "allow_bf16_reduced_precision_reduction":
return torch._C._get_cublas_allow_bf16_reduced_precision_reduction()
raise AttributeError("Unknown attribute " + name)
def __setattr__(self, name, value):
if name == "allow_tf32":
return torch._C._set_cublas_allow_tf32(value)
elif name == "allow_fp16_reduced_precision_reduction":
return torch._C._set_cublas_allow_fp16_reduced_precision_reduction(value)
elif name == "allow_bf16_reduced_precision_reduction":
return torch._C._set_cublas_allow_bf16_reduced_precision_reduction(value)
raise AttributeError("Unknown attribute " + name)
_LinalgBackends = {
"default": torch._C._LinalgBackend.Default,
"cusolver": torch._C._LinalgBackend.Cusolver,
"magma": torch._C._LinalgBackend.Magma,
}
_LinalgBackends_str = ", ".join(_LinalgBackends.keys())
def preferred_linalg_library(
backend: Union[None, str, torch._C._LinalgBackend] = None
) -> torch._C._LinalgBackend:
r"""
Override the heuristic PyTorch uses to choose between cuSOLVER and MAGMA for CUDA linear algebra operations.
.. warning:: This flag is experimental and subject to change.
When PyTorch runs a CUDA linear algebra operation it often uses the cuSOLVER or MAGMA libraries,
and if both are available it decides which to use with a heuristic.
This flag (a :class:`str`) allows overriding those heuristics.
* If `"cusolver"` is set then cuSOLVER will be used wherever possible.
* If `"magma"` is set then MAGMA will be used wherever possible.
* If `"default"` (the default) is set then heuristics will be used to pick between
cuSOLVER and MAGMA if both are available.
* When no input is given, this function returns the currently preferred library.
* User may use the environment variable TORCH_LINALG_PREFER_CUSOLVER=1 to set the preferred library to cuSOLVER
globally.
This flag only sets the initial value of the preferred library and the preferred library
may still be overridden by this function call later in your script.
Note: When a library is preferred other libraries may still be used if the preferred library
doesn't implement the operation(s) called.
This flag may achieve better performance if PyTorch's heuristic library selection is incorrect
for your application's inputs.
Currently supported linalg operators:
* :func:`torch.linalg.inv`
* :func:`torch.linalg.inv_ex`
* :func:`torch.linalg.cholesky`
* :func:`torch.linalg.cholesky_ex`
* :func:`torch.cholesky_solve`
* :func:`torch.cholesky_inverse`
* :func:`torch.linalg.lu_factor`
* :func:`torch.linalg.lu`
* :func:`torch.linalg.lu_solve`
* :func:`torch.linalg.qr`
* :func:`torch.linalg.eigh`
* :func:`torch.linalg.eighvals`
* :func:`torch.linalg.svd`
* :func:`torch.linalg.svdvals`
"""
if backend is None:
pass
elif isinstance(backend, str):
if backend not in _LinalgBackends:
raise RuntimeError(
"Unknown input value. " f"Choose from: {_LinalgBackends_str}."
)
torch._C._set_linalg_preferred_backend(_LinalgBackends[backend])
elif isinstance(backend, torch._C._LinalgBackend):
torch._C._set_linalg_preferred_backend(backend)
else:
raise RuntimeError("Unknown input value type.")
return torch._C._get_linalg_preferred_backend()
_BlasBackends = {
"cublas": torch._C._BlasBackend.Cublas,
"cublaslt": torch._C._BlasBackend.Cublaslt,
"hipblaslt": torch._C._BlasBackend.Cublaslt, # alias
}
_BlasBackends_str = ", ".join(_BlasBackends.keys())
def preferred_blas_library(
backend: Union[None, str, torch._C._BlasBackend] = None
) -> torch._C._BlasBackend:
r"""
Override the library PyTorch uses for BLAS operations. Choose between cuBLAS and cuBLASLt.
.. warning:: This flag is experimental and subject to change.
When PyTorch runs a CUDA BLAS operation it defaults to cuBLAS even if both cuBLAS and cuBLASLt are available.
For PyTorch built for ROCm, hipBLAS and hipBLASLt may offer different performance.
This flag (a :class:`str`) allows overriding which BLAS library to use.
* If `"cublas"` is set then cuBLAS will be used wherever possible.
* If `"cublaslt"` is set then cuBLASLt will be used wherever possible.
* When no input is given, this function returns the currently preferred library.
* User may use the environment variable TORCH_BLAS_PREFER_CUBLASLT=1 to set the preferred library to cuBLASLt
globally.
This flag only sets the initial value of the preferred library and the preferred library
may still be overridden by this function call later in your script.
Note: When a library is preferred other libraries may still be used if the preferred library
doesn't implement the operation(s) called.
This flag may achieve better performance if PyTorch's library selection is incorrect
for your application's inputs.
"""
if backend is None:
pass
elif isinstance(backend, str):
if backend not in _BlasBackends:
raise RuntimeError(
"Unknown input value. " f"Choose from: {_BlasBackends_str}."
)
torch._C._set_blas_preferred_backend(_BlasBackends[backend])
elif isinstance(backend, torch._C._BlasBackend):
torch._C._set_blas_preferred_backend(backend)
else:
raise RuntimeError("Unknown input value type.")
return torch._C._get_blas_preferred_backend()
from torch._C import _SDPAParams as SDPAParams, _SDPBackend as SDPBackend
# Set the __module__ attribute
SDPAParams.__module__ = "torch.backends.cuda"
SDPAParams.__name__ = "SDPAParams"
def flash_sdp_enabled():
r"""
.. warning:: This flag is beta and subject to change.
Returns whether flash scaled dot product attention is enabled or not.
"""
return torch._C._get_flash_sdp_enabled()
def enable_flash_sdp(enabled: bool):
r"""
.. warning:: This flag is beta and subject to change.
Enables or disables flash scaled dot product attention.
"""
torch._C._set_sdp_use_flash(enabled)
def mem_efficient_sdp_enabled():
r"""
.. warning:: This flag is beta and subject to change.
Returns whether memory efficient scaled dot product attention is enabled or not.
"""
return torch._C._get_mem_efficient_sdp_enabled()
def enable_mem_efficient_sdp(enabled: bool):
r"""
.. warning:: This flag is beta and subject to change.
Enables or disables memory efficient scaled dot product attention.
"""
torch._C._set_sdp_use_mem_efficient(enabled)
def math_sdp_enabled():
r"""
.. warning:: This flag is beta and subject to change.
Returns whether math scaled dot product attention is enabled or not.
"""
return torch._C._get_math_sdp_enabled()
def enable_math_sdp(enabled: bool):
r"""
.. warning:: This flag is beta and subject to change.
Enables or disables math scaled dot product attention.
"""
torch._C._set_sdp_use_math(enabled)
def allow_fp16_bf16_reduction_math_sdp(enabled: bool):
r"""
.. warning:: This flag is beta and subject to change.
Enables or disables fp16/bf16 reduction in math scaled dot product attention.
"""
torch._C._set_math_sdp_allow_fp16_bf16_reduction(enabled)
def fp16_bf16_reduction_math_sdp_allowed():
r"""
.. warning:: This flag is beta and subject to change.
Returns whether fp16/bf16 reduction in math scaled dot product attention is enabled or not.
"""
return torch._C._get_math_sdp_allow_fp16_bf16_reduction()
def is_flash_attention_available() -> bool:
r"""Check if PyTorch was built with FlashAttention for scaled_dot_product_attention.
Returns:
True if FlashAttention is built and available; otherwise, False.
Note:
This function is dependent on a CUDA-enabled build of PyTorch. It will return False
in non-CUDA environments.
"""
return torch._C._is_flash_attention_available()
def can_use_flash_attention(params: SDPAParams, debug: bool = False) -> bool:
r"""Check if FlashAttention can be utilized in scaled_dot_product_attention.
Args:
params: An instance of SDPAParams containing the tensors for query,
key, value, an optional attention mask, dropout rate, and
a flag indicating if the attention is causal.
debug: Whether to logging.warn debug information as to why FlashAttention could not be run.
Defaults to False.
Returns:
True if FlashAttention can be used with the given parameters; otherwise, False.
Note:
This function is dependent on a CUDA-enabled build of PyTorch. It will return False
in non-CUDA environments.
"""
return torch._C._can_use_flash_attention(params, debug)
def can_use_efficient_attention(params: SDPAParams, debug: bool = False) -> bool:
r"""Check if efficient_attention can be utilized in scaled_dot_product_attention.
Args:
params: An instance of SDPAParams containing the tensors for query,
key, value, an optional attention mask, dropout rate, and
a flag indicating if the attention is causal.
debug: Whether to logging.warn with information as to why efficient_attention could not be run.
Defaults to False.
Returns:
True if efficient_attention can be used with the given parameters; otherwise, False.
Note:
This function is dependent on a CUDA-enabled build of PyTorch. It will return False
in non-CUDA environments.
"""
return torch._C._can_use_mem_efficient_attention(params, debug)
def can_use_cudnn_attention(params: SDPAParams, debug: bool = False) -> bool:
r"""Check if cudnn_attention can be utilized in scaled_dot_product_attention.
Args:
params: An instance of SDPAParams containing the tensors for query,
key, value, an optional attention mask, dropout rate, and
a flag indicating if the attention is causal.
debug: Whether to logging.warn with information as to why cuDNN attention could not be run.
Defaults to False.
Returns:
True if cuDNN can be used with the given parameters; otherwise, False.
Note:
This function is dependent on a CUDA-enabled build of PyTorch. It will return False
in non-CUDA environments.
"""
return torch._C._can_use_cudnn_attention(params, debug)
def cudnn_sdp_enabled():
r"""
.. warning:: This flag is beta and subject to change.
Returns whether cuDNN scaled dot product attention is enabled or not.
"""
return torch._C._get_cudnn_sdp_enabled()
def enable_cudnn_sdp(enabled: bool):
r"""
.. warning:: This flag is beta and subject to change.
Enables or disables cuDNN scaled dot product attention.
"""
torch._C._set_sdp_use_cudnn(enabled)
@contextlib.contextmanager
@deprecated(
(
"`torch.backends.cuda.sdp_kernel()` is deprecated. "
"In the future, this context manager will be removed. "
"Please see `torch.nn.attention.sdpa_kernel()` for the new context manager, "
"with updated signature."
),
category=FutureWarning,
)
def sdp_kernel(
enable_flash: bool = True,
enable_math: bool = True,
enable_mem_efficient: bool = True,
enable_cudnn: bool = True,
):
r"""
.. warning:: This flag is beta and subject to change.
This context manager can be used to temporarily enable or disable any of the three backends for scaled dot product attention.
Upon exiting the context manager, the previous state of the flags will be restored.
"""
from torch.nn.attention import sdpa_kernel
backend_list = []
if enable_flash:
backend_list.append(SDPBackend.FLASH_ATTENTION)
if enable_mem_efficient:
backend_list.append(SDPBackend.EFFICIENT_ATTENTION)
if enable_math:
backend_list.append(SDPBackend.MATH)
if enable_cudnn:
backend_list.append(SDPBackend.CUDNN_ATTENTION)
with sdpa_kernel(backend_list) as context:
try:
yield context
finally:
pass
cufft_plan_cache = cuFFTPlanCacheManager()
matmul = cuBLASModule()

View File

@ -0,0 +1,208 @@
# mypy: allow-untyped-defs
import os
import sys
import warnings
from contextlib import contextmanager
from typing import Optional
import torch
from torch.backends import __allow_nonbracketed_mutation, ContextProp, PropModule
try:
from torch._C import _cudnn
except ImportError:
_cudnn = None # type: ignore[assignment]
# Write:
#
# torch.backends.cudnn.enabled = False
#
# to globally disable CuDNN/MIOpen
__cudnn_version: Optional[int] = None
if _cudnn is not None:
def _init():
global __cudnn_version
if __cudnn_version is None:
__cudnn_version = _cudnn.getVersionInt()
runtime_version = _cudnn.getRuntimeVersion()
compile_version = _cudnn.getCompileVersion()
runtime_major, runtime_minor, _ = runtime_version
compile_major, compile_minor, _ = compile_version
# Different major versions are always incompatible
# Starting with cuDNN 7, minor versions are backwards-compatible
# Not sure about MIOpen (ROCm), so always do a strict check
if runtime_major != compile_major:
cudnn_compatible = False
elif runtime_major < 7 or not _cudnn.is_cuda:
cudnn_compatible = runtime_minor == compile_minor
else:
cudnn_compatible = runtime_minor >= compile_minor
if not cudnn_compatible:
if os.environ.get("PYTORCH_SKIP_CUDNN_COMPATIBILITY_CHECK", "0") == "1":
return True
base_error_msg = (
f"cuDNN version incompatibility: "
f"PyTorch was compiled against {compile_version} "
f"but found runtime version {runtime_version}. "
f"PyTorch already comes bundled with cuDNN. "
f"One option to resolving this error is to ensure PyTorch "
f"can find the bundled cuDNN. "
)
if "LD_LIBRARY_PATH" in os.environ:
ld_library_path = os.environ.get("LD_LIBRARY_PATH", "")
if any(
substring in ld_library_path for substring in ["cuda", "cudnn"]
):
raise RuntimeError(
f"{base_error_msg}"
f"Looks like your LD_LIBRARY_PATH contains incompatible version of cudnn. "
f"Please either remove it from the path or install cudnn {compile_version}"
)
else:
raise RuntimeError(
f"{base_error_msg}"
f"one possibility is that there is a "
f"conflicting cuDNN in LD_LIBRARY_PATH."
)
else:
raise RuntimeError(base_error_msg)
return True
else:
def _init():
return False
def version():
"""Return the version of cuDNN."""
if not _init():
return None
return __cudnn_version
CUDNN_TENSOR_DTYPES = {
torch.half,
torch.float,
torch.double,
}
def is_available():
r"""Return a bool indicating if CUDNN is currently available."""
return torch._C._has_cudnn
def is_acceptable(tensor):
if not torch._C._get_cudnn_enabled():
return False
if tensor.device.type != "cuda" or tensor.dtype not in CUDNN_TENSOR_DTYPES:
return False
if not is_available():
warnings.warn(
"PyTorch was compiled without cuDNN/MIOpen support. To use cuDNN/MIOpen, rebuild "
"PyTorch making sure the library is visible to the build system."
)
return False
if not _init():
warnings.warn(
"cuDNN/MIOpen library not found. Check your {libpath}".format(
libpath={"darwin": "DYLD_LIBRARY_PATH", "win32": "PATH"}.get(
sys.platform, "LD_LIBRARY_PATH"
)
)
)
return False
return True
def set_flags(
_enabled=None,
_benchmark=None,
_benchmark_limit=None,
_deterministic=None,
_allow_tf32=None,
):
orig_flags = (
torch._C._get_cudnn_enabled(),
torch._C._get_cudnn_benchmark(),
None if not is_available() else torch._C._cuda_get_cudnn_benchmark_limit(),
torch._C._get_cudnn_deterministic(),
torch._C._get_cudnn_allow_tf32(),
)
if _enabled is not None:
torch._C._set_cudnn_enabled(_enabled)
if _benchmark is not None:
torch._C._set_cudnn_benchmark(_benchmark)
if _benchmark_limit is not None and is_available():
torch._C._cuda_set_cudnn_benchmark_limit(_benchmark_limit)
if _deterministic is not None:
torch._C._set_cudnn_deterministic(_deterministic)
if _allow_tf32 is not None:
torch._C._set_cudnn_allow_tf32(_allow_tf32)
return orig_flags
@contextmanager
def flags(
enabled=False,
benchmark=False,
benchmark_limit=10,
deterministic=False,
allow_tf32=True,
):
with __allow_nonbracketed_mutation():
orig_flags = set_flags(
enabled, benchmark, benchmark_limit, deterministic, allow_tf32
)
try:
yield
finally:
# recover the previous values
with __allow_nonbracketed_mutation():
set_flags(*orig_flags)
# The magic here is to allow us to intercept code like this:
#
# torch.backends.<cudnn|mkldnn>.enabled = True
class CudnnModule(PropModule):
def __init__(self, m, name):
super().__init__(m, name)
enabled = ContextProp(torch._C._get_cudnn_enabled, torch._C._set_cudnn_enabled)
deterministic = ContextProp(
torch._C._get_cudnn_deterministic, torch._C._set_cudnn_deterministic
)
benchmark = ContextProp(
torch._C._get_cudnn_benchmark, torch._C._set_cudnn_benchmark
)
benchmark_limit = None
if is_available():
benchmark_limit = ContextProp(
torch._C._cuda_get_cudnn_benchmark_limit,
torch._C._cuda_set_cudnn_benchmark_limit,
)
allow_tf32 = ContextProp(
torch._C._get_cudnn_allow_tf32, torch._C._set_cudnn_allow_tf32
)
# This is the sys.modules replacement trick, see
# https://stackoverflow.com/questions/2447353/getattr-on-a-module/7668273#7668273
sys.modules[__name__] = CudnnModule(sys.modules[__name__], __name__)
# Add type annotation for the replaced module
enabled: bool
deterministic: bool
benchmark: bool
allow_tf32: bool
benchmark_limit: int

View File

@ -0,0 +1,64 @@
# mypy: allow-untyped-defs
import torch.cuda
try:
from torch._C import _cudnn
except ImportError:
# Uses of all the functions below should be guarded by torch.backends.cudnn.is_available(),
# so it's safe to not emit any checks here.
_cudnn = None # type: ignore[assignment]
def get_cudnn_mode(mode):
if mode == "RNN_RELU":
return int(_cudnn.RNNMode.rnn_relu)
elif mode == "RNN_TANH":
return int(_cudnn.RNNMode.rnn_tanh)
elif mode == "LSTM":
return int(_cudnn.RNNMode.lstm)
elif mode == "GRU":
return int(_cudnn.RNNMode.gru)
else:
raise Exception(f"Unknown mode: {mode}") # noqa: TRY002
# NB: We don't actually need this class anymore (in fact, we could serialize the
# dropout state for even better reproducibility), but it is kept for backwards
# compatibility for old models.
class Unserializable:
def __init__(self, inner):
self.inner = inner
def get(self):
return self.inner
def __getstate__(self):
# Note: can't return {}, because python2 won't call __setstate__
# if the value evaluates to False
return "<unserializable>"
def __setstate__(self, state):
self.inner = None
def init_dropout_state(dropout, train, dropout_seed, dropout_state):
dropout_desc_name = "desc_" + str(torch.cuda.current_device())
dropout_p = dropout if train else 0
if (dropout_desc_name not in dropout_state) or (
dropout_state[dropout_desc_name].get() is None
):
if dropout_p == 0:
dropout_state[dropout_desc_name] = Unserializable(None)
else:
dropout_state[dropout_desc_name] = Unserializable(
torch._cudnn_init_dropout_state( # type: ignore[call-arg]
dropout_p,
train,
dropout_seed,
self_ty=torch.uint8,
device=torch.device("cuda"),
)
)
dropout_ts = dropout_state[dropout_desc_name].get()
return dropout_ts

View File

@ -0,0 +1,42 @@
# mypy: allow-untyped-defs
from typing import Optional
import torch
__all__ = [
"version",
"is_available",
]
try:
from torch._C import _cusparselt
except ImportError:
_cusparselt = None # type: ignore[assignment]
__cusparselt_version: Optional[int] = None
if _cusparselt is not None:
def _init():
global __cusparselt_version
if __cusparselt_version is None:
__cusparselt_version = _cusparselt.getVersionInt()
return True
else:
def _init():
return False
def version() -> Optional[int]:
"""Return the version of cuSPARSELt"""
if not _init():
return None
return __cusparselt_version
def is_available() -> bool:
r"""Return a bool indicating if cuSPARSELt is currently available."""
return torch._C._has_cusparselt

View File

@ -0,0 +1,25 @@
# Config options to enable/disable C++ kernel for nn.functional.MHA
# and nn.TransformerEncoder
import torch
_is_fastpath_enabled: bool = True
def get_fastpath_enabled() -> bool:
"""Returns whether fast path for TransformerEncoder and MultiHeadAttention
is enabled, or ``True`` if jit is scripting.
..note:
The fastpath might not be run even if ``get_fastpath_enabled`` returns
``True`` unless all conditions on inputs are met.
"""
if not torch.jit.is_scripting():
return _is_fastpath_enabled
return True
def set_fastpath_enabled(value: bool) -> None:
"""Sets whether fast path is enabled"""
global _is_fastpath_enabled
_is_fastpath_enabled = value

View File

@ -0,0 +1,57 @@
# mypy: allow-untyped-defs
import torch
def is_available():
r"""Return whether PyTorch is built with MKL support."""
return torch._C.has_mkl
VERBOSE_OFF = 0
VERBOSE_ON = 1
class verbose:
"""
On-demand oneMKL verbosing functionality.
To make it easier to debug performance issues, oneMKL can dump verbose
messages containing execution information like duration while executing
the kernel. The verbosing functionality can be invoked via an environment
variable named `MKL_VERBOSE`. However, this methodology dumps messages in
all steps. Those are a large amount of verbose messages. Moreover, for
investigating the performance issues, generally taking verbose messages
for one single iteration is enough. This on-demand verbosing functionality
makes it possible to control scope for verbose message dumping. In the
following example, verbose messages will be dumped out for the second
inference only.
.. highlight:: python
.. code-block:: python
import torch
model(data)
with torch.backends.mkl.verbose(torch.backends.mkl.VERBOSE_ON):
model(data)
Args:
level: Verbose level
- ``VERBOSE_OFF``: Disable verbosing
- ``VERBOSE_ON``: Enable verbosing
"""
def __init__(self, enable):
self.enable = enable
def __enter__(self):
if self.enable == VERBOSE_OFF:
return
st = torch._C._verbose.mkl_set_verbose(self.enable)
assert (
st
), "Failed to set MKL into verbose mode. Please consider to disable this verbose scope."
return self
def __exit__(self, exc_type, exc_val, exc_tb):
torch._C._verbose.mkl_set_verbose(VERBOSE_OFF)
return False

View File

@ -0,0 +1,100 @@
# mypy: allow-untyped-defs
import sys
from contextlib import contextmanager
from typing import TYPE_CHECKING
import torch
from torch.backends import __allow_nonbracketed_mutation, ContextProp, PropModule
def is_available():
r"""Return whether PyTorch is built with MKL-DNN support."""
return torch._C._has_mkldnn
VERBOSE_OFF = 0
VERBOSE_ON = 1
VERBOSE_ON_CREATION = 2
class verbose:
"""
On-demand oneDNN (former MKL-DNN) verbosing functionality.
To make it easier to debug performance issues, oneDNN can dump verbose
messages containing information like kernel size, input data size and
execution duration while executing the kernel. The verbosing functionality
can be invoked via an environment variable named `DNNL_VERBOSE`. However,
this methodology dumps messages in all steps. Those are a large amount of
verbose messages. Moreover, for investigating the performance issues,
generally taking verbose messages for one single iteration is enough.
This on-demand verbosing functionality makes it possible to control scope
for verbose message dumping. In the following example, verbose messages
will be dumped out for the second inference only.
.. highlight:: python
.. code-block:: python
import torch
model(data)
with torch.backends.mkldnn.verbose(torch.backends.mkldnn.VERBOSE_ON):
model(data)
Args:
level: Verbose level
- ``VERBOSE_OFF``: Disable verbosing
- ``VERBOSE_ON``: Enable verbosing
- ``VERBOSE_ON_CREATION``: Enable verbosing, including oneDNN kernel creation
"""
def __init__(self, level):
self.level = level
def __enter__(self):
if self.level == VERBOSE_OFF:
return
st = torch._C._verbose.mkldnn_set_verbose(self.level)
assert (
st
), "Failed to set MKLDNN into verbose mode. Please consider to disable this verbose scope."
return self
def __exit__(self, exc_type, exc_val, exc_tb):
torch._C._verbose.mkldnn_set_verbose(VERBOSE_OFF)
return False
def set_flags(_enabled, _deterministic=None):
orig_flags = (torch._C._get_mkldnn_enabled(), torch._C._get_mkldnn_deterministic())
torch._C._set_mkldnn_enabled(_enabled)
if _deterministic is not None:
torch._C._set_mkldnn_deterministic(_deterministic)
return orig_flags
@contextmanager
def flags(enabled=False, deterministic=False):
with __allow_nonbracketed_mutation():
orig_flags = set_flags(enabled, deterministic)
try:
yield
finally:
with __allow_nonbracketed_mutation():
set_flags(*orig_flags)
class MkldnnModule(PropModule):
def __init__(self, m, name):
super().__init__(m, name)
enabled = ContextProp(torch._C._get_mkldnn_enabled, torch._C._set_mkldnn_enabled)
deterministic = ContextProp(
torch._C._get_mkldnn_deterministic, torch._C._set_mkldnn_deterministic
)
if TYPE_CHECKING:
enabled: ContextProp
deterministic: ContextProp
sys.modules[__name__] = MkldnnModule(sys.modules[__name__], __name__)

View File

@ -0,0 +1,55 @@
# mypy: allow-untyped-defs
from functools import lru_cache as _lru_cache
from typing import Optional, TYPE_CHECKING
import torch
from torch.library import Library as _Library
__all__ = ["is_built", "is_available", "is_macos13_or_newer", "is_macos_or_newer"]
def is_built() -> bool:
r"""Return whether PyTorch is built with MPS support.
Note that this doesn't necessarily mean MPS is available; just that
if this PyTorch binary were run a machine with working MPS drivers
and devices, we would be able to use it.
"""
return torch._C._has_mps
@_lru_cache
def is_available() -> bool:
r"""Return a bool indicating if MPS is currently available."""
return torch._C._mps_is_available()
@_lru_cache
def is_macos_or_newer(major: int, minor: int) -> bool:
r"""Return a bool indicating whether MPS is running on given MacOS or newer."""
return torch._C._mps_is_on_macos_or_newer(major, minor)
@_lru_cache
def is_macos13_or_newer(minor: int = 0) -> bool:
r"""Return a bool indicating whether MPS is running on MacOS 13 or newer."""
return torch._C._mps_is_on_macos_or_newer(13, minor)
_lib: Optional[_Library] = None
def _init():
r"""Register prims as implementation of var_mean and group_norm."""
global _lib
if _lib is not None or not is_built():
return
from torch._decomp.decompositions import native_group_norm_backward
from torch._refs import native_group_norm
_lib = _Library("aten", "IMPL") # noqa: TOR901
_lib.impl("native_group_norm", native_group_norm, "MPS")
_lib.impl("native_group_norm_backward", native_group_norm_backward, "MPS")

View File

@ -0,0 +1,32 @@
# mypy: allow-untyped-defs
from contextlib import contextmanager
import torch
from torch.backends import __allow_nonbracketed_mutation, ContextProp, PropModule
__all__ = ["is_available", "flags", "set_flags"]
def is_available():
r"""Return whether PyTorch is built with NNPACK support."""
return torch._nnpack_available()
def set_flags(_enabled):
r"""Set if nnpack is enabled globally"""
orig_flags = (torch._C._get_nnpack_enabled(),)
torch._C._set_nnpack_enabled(_enabled)
return orig_flags
@contextmanager
def flags(enabled=False):
r"""Context manager for setting if nnpack is enabled globally"""
with __allow_nonbracketed_mutation():
orig_flags = set_flags(enabled)
try:
yield
finally:
with __allow_nonbracketed_mutation():
set_flags(orig_flags[0])

View File

@ -0,0 +1,7 @@
# mypy: allow-untyped-defs
import torch
def is_available():
r"""Return whether PyTorch is built with OpenMP support."""
return torch._C.has_openmp

View File

@ -0,0 +1,112 @@
# mypy: allow-untyped-defs
import sys
import warnings
from contextlib import contextmanager
from functools import lru_cache as _lru_cache
from typing import Any
from torch.backends import __allow_nonbracketed_mutation, ContextProp, PropModule
try:
import opt_einsum as _opt_einsum # type: ignore[import]
except ImportError:
_opt_einsum = None
@_lru_cache
def is_available() -> bool:
r"""Return a bool indicating if opt_einsum is currently available."""
return _opt_einsum is not None
def get_opt_einsum() -> Any:
r"""Return the opt_einsum package if opt_einsum is currently available, else None."""
return _opt_einsum
def _set_enabled(_enabled: bool) -> None:
if not is_available() and _enabled:
raise ValueError(
f"opt_einsum is not available, so setting `enabled` to {_enabled} will not reap "
"the benefits of calculating an optimal path for einsum. torch.einsum will "
"fall back to contracting from left to right. To enable this optimal path "
"calculation, please install opt-einsum."
)
global enabled
enabled = _enabled
def _get_enabled() -> bool:
return enabled
def _set_strategy(_strategy: str) -> None:
if not is_available():
raise ValueError(
f"opt_einsum is not available, so setting `strategy` to {_strategy} will not be meaningful. "
"torch.einsum will bypass path calculation and simply contract from left to right. "
"Please install opt_einsum or unset `strategy`."
)
if not enabled:
raise ValueError(
f"opt_einsum is not enabled, so setting a `strategy` to {_strategy} will not be meaningful. "
"torch.einsum will bypass path calculation and simply contract from left to right. "
"Please set `enabled` to `True` as well or unset `strategy`."
)
if _strategy not in ["auto", "greedy", "optimal"]:
raise ValueError(
f"`strategy` must be one of the following: [auto, greedy, optimal] but is {_strategy}"
)
global strategy
strategy = _strategy
def _get_strategy() -> str:
return strategy
def set_flags(_enabled=None, _strategy=None):
orig_flags = (enabled, None if not is_available() else strategy)
if _enabled is not None:
_set_enabled(_enabled)
if _strategy is not None:
_set_strategy(_strategy)
return orig_flags
@contextmanager
def flags(enabled=None, strategy=None):
with __allow_nonbracketed_mutation():
orig_flags = set_flags(enabled, strategy)
try:
yield
finally:
# recover the previous values
with __allow_nonbracketed_mutation():
set_flags(*orig_flags)
# The magic here is to allow us to intercept code like this:
#
# torch.backends.opt_einsum.enabled = True
class OptEinsumModule(PropModule):
def __init__(self, m, name):
super().__init__(m, name)
global enabled
enabled = ContextProp(_get_enabled, _set_enabled)
global strategy
strategy = None
if is_available():
strategy = ContextProp(_get_strategy, _set_strategy)
# This is the sys.modules replacement trick, see
# https://stackoverflow.com/questions/2447353/getattr-on-a-module/7668273#7668273
sys.modules[__name__] = OptEinsumModule(sys.modules[__name__], __name__)
enabled = True if is_available() else False
strategy = "auto" if is_available() else None

View File

@ -0,0 +1,66 @@
# mypy: allow-untyped-defs
import sys
import types
from typing import List
import torch
# This function should correspond to the enums present in c10/core/QEngine.h
def _get_qengine_id(qengine: str) -> int:
if qengine == "none" or qengine == "" or qengine is None:
ret = 0
elif qengine == "fbgemm":
ret = 1
elif qengine == "qnnpack":
ret = 2
elif qengine == "onednn":
ret = 3
elif qengine == "x86":
ret = 4
else:
ret = -1
raise RuntimeError(f"{qengine} is not a valid value for quantized engine")
return ret
# This function should correspond to the enums present in c10/core/QEngine.h
def _get_qengine_str(qengine: int) -> str:
all_engines = {0: "none", 1: "fbgemm", 2: "qnnpack", 3: "onednn", 4: "x86"}
return all_engines.get(qengine, "*undefined")
class _QEngineProp:
def __get__(self, obj, objtype) -> str:
return _get_qengine_str(torch._C._get_qengine())
def __set__(self, obj, val: str) -> None:
torch._C._set_qengine(_get_qengine_id(val))
class _SupportedQEnginesProp:
def __get__(self, obj, objtype) -> List[str]:
qengines = torch._C._supported_qengines()
return [_get_qengine_str(qe) for qe in qengines]
def __set__(self, obj, val) -> None:
raise RuntimeError("Assignment not supported")
class QuantizedEngine(types.ModuleType):
def __init__(self, m, name):
super().__init__(name)
self.m = m
def __getattr__(self, attr):
return self.m.__getattribute__(attr)
engine = _QEngineProp()
supported_engines = _SupportedQEnginesProp()
# This is the sys.modules replacement trick, see
# https://stackoverflow.com/questions/2447353/getattr-on-a-module/7668273#7668273
sys.modules[__name__] = QuantizedEngine(sys.modules[__name__], __name__)
engine: str
supported_engines: List[str]

View File

@ -0,0 +1,943 @@
# mypy: allow-untyped-defs
"""
This is a script for launching PyTorch inference on Intel(R) Xeon(R) Scalable Processors with optimal configurations.
Single instance inference, multi-instance inference are enabled.
Note: term "instance" here doesn't refer to a cloud instance. This script is executed as a single process. It invokes
multiple "instances" which are formed from multiple threads for each. "instance" is kind of group of threads in this
context.
Illustrated as below:
::
+-----------------------------+----------------------+-------+
| process | thread | core |
+=============================+======================+=======+
| torch.backends.xeon.run_cpu | instance 0: thread 0 | 0 |
| | thread 1 | 1 |
| +----------------------+-------+
| | instance 1: thread 0 | 2 |
| | thread 1 | 3 |
| +----------------------+-------+
| | ... | ... |
| +----------------------+-------+
| | instance N: thread 0 | M |
| | thread 1 | M+1 |
+-----------------------------+----------------------+-------+
To get the peak performance on Intel(R) Xeon(R) Scalable Processors, the script optimizes the configuration of thread and memory
management. For thread management, the script configures thread affinity and the preload of Intel OMP library.
For memory management, it configures NUMA binding and preload optimized memory allocation library (e.g. tcmalloc, jemalloc).
Environment variables that will be set by this script:
+------------------+-------------------------------------------------------------------------------------------------+
| Environ Variable | Value |
+==================+=================================================================================================+
| LD_PRELOAD | Depending on knobs you set, <lib>/libiomp5.so, <lib>/libjemalloc.so, <lib>/libtcmalloc.so might |
| | be appended to LD_PRELOAD. |
+------------------+-------------------------------------------------------------------------------------------------+
| KMP_AFFINITY | If libiomp5.so is preloaded, KMP_AFFINITY could be set to "granularity=fine,compact,1,0". |
+------------------+-------------------------------------------------------------------------------------------------+
| KMP_BLOCKTIME | If libiomp5.so is preloaded, KMP_BLOCKTIME is set to "1". |
+------------------+-------------------------------------------------------------------------------------------------+
| OMP_NUM_THREADS | value of ncores_per_instance |
+------------------+-------------------------------------------------------------------------------------------------+
| MALLOC_CONF | If libjemalloc.so is preloaded, MALLOC_CONF will be set to |
| | "oversize_threshold:1,background_thread:true,metadata_thp:auto". |
+------------------+-------------------------------------------------------------------------------------------------+
*Note*: This script respects environment variables set preliminarily. I.e. If you set the environment variables
mentioned above before running the script, the script will not overwrite the values in the script.
How to use this module:
~~~~~~~~~~~~~~~~~~~~~~~
Single instance inference
-------------------------
1. Run single-instance inference on a single node with all CPU nodes.
::
python -m torch.backends.xeon.run_cpu --throughput-mode script.py args
2. Run single-instance inference on a single CPU node.
::
python -m torch.backends.xeon.run_cpu --node-id 1 script.py args
Multi-instance inference
------------------------
1. Multi-instance
By default this tool runs one process per node. If you want to set the instance numbers and core per instance,
--ninstances and --ncores-per-instance should be set.
::
python -m torch.backends.xeon.run_cpu -- python_script args
eg: on an Intel(R) Xeon(R) Scalable Processor with 14 instance, 4 cores per instance
::
python -m torch.backends.xeon.run_cpu --ninstances 14 --ncores-per-instance 4 python_script args
2. Run single-instance inference among multiple instances.
By default, runs all ninstances. If you want to independently run a single instance among ninstances, specify rank.
eg: run 0th instance on an Intel(R) Xeon(R) Scalable Processor with 2 instance (i.e., numactl -C 0-27)
::
python -m torch.backends.xeon.run_cpu --ninstances 2 --rank 0 python_script args
eg: run 1st instance on an Intel(R) Xeon(R) Scalable Processor with 2 instance (i.e., numactl -C 28-55)
::
python -m torch.backends.xeon.run_cpu --ninstances 2 --rank 1 python_script args
eg: run 0th instance on an Intel(R) Xeon(R) Scalable Processor with 2 instance, 2 cores per instance,
first four cores (i.e., numactl -C 0-1)
::
python -m torch.backends.xeon.run_cpu --core-list "0, 1, 2, 3" --ninstances 2 --ncores-per-instance 2
--rank 0 python_script args
3. To look up what optional arguments this module offers:
::
python -m torch.backends.xeon.run_cpu --help
Memory allocator
----------------
"--enable-tcmalloc" and "--enable-jemalloc" can be used to enable different memory allcator.
"""
import glob
import logging
import os
import platform
import re
import subprocess
import sys
from argparse import ArgumentParser, RawTextHelpFormatter, REMAINDER
from os.path import expanduser
from typing import Dict, List
from torch.distributed.elastic.multiprocessing import (
DefaultLogsSpecs as _DefaultLogsSpecs,
start_processes,
Std,
)
format_str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
logging.basicConfig(level=logging.INFO, format=format_str)
logger = logging.getLogger(__name__)
class _CPUinfo:
"""Get CPU information, such as cores list and NUMA information."""
def __init__(self, test_input=""):
self.cpuinfo = []
if platform.system() in ["Windows", "Darwin"]:
raise RuntimeError(f"{platform.system()} is not supported!!!")
elif platform.system() == "Linux":
# Sample output of: `lscpu --parse=CPU,Core,Socket,Node`
#
# # The following is the parsable format, which can be fed to other
# # programs. Each different item in every column has an unique ID
# # starting from zero.
# # CPU,Core,Socket,Node
# 0,0,0,0
# 1,1,0,0
# ...
if test_input == "":
lscpu_cmd = ["lscpu", "--parse=CPU,Core,Socket,Node"]
lscpu_info = subprocess.check_output(
lscpu_cmd, universal_newlines=True
).split("\n")
else:
lscpu_info = test_input.split("\n")
# Get information about cpu, core, socket and node
for line in lscpu_info:
pattern = r"^([\d]+,[\d]+,[\d]+,[\d]?)"
regex_out = re.search(pattern, line)
if regex_out:
self.cpuinfo.append(regex_out.group(1).strip().split(","))
# physical cores := core column in lscpu output
# logical cores := cPU column in lscpu output
self.node_nums = int(max(line[3] for line in self.cpuinfo)) + 1
self.node_physical_cores: List[List[int]] = [] # node_id is index
self.node_logical_cores: List[List[int]] = [] # node_id is index
self.physical_core_node_map = {} # physical core to numa node id
self.logical_core_node_map = {} # logical core to numa node id
for node_id in range(self.node_nums):
cur_node_physical_core = []
cur_node_logical_core = []
for cpuinfo in self.cpuinfo:
nid = cpuinfo[3] if cpuinfo[3] != "" else "0"
if node_id == int(nid):
if int(cpuinfo[1]) not in cur_node_physical_core:
cur_node_physical_core.append(int(cpuinfo[1]))
self.physical_core_node_map[int(cpuinfo[1])] = int(node_id)
cur_node_logical_core.append(int(cpuinfo[0]))
self.logical_core_node_map[int(cpuinfo[0])] = int(node_id)
self.node_physical_cores.append(cur_node_physical_core)
self.node_logical_cores.append(cur_node_logical_core)
def _physical_core_nums(self):
return len(self.node_physical_cores) * len(self.node_physical_cores[0])
def _logical_core_nums(self):
return len(self.node_logical_cores) * len(self.node_logical_cores[0])
def get_node_physical_cores(self, node_id):
if node_id < 0 or node_id > self.node_nums - 1:
raise ValueError(
f"Invalid node id: {node_id}. Valid node ids: {list(range(len(self.node_physical_cores)))}"
)
return self.node_physical_cores[node_id]
def get_node_logical_cores(self, node_id):
if node_id < 0 or node_id > self.node_nums - 1:
raise ValueError(
f"Invalid node id: {node_id}. Valid node ids: {list(range(len(self.node_physical_cores)))}"
)
return self.node_logical_cores[node_id]
def get_all_physical_cores(self):
all_cores = []
for cores in self.node_physical_cores:
all_cores.extend(cores)
return all_cores
def get_all_logical_cores(self):
all_cores = []
for cores in self.node_logical_cores:
all_cores.extend(cores)
return all_cores
def numa_aware_check(self, core_list):
"""
Check whether all cores in core_list are in the same NUMA node.
Cross NUMA will reduce performance.
We strongly advice to not use cores on different nodes.
"""
cores_numa_map = self.logical_core_node_map
numa_ids = []
for core in core_list:
numa_id = cores_numa_map[core]
if numa_id not in numa_ids:
numa_ids.append(numa_id)
if len(numa_ids) > 1:
logger.warning(
"Numa Aware: cores:%s on different NUMA nodes:%s. To avoid \
this behavior, please use --ncores-per-instance knob to make sure number of cores is divisible by --ncores-per-\
instance. Alternatively, please use --skip-cross-node-cores knob.",
str(core_list),
str(numa_ids),
)
if len(numa_ids) == 0:
raise RuntimeError(
"invalid number of NUMA nodes; please make sure numa_ids >= 1"
)
return numa_ids
class _Launcher:
r"""Class for launcher."""
msg_lib_notfound = f"Unable to find the {{0}} library file lib{{1}}.so in $CONDA_PREFIX/lib or $VIRTUAL_ENV/lib \
or /.local/lib/ or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or \
{expanduser('~')}/.local/lib/ so the LD_PRELOAD environment variable will not be set."
def __init__(self) -> None:
self.cpuinfo = _CPUinfo()
def add_lib_preload(self, lib_type):
"""Enable TCMalloc/JeMalloc/intel OpenMP."""
library_paths = []
if "CONDA_PREFIX" in os.environ:
library_paths.append(f"{os.environ['CONDA_PREFIX']}/lib")
if "VIRTUAL_ENV" in os.environ:
library_paths.append(f"{os.environ['VIRTUAL_ENV']}/lib")
library_paths += [
f"{expanduser('~')}/.local/lib",
"/usr/local/lib",
"/usr/local/lib64",
"/usr/lib",
"/usr/lib64",
]
lib_find = False
lib_set = False
for item in os.getenv("LD_PRELOAD", "").split(":"):
if item.endswith(f"lib{lib_type}.so"):
lib_set = True
break
if not lib_set:
for lib_path in library_paths:
library_file = os.path.join(lib_path, f"lib{lib_type}.so")
matches = glob.glob(library_file)
if len(matches) > 0:
ld_preloads = [f"{matches[0]}", os.getenv("LD_PRELOAD", "")]
os.environ["LD_PRELOAD"] = os.pathsep.join(
[p.strip(os.pathsep) for p in ld_preloads if p]
)
lib_find = True
break
return lib_set or lib_find
def is_numactl_available(self):
numactl_available = False
try:
cmd = ["numactl", "-C", "0", "-m", "0", "hostname"]
r = subprocess.run(
cmd,
env=os.environ,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
check=False,
)
if r.returncode == 0:
numactl_available = True
except Exception:
pass
return numactl_available
def set_memory_allocator(
self, enable_tcmalloc=True, enable_jemalloc=False, use_default_allocator=False
):
"""
Enable TCMalloc/JeMalloc with LD_PRELOAD and set configuration for JeMalloc.
By default, PTMalloc will be used for PyTorch, but TCMalloc and JeMalloc can get better
memory reuse and reduce page fault to improve performance.
"""
if enable_tcmalloc and enable_jemalloc:
raise RuntimeError(
"Unable to enable TCMalloc and JEMalloc at the same time."
)
if enable_tcmalloc:
find_tc = self.add_lib_preload(lib_type="tcmalloc")
if not find_tc:
msg = f'{self.msg_lib_notfound} you can use "conda install -c conda-forge gperftools" to install {{0}}'
logger.warning(msg.format("TCmalloc", "tcmalloc")) # noqa: G001
else:
logger.info("Use TCMalloc memory allocator")
elif enable_jemalloc:
find_je = self.add_lib_preload(lib_type="jemalloc")
if not find_je:
msg = f'{self.msg_lib_notfound} you can use "conda install -c conda-forge jemalloc" to install {{0}}'
logger.warning(msg.format("Jemalloc", "jemalloc")) # noqa: G001
else:
logger.info("Use JeMalloc memory allocator")
self.set_env(
"MALLOC_CONF",
"oversize_threshold:1,background_thread:true,metadata_thp:auto",
)
elif use_default_allocator:
pass
else:
find_tc = self.add_lib_preload(lib_type="tcmalloc")
if find_tc:
logger.info("Use TCMalloc memory allocator")
return
find_je = self.add_lib_preload(lib_type="jemalloc")
if find_je:
logger.info("Use JeMalloc memory allocator")
return
logger.warning(
"""Neither TCMalloc nor JeMalloc is found in $CONDA_PREFIX/lib or $VIRTUAL_ENV/lib
or /.local/lib/ or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or
%s/.local/lib/ so the LD_PRELOAD environment variable will not be set.
This may drop the performance""",
expanduser("~"),
)
def log_env_var(self, env_var_name=""):
if env_var_name in os.environ:
logger.info("%s=%s", env_var_name, os.environ[env_var_name])
def set_env(self, env_name, env_value):
if not env_value:
logger.warning("%s is None", env_name)
if env_name not in os.environ:
os.environ[env_name] = env_value
elif os.environ[env_name] != env_value:
logger.warning(
"Overriding value with the one set in environment variable: %s. \
Value applied: %s. Value ignored: %s",
env_name,
os.environ[env_name],
env_value,
)
self.log_env_var(env_name)
# set_kmp_affinity is used to control whether to set KMP_AFFINITY or not.
# In scenario that use all cores on all nodes, including logical cores, setting KMP_AFFINITY disables logical cores.
# In this case, KMP_AFFINITY should not be set.
def set_multi_thread_and_allocator(
self,
ncores_per_instance,
disable_iomp=False,
set_kmp_affinity=True,
enable_tcmalloc=True,
enable_jemalloc=False,
use_default_allocator=False,
):
"""
Set multi-thread configuration and enable Intel openMP and TCMalloc/JeMalloc.
By default, GNU openMP and PTMalloc are used in PyTorch. but Intel openMP and TCMalloc/JeMalloc are better alternatives
to get performance benefit.
"""
self.set_memory_allocator(
enable_tcmalloc, enable_jemalloc, use_default_allocator
)
self.set_env("OMP_NUM_THREADS", str(ncores_per_instance))
if not disable_iomp:
find_iomp = self.add_lib_preload(lib_type="iomp5")
if not find_iomp:
msg = f'{self.msg_lib_notfound} you can use "conda install mkl" to install {{0}}'
logger.warning(msg.format("iomp", "iomp5")) # noqa: G001
else:
logger.info("Using Intel OpenMP")
if set_kmp_affinity:
self.set_env("KMP_AFFINITY", "granularity=fine,compact,1,0")
self.set_env("KMP_BLOCKTIME", "1")
self.log_env_var("LD_PRELOAD")
r"""
Launcher for single instance and multi-instance
"""
def launch(self, args):
cores = []
set_kmp_affinity = True
enable_taskset = False
if args.core_list: # user specify what cores will be used by params
cores = [int(x) for x in args.core_list.split(",")]
if args.ncores_per_instance == -1:
raise RuntimeError(
'please specify the "--ncores-per-instance" if you have pass the --core-list params'
)
elif (
args.ninstances > 1
and args.ncores_per_instance * args.ninstances < len(cores)
):
logger.warning(
"only first %s cores will be used, \
but you specify %s cores in core_list",
args.ncores_per_instance * args.ninstances,
len(cores),
)
else:
args.ninstances = len(cores) // args.ncores_per_instance
else:
if args.use_logical_core:
if args.node_id != -1:
cores = self.cpuinfo.get_node_logical_cores(args.node_id)
else:
cores = self.cpuinfo.get_all_logical_cores()
# When using all cores on all nodes, including logical cores,
# setting KMP_AFFINITY disables logical cores. Thus, KMP_AFFINITY should not be set.
set_kmp_affinity = False
else:
if args.node_id != -1:
cores = self.cpuinfo.get_node_physical_cores(args.node_id)
else:
cores = self.cpuinfo.get_all_physical_cores()
if (
not args.multi_instance
and args.ninstances == -1
and args.ncores_per_instance == -1
):
args.ninstances = 1
args.ncores_per_instance = len(cores)
elif (
args.multi_instance
and args.ninstances == -1
and args.ncores_per_instance == -1
):
args.throughput_mode = True
elif args.ncores_per_instance == -1 and args.ninstances != -1:
if args.ninstances > len(cores):
raise RuntimeError(
f"there are {len(cores)} total cores but you specify {args.ninstances} ninstances; \
please make sure ninstances <= total_cores)"
)
else:
args.ncores_per_instance = len(cores) // args.ninstances
elif args.ncores_per_instance != -1 and args.ninstances == -1:
if not args.skip_cross_node_cores:
args.ninstances = len(cores) // args.ncores_per_instance
else:
ncore_per_node = len(self.cpuinfo.node_physical_cores[0])
num_leftover_cores = ncore_per_node % args.ncores_per_instance
if args.ncores_per_instance > ncore_per_node:
# too many ncores_per_instance to skip cross-node cores
logger.warning(
"there are %s core(s) per socket, but you specify %s ncores_per_instance and \
skip_cross_node_cores. Please make sure --ncores-per-instance < core(s) per \
socket",
ncore_per_node,
args.ncores_per_instance,
)
sys.exit(-1)
elif num_leftover_cores == 0:
# aren't any cross-node cores
logger.info(
"--skip-cross-node-cores is set, but there are no cross-node cores."
)
args.ninstances = len(cores) // args.ncores_per_instance
else:
# skip cross-node cores
if args.ninstances != -1:
logger.warning(
"--skip-cross-node-cores is exclusive to --ninstances. --ninstances \
won't take effect even if it is set explicitly."
)
i = 1
leftover_cores = set()
while ncore_per_node * i <= len(cores):
leftover_cores.update(
cores[
ncore_per_node * i
- num_leftover_cores : ncore_per_node * i
]
)
i += 1
cores = list(set(cores) - leftover_cores)
assert len(cores) % args.ncores_per_instance == 0
args.ninstances = len(cores) // args.ncores_per_instance
else:
if args.ninstances * args.ncores_per_instance > len(cores):
raise RuntimeError(
"Please make sure ninstances * ncores_per_instance <= total_cores"
)
if args.latency_mode:
logger.warning(
"--latency-mode is exclusive to --ninstances, --ncores-per-instance, --node-id and \
--use-logical-core. They won't take effect even they are set explicitly."
)
args.ncores_per_instance = 4
cores = self.cpuinfo.get_all_physical_cores()
args.ninstances = len(cores) // args.ncores_per_instance
if args.throughput_mode:
logger.warning(
"--throughput-mode is exclusive to --ninstances, --ncores-per-instance, --node-id and \
--use-logical-core. They won't take effect even they are set explicitly."
)
args.ninstances = self.cpuinfo.node_nums
cores = self.cpuinfo.get_all_physical_cores()
args.ncores_per_instance = len(cores) // args.ninstances
if args.ninstances > 1 and args.rank != -1:
logger.info(
"assigning %s cores for instance %s",
args.ncores_per_instance,
args.rank,
)
if not args.disable_numactl:
numactl_available = self.is_numactl_available()
if not numactl_available:
if not args.disable_taskset:
logger.warning(
"Core binding with numactl is not available. Disabling numactl and using taskset instead. \
This may affect performance in multi-socket system; please use numactl if memory binding is needed."
)
args.disable_numactl = True
enable_taskset = True
else:
logger.warning(
"Core binding with numactl is not available, and --disable_taskset is set. \
Please unset --disable_taskset to use taskset instead of numactl."
)
sys.exit(-1)
if not args.disable_taskset:
enable_taskset = True
self.set_multi_thread_and_allocator(
args.ncores_per_instance,
args.disable_iomp,
set_kmp_affinity,
args.enable_tcmalloc,
args.enable_jemalloc,
args.use_default_allocator,
)
entrypoint = ""
launch_args = {}
launch_envs: Dict[int, Dict] = {}
launch_tee = {}
# check whether is launched from torchrun with --nproc-per-node <num workers>
local_size = int(os.environ.get("LOCAL_WORLD_SIZE", 1))
local_rank = int(os.environ.get("LOCAL_RANK", 0))
for i in range(args.ninstances):
cmd = []
cur_process_cores = ""
if not args.disable_numactl or enable_taskset:
if not args.disable_numactl:
cmd = ["numactl"]
elif enable_taskset:
cmd = ["taskset"]
cores = sorted(cores)
if (
args.rank == -1
): # sequentially assign ncores_per_instance to ninstances
core_list = cores[
i
* args.ncores_per_instance : (i + 1)
* args.ncores_per_instance
]
else: # assign ncores_per_instance from rank
core_list = cores[
args.rank
* args.ncores_per_instance : (args.rank + 1)
* args.ncores_per_instance
]
core_ranges: List[Dict] = []
if local_size > 1:
total_num_cores = len(core_list)
cores_per_rank = total_num_cores // local_size
assert (
cores_per_rank >= 1
), "At least one core needs to be assigned to each rank"
core_list = core_list[
cores_per_rank * local_rank : cores_per_rank * (local_rank + 1)
]
for core in core_list:
if len(core_ranges) == 0:
range_elem = {"start": core, "end": core}
core_ranges.append(range_elem)
else:
if core - core_ranges[-1]["end"] == 1:
core_ranges[-1]["end"] = core
else:
range_elem = {"start": core, "end": core}
core_ranges.append(range_elem)
for r in core_ranges:
cur_process_cores = f"{cur_process_cores}{r['start']}-{r['end']},"
cur_process_cores = cur_process_cores[:-1]
if not args.disable_numactl:
numa_params = f"-C {cur_process_cores} "
numa_ids = ",".join(
[
str(numa_id)
for numa_id in self.cpuinfo.numa_aware_check(core_list)
]
)
numa_params += f"-m {numa_ids}"
cmd.extend(numa_params.split())
elif enable_taskset:
taskset_params = f"-c {cur_process_cores} "
cmd.extend(taskset_params.split())
with_python = not args.no_python
if with_python:
cmd.append(sys.executable)
cmd.append("-u")
if args.module:
cmd.append("-m")
cmd.append(args.program)
cmd.extend(args.program_args)
cmd_s = " ".join(cmd)
logger.info(cmd_s)
if entrypoint == "":
entrypoint = cmd[0]
del cmd[0]
launch_args[i] = tuple(cmd)
launch_envs[i] = {}
launch_tee[i] = Std.ALL
if args.rank != -1: # launches single instance, rank, only
break
ctx = start_processes(
name=args.log_file_prefix,
entrypoint=entrypoint,
args=launch_args,
envs=launch_envs,
logs_specs=_DefaultLogsSpecs(log_dir=args.log_path, tee=launch_tee),
)
ctx.wait()
def _add_memory_allocator_params(parser):
group = parser.add_argument_group("Memory Allocator Parameters")
# allocator control
group.add_argument(
"--enable-tcmalloc",
"--enable_tcmalloc",
action="store_true",
default=False,
help="Enable tcmalloc allocator",
)
group.add_argument(
"--enable-jemalloc",
"--enable_jemalloc",
action="store_true",
default=False,
help="Enable jemalloc allocator",
)
group.add_argument(
"--use-default-allocator",
"--use_default_allocator",
action="store_true",
default=False,
help="Use default memory allocator",
)
def _add_multi_instance_params(parser):
group = parser.add_argument_group("Multi-instance Parameters")
# multi-instance control
group.add_argument(
"--ncores-per-instance",
"--ncores_per_instance",
metavar="\b",
default=-1,
type=int,
help="Cores per instance",
)
group.add_argument(
"--ninstances",
metavar="\b",
default=-1,
type=int,
help="For multi-instance, you should give the cores number you used for per instance.",
)
group.add_argument(
"--skip-cross-node-cores",
"--skip_cross_node_cores",
action="store_true",
default=False,
help="If specified --ncores-per-instance, skips cross-node cores.",
)
group.add_argument(
"--rank",
metavar="\b",
default="-1",
type=int,
help="Specify instance index to assign ncores_per_instance for rank; \
otherwise ncores_per_instance will be assigned sequentially to ninstances. Please refer to \
https://github.com/intel/intel-extension-for-pytorch/blob/master/docs/tutorials/performance_tuning/launch_script.md",
)
group.add_argument(
"--latency-mode",
"--latency_mode",
action="store_true",
default=False,
help="By default 4 core per instance and use all physical cores",
)
group.add_argument(
"--throughput-mode",
"--throughput_mode",
action="store_true",
default=False,
help="By default one instance per node and use all physical cores",
)
group.add_argument(
"--node-id",
"--node_id",
metavar="\b",
default=-1,
type=int,
help="node id for multi-instance, by default all nodes will be used",
)
group.add_argument(
"--use-logical-core",
"--use_logical_core",
action="store_true",
default=False,
help="Whether only use physical cores",
)
group.add_argument(
"--disable-numactl",
"--disable_numactl",
action="store_true",
default=False,
help="Disable numactl",
)
group.add_argument(
"--disable-taskset",
"--disable_taskset",
action="store_true",
default=False,
help="Disable taskset",
)
group.add_argument(
"--core-list",
"--core_list",
metavar="\b",
default=None,
type=str,
help='Specify the core list as "core_id, core_id, ....", otherwise, all the cores will be used.',
)
group.add_argument(
"--log-path",
"--log_path",
metavar="\b",
default="",
type=str,
help="The log file directory. Default path is "
", which means disable logging to files.",
)
group.add_argument(
"--log-file-prefix",
"--log_file_prefix",
metavar="\b",
default="run",
type=str,
help="log file prefix",
)
def _add_kmp_iomp_params(parser):
group = parser.add_argument_group("IOMP Parameters")
group.add_argument(
"--disable-iomp",
"--disable_iomp",
action="store_true",
default=False,
help="By default, we use Intel OpenMP and libiomp5.so will be add to LD_PRELOAD",
)
def create_args(parser=None):
"""
Parse the command line options.
@retval ArgumentParser
"""
parser.add_argument(
"--multi-instance",
"--multi_instance",
action="store_true",
default=False,
help="Enable multi-instance, by default one instance per node",
)
parser.add_argument(
"-m",
"--module",
default=False,
action="store_true",
help="Changes each process to interpret the launch script "
"as a python module, executing with the same behavior as"
'"python -m".',
)
parser.add_argument(
"--no-python",
"--no_python",
default=False,
action="store_true",
help='Do not prepend the --program script with "python" - just exec '
"it directly. Useful when the script is not a Python script.",
)
_add_memory_allocator_params(parser)
_add_kmp_iomp_params(parser)
_add_multi_instance_params(parser)
# positional
parser.add_argument(
"program",
type=str,
help="The full path to the program/script to be launched. "
"followed by all the arguments for the script",
)
# rest from the training program
parser.add_argument("program_args", nargs=REMAINDER)
def main(args):
env_before = set(os.environ.keys())
if platform.system() in ["Windows", "Darwin"]:
raise RuntimeError(f"{platform.system()} is not supported!!!")
if args.log_path:
os.makedirs(args.log_path, exist_ok=True)
else:
args.log_path = os.devnull
if args.latency_mode and args.throughput_mode:
raise RuntimeError(
"Either args.latency_mode or args.throughput_mode should be set"
)
if not args.no_python and not args.program.endswith(".py"):
raise RuntimeError(
'For non Python script, you should use "--no-python" parameter.'
)
# Verify LD_PRELOAD
if "LD_PRELOAD" in os.environ:
lst_valid = []
tmp_ldpreload = os.environ["LD_PRELOAD"]
for item in tmp_ldpreload.split(":"):
matches = glob.glob(item)
if len(matches) > 0:
lst_valid.append(item)
else:
logger.warning("%s doesn't exist. Removing it from LD_PRELOAD.", item)
if len(lst_valid) > 0:
os.environ["LD_PRELOAD"] = ":".join(lst_valid)
else:
os.environ["LD_PRELOAD"] = ""
launcher = _Launcher()
launcher.launch(args)
for x in sorted(set(os.environ.keys()) - env_before):
logger.debug("%s=%s", x, os.environ[x])
if __name__ == "__main__":
parser = ArgumentParser(
description="This is a script for launching PyTorch inference on Intel(R) Xeon(R) Scalable "
"Processors with optimal configurations. Single instance inference, "
"multi-instance inference are enable. To get the peak performance on Intel(R) "
"Xeon(R) Scalable Processors, the script optimizes the configuration "
"of thread and memory management. For thread management, the script configures thread "
"affinity and the preload of Intel OMP library. For memory management, it configures "
"NUMA binding and preload optimized memory allocation library (e.g. tcmalloc, jemalloc) "
"\n################################# Basic usage ############################# \n"
"\n 1. single instance\n"
"\n >>> python -m torch.backends.xeon.run_cpu python_script args \n"
"\n2. multi-instance \n"
"\n >>> python -m torch.backends.xeon.run_cpu --ninstances xxx "
"--ncores-per-instance xx python_script args\n"
"\n############################################################################# \n",
formatter_class=RawTextHelpFormatter,
)
create_args(parser)
args = parser.parse_args()
main(args)

View File

@ -0,0 +1,29 @@
# mypy: allow-untyped-defs
import sys
import types
import torch
class _XNNPACKEnabled:
def __get__(self, obj, objtype):
return torch._C._is_xnnpack_enabled()
def __set__(self, obj, val):
raise RuntimeError("Assignment not supported")
class XNNPACKEngine(types.ModuleType):
def __init__(self, m, name):
super().__init__(name)
self.m = m
def __getattr__(self, attr):
return self.m.__getattribute__(attr)
enabled = _XNNPACKEnabled()
# This is the sys.modules replacement trick, see
# https://stackoverflow.com/questions/2447353/getattr-on-a-module/7668273#7668273
sys.modules[__name__] = XNNPACKEngine(sys.modules[__name__], __name__)