I am done

2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions
--- a/rl/Lib/site-packages/torch/backends/init.py
+++ b/rl/Lib/site-packages/torch/backends/init.py
@ -0,0 +1,72 @@
+# mypy: allow-untyped-defs
+import types
+from contextlib import contextmanager
+
+
+# The idea for this parameter is that we forbid bare assignment
+# to torch.backends.<cudnn|mkldnn>.enabled and friends when running our
+# test suite, where it's very easy to forget to undo the change
+# later.
+__allow_nonbracketed_mutation_flag = True
+
+
+def disable_global_flags():
+    global __allow_nonbracketed_mutation_flag
+    __allow_nonbracketed_mutation_flag = False
+
+
+def flags_frozen():
+    return not __allow_nonbracketed_mutation_flag
+
+
+@contextmanager
+def __allow_nonbracketed_mutation():
+    global __allow_nonbracketed_mutation_flag
+    old = __allow_nonbracketed_mutation_flag
+    __allow_nonbracketed_mutation_flag = True
+    try:
+        yield
+    finally:
+        __allow_nonbracketed_mutation_flag = old
+
+
+class ContextProp:
+    def __init__(self, getter, setter):
+        self.getter = getter
+        self.setter = setter
+
+    def __get__(self, obj, objtype):
+        return self.getter()
+
+    def __set__(self, obj, val):
+        if not flags_frozen():
+            self.setter(val)
+        else:
+            raise RuntimeError(
+                f"not allowed to set {obj.__name__} flags "
+                "after disable_global_flags; please use flags() context manager instead"
+            )
+
+
+class PropModule(types.ModuleType):
+    def __init__(self, m, name):
+        super().__init__(name)
+        self.m = m
+
+    def __getattr__(self, attr):
+        return self.m.__getattribute__(attr)
+
+
+from torch.backends import (
+    cpu as cpu,
+    cuda as cuda,
+    cudnn as cudnn,
+    cusparselt as cusparselt,
+    mha as mha,
+    mkl as mkl,
+    mkldnn as mkldnn,
+    mps as mps,
+    nnpack as nnpack,
+    openmp as openmp,
+    quantized as quantized,
+)
--- a/rl/Lib/site-packages/torch/backends/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/backends/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/backends/_coreml/init.py
+++ b/rl/Lib/site-packages/torch/backends/_coreml/init.py
--- a/rl/Lib/site-packages/torch/backends/_coreml/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/backends/_coreml/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/backends/_coreml/pycache/preprocess.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/backends/_coreml/pycache/preprocess.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/backends/_coreml/preprocess.py
+++ b/rl/Lib/site-packages/torch/backends/_coreml/preprocess.py
@ -0,0 +1,148 @@
+# mypy: allow-untyped-defs
+import hashlib
+import json
+from typing import Dict, Tuple
+
+import coremltools as ct  # type: ignore[import]
+from coremltools.converters.mil.input_types import TensorType  # type: ignore[import]
+from coremltools.converters.mil.mil import types  # type: ignore[import]
+from coremltools.models.neural_network import quantization_utils  # type: ignore[import]
+
+import torch
+
+
+CT_METADATA_VERSION = "com.github.apple.coremltools.version"
+CT_METADATA_SOURCE = "com.github.apple.coremltools.source"
+
+
+class ScalarType:
+    Float = 0
+    Double = 1
+    Int = 2
+    Long = 3
+    Undefined = 4
+
+
+# Supported Tensor types in coremltools:
+# https://github.com/apple/coremltools/blob/main/coremltools/converters/mil/frontend/torch/converter.py#L28
+torch_to_mil_types = {
+    ScalarType.Float: types.fp32,
+    ScalarType.Double: types.fp64,
+    ScalarType.Int: types.int32,
+    ScalarType.Long: types.int64,
+}
+
+
+class CoreMLComputeUnit:
+    CPU = "cpuOnly"
+    CPUAndGPU = "cpuAndGPU"
+    ALL = "all"
+
+
+class CoreMLQuantizationMode:
+    LINEAR = "linear"
+    LINEAR_SYMMETRIC = "linear_symmetric"
+    NONE = "none"
+
+
+def TensorSpec(shape, dtype=ScalarType.Float):
+    return (shape, dtype)
+
+
+def CompileSpec(
+    inputs,
+    outputs,
+    backend=CoreMLComputeUnit.CPU,
+    allow_low_precision=True,
+    quantization_mode=CoreMLQuantizationMode.NONE,
+    mlmodel_export_path=None,
+):
+    return (
+        inputs,
+        outputs,
+        backend,
+        allow_low_precision,
+        quantization_mode,
+        mlmodel_export_path,
+    )
+
+
+def _check_enumerated_shape(shape):
+    for s in shape:
+        if not isinstance(s, (list, tuple)):
+            return False
+    return True
+
+
+def _convert_to_mil_type(shape, dtype, name: str):
+    mil_shape = shape
+    if _check_enumerated_shape(shape):
+        mil_shape = ct.EnumeratedShapes(shape)
+    ml_type = TensorType(shape=mil_shape, dtype=torch_to_mil_types[dtype])
+    ml_type.name = name
+    return ml_type
+
+
+def preprocess(script_module: torch._C.ScriptObject, compile_spec: Dict[str, Tuple]):
+    spec = compile_spec["forward"]
+    (
+        input_specs,
+        output_specs,
+        backend,
+        allow_low_precision,
+        quantization_mode,
+        mlmodel_export_path,
+    ) = spec
+    mil_inputs = []
+    inputs = []
+    for index, input in enumerate(input_specs):
+        shape, dtype = input
+        name = "input_" + str(index)
+        inputs.append([name, str(dtype), str(shape)])
+        ml_type = _convert_to_mil_type(shape, dtype, name)
+        mil_inputs.append(ml_type)
+    model = torch.jit.RecursiveScriptModule._construct(script_module, lambda x: None)
+    mlmodel = ct.convert(model, inputs=mil_inputs)
+
+    if quantization_mode != CoreMLQuantizationMode.NONE:
+        quant_model_spec = quantization_utils.quantize_weights(
+            mlmodel, nbits=8, quantization_mode=quantization_mode
+        )
+        mlmodel = ct.models.MLModel(quant_model_spec)
+
+    spec = mlmodel.get_spec()
+    assert len(spec.description.output) == len(output_specs)  # type: ignore[attr-defined]
+    outputs = []
+    for index, output in enumerate(output_specs):
+        shape, dtype = output
+        name = spec.description.output[index].name  # type: ignore[attr-defined]
+        outputs.append([name, str(dtype), str(shape)])
+    mlmodel = ct.models.model.MLModel(spec)
+    print(mlmodel)
+
+    if mlmodel_export_path is not None:
+        print(f"Saving CoreML .mlmodel file to {mlmodel_export_path}")
+        mlmodel.save(mlmodel_export_path)
+
+    config = {
+        "spec_ver": str(spec.specificationVersion),  # type: ignore[attr-defined]
+        "backend": backend,
+        "allow_low_precision": str(allow_low_precision),
+    }
+    metadata = {
+        "coremltool_ver": mlmodel.user_defined_metadata[CT_METADATA_VERSION],
+        "torch_ver": mlmodel.user_defined_metadata[CT_METADATA_SOURCE],
+    }
+    coreml_compile_spec = {
+        "inputs": inputs,
+        "outputs": outputs,
+        "config": config,
+        "metadata": metadata,
+    }
+    mlmodel = spec.SerializeToString()  # type: ignore[attr-defined]
+
+    return {
+        "model": mlmodel,
+        "hash": str(hashlib.sha256(mlmodel).hexdigest()),
+        "extra": json.dumps(coreml_compile_spec),
+    }
--- a/rl/Lib/site-packages/torch/backends/_nnapi/init.py
+++ b/rl/Lib/site-packages/torch/backends/_nnapi/init.py
--- a/rl/Lib/site-packages/torch/backends/_nnapi/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/backends/_nnapi/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/backends/_nnapi/pycache/prepare.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/backends/_nnapi/pycache/prepare.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/backends/_nnapi/pycache/serializer.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/backends/_nnapi/pycache/serializer.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/backends/_nnapi/prepare.py
+++ b/rl/Lib/site-packages/torch/backends/_nnapi/prepare.py
@ -0,0 +1,199 @@
+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+from typing import List, Optional
+
+import torch
+from torch.backends._nnapi.serializer import _NnapiSerializer
+
+
+ANEURALNETWORKS_PREFER_LOW_POWER = 0
+ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER = 1
+ANEURALNETWORKS_PREFER_SUSTAINED_SPEED = 2
+
+
+class NnapiModule(torch.nn.Module):
+    """Torch Module that wraps an NNAPI Compilation.
+
+    This module handles preparing the weights, initializing the
+    NNAPI TorchBind object, and adjusting the memory formats
+    of all inputs and outputs.
+    """
+
+    # _nnapi.Compilation is defined
+    comp: Optional[torch.classes._nnapi.Compilation]  # type: ignore[name-defined]
+    weights: List[torch.Tensor]
+    out_templates: List[torch.Tensor]
+
+    def __init__(
+        self,
+        shape_compute_module: torch.nn.Module,
+        ser_model: torch.Tensor,
+        weights: List[torch.Tensor],
+        inp_mem_fmts: List[int],
+        out_mem_fmts: List[int],
+        compilation_preference: int,
+        relax_f32_to_f16: bool,
+    ):
+        super().__init__()
+        self.shape_compute_module = shape_compute_module
+        self.ser_model = ser_model
+        self.weights = weights
+        self.inp_mem_fmts = inp_mem_fmts
+        self.out_mem_fmts = out_mem_fmts
+        self.out_templates = []
+        self.comp = None
+        self.compilation_preference = compilation_preference
+        self.relax_f32_to_f16 = relax_f32_to_f16
+
+    @torch.jit.export
+    def init(self, args: List[torch.Tensor]):
+        assert self.comp is None
+        self.out_templates = self.shape_compute_module.prepare(self.ser_model, args)  # type: ignore[operator]
+        self.weights = [w.contiguous() for w in self.weights]
+        comp = torch.classes._nnapi.Compilation()
+        comp.init2(
+            self.ser_model,
+            self.weights,
+            self.compilation_preference,
+            self.relax_f32_to_f16,
+        )
+
+        self.comp = comp
+
+    def forward(self, args: List[torch.Tensor]) -> List[torch.Tensor]:
+        if self.comp is None:
+            self.init(args)
+        comp = self.comp
+        assert comp is not None
+        outs = [torch.empty_like(out) for out in self.out_templates]
+
+        assert len(args) == len(self.inp_mem_fmts)
+        fixed_args = []
+        for idx in range(len(args)):
+            fmt = self.inp_mem_fmts[idx]
+            # These constants match the values in DimOrder in serializer.py
+            # TODO: See if it's possible to use those directly.
+            if fmt == 0:
+                fixed_args.append(args[idx].contiguous())
+            elif fmt == 1:
+                fixed_args.append(args[idx].permute(0, 2, 3, 1).contiguous())
+            else:
+                raise ValueError("Invalid mem_fmt")
+        comp.run(fixed_args, outs)
+        assert len(outs) == len(self.out_mem_fmts)
+        for idx in range(len(self.out_templates)):
+            fmt = self.out_mem_fmts[idx]
+            # These constants match the values in DimOrder in serializer.py
+            # TODO: See if it's possible to use those directly.
+            if fmt in (0, 2):
+                pass
+            elif fmt == 1:
+                outs[idx] = outs[idx].permute(0, 3, 1, 2)
+            else:
+                raise ValueError("Invalid mem_fmt")
+        return outs
+
+
+def convert_model_to_nnapi(
+    model,
+    inputs,
+    serializer=None,
+    return_shapes=None,
+    use_int16_for_qint16=False,
+    compilation_preference=ANEURALNETWORKS_PREFER_SUSTAINED_SPEED,
+    relax_f32_to_f16=False,
+):
+    (
+        shape_compute_module,
+        ser_model_tensor,
+        used_weights,
+        inp_mem_fmts,
+        out_mem_fmts,
+        retval_count,
+    ) = process_for_nnapi(
+        model, inputs, serializer, return_shapes, use_int16_for_qint16
+    )
+
+    nnapi_model = NnapiModule(
+        shape_compute_module,
+        ser_model_tensor,
+        used_weights,
+        inp_mem_fmts,
+        out_mem_fmts,
+        compilation_preference,
+        relax_f32_to_f16,
+    )
+
+    class NnapiInterfaceWrapper(torch.nn.Module):
+        """NNAPI list-ifying and de-list-ifying wrapper.
+
+        NNAPI always expects a list of inputs and provides a list of outputs.
+        This module allows us to accept inputs as separate arguments.
+        It returns results as either a single tensor or tuple,
+        matching the original module.
+        """
+
+        def __init__(self, mod):
+            super().__init__()
+            self.mod = mod
+
+    wrapper_model_py = NnapiInterfaceWrapper(nnapi_model)
+    wrapper_model = torch.jit.script(wrapper_model_py)
+    # TODO: Maybe make these names match the original.
+    arg_list = ", ".join(f"arg_{idx}" for idx in range(len(inputs)))
+    if retval_count < 0:
+        ret_expr = "retvals[0]"
+    else:
+        ret_expr = "".join(f"retvals[{idx}], " for idx in range(retval_count))
+    wrapper_model.define(
+        f"def forward(self, {arg_list}):\n"
+        f"    retvals = self.mod([{arg_list}])\n"
+        f"    return {ret_expr}\n"
+    )
+    return wrapper_model
+
+
+def process_for_nnapi(
+    model, inputs, serializer=None, return_shapes=None, use_int16_for_qint16=False
+):
+    model = torch.jit.freeze(model)
+
+    if isinstance(inputs, torch.Tensor):
+        inputs = [inputs]
+
+    serializer = serializer or _NnapiSerializer(
+        config=None, use_int16_for_qint16=use_int16_for_qint16
+    )
+    (
+        ser_model,
+        used_weights,
+        inp_mem_fmts,
+        out_mem_fmts,
+        shape_compute_lines,
+        retval_count,
+    ) = serializer.serialize_model(model, inputs, return_shapes)
+    ser_model_tensor = torch.tensor(ser_model, dtype=torch.int32)
+
+    # We have to create a new class here every time this function is called
+    # because module.define adds a method to the *class*, not the instance.
+    class ShapeComputeModule(torch.nn.Module):
+        """Code-gen-ed module for tensor shape computation.
+
+        module.prepare will mutate ser_model according to the computed operand
+        shapes, based on the shapes of args.  Returns a list of output templates.
+        """
+
+    shape_compute_module = torch.jit.script(ShapeComputeModule())
+    real_shape_compute_lines = [
+        "def prepare(self, ser_model: torch.Tensor, args: List[torch.Tensor]) -> List[torch.Tensor]:\n",
+    ] + [f"    {line}\n" for line in shape_compute_lines]
+    shape_compute_module.define("".join(real_shape_compute_lines))
+
+    return (
+        shape_compute_module,
+        ser_model_tensor,
+        used_weights,
+        inp_mem_fmts,
+        out_mem_fmts,
+        retval_count,
+    )
--- a/rl/Lib/site-packages/torch/backends/_nnapi/serializer.py
+++ b/rl/Lib/site-packages/torch/backends/_nnapi/serializer.py
--- a/rl/Lib/site-packages/torch/backends/cpu/init.py
+++ b/rl/Lib/site-packages/torch/backends/cpu/init.py
@ -0,0 +1,20 @@
+import torch
+
+
+__all__ = [
+    "get_cpu_capability",
+]
+
+
+def get_cpu_capability() -> str:
+    r"""Return cpu capability as a string value.
+
+    Possible values:
+    - "DEFAULT"
+    - "VSX"
+    - "Z VECTOR"
+    - "NO AVX"
+    - "AVX2"
+    - "AVX512"
+    """
+    return torch._C._get_cpu_capability()
--- a/rl/Lib/site-packages/torch/backends/cpu/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/backends/cpu/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/backends/cuda/init.py
+++ b/rl/Lib/site-packages/torch/backends/cuda/init.py
@ -0,0 +1,478 @@
+# mypy: allow-untyped-defs
+import contextlib
+from typing import Union
+from typing_extensions import deprecated
+
+import torch
+
+
+__all__ = [
+    "is_built",
+    "cuFFTPlanCacheAttrContextProp",
+    "cuFFTPlanCache",
+    "cuFFTPlanCacheManager",
+    "cuBLASModule",
+    "preferred_linalg_library",
+    "preferred_blas_library",
+    "cufft_plan_cache",
+    "matmul",
+    "SDPAParams",
+    "enable_cudnn_sdp",
+    "cudnn_sdp_enabled",
+    "enable_flash_sdp",
+    "flash_sdp_enabled",
+    "enable_mem_efficient_sdp",
+    "mem_efficient_sdp_enabled",
+    "math_sdp_enabled",
+    "enable_math_sdp",
+    "allow_fp16_bf16_reduction_math_sdp",
+    "fp16_bf16_reduction_math_sdp_allowed",
+    "is_flash_attention_available",
+    "can_use_flash_attention",
+    "can_use_efficient_attention",
+    "can_use_cudnn_attention",
+    "sdp_kernel",
+]
+
+
+def is_built():
+    r"""
+    Return whether PyTorch is built with CUDA support.
+
+    Note that this doesn't necessarily mean CUDA is available; just that if this PyTorch
+    binary were run on a machine with working CUDA drivers and devices, we would be able to use it.
+    """
+    return torch._C._has_cuda
+
+
+class cuFFTPlanCacheAttrContextProp:
+    # Like regular ContextProp, but uses the `.device_index` attribute from the
+    # calling object as the first argument to the getter and setter.
+    def __init__(self, getter, setter):
+        self.getter = getter
+        self.setter = setter
+
+    def __get__(self, obj, objtype):
+        return self.getter(obj.device_index)
+
+    def __set__(self, obj, val):
+        if isinstance(self.setter, str):
+            raise RuntimeError(self.setter)
+        self.setter(obj.device_index, val)
+
+
+class cuFFTPlanCache:
+    r"""
+    Represent a specific plan cache for a specific `device_index`.
+
+    The attributes `size` and `max_size`, and method `clear`, can fetch and/ or
+    change properties of the C++ cuFFT plan cache.
+    """
+
+    def __init__(self, device_index):
+        self.device_index = device_index
+
+    size = cuFFTPlanCacheAttrContextProp(
+        torch._cufft_get_plan_cache_size,
+        ".size is a read-only property showing the number of plans currently in the "
+        "cache. To change the cache capacity, set cufft_plan_cache.max_size.",
+    )
+
+    max_size = cuFFTPlanCacheAttrContextProp(
+        torch._cufft_get_plan_cache_max_size, torch._cufft_set_plan_cache_max_size
+    )
+
+    def clear(self):
+        return torch._cufft_clear_plan_cache(self.device_index)
+
+
+class cuFFTPlanCacheManager:
+    r"""
+    Represent all cuFFT plan caches, return the cuFFTPlanCache for a given device when indexed.
+
+    Finally, this object, when used directly as a `cuFFTPlanCache` object (e.g.,
+    setting the `.max_size`) attribute, the current device's cuFFT plan cache is
+    used.
+    """
+
+    __initialized = False
+
+    def __init__(self):
+        self.caches = []
+        self.__initialized = True
+
+    def __getitem__(self, device):
+        index = torch.cuda._utils._get_device_index(device)
+        if index < 0 or index >= torch.cuda.device_count():
+            raise RuntimeError(
+                f"cufft_plan_cache: expected 0 <= device index < {torch.cuda.device_count()}, but got "
+                f"device with index {index}"
+            )
+        if len(self.caches) == 0:
+            self.caches.extend(
+                cuFFTPlanCache(index) for index in range(torch.cuda.device_count())
+            )
+        return self.caches[index]
+
+    def __getattr__(self, name):
+        return getattr(self[torch.cuda.current_device()], name)
+
+    def __setattr__(self, name, value):
+        if self.__initialized:
+            return setattr(self[torch.cuda.current_device()], name, value)
+        else:
+            return super().__setattr__(name, value)
+
+
+class cuBLASModule:
+    def __getattr__(self, name):
+        if name == "allow_tf32":
+            return torch._C._get_cublas_allow_tf32()
+        elif name == "allow_fp16_reduced_precision_reduction":
+            return torch._C._get_cublas_allow_fp16_reduced_precision_reduction()
+        elif name == "allow_bf16_reduced_precision_reduction":
+            return torch._C._get_cublas_allow_bf16_reduced_precision_reduction()
+        raise AttributeError("Unknown attribute " + name)
+
+    def __setattr__(self, name, value):
+        if name == "allow_tf32":
+            return torch._C._set_cublas_allow_tf32(value)
+        elif name == "allow_fp16_reduced_precision_reduction":
+            return torch._C._set_cublas_allow_fp16_reduced_precision_reduction(value)
+        elif name == "allow_bf16_reduced_precision_reduction":
+            return torch._C._set_cublas_allow_bf16_reduced_precision_reduction(value)
+        raise AttributeError("Unknown attribute " + name)
+
+
+_LinalgBackends = {
+    "default": torch._C._LinalgBackend.Default,
+    "cusolver": torch._C._LinalgBackend.Cusolver,
+    "magma": torch._C._LinalgBackend.Magma,
+}
+_LinalgBackends_str = ", ".join(_LinalgBackends.keys())
+
+
+def preferred_linalg_library(
+    backend: Union[None, str, torch._C._LinalgBackend] = None
+) -> torch._C._LinalgBackend:
+    r"""
+    Override the heuristic PyTorch uses to choose between cuSOLVER and MAGMA for CUDA linear algebra operations.
+
+    .. warning:: This flag is experimental and subject to change.
+
+    When PyTorch runs a CUDA linear algebra operation it often uses the cuSOLVER or MAGMA libraries,
+    and if both are available it decides which to use with a heuristic.
+    This flag (a :class:`str`) allows overriding those heuristics.
+
+    * If `"cusolver"` is set then cuSOLVER will be used wherever possible.
+    * If `"magma"` is set then MAGMA will be used wherever possible.
+    * If `"default"` (the default) is set then heuristics will be used to pick between
+      cuSOLVER and MAGMA if both are available.
+    * When no input is given, this function returns the currently preferred library.
+    * User may use the environment variable TORCH_LINALG_PREFER_CUSOLVER=1 to set the preferred library to cuSOLVER
+      globally.
+      This flag only sets the initial value of the preferred library and the preferred library
+      may still be overridden by this function call later in your script.
+
+    Note: When a library is preferred other libraries may still be used if the preferred library
+    doesn't implement the operation(s) called.
+    This flag may achieve better performance if PyTorch's heuristic library selection is incorrect
+    for your application's inputs.
+
+    Currently supported linalg operators:
+
+    * :func:`torch.linalg.inv`
+    * :func:`torch.linalg.inv_ex`
+    * :func:`torch.linalg.cholesky`
+    * :func:`torch.linalg.cholesky_ex`
+    * :func:`torch.cholesky_solve`
+    * :func:`torch.cholesky_inverse`
+    * :func:`torch.linalg.lu_factor`
+    * :func:`torch.linalg.lu`
+    * :func:`torch.linalg.lu_solve`
+    * :func:`torch.linalg.qr`
+    * :func:`torch.linalg.eigh`
+    * :func:`torch.linalg.eighvals`
+    * :func:`torch.linalg.svd`
+    * :func:`torch.linalg.svdvals`
+    """
+    if backend is None:
+        pass
+    elif isinstance(backend, str):
+        if backend not in _LinalgBackends:
+            raise RuntimeError(
+                "Unknown input value. " f"Choose from: {_LinalgBackends_str}."
+            )
+        torch._C._set_linalg_preferred_backend(_LinalgBackends[backend])
+    elif isinstance(backend, torch._C._LinalgBackend):
+        torch._C._set_linalg_preferred_backend(backend)
+    else:
+        raise RuntimeError("Unknown input value type.")
+
+    return torch._C._get_linalg_preferred_backend()
+
+
+_BlasBackends = {
+    "cublas": torch._C._BlasBackend.Cublas,
+    "cublaslt": torch._C._BlasBackend.Cublaslt,
+    "hipblaslt": torch._C._BlasBackend.Cublaslt,  # alias
+}
+_BlasBackends_str = ", ".join(_BlasBackends.keys())
+
+
+def preferred_blas_library(
+    backend: Union[None, str, torch._C._BlasBackend] = None
+) -> torch._C._BlasBackend:
+    r"""
+    Override the library PyTorch uses for BLAS operations. Choose between cuBLAS and cuBLASLt.
+
+    .. warning:: This flag is experimental and subject to change.
+
+    When PyTorch runs a CUDA BLAS operation it defaults to cuBLAS even if both cuBLAS and cuBLASLt are available.
+    For PyTorch built for ROCm, hipBLAS and hipBLASLt may offer different performance.
+    This flag (a :class:`str`) allows overriding which BLAS library to use.
+
+    * If `"cublas"` is set then cuBLAS will be used wherever possible.
+    * If `"cublaslt"` is set then cuBLASLt will be used wherever possible.
+    * When no input is given, this function returns the currently preferred library.
+    * User may use the environment variable TORCH_BLAS_PREFER_CUBLASLT=1 to set the preferred library to cuBLASLt
+      globally.
+      This flag only sets the initial value of the preferred library and the preferred library
+      may still be overridden by this function call later in your script.
+
+    Note: When a library is preferred other libraries may still be used if the preferred library
+    doesn't implement the operation(s) called.
+    This flag may achieve better performance if PyTorch's library selection is incorrect
+    for your application's inputs.
+
+    """
+    if backend is None:
+        pass
+    elif isinstance(backend, str):
+        if backend not in _BlasBackends:
+            raise RuntimeError(
+                "Unknown input value. " f"Choose from: {_BlasBackends_str}."
+            )
+        torch._C._set_blas_preferred_backend(_BlasBackends[backend])
+    elif isinstance(backend, torch._C._BlasBackend):
+        torch._C._set_blas_preferred_backend(backend)
+    else:
+        raise RuntimeError("Unknown input value type.")
+
+    return torch._C._get_blas_preferred_backend()
+
+
+from torch._C import _SDPAParams as SDPAParams, _SDPBackend as SDPBackend
+
+
+# Set the __module__ attribute
+SDPAParams.__module__ = "torch.backends.cuda"
+SDPAParams.__name__ = "SDPAParams"
+
+
+def flash_sdp_enabled():
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    Returns whether flash scaled dot product attention is enabled or not.
+    """
+    return torch._C._get_flash_sdp_enabled()
+
+
+def enable_flash_sdp(enabled: bool):
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    Enables or disables flash scaled dot product attention.
+    """
+    torch._C._set_sdp_use_flash(enabled)
+
+
+def mem_efficient_sdp_enabled():
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    Returns whether memory efficient scaled dot product attention is enabled or not.
+    """
+    return torch._C._get_mem_efficient_sdp_enabled()
+
+
+def enable_mem_efficient_sdp(enabled: bool):
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    Enables or disables memory efficient scaled dot product attention.
+    """
+    torch._C._set_sdp_use_mem_efficient(enabled)
+
+
+def math_sdp_enabled():
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    Returns whether math scaled dot product attention is enabled or not.
+    """
+    return torch._C._get_math_sdp_enabled()
+
+
+def enable_math_sdp(enabled: bool):
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    Enables or disables math scaled dot product attention.
+    """
+    torch._C._set_sdp_use_math(enabled)
+
+
+def allow_fp16_bf16_reduction_math_sdp(enabled: bool):
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    Enables or disables fp16/bf16 reduction in math scaled dot product attention.
+    """
+    torch._C._set_math_sdp_allow_fp16_bf16_reduction(enabled)
+
+
+def fp16_bf16_reduction_math_sdp_allowed():
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    Returns whether fp16/bf16 reduction in math scaled dot product attention is enabled or not.
+    """
+    return torch._C._get_math_sdp_allow_fp16_bf16_reduction()
+
+
+def is_flash_attention_available() -> bool:
+    r"""Check if PyTorch was built with FlashAttention for scaled_dot_product_attention.
+
+    Returns:
+        True if FlashAttention is built and available; otherwise, False.
+
+    Note:
+        This function is dependent on a CUDA-enabled build of PyTorch. It will return False
+        in non-CUDA environments.
+    """
+    return torch._C._is_flash_attention_available()
+
+
+def can_use_flash_attention(params: SDPAParams, debug: bool = False) -> bool:
+    r"""Check if FlashAttention can be utilized in scaled_dot_product_attention.
+
+    Args:
+        params: An instance of SDPAParams containing the tensors for query,
+                key, value, an optional attention mask, dropout rate, and
+                a flag indicating if the attention is causal.
+        debug: Whether to logging.warn debug information as to why FlashAttention could not be run.
+            Defaults to False.
+
+    Returns:
+        True if FlashAttention can be used with the given parameters; otherwise, False.
+
+    Note:
+        This function is dependent on a CUDA-enabled build of PyTorch. It will return False
+        in non-CUDA environments.
+    """
+    return torch._C._can_use_flash_attention(params, debug)
+
+
+def can_use_efficient_attention(params: SDPAParams, debug: bool = False) -> bool:
+    r"""Check if efficient_attention can be utilized in scaled_dot_product_attention.
+
+    Args:
+        params: An instance of SDPAParams containing the tensors for query,
+                key, value, an optional attention mask, dropout rate, and
+                a flag indicating if the attention is causal.
+        debug: Whether to logging.warn with information as to why efficient_attention could not be run.
+            Defaults to False.
+
+    Returns:
+        True if efficient_attention can be used with the given parameters; otherwise, False.
+
+    Note:
+        This function is dependent on a CUDA-enabled build of PyTorch. It will return False
+        in non-CUDA environments.
+    """
+    return torch._C._can_use_mem_efficient_attention(params, debug)
+
+
+def can_use_cudnn_attention(params: SDPAParams, debug: bool = False) -> bool:
+    r"""Check if cudnn_attention can be utilized in scaled_dot_product_attention.
+
+    Args:
+        params: An instance of SDPAParams containing the tensors for query,
+                key, value, an optional attention mask, dropout rate, and
+                a flag indicating if the attention is causal.
+        debug: Whether to logging.warn with information as to why cuDNN attention could not be run.
+            Defaults to False.
+
+    Returns:
+        True if cuDNN can be used with the given parameters; otherwise, False.
+
+    Note:
+        This function is dependent on a CUDA-enabled build of PyTorch. It will return False
+        in non-CUDA environments.
+    """
+    return torch._C._can_use_cudnn_attention(params, debug)
+
+
+def cudnn_sdp_enabled():
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    Returns whether cuDNN scaled dot product attention is enabled or not.
+    """
+    return torch._C._get_cudnn_sdp_enabled()
+
+
+def enable_cudnn_sdp(enabled: bool):
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    Enables or disables cuDNN scaled dot product attention.
+    """
+    torch._C._set_sdp_use_cudnn(enabled)
+
+
+@contextlib.contextmanager
+@deprecated(
+    (
+        "`torch.backends.cuda.sdp_kernel()` is deprecated. "
+        "In the future, this context manager will be removed. "
+        "Please see `torch.nn.attention.sdpa_kernel()` for the new context manager, "
+        "with updated signature."
+    ),
+    category=FutureWarning,
+)
+def sdp_kernel(
+    enable_flash: bool = True,
+    enable_math: bool = True,
+    enable_mem_efficient: bool = True,
+    enable_cudnn: bool = True,
+):
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    This context manager can be used to temporarily enable or disable any of the three backends for scaled dot product attention.
+    Upon exiting the context manager, the previous state of the flags will be restored.
+    """
+    from torch.nn.attention import sdpa_kernel
+
+    backend_list = []
+    if enable_flash:
+        backend_list.append(SDPBackend.FLASH_ATTENTION)
+    if enable_mem_efficient:
+        backend_list.append(SDPBackend.EFFICIENT_ATTENTION)
+    if enable_math:
+        backend_list.append(SDPBackend.MATH)
+    if enable_cudnn:
+        backend_list.append(SDPBackend.CUDNN_ATTENTION)
+
+    with sdpa_kernel(backend_list) as context:
+        try:
+            yield context
+        finally:
+            pass
+
+
+cufft_plan_cache = cuFFTPlanCacheManager()
+matmul = cuBLASModule()
--- a/rl/Lib/site-packages/torch/backends/cuda/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/backends/cuda/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/backends/cudnn/init.py
+++ b/rl/Lib/site-packages/torch/backends/cudnn/init.py
@ -0,0 +1,208 @@
+# mypy: allow-untyped-defs
+import os
+import sys
+import warnings
+from contextlib import contextmanager
+from typing import Optional
+
+import torch
+from torch.backends import __allow_nonbracketed_mutation, ContextProp, PropModule
+
+
+try:
+    from torch._C import _cudnn
+except ImportError:
+    _cudnn = None  # type: ignore[assignment]
+
+# Write:
+#
+#   torch.backends.cudnn.enabled = False
+#
+# to globally disable CuDNN/MIOpen
+
+__cudnn_version: Optional[int] = None
+
+if _cudnn is not None:
+
+    def _init():
+        global __cudnn_version
+        if __cudnn_version is None:
+            __cudnn_version = _cudnn.getVersionInt()
+            runtime_version = _cudnn.getRuntimeVersion()
+            compile_version = _cudnn.getCompileVersion()
+            runtime_major, runtime_minor, _ = runtime_version
+            compile_major, compile_minor, _ = compile_version
+            # Different major versions are always incompatible
+            # Starting with cuDNN 7, minor versions are backwards-compatible
+            # Not sure about MIOpen (ROCm), so always do a strict check
+            if runtime_major != compile_major:
+                cudnn_compatible = False
+            elif runtime_major < 7 or not _cudnn.is_cuda:
+                cudnn_compatible = runtime_minor == compile_minor
+            else:
+                cudnn_compatible = runtime_minor >= compile_minor
+            if not cudnn_compatible:
+                if os.environ.get("PYTORCH_SKIP_CUDNN_COMPATIBILITY_CHECK", "0") == "1":
+                    return True
+                base_error_msg = (
+                    f"cuDNN version incompatibility: "
+                    f"PyTorch was compiled  against {compile_version} "
+                    f"but found runtime version {runtime_version}. "
+                    f"PyTorch already comes bundled with cuDNN. "
+                    f"One option to resolving this error is to ensure PyTorch "
+                    f"can find the bundled cuDNN. "
+                )
+
+                if "LD_LIBRARY_PATH" in os.environ:
+                    ld_library_path = os.environ.get("LD_LIBRARY_PATH", "")
+                    if any(
+                        substring in ld_library_path for substring in ["cuda", "cudnn"]
+                    ):
+                        raise RuntimeError(
+                            f"{base_error_msg}"
+                            f"Looks like your LD_LIBRARY_PATH contains incompatible version of cudnn. "
+                            f"Please either remove it from the path or install cudnn {compile_version}"
+                        )
+                    else:
+                        raise RuntimeError(
+                            f"{base_error_msg}"
+                            f"one possibility is that there is a "
+                            f"conflicting cuDNN in LD_LIBRARY_PATH."
+                        )
+                else:
+                    raise RuntimeError(base_error_msg)
+
+        return True
+
+else:
+
+    def _init():
+        return False
+
+
+def version():
+    """Return the version of cuDNN."""
+    if not _init():
+        return None
+    return __cudnn_version
+
+
+CUDNN_TENSOR_DTYPES = {
+    torch.half,
+    torch.float,
+    torch.double,
+}
+
+
+def is_available():
+    r"""Return a bool indicating if CUDNN is currently available."""
+    return torch._C._has_cudnn
+
+
+def is_acceptable(tensor):
+    if not torch._C._get_cudnn_enabled():
+        return False
+    if tensor.device.type != "cuda" or tensor.dtype not in CUDNN_TENSOR_DTYPES:
+        return False
+    if not is_available():
+        warnings.warn(
+            "PyTorch was compiled without cuDNN/MIOpen support. To use cuDNN/MIOpen, rebuild "
+            "PyTorch making sure the library is visible to the build system."
+        )
+        return False
+    if not _init():
+        warnings.warn(
+            "cuDNN/MIOpen library not found. Check your {libpath}".format(
+                libpath={"darwin": "DYLD_LIBRARY_PATH", "win32": "PATH"}.get(
+                    sys.platform, "LD_LIBRARY_PATH"
+                )
+            )
+        )
+        return False
+    return True
+
+
+def set_flags(
+    _enabled=None,
+    _benchmark=None,
+    _benchmark_limit=None,
+    _deterministic=None,
+    _allow_tf32=None,
+):
+    orig_flags = (
+        torch._C._get_cudnn_enabled(),
+        torch._C._get_cudnn_benchmark(),
+        None if not is_available() else torch._C._cuda_get_cudnn_benchmark_limit(),
+        torch._C._get_cudnn_deterministic(),
+        torch._C._get_cudnn_allow_tf32(),
+    )
+    if _enabled is not None:
+        torch._C._set_cudnn_enabled(_enabled)
+    if _benchmark is not None:
+        torch._C._set_cudnn_benchmark(_benchmark)
+    if _benchmark_limit is not None and is_available():
+        torch._C._cuda_set_cudnn_benchmark_limit(_benchmark_limit)
+    if _deterministic is not None:
+        torch._C._set_cudnn_deterministic(_deterministic)
+    if _allow_tf32 is not None:
+        torch._C._set_cudnn_allow_tf32(_allow_tf32)
+    return orig_flags
+
+
+@contextmanager
+def flags(
+    enabled=False,
+    benchmark=False,
+    benchmark_limit=10,
+    deterministic=False,
+    allow_tf32=True,
+):
+    with __allow_nonbracketed_mutation():
+        orig_flags = set_flags(
+            enabled, benchmark, benchmark_limit, deterministic, allow_tf32
+        )
+    try:
+        yield
+    finally:
+        # recover the previous values
+        with __allow_nonbracketed_mutation():
+            set_flags(*orig_flags)
+
+
+# The magic here is to allow us to intercept code like this:
+#
+#   torch.backends.<cudnn|mkldnn>.enabled = True
+
+
+class CudnnModule(PropModule):
+    def __init__(self, m, name):
+        super().__init__(m, name)
+
+    enabled = ContextProp(torch._C._get_cudnn_enabled, torch._C._set_cudnn_enabled)
+    deterministic = ContextProp(
+        torch._C._get_cudnn_deterministic, torch._C._set_cudnn_deterministic
+    )
+    benchmark = ContextProp(
+        torch._C._get_cudnn_benchmark, torch._C._set_cudnn_benchmark
+    )
+    benchmark_limit = None
+    if is_available():
+        benchmark_limit = ContextProp(
+            torch._C._cuda_get_cudnn_benchmark_limit,
+            torch._C._cuda_set_cudnn_benchmark_limit,
+        )
+    allow_tf32 = ContextProp(
+        torch._C._get_cudnn_allow_tf32, torch._C._set_cudnn_allow_tf32
+    )
+
+
+# This is the sys.modules replacement trick, see
+# https://stackoverflow.com/questions/2447353/getattr-on-a-module/7668273#7668273
+sys.modules[__name__] = CudnnModule(sys.modules[__name__], __name__)
+
+# Add type annotation for the replaced module
+enabled: bool
+deterministic: bool
+benchmark: bool
+allow_tf32: bool
+benchmark_limit: int
--- a/rl/Lib/site-packages/torch/backends/cudnn/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/backends/cudnn/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/backends/cudnn/pycache/rnn.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/backends/cudnn/pycache/rnn.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/backends/cudnn/rnn.py
+++ b/rl/Lib/site-packages/torch/backends/cudnn/rnn.py
@ -0,0 +1,64 @@
+# mypy: allow-untyped-defs
+import torch.cuda
+
+
+try:
+    from torch._C import _cudnn
+except ImportError:
+    # Uses of all the functions below should be guarded by torch.backends.cudnn.is_available(),
+    # so it's safe to not emit any checks here.
+    _cudnn = None  # type: ignore[assignment]
+
+
+def get_cudnn_mode(mode):
+    if mode == "RNN_RELU":
+        return int(_cudnn.RNNMode.rnn_relu)
+    elif mode == "RNN_TANH":
+        return int(_cudnn.RNNMode.rnn_tanh)
+    elif mode == "LSTM":
+        return int(_cudnn.RNNMode.lstm)
+    elif mode == "GRU":
+        return int(_cudnn.RNNMode.gru)
+    else:
+        raise Exception(f"Unknown mode: {mode}")  # noqa: TRY002
+
+
+# NB: We don't actually need this class anymore (in fact, we could serialize the
+# dropout state for even better reproducibility), but it is kept for backwards
+# compatibility for old models.
+class Unserializable:
+    def __init__(self, inner):
+        self.inner = inner
+
+    def get(self):
+        return self.inner
+
+    def __getstate__(self):
+        # Note: can't return {}, because python2 won't call __setstate__
+        # if the value evaluates to False
+        return "<unserializable>"
+
+    def __setstate__(self, state):
+        self.inner = None
+
+
+def init_dropout_state(dropout, train, dropout_seed, dropout_state):
+    dropout_desc_name = "desc_" + str(torch.cuda.current_device())
+    dropout_p = dropout if train else 0
+    if (dropout_desc_name not in dropout_state) or (
+        dropout_state[dropout_desc_name].get() is None
+    ):
+        if dropout_p == 0:
+            dropout_state[dropout_desc_name] = Unserializable(None)
+        else:
+            dropout_state[dropout_desc_name] = Unserializable(
+                torch._cudnn_init_dropout_state(  # type: ignore[call-arg]
+                    dropout_p,
+                    train,
+                    dropout_seed,
+                    self_ty=torch.uint8,
+                    device=torch.device("cuda"),
+                )
+            )
+    dropout_ts = dropout_state[dropout_desc_name].get()
+    return dropout_ts
--- a/rl/Lib/site-packages/torch/backends/cusparselt/init.py
+++ b/rl/Lib/site-packages/torch/backends/cusparselt/init.py
@ -0,0 +1,42 @@
+# mypy: allow-untyped-defs
+from typing import Optional
+
+import torch
+
+
+__all__ = [
+    "version",
+    "is_available",
+]
+
+try:
+    from torch._C import _cusparselt
+except ImportError:
+    _cusparselt = None  # type: ignore[assignment]
+
+__cusparselt_version: Optional[int] = None
+
+if _cusparselt is not None:
+
+    def _init():
+        global __cusparselt_version
+        if __cusparselt_version is None:
+            __cusparselt_version = _cusparselt.getVersionInt()
+        return True
+
+else:
+
+    def _init():
+        return False
+
+
+def version() -> Optional[int]:
+    """Return the version of cuSPARSELt"""
+    if not _init():
+        return None
+    return __cusparselt_version
+
+
+def is_available() -> bool:
+    r"""Return a bool indicating if cuSPARSELt is currently available."""
+    return torch._C._has_cusparselt
--- a/rl/Lib/site-packages/torch/backends/cusparselt/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/backends/cusparselt/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/backends/mha/init.py
+++ b/rl/Lib/site-packages/torch/backends/mha/init.py
@ -0,0 +1,25 @@
+# Config options to enable/disable C++ kernel for nn.functional.MHA
+# and nn.TransformerEncoder
+import torch
+
+
+_is_fastpath_enabled: bool = True
+
+
+def get_fastpath_enabled() -> bool:
+    """Returns whether fast path for TransformerEncoder and MultiHeadAttention
+    is enabled, or ``True`` if jit is scripting.
+
+    ..note:
+        The fastpath might not be run even if ``get_fastpath_enabled`` returns
+        ``True`` unless all conditions on inputs are met.
+    """
+    if not torch.jit.is_scripting():
+        return _is_fastpath_enabled
+    return True
+
+
+def set_fastpath_enabled(value: bool) -> None:
+    """Sets whether fast path is enabled"""
+    global _is_fastpath_enabled
+    _is_fastpath_enabled = value
--- a/rl/Lib/site-packages/torch/backends/mha/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/backends/mha/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/backends/mkl/init.py
+++ b/rl/Lib/site-packages/torch/backends/mkl/init.py
@ -0,0 +1,57 @@
+# mypy: allow-untyped-defs
+import torch
+
+
+def is_available():
+    r"""Return whether PyTorch is built with MKL support."""
+    return torch._C.has_mkl
+
+
+VERBOSE_OFF = 0
+VERBOSE_ON = 1
+
+
+class verbose:
+    """
+    On-demand oneMKL verbosing functionality.
+
+    To make it easier to debug performance issues, oneMKL can dump verbose
+    messages containing execution information like duration while executing
+    the kernel. The verbosing functionality can be invoked via an environment
+    variable named `MKL_VERBOSE`. However, this methodology dumps messages in
+    all steps. Those are a large amount of verbose messages. Moreover, for
+    investigating the performance issues, generally taking verbose messages
+    for one single iteration is enough. This on-demand verbosing functionality
+    makes it possible to control scope for verbose message dumping. In the
+    following example, verbose messages will be dumped out for the second
+    inference only.
+
+    .. highlight:: python
+    .. code-block:: python
+
+        import torch
+        model(data)
+        with torch.backends.mkl.verbose(torch.backends.mkl.VERBOSE_ON):
+            model(data)
+
+    Args:
+        level: Verbose level
+            - ``VERBOSE_OFF``: Disable verbosing
+            - ``VERBOSE_ON``:  Enable verbosing
+    """
+
+    def __init__(self, enable):
+        self.enable = enable
+
+    def __enter__(self):
+        if self.enable == VERBOSE_OFF:
+            return
+        st = torch._C._verbose.mkl_set_verbose(self.enable)
+        assert (
+            st
+        ), "Failed to set MKL into verbose mode. Please consider to disable this verbose scope."
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        torch._C._verbose.mkl_set_verbose(VERBOSE_OFF)
+        return False
--- a/rl/Lib/site-packages/torch/backends/mkl/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/backends/mkl/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/backends/mkldnn/init.py
+++ b/rl/Lib/site-packages/torch/backends/mkldnn/init.py
@ -0,0 +1,100 @@
+# mypy: allow-untyped-defs
+import sys
+from contextlib import contextmanager
+from typing import TYPE_CHECKING
+
+import torch
+from torch.backends import __allow_nonbracketed_mutation, ContextProp, PropModule
+
+
+def is_available():
+    r"""Return whether PyTorch is built with MKL-DNN support."""
+    return torch._C._has_mkldnn
+
+
+VERBOSE_OFF = 0
+VERBOSE_ON = 1
+VERBOSE_ON_CREATION = 2
+
+
+class verbose:
+    """
+    On-demand oneDNN (former MKL-DNN) verbosing functionality.
+
+    To make it easier to debug performance issues, oneDNN can dump verbose
+    messages containing information like kernel size, input data size and
+    execution duration while executing the kernel. The verbosing functionality
+    can be invoked via an environment variable named `DNNL_VERBOSE`. However,
+    this methodology dumps messages in all steps. Those are a large amount of
+    verbose messages. Moreover, for investigating the performance issues,
+    generally taking verbose messages for one single iteration is enough.
+    This on-demand verbosing functionality makes it possible to control scope
+    for verbose message dumping. In the following example, verbose messages
+    will be dumped out for the second inference only.
+
+    .. highlight:: python
+    .. code-block:: python
+
+        import torch
+        model(data)
+        with torch.backends.mkldnn.verbose(torch.backends.mkldnn.VERBOSE_ON):
+            model(data)
+
+    Args:
+        level: Verbose level
+            - ``VERBOSE_OFF``: Disable verbosing
+            - ``VERBOSE_ON``:  Enable verbosing
+            - ``VERBOSE_ON_CREATION``: Enable verbosing, including oneDNN kernel creation
+    """
+
+    def __init__(self, level):
+        self.level = level
+
+    def __enter__(self):
+        if self.level == VERBOSE_OFF:
+            return
+        st = torch._C._verbose.mkldnn_set_verbose(self.level)
+        assert (
+            st
+        ), "Failed to set MKLDNN into verbose mode. Please consider to disable this verbose scope."
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        torch._C._verbose.mkldnn_set_verbose(VERBOSE_OFF)
+        return False
+
+
+def set_flags(_enabled, _deterministic=None):
+    orig_flags = (torch._C._get_mkldnn_enabled(), torch._C._get_mkldnn_deterministic())
+    torch._C._set_mkldnn_enabled(_enabled)
+    if _deterministic is not None:
+        torch._C._set_mkldnn_deterministic(_deterministic)
+    return orig_flags
+
+
+@contextmanager
+def flags(enabled=False, deterministic=False):
+    with __allow_nonbracketed_mutation():
+        orig_flags = set_flags(enabled, deterministic)
+    try:
+        yield
+    finally:
+        with __allow_nonbracketed_mutation():
+            set_flags(*orig_flags)
+
+
+class MkldnnModule(PropModule):
+    def __init__(self, m, name):
+        super().__init__(m, name)
+
+    enabled = ContextProp(torch._C._get_mkldnn_enabled, torch._C._set_mkldnn_enabled)
+    deterministic = ContextProp(
+        torch._C._get_mkldnn_deterministic, torch._C._set_mkldnn_deterministic
+    )
+
+
+if TYPE_CHECKING:
+    enabled: ContextProp
+    deterministic: ContextProp
+
+sys.modules[__name__] = MkldnnModule(sys.modules[__name__], __name__)
--- a/rl/Lib/site-packages/torch/backends/mkldnn/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/backends/mkldnn/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/backends/mps/init.py
+++ b/rl/Lib/site-packages/torch/backends/mps/init.py
@ -0,0 +1,55 @@
+# mypy: allow-untyped-defs
+from functools import lru_cache as _lru_cache
+from typing import Optional, TYPE_CHECKING
+
+import torch
+from torch.library import Library as _Library
+
+
+__all__ = ["is_built", "is_available", "is_macos13_or_newer", "is_macos_or_newer"]
+
+
+def is_built() -> bool:
+    r"""Return whether PyTorch is built with MPS support.
+
+    Note that this doesn't necessarily mean MPS is available; just that
+    if this PyTorch binary were run a machine with working MPS drivers
+    and devices, we would be able to use it.
+    """
+    return torch._C._has_mps
+
+
+@_lru_cache
+def is_available() -> bool:
+    r"""Return a bool indicating if MPS is currently available."""
+    return torch._C._mps_is_available()
+
+
+@_lru_cache
+def is_macos_or_newer(major: int, minor: int) -> bool:
+    r"""Return a bool indicating whether MPS is running on given MacOS or newer."""
+    return torch._C._mps_is_on_macos_or_newer(major, minor)
+
+
+@_lru_cache
+def is_macos13_or_newer(minor: int = 0) -> bool:
+    r"""Return a bool indicating whether MPS is running on MacOS 13 or newer."""
+    return torch._C._mps_is_on_macos_or_newer(13, minor)
+
+
+_lib: Optional[_Library] = None
+
+
+def _init():
+    r"""Register prims as implementation of var_mean and group_norm."""
+    global _lib
+
+    if _lib is not None or not is_built():
+        return
+
+    from torch._decomp.decompositions import native_group_norm_backward
+    from torch._refs import native_group_norm
+
+    _lib = _Library("aten", "IMPL")  # noqa: TOR901
+    _lib.impl("native_group_norm", native_group_norm, "MPS")
+    _lib.impl("native_group_norm_backward", native_group_norm_backward, "MPS")
--- a/rl/Lib/site-packages/torch/backends/mps/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/backends/mps/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/backends/nnpack/init.py
+++ b/rl/Lib/site-packages/torch/backends/nnpack/init.py
@ -0,0 +1,32 @@
+# mypy: allow-untyped-defs
+from contextlib import contextmanager
+
+import torch
+from torch.backends import __allow_nonbracketed_mutation, ContextProp, PropModule
+
+
+__all__ = ["is_available", "flags", "set_flags"]
+
+
+def is_available():
+    r"""Return whether PyTorch is built with NNPACK support."""
+    return torch._nnpack_available()
+
+
+def set_flags(_enabled):
+    r"""Set if nnpack is enabled globally"""
+    orig_flags = (torch._C._get_nnpack_enabled(),)
+    torch._C._set_nnpack_enabled(_enabled)
+    return orig_flags
+
+
+@contextmanager
+def flags(enabled=False):
+    r"""Context manager for setting if nnpack is enabled globally"""
+    with __allow_nonbracketed_mutation():
+        orig_flags = set_flags(enabled)
+    try:
+        yield
+    finally:
+        with __allow_nonbracketed_mutation():
+            set_flags(orig_flags[0])
--- a/rl/Lib/site-packages/torch/backends/nnpack/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/backends/nnpack/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/backends/openmp/init.py
+++ b/rl/Lib/site-packages/torch/backends/openmp/init.py
@ -0,0 +1,7 @@
+# mypy: allow-untyped-defs
+import torch
+
+
+def is_available():
+    r"""Return whether PyTorch is built with OpenMP support."""
+    return torch._C.has_openmp
--- a/rl/Lib/site-packages/torch/backends/openmp/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/backends/openmp/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/backends/opt_einsum/init.py
+++ b/rl/Lib/site-packages/torch/backends/opt_einsum/init.py
@ -0,0 +1,112 @@
+# mypy: allow-untyped-defs
+import sys
+import warnings
+from contextlib import contextmanager
+from functools import lru_cache as _lru_cache
+from typing import Any
+
+from torch.backends import __allow_nonbracketed_mutation, ContextProp, PropModule
+
+
+try:
+    import opt_einsum as _opt_einsum  # type: ignore[import]
+except ImportError:
+    _opt_einsum = None
+
+
+@_lru_cache
+def is_available() -> bool:
+    r"""Return a bool indicating if opt_einsum is currently available."""
+    return _opt_einsum is not None
+
+
+def get_opt_einsum() -> Any:
+    r"""Return the opt_einsum package if opt_einsum is currently available, else None."""
+    return _opt_einsum
+
+
+def _set_enabled(_enabled: bool) -> None:
+    if not is_available() and _enabled:
+        raise ValueError(
+            f"opt_einsum is not available, so setting `enabled` to {_enabled} will not reap "
+            "the benefits of calculating an optimal path for einsum. torch.einsum will "
+            "fall back to contracting from left to right. To enable this optimal path "
+            "calculation, please install opt-einsum."
+        )
+    global enabled
+    enabled = _enabled
+
+
+def _get_enabled() -> bool:
+    return enabled
+
+
+def _set_strategy(_strategy: str) -> None:
+    if not is_available():
+        raise ValueError(
+            f"opt_einsum is not available, so setting `strategy` to {_strategy} will not be meaningful. "
+            "torch.einsum will bypass path calculation and simply contract from left to right. "
+            "Please install opt_einsum or unset `strategy`."
+        )
+    if not enabled:
+        raise ValueError(
+            f"opt_einsum is not enabled, so setting a `strategy` to {_strategy} will not be meaningful. "
+            "torch.einsum will bypass path calculation and simply contract from left to right. "
+            "Please set `enabled` to `True` as well or unset `strategy`."
+        )
+    if _strategy not in ["auto", "greedy", "optimal"]:
+        raise ValueError(
+            f"`strategy` must be one of the following: [auto, greedy, optimal] but is {_strategy}"
+        )
+    global strategy
+    strategy = _strategy
+
+
+def _get_strategy() -> str:
+    return strategy
+
+
+def set_flags(_enabled=None, _strategy=None):
+    orig_flags = (enabled, None if not is_available() else strategy)
+    if _enabled is not None:
+        _set_enabled(_enabled)
+    if _strategy is not None:
+        _set_strategy(_strategy)
+    return orig_flags
+
+
+@contextmanager
+def flags(enabled=None, strategy=None):
+    with __allow_nonbracketed_mutation():
+        orig_flags = set_flags(enabled, strategy)
+    try:
+        yield
+    finally:
+        # recover the previous values
+        with __allow_nonbracketed_mutation():
+            set_flags(*orig_flags)
+
+
+# The magic here is to allow us to intercept code like this:
+#
+#   torch.backends.opt_einsum.enabled = True
+
+
+class OptEinsumModule(PropModule):
+    def __init__(self, m, name):
+        super().__init__(m, name)
+
+    global enabled
+    enabled = ContextProp(_get_enabled, _set_enabled)
+    global strategy
+    strategy = None
+    if is_available():
+        strategy = ContextProp(_get_strategy, _set_strategy)
+
+
+# This is the sys.modules replacement trick, see
+# https://stackoverflow.com/questions/2447353/getattr-on-a-module/7668273#7668273
+sys.modules[__name__] = OptEinsumModule(sys.modules[__name__], __name__)
+
+enabled = True if is_available() else False
+strategy = "auto" if is_available() else None
--- a/rl/Lib/site-packages/torch/backends/opt_einsum/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/backends/opt_einsum/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/backends/quantized/init.py
+++ b/rl/Lib/site-packages/torch/backends/quantized/init.py
@ -0,0 +1,66 @@
+# mypy: allow-untyped-defs
+import sys
+import types
+from typing import List
+
+import torch
+
+
+# This function should correspond to the enums present in c10/core/QEngine.h
+def _get_qengine_id(qengine: str) -> int:
+    if qengine == "none" or qengine == "" or qengine is None:
+        ret = 0
+    elif qengine == "fbgemm":
+        ret = 1
+    elif qengine == "qnnpack":
+        ret = 2
+    elif qengine == "onednn":
+        ret = 3
+    elif qengine == "x86":
+        ret = 4
+    else:
+        ret = -1
+        raise RuntimeError(f"{qengine} is not a valid value for quantized engine")
+    return ret
+
+
+# This function should correspond to the enums present in c10/core/QEngine.h
+def _get_qengine_str(qengine: int) -> str:
+    all_engines = {0: "none", 1: "fbgemm", 2: "qnnpack", 3: "onednn", 4: "x86"}
+    return all_engines.get(qengine, "*undefined")
+
+
+class _QEngineProp:
+    def __get__(self, obj, objtype) -> str:
+        return _get_qengine_str(torch._C._get_qengine())
+
+    def __set__(self, obj, val: str) -> None:
+        torch._C._set_qengine(_get_qengine_id(val))
+
+
+class _SupportedQEnginesProp:
+    def __get__(self, obj, objtype) -> List[str]:
+        qengines = torch._C._supported_qengines()
+        return [_get_qengine_str(qe) for qe in qengines]
+
+    def __set__(self, obj, val) -> None:
+        raise RuntimeError("Assignment not supported")
+
+
+class QuantizedEngine(types.ModuleType):
+    def __init__(self, m, name):
+        super().__init__(name)
+        self.m = m
+
+    def __getattr__(self, attr):
+        return self.m.__getattribute__(attr)
+
+    engine = _QEngineProp()
+    supported_engines = _SupportedQEnginesProp()
+
+
+# This is the sys.modules replacement trick, see
+# https://stackoverflow.com/questions/2447353/getattr-on-a-module/7668273#7668273
+sys.modules[__name__] = QuantizedEngine(sys.modules[__name__], __name__)
+engine: str
+supported_engines: List[str]
--- a/rl/Lib/site-packages/torch/backends/quantized/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/backends/quantized/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/backends/xeon/init.py
+++ b/rl/Lib/site-packages/torch/backends/xeon/init.py
--- a/rl/Lib/site-packages/torch/backends/xeon/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/backends/xeon/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/backends/xeon/pycache/run_cpu.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/backends/xeon/pycache/run_cpu.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/backends/xeon/run_cpu.py
+++ b/rl/Lib/site-packages/torch/backends/xeon/run_cpu.py
@ -0,0 +1,943 @@
+# mypy: allow-untyped-defs
+"""
+This is a script for launching PyTorch inference on Intel(R) Xeon(R) Scalable Processors with optimal configurations.
+
+Single instance inference, multi-instance inference are enabled.
+
+Note: term "instance" here doesn't refer to a cloud instance. This script is executed as a single process. It invokes
+multiple "instances" which are formed from multiple threads for each. "instance" is kind of group of threads in this
+context.
+
+Illustrated as below:
+
+::
+
+    +-----------------------------+----------------------+-------+
+    |            process          |        thread        | core  |
+    +=============================+======================+=======+
+    | torch.backends.xeon.run_cpu | instance 0: thread 0 |   0   |
+    |                             |             thread 1 |   1   |
+    |                             +----------------------+-------+
+    |                             | instance 1: thread 0 |   2   |
+    |                             |             thread 1 |   3   |
+    |                             +----------------------+-------+
+    |                             | ...                  |  ...  |
+    |                             +----------------------+-------+
+    |                             | instance N: thread 0 |   M   |
+    |                             |             thread 1 |  M+1  |
+    +-----------------------------+----------------------+-------+
+
+To get the peak performance on Intel(R) Xeon(R) Scalable Processors, the script optimizes the configuration of thread and memory
+management. For thread management, the script configures thread affinity and the preload of Intel OMP library.
+For memory management, it configures NUMA binding and preload optimized memory allocation library (e.g. tcmalloc, jemalloc).
+
+Environment variables that will be set by this script:
+
+------------------+-------------------------------------------------------------------------------------------------+
+| Environ Variable |                                             Value                                               |
+==================+=================================================================================================+
+|    LD_PRELOAD    | Depending on knobs you set, <lib>/libiomp5.so, <lib>/libjemalloc.so, <lib>/libtcmalloc.so might |
+|                  | be appended to LD_PRELOAD.                                                                      |
+------------------+-------------------------------------------------------------------------------------------------+
+|   KMP_AFFINITY   | If libiomp5.so is preloaded, KMP_AFFINITY could be set to "granularity=fine,compact,1,0".       |
+------------------+-------------------------------------------------------------------------------------------------+
+|   KMP_BLOCKTIME  | If libiomp5.so is preloaded, KMP_BLOCKTIME is set to "1".                                       |
+------------------+-------------------------------------------------------------------------------------------------+
+|  OMP_NUM_THREADS | value of ncores_per_instance                                                                    |
+------------------+-------------------------------------------------------------------------------------------------+
+|    MALLOC_CONF   | If libjemalloc.so is preloaded, MALLOC_CONF will be set to                                      |
+|                  | "oversize_threshold:1,background_thread:true,metadata_thp:auto".                                |
+------------------+-------------------------------------------------------------------------------------------------+
+
+*Note*: This script respects environment variables set preliminarily. I.e. If you set the environment variables
+mentioned above before running the script, the script will not overwrite the values in the script.
+
+How to use this module:
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Single instance inference
+-------------------------
+
+1. Run single-instance inference on a single node with all CPU nodes.
+
+::
+
+   python -m torch.backends.xeon.run_cpu --throughput-mode script.py args
+
+2. Run single-instance inference on a single CPU node.
+
+::
+
+   python -m torch.backends.xeon.run_cpu --node-id 1 script.py args
+
+Multi-instance inference
+------------------------
+
+1. Multi-instance
+   By default this tool runs one process per node. If you want to set the instance numbers and core per instance,
+   --ninstances and  --ncores-per-instance should be set.
+
+::
+
+   python -m torch.backends.xeon.run_cpu -- python_script args
+
+   eg: on an Intel(R) Xeon(R) Scalable Processor with 14 instance, 4 cores per instance
+
+::
+
+   python -m torch.backends.xeon.run_cpu --ninstances 14 --ncores-per-instance 4 python_script args
+
+2. Run single-instance inference among multiple instances.
+   By default, runs all ninstances. If you want to independently run a single instance among ninstances, specify rank.
+
+   eg: run 0th instance on an Intel(R) Xeon(R) Scalable Processor with 2 instance (i.e., numactl -C 0-27)
+
+::
+
+   python -m torch.backends.xeon.run_cpu --ninstances 2 --rank 0 python_script args
+
+   eg: run 1st instance on an Intel(R) Xeon(R) Scalable Processor with 2 instance (i.e., numactl -C 28-55)
+
+::
+
+   python -m torch.backends.xeon.run_cpu --ninstances 2 --rank 1 python_script args
+
+   eg: run 0th instance on an Intel(R) Xeon(R) Scalable Processor with 2 instance, 2 cores per instance,
+   first four cores (i.e., numactl -C 0-1)
+
+::
+
+   python -m torch.backends.xeon.run_cpu --core-list "0, 1, 2, 3" --ninstances 2 --ncores-per-instance 2
+   --rank 0 python_script args
+
+3. To look up what optional arguments this module offers:
+
+::
+
+    python -m torch.backends.xeon.run_cpu --help
+
+Memory allocator
+----------------
+
+"--enable-tcmalloc" and "--enable-jemalloc" can be used to enable different memory allcator.
+
+"""
+
+import glob
+import logging
+import os
+import platform
+import re
+import subprocess
+import sys
+from argparse import ArgumentParser, RawTextHelpFormatter, REMAINDER
+from os.path import expanduser
+from typing import Dict, List
+
+from torch.distributed.elastic.multiprocessing import (
+    DefaultLogsSpecs as _DefaultLogsSpecs,
+    start_processes,
+    Std,
+)
+
+
+format_str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+logging.basicConfig(level=logging.INFO, format=format_str)
+logger = logging.getLogger(__name__)
+
+
+class _CPUinfo:
+    """Get CPU information, such as cores list and NUMA information."""
+
+    def __init__(self, test_input=""):
+        self.cpuinfo = []
+        if platform.system() in ["Windows", "Darwin"]:
+            raise RuntimeError(f"{platform.system()} is not supported!!!")
+        elif platform.system() == "Linux":
+            # Sample output of: `lscpu --parse=CPU,Core,Socket,Node`
+            #
+            # # The following is the parsable format, which can be fed to other
+            # # programs. Each different item in every column has an unique ID
+            # # starting from zero.
+            # # CPU,Core,Socket,Node
+            # 0,0,0,0
+            # 1,1,0,0
+            # ...
+            if test_input == "":
+                lscpu_cmd = ["lscpu", "--parse=CPU,Core,Socket,Node"]
+                lscpu_info = subprocess.check_output(
+                    lscpu_cmd, universal_newlines=True
+                ).split("\n")
+            else:
+                lscpu_info = test_input.split("\n")
+
+            # Get information about  cpu, core, socket and node
+            for line in lscpu_info:
+                pattern = r"^([\d]+,[\d]+,[\d]+,[\d]?)"
+                regex_out = re.search(pattern, line)
+                if regex_out:
+                    self.cpuinfo.append(regex_out.group(1).strip().split(","))
+
+            # physical cores := core column in lscpu output
+            #  logical cores :=  cPU column in lscpu output
+            self.node_nums = int(max(line[3] for line in self.cpuinfo)) + 1
+            self.node_physical_cores: List[List[int]] = []  # node_id is index
+            self.node_logical_cores: List[List[int]] = []  # node_id is index
+            self.physical_core_node_map = {}  # physical core to numa node id
+            self.logical_core_node_map = {}  # logical core to numa node id
+
+            for node_id in range(self.node_nums):
+                cur_node_physical_core = []
+                cur_node_logical_core = []
+                for cpuinfo in self.cpuinfo:
+                    nid = cpuinfo[3] if cpuinfo[3] != "" else "0"
+                    if node_id == int(nid):
+                        if int(cpuinfo[1]) not in cur_node_physical_core:
+                            cur_node_physical_core.append(int(cpuinfo[1]))
+                            self.physical_core_node_map[int(cpuinfo[1])] = int(node_id)
+                        cur_node_logical_core.append(int(cpuinfo[0]))
+                        self.logical_core_node_map[int(cpuinfo[0])] = int(node_id)
+                self.node_physical_cores.append(cur_node_physical_core)
+                self.node_logical_cores.append(cur_node_logical_core)
+
+    def _physical_core_nums(self):
+        return len(self.node_physical_cores) * len(self.node_physical_cores[0])
+
+    def _logical_core_nums(self):
+        return len(self.node_logical_cores) * len(self.node_logical_cores[0])
+
+    def get_node_physical_cores(self, node_id):
+        if node_id < 0 or node_id > self.node_nums - 1:
+            raise ValueError(
+                f"Invalid node id: {node_id}. Valid node ids: {list(range(len(self.node_physical_cores)))}"
+            )
+        return self.node_physical_cores[node_id]
+
+    def get_node_logical_cores(self, node_id):
+        if node_id < 0 or node_id > self.node_nums - 1:
+            raise ValueError(
+                f"Invalid node id: {node_id}. Valid node ids: {list(range(len(self.node_physical_cores)))}"
+            )
+        return self.node_logical_cores[node_id]
+
+    def get_all_physical_cores(self):
+        all_cores = []
+        for cores in self.node_physical_cores:
+            all_cores.extend(cores)
+        return all_cores
+
+    def get_all_logical_cores(self):
+        all_cores = []
+        for cores in self.node_logical_cores:
+            all_cores.extend(cores)
+        return all_cores
+
+    def numa_aware_check(self, core_list):
+        """
+        Check whether all cores in core_list are in the same NUMA node.
+
+        Cross NUMA will reduce performance.
+        We strongly advice to not use cores on different nodes.
+        """
+        cores_numa_map = self.logical_core_node_map
+        numa_ids = []
+        for core in core_list:
+            numa_id = cores_numa_map[core]
+            if numa_id not in numa_ids:
+                numa_ids.append(numa_id)
+        if len(numa_ids) > 1:
+            logger.warning(
+                "Numa Aware: cores:%s on different NUMA nodes:%s. To avoid \
+this behavior, please use --ncores-per-instance knob to make sure number of cores is divisible by --ncores-per-\
+instance. Alternatively, please use --skip-cross-node-cores knob.",
+                str(core_list),
+                str(numa_ids),
+            )
+        if len(numa_ids) == 0:
+            raise RuntimeError(
+                "invalid number of NUMA nodes; please make sure numa_ids >= 1"
+            )
+        return numa_ids
+
+
+class _Launcher:
+    r"""Class for launcher."""
+
+    msg_lib_notfound = f"Unable to find the {{0}} library file lib{{1}}.so in $CONDA_PREFIX/lib or $VIRTUAL_ENV/lib \
+or /.local/lib/ or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or \
+{expanduser('~')}/.local/lib/ so the LD_PRELOAD environment variable will not be set."
+
+    def __init__(self) -> None:
+        self.cpuinfo = _CPUinfo()
+
+    def add_lib_preload(self, lib_type):
+        """Enable TCMalloc/JeMalloc/intel OpenMP."""
+        library_paths = []
+        if "CONDA_PREFIX" in os.environ:
+            library_paths.append(f"{os.environ['CONDA_PREFIX']}/lib")
+        if "VIRTUAL_ENV" in os.environ:
+            library_paths.append(f"{os.environ['VIRTUAL_ENV']}/lib")
+
+        library_paths += [
+            f"{expanduser('~')}/.local/lib",
+            "/usr/local/lib",
+            "/usr/local/lib64",
+            "/usr/lib",
+            "/usr/lib64",
+        ]
+
+        lib_find = False
+        lib_set = False
+        for item in os.getenv("LD_PRELOAD", "").split(":"):
+            if item.endswith(f"lib{lib_type}.so"):
+                lib_set = True
+                break
+        if not lib_set:
+            for lib_path in library_paths:
+                library_file = os.path.join(lib_path, f"lib{lib_type}.so")
+                matches = glob.glob(library_file)
+                if len(matches) > 0:
+                    ld_preloads = [f"{matches[0]}", os.getenv("LD_PRELOAD", "")]
+                    os.environ["LD_PRELOAD"] = os.pathsep.join(
+                        [p.strip(os.pathsep) for p in ld_preloads if p]
+                    )
+                    lib_find = True
+                    break
+        return lib_set or lib_find
+
+    def is_numactl_available(self):
+        numactl_available = False
+        try:
+            cmd = ["numactl", "-C", "0", "-m", "0", "hostname"]
+            r = subprocess.run(
+                cmd,
+                env=os.environ,
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+                check=False,
+            )
+            if r.returncode == 0:
+                numactl_available = True
+        except Exception:
+            pass
+        return numactl_available
+
+    def set_memory_allocator(
+        self, enable_tcmalloc=True, enable_jemalloc=False, use_default_allocator=False
+    ):
+        """
+        Enable TCMalloc/JeMalloc with LD_PRELOAD and set configuration for JeMalloc.
+
+        By default, PTMalloc will be used for PyTorch, but TCMalloc and JeMalloc can get better
+        memory reuse and reduce page fault to improve performance.
+        """
+        if enable_tcmalloc and enable_jemalloc:
+            raise RuntimeError(
+                "Unable to enable TCMalloc and JEMalloc at the same time."
+            )
+
+        if enable_tcmalloc:
+            find_tc = self.add_lib_preload(lib_type="tcmalloc")
+            if not find_tc:
+                msg = f'{self.msg_lib_notfound} you can use "conda install -c conda-forge gperftools" to install {{0}}'
+                logger.warning(msg.format("TCmalloc", "tcmalloc"))  # noqa: G001
+            else:
+                logger.info("Use TCMalloc memory allocator")
+
+        elif enable_jemalloc:
+            find_je = self.add_lib_preload(lib_type="jemalloc")
+            if not find_je:
+                msg = f'{self.msg_lib_notfound} you can use "conda install -c conda-forge jemalloc" to install {{0}}'
+                logger.warning(msg.format("Jemalloc", "jemalloc"))  # noqa: G001
+            else:
+                logger.info("Use JeMalloc memory allocator")
+                self.set_env(
+                    "MALLOC_CONF",
+                    "oversize_threshold:1,background_thread:true,metadata_thp:auto",
+                )
+
+        elif use_default_allocator:
+            pass
+
+        else:
+            find_tc = self.add_lib_preload(lib_type="tcmalloc")
+            if find_tc:
+                logger.info("Use TCMalloc memory allocator")
+                return
+            find_je = self.add_lib_preload(lib_type="jemalloc")
+            if find_je:
+                logger.info("Use JeMalloc memory allocator")
+                return
+            logger.warning(
+                """Neither TCMalloc nor JeMalloc is found in $CONDA_PREFIX/lib or $VIRTUAL_ENV/lib
+                            or /.local/lib/ or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or
+                           %s/.local/lib/ so the LD_PRELOAD environment variable will not be set.
+                           This may drop the performance""",
+                expanduser("~"),
+            )
+
+    def log_env_var(self, env_var_name=""):
+        if env_var_name in os.environ:
+            logger.info("%s=%s", env_var_name, os.environ[env_var_name])
+
+    def set_env(self, env_name, env_value):
+        if not env_value:
+            logger.warning("%s is None", env_name)
+        if env_name not in os.environ:
+            os.environ[env_name] = env_value
+        elif os.environ[env_name] != env_value:
+            logger.warning(
+                "Overriding value with the one set in environment variable: %s. \
+Value applied: %s. Value ignored: %s",
+                env_name,
+                os.environ[env_name],
+                env_value,
+            )
+        self.log_env_var(env_name)
+
+    # set_kmp_affinity is used to control whether to set KMP_AFFINITY or not.
+    # In scenario that use all cores on all nodes, including logical cores, setting KMP_AFFINITY disables logical cores.
+    # In this case, KMP_AFFINITY should not be set.
+    def set_multi_thread_and_allocator(
+        self,
+        ncores_per_instance,
+        disable_iomp=False,
+        set_kmp_affinity=True,
+        enable_tcmalloc=True,
+        enable_jemalloc=False,
+        use_default_allocator=False,
+    ):
+        """
+        Set multi-thread configuration and enable Intel openMP and TCMalloc/JeMalloc.
+
+        By default, GNU openMP and PTMalloc are used in PyTorch. but Intel openMP and TCMalloc/JeMalloc are better alternatives
+        to get performance benefit.
+        """
+        self.set_memory_allocator(
+            enable_tcmalloc, enable_jemalloc, use_default_allocator
+        )
+        self.set_env("OMP_NUM_THREADS", str(ncores_per_instance))
+        if not disable_iomp:
+            find_iomp = self.add_lib_preload(lib_type="iomp5")
+            if not find_iomp:
+                msg = f'{self.msg_lib_notfound} you can use "conda install mkl" to install {{0}}'
+                logger.warning(msg.format("iomp", "iomp5"))  # noqa: G001
+            else:
+                logger.info("Using Intel OpenMP")
+                if set_kmp_affinity:
+                    self.set_env("KMP_AFFINITY", "granularity=fine,compact,1,0")
+                self.set_env("KMP_BLOCKTIME", "1")
+        self.log_env_var("LD_PRELOAD")
+
+    r"""
+     Launcher for single instance and multi-instance
+     """
+
+    def launch(self, args):
+        cores = []
+        set_kmp_affinity = True
+        enable_taskset = False
+        if args.core_list:  # user specify what cores will be used by params
+            cores = [int(x) for x in args.core_list.split(",")]
+            if args.ncores_per_instance == -1:
+                raise RuntimeError(
+                    'please specify the "--ncores-per-instance" if you have pass the --core-list params'
+                )
+            elif (
+                args.ninstances > 1
+                and args.ncores_per_instance * args.ninstances < len(cores)
+            ):
+                logger.warning(
+                    "only first %s cores will be used, \
+but you specify %s cores in core_list",
+                    args.ncores_per_instance * args.ninstances,
+                    len(cores),
+                )
+            else:
+                args.ninstances = len(cores) // args.ncores_per_instance
+
+        else:
+            if args.use_logical_core:
+                if args.node_id != -1:
+                    cores = self.cpuinfo.get_node_logical_cores(args.node_id)
+                else:
+                    cores = self.cpuinfo.get_all_logical_cores()
+                    # When using all cores on all nodes, including logical cores,
+                    # setting KMP_AFFINITY disables logical cores. Thus, KMP_AFFINITY should not be set.
+                    set_kmp_affinity = False
+            else:
+                if args.node_id != -1:
+                    cores = self.cpuinfo.get_node_physical_cores(args.node_id)
+                else:
+                    cores = self.cpuinfo.get_all_physical_cores()
+            if (
+                not args.multi_instance
+                and args.ninstances == -1
+                and args.ncores_per_instance == -1
+            ):
+                args.ninstances = 1
+                args.ncores_per_instance = len(cores)
+            elif (
+                args.multi_instance
+                and args.ninstances == -1
+                and args.ncores_per_instance == -1
+            ):
+                args.throughput_mode = True
+            elif args.ncores_per_instance == -1 and args.ninstances != -1:
+                if args.ninstances > len(cores):
+                    raise RuntimeError(
+                        f"there are {len(cores)} total cores but you specify {args.ninstances} ninstances; \
+please make sure ninstances <= total_cores)"
+                    )
+                else:
+                    args.ncores_per_instance = len(cores) // args.ninstances
+            elif args.ncores_per_instance != -1 and args.ninstances == -1:
+                if not args.skip_cross_node_cores:
+                    args.ninstances = len(cores) // args.ncores_per_instance
+                else:
+                    ncore_per_node = len(self.cpuinfo.node_physical_cores[0])
+                    num_leftover_cores = ncore_per_node % args.ncores_per_instance
+                    if args.ncores_per_instance > ncore_per_node:
+                        # too many ncores_per_instance to skip cross-node cores
+                        logger.warning(
+                            "there are %s core(s) per socket, but you specify %s ncores_per_instance and \
+skip_cross_node_cores. Please make sure --ncores-per-instance < core(s) per \
+socket",
+                            ncore_per_node,
+                            args.ncores_per_instance,
+                        )
+                        sys.exit(-1)
+                    elif num_leftover_cores == 0:
+                        # aren't any cross-node cores
+                        logger.info(
+                            "--skip-cross-node-cores is set, but there are no cross-node cores."
+                        )
+                        args.ninstances = len(cores) // args.ncores_per_instance
+                    else:
+                        # skip cross-node cores
+                        if args.ninstances != -1:
+                            logger.warning(
+                                "--skip-cross-node-cores is exclusive to --ninstances. --ninstances \
+won't take effect even if it is set explicitly."
+                            )
+
+                        i = 1
+                        leftover_cores = set()
+                        while ncore_per_node * i <= len(cores):
+                            leftover_cores.update(
+                                cores[
+                                    ncore_per_node * i
+                                    - num_leftover_cores : ncore_per_node * i
+                                ]
+                            )
+                            i += 1
+                        cores = list(set(cores) - leftover_cores)
+                        assert len(cores) % args.ncores_per_instance == 0
+                        args.ninstances = len(cores) // args.ncores_per_instance
+            else:
+                if args.ninstances * args.ncores_per_instance > len(cores):
+                    raise RuntimeError(
+                        "Please make sure ninstances * ncores_per_instance <= total_cores"
+                    )
+            if args.latency_mode:
+                logger.warning(
+                    "--latency-mode is exclusive to --ninstances, --ncores-per-instance, --node-id and \
+--use-logical-core. They won't take effect even they are set explicitly."
+                )
+                args.ncores_per_instance = 4
+                cores = self.cpuinfo.get_all_physical_cores()
+                args.ninstances = len(cores) // args.ncores_per_instance
+
+            if args.throughput_mode:
+                logger.warning(
+                    "--throughput-mode is exclusive to --ninstances, --ncores-per-instance, --node-id and \
+--use-logical-core. They won't take effect even they are set explicitly."
+                )
+                args.ninstances = self.cpuinfo.node_nums
+                cores = self.cpuinfo.get_all_physical_cores()
+                args.ncores_per_instance = len(cores) // args.ninstances
+
+        if args.ninstances > 1 and args.rank != -1:
+            logger.info(
+                "assigning %s cores for instance %s",
+                args.ncores_per_instance,
+                args.rank,
+            )
+
+        if not args.disable_numactl:
+            numactl_available = self.is_numactl_available()
+            if not numactl_available:
+                if not args.disable_taskset:
+                    logger.warning(
+                        "Core binding with numactl is not available. Disabling numactl and using taskset instead. \
+                    This may affect performance in multi-socket system; please use numactl if memory binding is needed."
+                    )
+                    args.disable_numactl = True
+                    enable_taskset = True
+                else:
+                    logger.warning(
+                        "Core binding with numactl is not available, and --disable_taskset is set. \
+                    Please unset --disable_taskset to use taskset instead of numactl."
+                    )
+                    sys.exit(-1)
+
+        if not args.disable_taskset:
+            enable_taskset = True
+
+        self.set_multi_thread_and_allocator(
+            args.ncores_per_instance,
+            args.disable_iomp,
+            set_kmp_affinity,
+            args.enable_tcmalloc,
+            args.enable_jemalloc,
+            args.use_default_allocator,
+        )
+        entrypoint = ""
+        launch_args = {}
+        launch_envs: Dict[int, Dict] = {}
+        launch_tee = {}
+        # check whether is launched from torchrun with --nproc-per-node <num workers>
+        local_size = int(os.environ.get("LOCAL_WORLD_SIZE", 1))
+        local_rank = int(os.environ.get("LOCAL_RANK", 0))
+        for i in range(args.ninstances):
+            cmd = []
+            cur_process_cores = ""
+            if not args.disable_numactl or enable_taskset:
+                if not args.disable_numactl:
+                    cmd = ["numactl"]
+                elif enable_taskset:
+                    cmd = ["taskset"]
+                cores = sorted(cores)
+                if (
+                    args.rank == -1
+                ):  # sequentially assign ncores_per_instance to ninstances
+                    core_list = cores[
+                        i
+                        * args.ncores_per_instance : (i + 1)
+                        * args.ncores_per_instance
+                    ]
+                else:  # assign ncores_per_instance from rank
+                    core_list = cores[
+                        args.rank
+                        * args.ncores_per_instance : (args.rank + 1)
+                        * args.ncores_per_instance
+                    ]
+
+                core_ranges: List[Dict] = []
+                if local_size > 1:
+                    total_num_cores = len(core_list)
+                    cores_per_rank = total_num_cores // local_size
+                    assert (
+                        cores_per_rank >= 1
+                    ), "At least one core needs to be assigned to each rank"
+                    core_list = core_list[
+                        cores_per_rank * local_rank : cores_per_rank * (local_rank + 1)
+                    ]
+                for core in core_list:
+                    if len(core_ranges) == 0:
+                        range_elem = {"start": core, "end": core}
+                        core_ranges.append(range_elem)
+                    else:
+                        if core - core_ranges[-1]["end"] == 1:
+                            core_ranges[-1]["end"] = core
+                        else:
+                            range_elem = {"start": core, "end": core}
+                            core_ranges.append(range_elem)
+                for r in core_ranges:
+                    cur_process_cores = f"{cur_process_cores}{r['start']}-{r['end']},"
+                cur_process_cores = cur_process_cores[:-1]
+                if not args.disable_numactl:
+                    numa_params = f"-C {cur_process_cores} "
+                    numa_ids = ",".join(
+                        [
+                            str(numa_id)
+                            for numa_id in self.cpuinfo.numa_aware_check(core_list)
+                        ]
+                    )
+                    numa_params += f"-m {numa_ids}"
+                    cmd.extend(numa_params.split())
+                elif enable_taskset:
+                    taskset_params = f"-c {cur_process_cores} "
+                    cmd.extend(taskset_params.split())
+            with_python = not args.no_python
+            if with_python:
+                cmd.append(sys.executable)
+                cmd.append("-u")
+            if args.module:
+                cmd.append("-m")
+            cmd.append(args.program)
+            cmd.extend(args.program_args)
+            cmd_s = " ".join(cmd)
+            logger.info(cmd_s)
+            if entrypoint == "":
+                entrypoint = cmd[0]
+            del cmd[0]
+            launch_args[i] = tuple(cmd)
+            launch_envs[i] = {}
+            launch_tee[i] = Std.ALL
+
+            if args.rank != -1:  # launches single instance, rank, only
+                break
+
+        ctx = start_processes(
+            name=args.log_file_prefix,
+            entrypoint=entrypoint,
+            args=launch_args,
+            envs=launch_envs,
+            logs_specs=_DefaultLogsSpecs(log_dir=args.log_path, tee=launch_tee),
+        )
+        ctx.wait()
+
+
+def _add_memory_allocator_params(parser):
+    group = parser.add_argument_group("Memory Allocator Parameters")
+    # allocator control
+    group.add_argument(
+        "--enable-tcmalloc",
+        "--enable_tcmalloc",
+        action="store_true",
+        default=False,
+        help="Enable tcmalloc allocator",
+    )
+    group.add_argument(
+        "--enable-jemalloc",
+        "--enable_jemalloc",
+        action="store_true",
+        default=False,
+        help="Enable jemalloc allocator",
+    )
+    group.add_argument(
+        "--use-default-allocator",
+        "--use_default_allocator",
+        action="store_true",
+        default=False,
+        help="Use default memory allocator",
+    )
+
+
+def _add_multi_instance_params(parser):
+    group = parser.add_argument_group("Multi-instance Parameters")
+    # multi-instance control
+    group.add_argument(
+        "--ncores-per-instance",
+        "--ncores_per_instance",
+        metavar="\b",
+        default=-1,
+        type=int,
+        help="Cores per instance",
+    )
+    group.add_argument(
+        "--ninstances",
+        metavar="\b",
+        default=-1,
+        type=int,
+        help="For multi-instance, you should give the cores number you used for per instance.",
+    )
+    group.add_argument(
+        "--skip-cross-node-cores",
+        "--skip_cross_node_cores",
+        action="store_true",
+        default=False,
+        help="If specified --ncores-per-instance, skips cross-node cores.",
+    )
+    group.add_argument(
+        "--rank",
+        metavar="\b",
+        default="-1",
+        type=int,
+        help="Specify instance index to assign ncores_per_instance for rank; \
+otherwise ncores_per_instance will be assigned sequentially to ninstances. Please refer to \
+https://github.com/intel/intel-extension-for-pytorch/blob/master/docs/tutorials/performance_tuning/launch_script.md",
+    )
+    group.add_argument(
+        "--latency-mode",
+        "--latency_mode",
+        action="store_true",
+        default=False,
+        help="By default 4 core per instance and use all physical cores",
+    )
+    group.add_argument(
+        "--throughput-mode",
+        "--throughput_mode",
+        action="store_true",
+        default=False,
+        help="By default one instance per node and use all physical cores",
+    )
+    group.add_argument(
+        "--node-id",
+        "--node_id",
+        metavar="\b",
+        default=-1,
+        type=int,
+        help="node id for multi-instance, by default all nodes will be used",
+    )
+    group.add_argument(
+        "--use-logical-core",
+        "--use_logical_core",
+        action="store_true",
+        default=False,
+        help="Whether only use physical cores",
+    )
+    group.add_argument(
+        "--disable-numactl",
+        "--disable_numactl",
+        action="store_true",
+        default=False,
+        help="Disable numactl",
+    )
+    group.add_argument(
+        "--disable-taskset",
+        "--disable_taskset",
+        action="store_true",
+        default=False,
+        help="Disable taskset",
+    )
+    group.add_argument(
+        "--core-list",
+        "--core_list",
+        metavar="\b",
+        default=None,
+        type=str,
+        help='Specify the core list as "core_id, core_id, ....", otherwise, all the cores will be used.',
+    )
+    group.add_argument(
+        "--log-path",
+        "--log_path",
+        metavar="\b",
+        default="",
+        type=str,
+        help="The log file directory. Default path is "
+        ", which means disable logging to files.",
+    )
+    group.add_argument(
+        "--log-file-prefix",
+        "--log_file_prefix",
+        metavar="\b",
+        default="run",
+        type=str,
+        help="log file prefix",
+    )
+
+
+def _add_kmp_iomp_params(parser):
+    group = parser.add_argument_group("IOMP Parameters")
+    group.add_argument(
+        "--disable-iomp",
+        "--disable_iomp",
+        action="store_true",
+        default=False,
+        help="By default, we use Intel OpenMP and libiomp5.so will be add to LD_PRELOAD",
+    )
+
+
+def create_args(parser=None):
+    """
+    Parse the command line options.
+
+    @retval ArgumentParser
+    """
+    parser.add_argument(
+        "--multi-instance",
+        "--multi_instance",
+        action="store_true",
+        default=False,
+        help="Enable multi-instance, by default one instance per node",
+    )
+
+    parser.add_argument(
+        "-m",
+        "--module",
+        default=False,
+        action="store_true",
+        help="Changes each process to interpret the launch script "
+        "as a python module, executing with the same behavior as"
+        '"python -m".',
+    )
+
+    parser.add_argument(
+        "--no-python",
+        "--no_python",
+        default=False,
+        action="store_true",
+        help='Do not prepend the --program script with "python" - just exec '
+        "it directly. Useful when the script is not a Python script.",
+    )
+
+    _add_memory_allocator_params(parser)
+    _add_kmp_iomp_params(parser)
+
+    _add_multi_instance_params(parser)
+    # positional
+    parser.add_argument(
+        "program",
+        type=str,
+        help="The full path to the program/script to be launched. "
+        "followed by all the arguments for the script",
+    )
+
+    # rest from the training program
+    parser.add_argument("program_args", nargs=REMAINDER)
+
+
+def main(args):
+    env_before = set(os.environ.keys())
+    if platform.system() in ["Windows", "Darwin"]:
+        raise RuntimeError(f"{platform.system()} is not supported!!!")
+
+    if args.log_path:
+        os.makedirs(args.log_path, exist_ok=True)
+    else:
+        args.log_path = os.devnull
+
+    if args.latency_mode and args.throughput_mode:
+        raise RuntimeError(
+            "Either args.latency_mode or args.throughput_mode should be set"
+        )
+
+    if not args.no_python and not args.program.endswith(".py"):
+        raise RuntimeError(
+            'For non Python script, you should use "--no-python" parameter.'
+        )
+
+    # Verify LD_PRELOAD
+    if "LD_PRELOAD" in os.environ:
+        lst_valid = []
+        tmp_ldpreload = os.environ["LD_PRELOAD"]
+        for item in tmp_ldpreload.split(":"):
+            matches = glob.glob(item)
+            if len(matches) > 0:
+                lst_valid.append(item)
+            else:
+                logger.warning("%s doesn't exist. Removing it from LD_PRELOAD.", item)
+        if len(lst_valid) > 0:
+            os.environ["LD_PRELOAD"] = ":".join(lst_valid)
+        else:
+            os.environ["LD_PRELOAD"] = ""
+
+    launcher = _Launcher()
+    launcher.launch(args)
+    for x in sorted(set(os.environ.keys()) - env_before):
+        logger.debug("%s=%s", x, os.environ[x])
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(
+        description="This is a script for launching PyTorch inference on Intel(R) Xeon(R) Scalable "
+        "Processors with optimal configurations. Single instance inference, "
+        "multi-instance inference are enable. To get the peak performance on Intel(R) "
+        "Xeon(R) Scalable Processors, the script optimizes the configuration "
+        "of thread and memory management. For thread management, the script configures thread "
+        "affinity and the preload of Intel OMP library. For memory management, it configures "
+        "NUMA binding and preload optimized memory allocation library (e.g. tcmalloc, jemalloc) "
+        "\n################################# Basic usage ############################# \n"
+        "\n 1. single instance\n"
+        "\n   >>> python -m torch.backends.xeon.run_cpu python_script args \n"
+        "\n2. multi-instance \n"
+        "\n   >>> python -m torch.backends.xeon.run_cpu --ninstances xxx "
+        "--ncores-per-instance xx python_script args\n"
+        "\n############################################################################# \n",
+        formatter_class=RawTextHelpFormatter,
+    )
+    create_args(parser)
+    args = parser.parse_args()
+    main(args)
--- a/rl/Lib/site-packages/torch/backends/xnnpack/init.py
+++ b/rl/Lib/site-packages/torch/backends/xnnpack/init.py
@ -0,0 +1,29 @@
+# mypy: allow-untyped-defs
+import sys
+import types
+
+import torch
+
+
+class _XNNPACKEnabled:
+    def __get__(self, obj, objtype):
+        return torch._C._is_xnnpack_enabled()
+
+    def __set__(self, obj, val):
+        raise RuntimeError("Assignment not supported")
+
+
+class XNNPACKEngine(types.ModuleType):
+    def __init__(self, m, name):
+        super().__init__(name)
+        self.m = m
+
+    def __getattr__(self, attr):
+        return self.m.__getattribute__(attr)
+
+    enabled = _XNNPACKEnabled()
+
+
+# This is the sys.modules replacement trick, see
+# https://stackoverflow.com/questions/2447353/getattr-on-a-module/7668273#7668273
+sys.modules[__name__] = XNNPACKEngine(sys.modules[__name__], __name__)
--- a/rl/Lib/site-packages/torch/backends/xnnpack/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/backends/xnnpack/pycache/init.cpython-312.pyc