I am done

2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions
--- a/rl/Lib/site-packages/torch/xpu/init.py
+++ b/rl/Lib/site-packages/torch/xpu/init.py
@ -0,0 +1,504 @@
+# mypy: allow-untyped-defs
+r"""
+This package introduces support for the XPU backend, specifically tailored for
+Intel GPU optimization.
+
+This package is lazily initialized, so you can always import it, and use
+:func:`is_available()` to determine if your system supports XPU.
+"""
+import threading
+import traceback
+from functools import lru_cache
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch._C
+from torch import device as _device
+from torch._utils import _dummy_type, _LazySeedTracker
+
+from ._utils import _get_device_index
+from .streams import Event, Stream
+
+
+_initialized = False
+_tls = threading.local()
+_initialization_lock = threading.Lock()
+_queued_calls: List[
+    Tuple[Callable[[], None], List[str]]
+] = []  # don't invoke these until initialization occurs
+_is_in_bad_fork = getattr(torch._C, "_xpu_isInBadFork", lambda: False)
+_device_t = Union[_device, str, int, None]
+_lazy_seed_tracker = _LazySeedTracker()
+default_generators: Tuple[torch._C.Generator] = ()  # type: ignore[assignment]
+
+
+def _is_compiled() -> bool:
+    r"""Return true if compile with XPU support."""
+    return torch._C._has_xpu
+
+
+if _is_compiled():
+    _XpuDeviceProperties = torch._C._XpuDeviceProperties
+    _exchange_device = torch._C._xpu_exchangeDevice
+    _maybe_exchange_device = torch._C._xpu_maybeExchangeDevice
+else:
+    # Define dummy if PyTorch was compiled without XPU
+    _XpuDeviceProperties = _dummy_type("_XpuDeviceProperties")  # type: ignore[assignment, misc]
+
+    def _exchange_device(device: int) -> int:
+        raise NotImplementedError("PyTorch was compiled without XPU support")
+
+    def _maybe_exchange_device(device: int) -> int:
+        raise NotImplementedError("PyTorch was compiled without XPU support")
+
+
+@lru_cache(maxsize=1)
+def device_count() -> int:
+    r"""Return the number of XPU device available."""
+    if not _is_compiled():
+        return 0
+    return torch._C._xpu_getDeviceCount()
+
+
+def is_available() -> bool:
+    r"""Return a bool indicating if XPU is currently available."""
+    # This function nerver throws.
+    return device_count() > 0
+
+
+def is_bf16_supported():
+    r"""Return a bool indicating if the current XPU device supports dtype bfloat16."""
+    return True
+
+
+def is_initialized():
+    r"""Return whether PyTorch's XPU state has been initialized."""
+    return _initialized and not _is_in_bad_fork()
+
+
+def _lazy_call(callable, **kwargs):
+    if is_initialized():
+        callable()
+    else:
+        global _lazy_seed_tracker
+        if kwargs.get("seed_all", False):
+            _lazy_seed_tracker.queue_seed_all(callable, traceback.format_stack())
+        elif kwargs.get("seed", False):
+            _lazy_seed_tracker.queue_seed(callable, traceback.format_stack())
+        else:
+            # Don't store the actual traceback to avoid memory cycle
+            _queued_calls.append((callable, traceback.format_stack()))
+
+
+def init():
+    r"""Initialize PyTorch's XPU state.
+    This is a Python API about lazy initialization that avoids initializing
+    XPU until the first time it is accessed. Does nothing if the XPU state is
+    already initialized.
+    """
+    _lazy_init()
+
+
+def _lazy_init():
+    global _initialized, _queued_calls
+    if is_initialized() or hasattr(_tls, "is_initializing"):
+        return
+    with _initialization_lock:
+        # This test was was protected via GIL. Double-check whether XPU has
+        # already been initialized.
+        if is_initialized():
+            return
+        # Stop promptly upon encountering a bad fork error.
+        if _is_in_bad_fork():
+            raise RuntimeError(
+                "Cannot re-initialize XPU in forked subprocess. To use XPU with "
+                "multiprocessing, you must use the 'spawn' start method"
+            )
+        if not _is_compiled():
+            raise AssertionError("Torch not compiled with XPU enabled")
+        # This function inits XPU backend and detects bad fork processing.
+        torch._C._xpu_init()
+        # Some of the queued calls may reentrantly call _lazy_init(); We need to
+        # just return without initializing in that case.
+        _tls.is_initializing = True
+
+        for calls in _lazy_seed_tracker.get_calls():
+            if calls:
+                _queued_calls.append(calls)
+
+        try:
+            for queued_call, orig_traceback in _queued_calls:
+                try:
+                    queued_call()
+                except Exception as e:
+                    msg = (
+                        f"XPU call failed lazily at initialization with error: {str(e)}\n\n"
+                        f"XPU call was originally invoked at:\n\n{''.join(orig_traceback)}"
+                    )
+                    raise Exception(msg) from e  # noqa: TRY002
+        finally:
+            delattr(_tls, "is_initializing")
+        _initialized = True
+
+
+class _DeviceGuard:
+    def __init__(self, index: int):
+        self.idx = index
+        self.prev_idx = -1
+
+    def __enter__(self):
+        self.prev_idx = torch.xpu._exchange_device(self.idx)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        self.idx = torch.xpu._maybe_exchange_device(self.prev_idx)
+        return False
+
+
+class device:
+    r"""Context-manager that changes the selected device.
+
+    Args:
+        device (torch.device or int or str): device index to select. It's a no-op if
+            this argument is a negative integer or ``None``.
+    """
+
+    def __init__(self, device: Any):
+        self.idx = _get_device_index(device, optional=True)
+        self.prev_idx = -1
+
+    def __enter__(self):
+        self.prev_idx = torch.xpu._exchange_device(self.idx)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        self.idx = torch.xpu._maybe_exchange_device(self.prev_idx)
+        return False
+
+
+class device_of(device):
+    r"""Context-manager that changes the current device to that of given object.
+
+    You can use both tensors and storages as arguments. If a given object is
+    not allocated on a XPU, this is a no-op.
+
+    Args:
+        obj (Tensor or Storage): object allocated on the selected device.
+    """
+
+    def __init__(self, obj):
+        idx = obj.get_device() if obj.is_xpu else -1
+        super().__init__(idx)
+
+
+def set_device(device: _device_t) -> None:
+    r"""Set the current device.
+
+    Args:
+        device (torch.device or int or str): selected device. This function is a
+            no-op if this argument is negative.
+    """
+    _lazy_init()
+    device = _get_device_index(device)
+    if device >= 0:
+        torch._C._xpu_setDevice(device)
+
+
+def get_device_name(device: Optional[_device_t] = None) -> str:
+    r"""Get the name of a device.
+
+    Args:
+        device (torch.device or int or str, optional): device for which to
+            return the name. This function is a no-op if this argument is a
+            negative integer. It uses the current device, given by :func:`~torch.xpu.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    Returns:
+        str: the name of the device
+    """
+    return get_device_properties(device).name
+
+
+@lru_cache(None)
+def get_device_capability(device: Optional[_device_t] = None) -> Dict[str, Any]:
+    r"""Get the xpu capability of a device.
+
+    Args:
+        device (torch.device or int or str, optional): device for which to
+            return the device capability. This function is a no-op if this
+            argument is a negative integer. It uses the current device, given by
+            :func:`~torch.xpu.current_device`, if :attr:`device` is ``None``
+            (default).
+
+    Returns:
+        Dict[str, Any]: the xpu capability dictionary of the device
+    """
+    props = get_device_properties(device)
+    return {
+        prop: getattr(props, prop) for prop in dir(props) if not prop.startswith("__")
+    }
+
+
+def get_device_properties(device: Optional[_device_t] = None) -> _XpuDeviceProperties:
+    r"""Get the properties of a device.
+
+    Args:
+        device (torch.device or int or str): device for which to return the
+            properties of the device.
+
+    Returns:
+        _XpuDeviceProperties: the properties of the device
+    """
+    _lazy_init()
+    device = _get_device_index(device, optional=True)
+    if device < 0 or device >= device_count():
+        raise AssertionError("Invalid device index")
+    return _get_device_properties(device)  # type: ignore[name-defined]  # noqa: F821
+
+
+def current_device() -> int:
+    r"""Return the index of a currently selected device."""
+    _lazy_init()
+    return torch._C._xpu_getDevice()
+
+
+def _get_device(device: Union[int, str, torch.device]) -> torch.device:
+    r"""Return the torch.device type object from the passed in device.
+
+    Args:
+        device (torch.device or int or str): selected device.
+    """
+    if isinstance(device, str):
+        device = torch.device(device)
+    elif isinstance(device, int):
+        device = torch.device("xpu", device)
+    return device
+
+
+class StreamContext:
+    r"""Context-manager that selects a given stream.
+
+    All XPU kernels queued within its context will be enqueued on a selected
+    stream.
+
+    Args:
+        Stream (Stream): selected stream. This manager is a no-op if it's
+            ``None``.
+    .. note:: Streams are per-device.
+    """
+    cur_stream: Optional["torch.xpu.Stream"]
+
+    def __init__(self, stream: Optional["torch.xpu.Stream"]):
+        self.stream = stream
+        self.idx = _get_device_index(None, True)
+        if self.idx is None:
+            self.idx = -1
+
+    def __enter__(self):
+        cur_stream = self.stream
+        if cur_stream is None or self.idx == -1:
+            return
+        self.src_prev_stream = torch.xpu.current_stream(None)
+
+        # If the stream is not on the current device, then set the current stream on the device
+        if self.src_prev_stream.device != cur_stream.device:
+            with device(cur_stream.device):
+                self.dst_prev_stream = torch.xpu.current_stream(cur_stream.device)
+        torch.xpu.set_stream(cur_stream)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        cur_stream = self.stream
+        if cur_stream is None or self.idx == -1:
+            return
+
+        # Reset the stream on the original device and destination device
+        if self.src_prev_stream.device != cur_stream.device:
+            torch.xpu.set_stream(self.dst_prev_stream)
+        torch.xpu.set_stream(self.src_prev_stream)
+
+
+def stream(stream: Optional["torch.xpu.Stream"]) -> StreamContext:
+    r"""Wrap around the Context-manager StreamContext that selects a given stream.
+
+    Arguments:
+        stream (Stream): selected stream. This manager is a no-op if it's ``None``.
+    """
+    return StreamContext(stream)
+
+
+def _set_stream_by_id(stream_id, device_index, device_type):
+    r"""set stream specified by the stream id, device index and device type
+
+    Args: stream_id (int): not visible to the user, used to assigned to the specific stream.
+          device_index (int): selected device index.
+          device_type (int): selected device type.
+    """
+    torch._C._xpu_setStream(
+        stream_id=stream_id,
+        device_index=device_index,
+        device_type=device_type,
+    )
+
+
+def set_stream(stream: Stream):
+    r"""Set the current stream.This is a wrapper API to set the stream.
+        Usage of this function is discouraged in favor of the ``stream``
+        context manager.
+
+    Args:
+        stream (Stream): selected stream. This function is a no-op
+            if this argument is ``None``.
+    """
+    if stream is None:
+        return
+    _lazy_init()
+    _set_stream_by_id(
+        stream_id=stream.stream_id,
+        device_index=stream.device_index,
+        device_type=stream.device_type,
+    )
+
+
+def current_stream(device: Optional[_device_t] = None) -> Stream:
+    r"""Return the currently selected :class:`Stream` for a given device.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            the currently selected :class:`Stream` for the current device, given
+            by :func:`~torch.xpu.current_device`, if :attr:`device` is ``None``
+            (default).
+    """
+    _lazy_init()
+    streamdata = torch._C._xpu_getCurrentStream(
+        _get_device_index(device, optional=True)
+    )
+    return Stream(
+        stream_id=streamdata[0], device_index=streamdata[1], device_type=streamdata[2]
+    )
+
+
+def synchronize(device: _device_t = None) -> None:
+    r"""Wait for all kernels in all streams on a XPU device to complete.
+
+    Args:
+        device (torch.device or int, optional): device for which to synchronize.
+            It uses the current device, given by :func:`~torch.xpu.current_device`,
+            if :attr:`device` is ``None`` (default).
+    """
+    _lazy_init()
+    device = _get_device_index(device, optional=True)
+    return torch._C._xpu_synchronize(device)
+
+
+def _get_generator(device: torch.device) -> torch._C.Generator:
+    r"""Return the XPU Generator object for the given device.
+
+    Args:
+        device (torch.device): selected device.
+    """
+    idx = device.index
+    if idx is None:
+        idx = current_device()
+    return torch.xpu.default_generators[idx]
+
+
+def _set_rng_state_offset(
+    offset: int, device: Union[int, str, torch.device] = "xpu"
+) -> None:
+    r"""Set the random number generator state offset of the specified GPU.
+
+    Args:
+        offset (int): The desired offset
+        device (torch.device or int, optional): The device to set the RNG state.
+            Default: ``'xpu'`` (i.e., ``torch.device('xpu')``, the current XPU device).
+    """
+    final_device = _get_device(device)
+
+    def cb():
+        default_generator = _get_generator(final_device)
+        default_generator.set_offset(offset)
+
+    _lazy_call(cb)
+
+
+def _get_rng_state_offset(device: Union[int, str, torch.device] = "xpu") -> int:
+    r"""Return the random number generator state offset of the specified GPU.
+
+    Args:
+        device (torch.device or int, optional): The device to return the RNG state offset of.
+            Default: ``'xpu'`` (i.e., ``torch.device('xpu')``, the current XPU device).
+
+    .. warning::
+        This function eagerly initializes XPU.
+    """
+    _lazy_init()
+    final_device = _get_device(device)
+    default_generator = _get_generator(final_device)
+    return default_generator.get_offset()
+
+
+# import here to avoid circular import
+from .memory import (
+    empty_cache,
+    max_memory_allocated,
+    max_memory_reserved,
+    memory_allocated,
+    memory_reserved,
+    memory_stats,
+    memory_stats_as_nested_dict,
+    reset_accumulated_memory_stats,
+    reset_peak_memory_stats,
+)
+from .random import (
+    get_rng_state,
+    get_rng_state_all,
+    initial_seed,
+    manual_seed,
+    manual_seed_all,
+    seed,
+    seed_all,
+    set_rng_state,
+    set_rng_state_all,
+)
+
+
+__all__ = [
+    "Event",
+    "Stream",
+    "StreamContext",
+    "current_device",
+    "current_stream",
+    "default_generators",
+    "device",
+    "device_of",
+    "device_count",
+    "empty_cache",
+    "get_device_capability",
+    "get_device_name",
+    "get_device_properties",
+    "get_rng_state",
+    "get_rng_state_all",
+    "get_stream",
+    "init",
+    "initial_seed",
+    "is_available",
+    "is_bf16_supported",
+    "is_initialized",
+    "manual_seed",
+    "manual_seed_all",
+    "max_memory_allocated",
+    "max_memory_reserved",
+    "memory_allocated",
+    "memory_reserved",
+    "memory_stats",
+    "memory_stats_as_nested_dict",
+    "reset_accumulated_memory_stats",
+    "reset_peak_memory_stats",
+    "seed",
+    "seed_all",
+    "set_device",
+    "set_rng_state",
+    "set_rng_state_all",
+    "set_stream",
+    "stream",
+    "streams",
+    "synchronize",
+]
--- a/rl/Lib/site-packages/torch/xpu/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/xpu/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/xpu/pycache/_gpu_trace.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/xpu/pycache/_gpu_trace.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/xpu/pycache/_utils.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/xpu/pycache/_utils.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/xpu/pycache/memory.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/xpu/pycache/memory.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/xpu/pycache/random.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/xpu/pycache/random.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/xpu/pycache/streams.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/xpu/pycache/streams.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/xpu/_gpu_trace.py
+++ b/rl/Lib/site-packages/torch/xpu/_gpu_trace.py
@ -0,0 +1,75 @@
+from typing import Callable
+
+from torch._utils import CallbackRegistry
+
+
+EventCreationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "XPU event creation"
+)
+EventDeletionCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "XPU event deletion"
+)
+EventRecordCallbacks: "CallbackRegistry[int, int]" = CallbackRegistry(
+    "XPU event record"
+)
+EventWaitCallbacks: "CallbackRegistry[int, int]" = CallbackRegistry(
+    "XPU event wait"
+)
+MemoryAllocationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "XPU memory allocation"
+)
+MemoryDeallocationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "XPU memory deallocation"
+)
+StreamCreationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "XPU stream creation"
+)
+DeviceSynchronizationCallbacks: "CallbackRegistry[[]]" = CallbackRegistry(
+    "XPU device synchronization"
+)
+StreamSynchronizationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "XPU stream synchronization"
+)
+EventSynchronizationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "XPU event synchronization"
+)
+
+
+def register_callback_for_event_creation(cb: Callable[[int], None]) -> None:
+    EventCreationCallbacks.add_callback(cb)
+
+
+def register_callback_for_event_deletion(cb: Callable[[int], None]) -> None:
+    EventDeletionCallbacks.add_callback(cb)
+
+
+def register_callback_for_event_record(cb: Callable[[int, int], None]) -> None:
+    EventRecordCallbacks.add_callback(cb)
+
+
+def register_callback_for_event_wait(cb: Callable[[int, int], None]) -> None:
+    EventWaitCallbacks.add_callback(cb)
+
+
+def register_callback_for_memory_allocation(cb: Callable[[int], None]) -> None:
+    MemoryAllocationCallbacks.add_callback(cb)
+
+
+def register_callback_for_memory_deallocation(cb: Callable[[int], None]) -> None:
+    MemoryDeallocationCallbacks.add_callback(cb)
+
+
+def register_callback_for_stream_creation(cb: Callable[[int], None]) -> None:
+    StreamCreationCallbacks.add_callback(cb)
+
+
+def register_callback_for_device_synchronization(cb: Callable[[], None]) -> None:
+    DeviceSynchronizationCallbacks.add_callback(cb)
+
+
+def register_callback_for_stream_synchronization(cb: Callable[[int], None]) -> None:
+    StreamSynchronizationCallbacks.add_callback(cb)
+
+
+def register_callback_for_event_synchronization(cb: Callable[[int], None]) -> None:
+    EventSynchronizationCallbacks.add_callback(cb)
--- a/rl/Lib/site-packages/torch/xpu/_utils.py
+++ b/rl/Lib/site-packages/torch/xpu/_utils.py
@ -0,0 +1,39 @@
+from typing import Any
+
+import torch
+
+# The _get_device_index has been moved to torch.utils._get_device_index
+from torch._utils import _get_device_index as _torch_get_device_index
+
+
+def _get_device_index(
+    device: Any, optional: bool = False, allow_cpu: bool = False
+) -> int:
+    r"""Get the device index from :attr:`device`, which can be a torch.device
+    object, a Python integer, or ``None``.
+
+    If :attr:`device` is a torch.device object, returns the device index if it
+    is a XPU device. Note that for a XPU device without a specified index,
+    i.e., ``torch.device('xpu')``, this will return the current default XPU
+    device if :attr:`optional` is ``True``. If :attr:`allow_cpu` is ``True``,
+    CPU devices will be accepted and ``-1`` will be returned in this case.
+
+    If :attr:`device` is a Python integer, it is returned as is.
+
+    If :attr:`device` is ``None``, this will return the current default XPU
+    device if :attr:`optional` is ``True``.
+    """
+    if isinstance(device, int):
+        return device
+    if isinstance(device, str):
+        device = torch.device(device)
+    if isinstance(device, torch.device):
+        if allow_cpu:
+            if device.type not in ["xpu", "cpu"]:
+                raise ValueError(f"Expected a xpu or cpu device, but got: {device}")
+        elif device.type != "xpu":
+            raise ValueError(f"Expected a xpu device, but got: {device}")
+    if not torch.jit.is_scripting():
+        if isinstance(device, torch.xpu.device):
+            return device.idx
+    return _torch_get_device_index(device, optional, allow_cpu)
--- a/rl/Lib/site-packages/torch/xpu/memory.py
+++ b/rl/Lib/site-packages/torch/xpu/memory.py
@ -0,0 +1,191 @@
+import collections
+from typing import Any, Dict, Union
+
+import torch
+from torch.types import Device
+
+from . import _get_device_index, is_initialized
+
+
+_device_t = Union[Device, str, int, None]
+
+
+def empty_cache() -> None:
+    r"""Release all unoccupied cached memory currently held by the caching
+    allocator so that those can be used in other XPU application.
+
+    .. note::
+        :func:`~torch.xpu.empty_cache` doesn't increase the amount of XPU
+        memory available for PyTorch. However, it may help reduce fragmentation
+        of XPU memory in certain cases.
+    """
+    if is_initialized():
+        torch._C._xpu_emptyCache()
+
+
+def reset_peak_memory_stats(device: _device_t = None) -> None:
+    r"""Reset the "peak" stats tracked by the XPU memory allocator.
+
+    See :func:`~torch.xpu.memory_stats` for details. Peak stats correspond to the
+    `"peak"` key in each individual stat dict.
+
+    Args:
+        device (torch.device or int or str, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.xpu.current_device`,
+            if :attr:`device` is ``None`` (default).
+    """
+    device = _get_device_index(device, optional=True)
+    return torch._C._xpu_resetPeakMemoryStats(device)
+
+
+def reset_accumulated_memory_stats(device: _device_t = None) -> None:
+    r"""Reset the "accumulated" (historical) stats tracked by the XPU memory allocator.
+
+    See :func:`~torch.xpu.memory_stats` for details. Accumulated stats correspond to
+    the `"allocated"` and `"freed"` keys in each individual stat dict.
+
+    Args:
+        device (torch.device or int or str, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.xpu.current_device`,
+            if :attr:`device` is ``None`` (default).
+    """
+    device = _get_device_index(device, optional=True)
+    return torch._C._xpu_resetAccumulatedMemoryStats(device)
+
+
+def memory_stats_as_nested_dict(device: _device_t = None) -> Dict[str, Any]:
+    r"""Return the result of :func:`~torch.xpu.memory_stats` as a nested dictionary."""
+    if not is_initialized():
+        return {}
+    device = _get_device_index(device, optional=True)
+    return torch._C._xpu_memoryStats(device)
+
+
+def memory_stats(device: _device_t = None) -> Dict[str, Any]:
+    r"""Return a dictionary of XPU memory allocator statistics for a given device.
+
+    The return value of this function is a dictionary of statistics, each of
+    which is a non-negative integer.
+
+    Core statistics:
+
+    - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of allocated memory.
+    - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of reserved memory.
+    - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of active memory.
+    - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      memory requested by client code, compare this with allocated_bytes to check if
+      allocation rounding adds too much overhead.
+
+    For these core statistics, values are broken down as follows.
+
+    Pool type:
+
+    - ``all``: combined statistics across all memory pools.
+    - ``large_pool``: statistics for the large allocation pool (for size >= 1MB allocations).
+    - ``small_pool``: statistics for the small allocation pool (for size < 1MB allocations).
+
+    Metric type:
+
+    - ``current``: current value of this metric.
+    - ``peak``: maximum value of this metric.
+    - ``allocated``: historical total increase in this metric.
+    - ``freed``: historical total decrease in this metric.
+
+    Args:
+        device (torch.device or int or str, optional): selected device. Returns
+            statistics for the current device, given by :func:`~torch.xpu.current_device`,
+            if :attr:`device` is ``None`` (default).
+    """
+    result = []
+
+    def _recurse_add_to_result(prefix: str, obj: Any) -> None:
+        if isinstance(obj, dict):
+            if len(prefix) > 0:
+                prefix += "."
+            for k, v in obj.items():
+                _recurse_add_to_result(prefix + k, v)
+        else:
+            result.append((prefix, obj))
+
+    stats = memory_stats_as_nested_dict(device=device)
+    _recurse_add_to_result("", stats)
+    result.sort()
+
+    return collections.OrderedDict(result)
+
+
+def memory_allocated(device: _device_t = None) -> int:
+    r"""Return the current GPU memory occupied by tensors in bytes for a given device.
+
+    Args:
+        device (torch.device or int or str, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.xpu.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        This is likely less than the amount shown in `xpu-smi` since some
+        unused memory can be held by the caching allocator and some context
+        needs to be created on GPU.
+    """
+    return memory_stats(device=device).get("allocated_bytes.all.current", 0)
+
+
+def max_memory_allocated(device: _device_t = None) -> int:
+    r"""Return the maximum GPU memory occupied by tensors in bytes for a given device.
+
+    By default, this returns the peak allocated memory since the beginning of
+    this program. :func:`~torch.xpu.reset_peak_memory_stats` can be used to
+    reset the starting point in tracking this metric. For example, these two
+    functions can measure the peak allocated memory usage of each iteration in a
+    training loop.
+
+    Args:
+        device (torch.device or int or str, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.xpu.current_device`,
+            if :attr:`device` is ``None`` (default).
+    """
+    return memory_stats(device=device).get("allocated_bytes.all.peak", 0)
+
+
+def memory_reserved(device: _device_t = None) -> int:
+    r"""Return the current GPU memory managed by the caching allocator in bytes for a given device.
+
+    Args:
+        device (torch.device or int or str, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.xpu.current_device`,
+            if :attr:`device` is ``None`` (default).
+    """
+    return memory_stats(device=device).get("reserved_bytes.all.current", 0)
+
+
+def max_memory_reserved(device: _device_t = None) -> int:
+    r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device.
+
+    By default, this returns the peak cached memory since the beginning of this
+    program. :func:`~torch.xpu.reset_peak_memory_stats` can be used to reset
+    the starting point in tracking this metric. For example, these two functions
+    can measure the peak cached memory amount of each iteration in a training
+    loop.
+
+    Args:
+        device (torch.device or int or str, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.xpu.current_device`,
+            if :attr:`device` is ``None`` (default).
+    """
+    return memory_stats(device=device).get("reserved_bytes.all.peak", 0)
+
+
+__all__ = [
+    "empty_cache",
+    "max_memory_allocated",
+    "max_memory_reserved",
+    "memory_allocated",
+    "memory_reserved",
+    "memory_stats",
+    "memory_stats_as_nested_dict",
+    "reset_accumulated_memory_stats",
+    "reset_peak_memory_stats",
+]
--- a/rl/Lib/site-packages/torch/xpu/random.py
+++ b/rl/Lib/site-packages/torch/xpu/random.py
@ -0,0 +1,178 @@
+# mypy: allow-untyped-defs
+from typing import Iterable, List, Union
+
+import torch
+from torch import Tensor
+
+from . import _lazy_call, _lazy_init, current_device, device_count
+
+
+def get_rng_state(device: Union[int, str, torch.device] = "xpu") -> Tensor:
+    r"""Return the random number generator state of the specified GPU as a ByteTensor.
+
+    Args:
+        device (torch.device or int, optional): The device to return the RNG state of.
+            Default: ``'xpu'`` (i.e., ``torch.device('xpu')``, the current XPU device).
+
+    .. warning::
+        This function eagerly initializes XPU.
+    """
+    _lazy_init()
+    if isinstance(device, str):
+        device = torch.device(device)
+    elif isinstance(device, int):
+        device = torch.device("xpu", device)
+    idx = device.index
+    if idx is None:
+        idx = current_device()
+    default_generator = torch.xpu.default_generators[idx]
+    return default_generator.get_state()
+
+
+def get_rng_state_all() -> List[Tensor]:
+    r"""Return a list of ByteTensor representing the random number states of all devices."""
+    results = []
+    for i in range(device_count()):
+        results.append(get_rng_state(i))
+    return results
+
+
+def set_rng_state(
+    new_state: Tensor, device: Union[int, str, torch.device] = "xpu"
+) -> None:
+    r"""Set the random number generator state of the specified GPU.
+
+    Args:
+        new_state (torch.ByteTensor): The desired state
+        device (torch.device or int, optional): The device to set the RNG state.
+            Default: ``'xpu'`` (i.e., ``torch.device('xpu')``, the current XPU device).
+    """
+    with torch._C._DisableFuncTorch():
+        new_state_copy = new_state.clone(memory_format=torch.contiguous_format)
+    if isinstance(device, str):
+        device = torch.device(device)
+    elif isinstance(device, int):
+        device = torch.device("xpu", device)
+
+    def cb():
+        idx = device.index
+        if idx is None:
+            idx = current_device()
+        default_generator = torch.xpu.default_generators[idx]
+        default_generator.set_state(new_state_copy)
+
+    _lazy_call(cb)
+
+
+def set_rng_state_all(new_states: Iterable[Tensor]) -> None:
+    r"""Set the random number generator state of all devices.
+
+    Args:
+        new_states (Iterable of torch.ByteTensor): The desired state for each device.
+    """
+    for i, state in enumerate(new_states):
+        set_rng_state(state, i)
+
+
+def manual_seed(seed: int) -> None:
+    r"""Set the seed for generating random numbers for the current GPU.
+
+    It's safe to call this function if XPU is not available; in that case, it is silently ignored.
+
+    Args:
+        seed (int): The desired seed.
+
+    .. warning::
+        If you are working with a multi-GPU model, this function is insufficient
+        to get determinism.  To seed all GPUs, use :func:`manual_seed_all`.
+    """
+    seed = int(seed)
+
+    def cb():
+        idx = current_device()
+        default_generator = torch.xpu.default_generators[idx]
+        default_generator.manual_seed(seed)
+
+    _lazy_call(cb, seed=True)
+
+
+def manual_seed_all(seed: int) -> None:
+    r"""Set the seed for generating random numbers on all GPUs.
+
+    It's safe to call this function if XPU is not available; in that case, it is silently ignored.
+
+    Args:
+        seed (int): The desired seed.
+    """
+    seed = int(seed)
+
+    def cb():
+        for i in range(device_count()):
+            default_generator = torch.xpu.default_generators[i]
+            default_generator.manual_seed(seed)
+
+    _lazy_call(cb, seed_all=True)
+
+
+def seed() -> None:
+    r"""Set the seed for generating random numbers to a random number for the current GPU.
+
+    It's safe to call this function if XPU is not available; in that case, it is silently ignored.
+
+    .. warning::
+        If you are working with a multi-GPU model, this function will only initialize
+        the seed on one GPU.  To initialize all GPUs, use :func:`seed_all`.
+    """
+
+    def cb():
+        idx = current_device()
+        default_generator = torch.xpu.default_generators[idx]
+        default_generator.seed()
+
+    _lazy_call(cb)
+
+
+def seed_all() -> None:
+    r"""Set the seed for generating random numbers to a random number on all GPUs.
+
+    It's safe to call this function if XPU is not available; in that case, it is silently ignored.
+    """
+
+    def cb():
+        random_seed = 0
+        seeded = False
+        for i in range(device_count()):
+            default_generator = torch.xpu.default_generators[i]
+            if not seeded:
+                default_generator.seed()
+                random_seed = default_generator.initial_seed()
+                seeded = True
+            else:
+                default_generator.manual_seed(random_seed)
+
+    _lazy_call(cb)
+
+
+def initial_seed() -> int:
+    r"""Return the current random seed of the current GPU.
+
+    .. warning::
+        This function eagerly initializes XPU.
+    """
+    _lazy_init()
+    idx = current_device()
+    default_generator = torch.xpu.default_generators[idx]
+    return default_generator.initial_seed()
+
+
+__all__ = [
+    "get_rng_state",
+    "get_rng_state_all",
+    "set_rng_state",
+    "set_rng_state_all",
+    "manual_seed",
+    "manual_seed_all",
+    "seed",
+    "seed_all",
+    "initial_seed",
+]
--- a/rl/Lib/site-packages/torch/xpu/streams.py
+++ b/rl/Lib/site-packages/torch/xpu/streams.py
@ -0,0 +1,171 @@
+# mypy: allow-untyped-defs
+import ctypes
+
+import torch
+from torch._streambase import _EventBase, _StreamBase
+
+from .._utils import _dummy_type
+
+
+if not hasattr(torch._C, "_XpuStreamBase"):
+    # Define dummy base classes
+    torch._C.__dict__["_XpuStreamBase"] = _dummy_type("_XpuStreamBase")
+    torch._C.__dict__["_XpuEventBase"] = _dummy_type("_XpuEventBase")
+
+
+class Stream(torch._C._XpuStreamBase, _StreamBase):
+    r"""Wrapper around a XPU stream.
+
+    A XPU stream is a linear sequence of execution that belongs to a specific
+    device, independent from other streams.
+
+    Args:
+        device(torch.device or int, optional): a device on which to allocate
+            the stream. If :attr:`device` is ``None`` (default) or a negative
+            integer, this will use the current device.
+        priority(int, optional): priority of the stream, should be 0 or
+            negative, where negative numbers indicate higher priority. By default,
+            streams have priority 0.
+    """
+
+    def __new__(cls, device=None, priority=0, **kwargs):
+        # setting device manager is expensive, so we avoid it unless necessary
+        if device is None or ("stream_id" in kwargs and "device_index" in kwargs):
+            return super().__new__(cls, priority=priority, **kwargs)
+        else:
+            with torch.xpu.device(device):
+                return super().__new__(cls, priority=priority, **kwargs)
+
+    def wait_event(self, event) -> None:
+        r"""Make all future work submitted to the stream wait for an event.
+
+        Args:
+            event (torch.xpu.Event): an event to wait for.
+        """
+        event.wait(self)
+
+    def wait_stream(self, stream) -> None:
+        r"""Synchronize with another stream.
+
+        All future work submitted to this stream will wait until all kernels
+        submitted to a given stream at the time of call complete.
+
+        Args:
+            stream (Stream): a stream to synchronize.
+        """
+        self.wait_event(stream.record_event())
+
+    def record_event(self, event=None):
+        r"""Record an event.
+
+        Args:
+            event (torch.xpu.Event, optional): event to record. If not given, a new one
+                will be allocated.
+
+        Returns:
+            Recorded event.
+        """
+        if event is None:
+            event = Event()
+        event.record(self)
+        return event
+
+    def query(self) -> bool:
+        r"""Check if all the work submitted has been completed.
+
+        Returns:
+            A boolean indicating if all kernels in this stream are completed.
+        """
+        return super().query()
+
+    def synchronize(self) -> None:
+        r"""Wait for all the kernels in this stream to complete."""
+        super().synchronize()
+
+    @property
+    def _as_parameter_(self):
+        return ctypes.c_void_p(self.sycl_queue)
+
+    def __eq__(self, o):
+        if isinstance(o, Stream):
+            return super().__eq__(o)
+        return False
+
+    def __hash__(self):
+        return hash((self.sycl_queue, self.device))
+
+    def __repr__(self):
+        return f"torch.xpu.Stream(device={self.device} sycl_queue={self.sycl_queue:#x})"
+
+
+class Event(torch._C._XpuEventBase, _EventBase):
+    r"""Wrapper around a XPU event.
+
+    XPU events are synchronization markers that can be used to monitor the
+    device's progress, and to synchronize XPU streams.
+
+    The underlying XPU events are lazily initialized when the event is first
+    recorded. After creation, only streams on the same device may record the
+    event. However, streams on any device can wait on the event.
+
+    Args:
+        enable_timing (bool, optional): indicates if the event should measure time
+            (default: ``False``)
+    """
+
+    def __new__(cls, enable_timing=False):
+        return super().__new__(cls, enable_timing=enable_timing)
+
+    def record(self, stream=None) -> None:
+        r"""Record the event in a given stream.
+
+        Uses ``torch.xpu.current_stream()`` if no stream is specified. The
+        stream's device must match the event's device.
+        """
+        if stream is None:
+            stream = torch.xpu.current_stream()
+        super().record(stream)
+
+    def wait(self, stream=None) -> None:
+        r"""Make all future work submitted to the given stream wait for this event.
+
+        Use ``torch.xpu.current_stream()`` if no stream is specified.
+        """
+        if stream is None:
+            stream = torch.xpu.current_stream()
+        super().wait(stream)
+
+    def query(self) -> bool:
+        r"""Check if all work currently captured by event has completed.
+
+        Returns:
+            A boolean indicating if all work currently captured by event has
+            completed.
+        """
+        return super().query()
+
+    def elapsed_time(self, end_event):
+        r"""Return the time elapsed.
+
+        Time reported in milliseconds after the event was recorded and
+        before the end_event was recorded.
+        """
+        return super().elapsed_time(end_event)
+
+    def synchronize(self) -> None:
+        r"""Wait for the event to complete.
+
+        Waits until the completion of all work currently captured in this event.
+        This prevents the CPU thread from proceeding until the event completes.
+        """
+        super().synchronize()
+
+    @property
+    def _as_parameter_(self):
+        return ctypes.c_void_p(self.sycl_event)
+
+    def __repr__(self):
+        if self.sycl_event:
+            return f"torch.xpu.Event(sycl_event={self.sycl_event:#x})"
+        else:
+            return "torch.xpu.Event(uninitialized)"