I am done

2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions
--- a/rl/Lib/site-packages/torch/include/c10/xpu/XPUCachingAllocator.h
+++ b/rl/Lib/site-packages/torch/include/c10/xpu/XPUCachingAllocator.h
@ -0,0 +1,27 @@
+#pragma once
+
+#include <c10/core/CachingDeviceAllocator.h>
+#include <c10/xpu/XPUStream.h>
+
+namespace c10::xpu::XPUCachingAllocator {
+
+C10_XPU_API Allocator* get();
+
+C10_XPU_API void init(DeviceIndex device_count);
+
+C10_XPU_API void emptyCache();
+
+C10_XPU_API void resetPeakStats(DeviceIndex device);
+
+C10_XPU_API void resetAccumulatedStats(DeviceIndex device);
+
+C10_XPU_API c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
+    DeviceIndex device);
+
+C10_XPU_API void* raw_alloc(size_t size);
+
+C10_XPU_API void raw_delete(void* ptr);
+
+C10_XPU_API void recordStream(const DataPtr& dataPtr, XPUStream stream);
+
+} // namespace c10::xpu::XPUCachingAllocator
--- a/rl/Lib/site-packages/torch/include/c10/xpu/XPUDeviceProp.h
+++ b/rl/Lib/site-packages/torch/include/c10/xpu/XPUDeviceProp.h
@ -0,0 +1,188 @@
+#pragma once
+
+#include <c10/xpu/XPUMacros.h>
+#include <sycl/sycl.hpp>
+
+namespace c10::xpu {
+
+#define AT_FORALL_XPU_DEVICE_PROPERTIES(_)                                     \
+  /* the device name of this SYCL device. */                                   \
+  _(name)                                                                      \
+                                                                               \
+  /* the device type associated with the device. */                            \
+  _(device_type)                                                               \
+                                                                               \
+  /* the vendor of this SYCL device. */                                        \
+  _(vendor)                                                                    \
+                                                                               \
+  /* a backend-defined driver version as a std::string. */                     \
+  _(driver_version)                                                            \
+                                                                               \
+  /* the SYCL version as a std::string in the form <major>.<minor> */          \
+  _(version)                                                                   \
+                                                                               \
+  /* true if the SYCL device is available. Otherwise, return false. */         \
+  _(is_available)                                                              \
+                                                                               \
+  /* the maximum size in bytes of the arguments that can be passed to a        \
+   * kernel. */                                                                \
+  _(max_parameter_size)                                                        \
+                                                                               \
+  /* the number of parallel compute units available to the device. */          \
+  _(max_compute_units)                                                         \
+                                                                               \
+  /* the maximum dimensions that specify the global and local work-item IDs    \
+   * used by the data parallel execution model. */                             \
+  _(max_work_item_dimensions)                                                  \
+                                                                               \
+  /* the maximum number of workitems that are permitted in a work-group        \
+   * executing a kernel on a single compute unit. */                           \
+  _(max_work_group_size)                                                       \
+                                                                               \
+  /* the maximum number of subgroups in a work-group for any kernel executed   \
+   * on the device. */                                                         \
+  _(max_num_sub_groups)                                                        \
+                                                                               \
+  /* a std::vector of size_t containing the set of sub-group sizes  supported  \
+   * by the device. */                                                         \
+  _(sub_group_sizes)                                                           \
+                                                                               \
+  /* the maximum configured clock frequency of this SYCL device in MHz. */     \
+  _(max_clock_frequency)                                                       \
+                                                                               \
+  /* the default compute device address space size specified as an unsigned    \
+   * integer value in bits. Must return either 32 or 64. */                    \
+  _(address_bits)                                                              \
+                                                                               \
+  /* the maximum size of memory object allocation in bytes. */                 \
+  _(max_mem_alloc_size)                                                        \
+                                                                               \
+  /* the minimum value in bits of the largest supported SYCL built-in data     \
+   * type if this SYCL device is not of device type                            \
+   * sycl::info::device_type::custom. */                                       \
+  _(mem_base_addr_align)                                                       \
+                                                                               \
+  /* a std::vector of info::fp_config describing the half/single/double        \
+   * precision floating-point capability of this SYCL device. */               \
+  _(half_fp_config)                                                            \
+  _(single_fp_config)                                                          \
+  _(double_fp_config)                                                          \
+                                                                               \
+  /* the size of global device memory in bytes. */                             \
+  _(global_mem_size)                                                           \
+                                                                               \
+  /* the type of global memory cache supported. */                             \
+  _(global_mem_cache_type)                                                     \
+                                                                               \
+  /* the size of global memory cache in bytes. */                              \
+  _(global_mem_cache_size)                                                     \
+                                                                               \
+  /* the size of global memory cache line in bytes. */                         \
+  _(global_mem_cache_line_size)                                                \
+                                                                               \
+  /* the type of local memory supported. */                                    \
+  _(local_mem_type)                                                            \
+                                                                               \
+  /* the size of local memory arena in bytes. */                               \
+  _(local_mem_size)                                                            \
+                                                                               \
+  /* the maximum number of sub-devices that can be created when this device is \
+   * partitioned. */                                                           \
+  _(partition_max_sub_devices)                                                 \
+                                                                               \
+  /* the resolution of device timer in nanoseconds. */                         \
+  _(profiling_timer_resolution)                                                \
+                                                                               \
+  /* the preferred native vector width size for built-in scalar types that can \
+   * be put into vectors. */                                                   \
+  _(preferred_vector_width_char)                                               \
+  _(preferred_vector_width_short)                                              \
+  _(preferred_vector_width_int)                                                \
+  _(preferred_vector_width_long)                                               \
+  _(preferred_vector_width_float)                                              \
+  _(preferred_vector_width_double)                                             \
+  _(preferred_vector_width_half)                                               \
+                                                                               \
+  /* the native ISA vector width. The vector width is defined as the number of \
+   * scalar elements that can be stored in the vector. */                      \
+  _(native_vector_width_char)                                                  \
+  _(native_vector_width_short)                                                 \
+  _(native_vector_width_int)                                                   \
+  _(native_vector_width_long)                                                  \
+  _(native_vector_width_float)                                                 \
+  _(native_vector_width_double)                                                \
+  _(native_vector_width_half)
+
+#define AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(_)           \
+  /* the number of EUs associated with the Intel GPU. */ \
+  _(gpu_eu_count, 512)                                   \
+                                                         \
+  /* the number of EUs in a subslice. */                 \
+  _(gpu_eu_count_per_subslice, 8)                        \
+                                                         \
+  /* the simd width of EU of GPU. */                     \
+  _(gpu_eu_simd_width, 8)                                \
+                                                         \
+  /* the number of hardware threads per EU of GPU. */    \
+  _(gpu_hw_threads_per_eu, 8)
+
+#define AT_FORALL_XPU_DEVICE_ASPECT(_)                  \
+  /* sycl::half is supported on device. */              \
+  _(fp16)                                               \
+                                                        \
+  /* double is supported on device. */                  \
+  _(fp64)                                               \
+                                                        \
+  /* 64-bit atomic operation is supported on device. */ \
+  _(atomic64)
+
+#define AT_FORALL_XPU_EXP_CL_ASPECT(_)                                         \
+  /* conversion between single-precision 32-bit floating-point values and      \
+   * 16-bit bfloat16 values is supported on device. */                         \
+  _(bfloat16_conversions)                                                      \
+                                                                               \
+  /* specialized hardware to compute MMA is supported on device. */            \
+  _(subgroup_matrix_multiply_accumulate)                                       \
+                                                                               \
+  /* specialized hardware to compute MMA for 32-bit floating-point is          \
+   * supported on device. */                                                   \
+  _(subgroup_matrix_multiply_accumulate_tensor_float32)                        \
+                                                                               \
+  /* block read operations for efficient matrix multiplication is supported on \
+   * device. */                                                                \
+  _(subgroup_2d_block_io)
+
+#define _DEFINE_SYCL_PROP(ns, property, member) \
+  ns::property::return_type member;
+
+#define DEFINE_DEVICE_PROP(property) \
+  _DEFINE_SYCL_PROP(sycl::info::device, property, property)
+
+#define DEFINE_PLATFORM_PROP(property, member) \
+  _DEFINE_SYCL_PROP(sycl::info::platform, property, member)
+
+#define DEFINE_EXT_DEVICE_PROP(property, ...) \
+  _DEFINE_SYCL_PROP(sycl::ext::intel::info::device, property, property)
+
+#define DEFINE_DEVICE_ASPECT(member) bool has_##member;
+
+struct C10_XPU_API DeviceProp {
+  AT_FORALL_XPU_DEVICE_PROPERTIES(DEFINE_DEVICE_PROP);
+
+  // the platform name.
+  DEFINE_PLATFORM_PROP(name, platform_name);
+
+  AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(DEFINE_EXT_DEVICE_PROP);
+
+  AT_FORALL_XPU_DEVICE_ASPECT(DEFINE_DEVICE_ASPECT);
+
+  AT_FORALL_XPU_EXP_CL_ASPECT(DEFINE_DEVICE_ASPECT);
+};
+
+#undef _DEFINE_SYCL_PROP
+#undef DEFINE_DEVICE_PROP
+#undef DEFINE_PLATFORM_PROP
+#undef DEFINE_EXT_DEVICE_PROP
+#undef DEFINE_DEVICE_ASPECT
+
+} // namespace c10::xpu
--- a/rl/Lib/site-packages/torch/include/c10/xpu/XPUException.h
+++ b/rl/Lib/site-packages/torch/include/c10/xpu/XPUException.h
@ -0,0 +1,22 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <sycl/sycl.hpp>
+
+namespace c10::xpu {
+
+static inline sycl::async_handler asyncHandler = [](sycl::exception_list el) {
+  if (el.size() == 0) {
+    return;
+  }
+  for (const auto& e : el) {
+    try {
+      std::rethrow_exception(e);
+    } catch (sycl::exception& e) {
+      TORCH_WARN("SYCL Exception: ", e.what());
+    }
+  }
+  throw;
+};
+
+} // namespace c10::xpu
--- a/rl/Lib/site-packages/torch/include/c10/xpu/XPUFunctions.h
+++ b/rl/Lib/site-packages/torch/include/c10/xpu/XPUFunctions.h
@ -0,0 +1,35 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/xpu/XPUDeviceProp.h>
+#include <c10/xpu/XPUMacros.h>
+
+// The naming convention used here matches the naming convention of torch.xpu
+
+namespace c10::xpu {
+
+// Log a warning only once if no devices are detected.
+C10_XPU_API DeviceIndex device_count();
+
+// Throws an error if no devices are detected.
+C10_XPU_API DeviceIndex device_count_ensure_non_zero();
+
+C10_XPU_API DeviceIndex current_device();
+
+C10_XPU_API void set_device(DeviceIndex device);
+
+C10_XPU_API DeviceIndex exchange_device(DeviceIndex device);
+
+C10_XPU_API DeviceIndex maybe_exchange_device(DeviceIndex to_device);
+
+C10_XPU_API sycl::device& get_raw_device(DeviceIndex device);
+
+C10_XPU_API sycl::context& get_device_context();
+
+C10_XPU_API void get_device_properties(
+    DeviceProp* device_prop,
+    DeviceIndex device);
+
+C10_XPU_API DeviceIndex get_device_idx_from_pointer(void* ptr);
+
+} // namespace c10::xpu
--- a/rl/Lib/site-packages/torch/include/c10/xpu/XPUMacros.h
+++ b/rl/Lib/site-packages/torch/include/c10/xpu/XPUMacros.h
@ -0,0 +1,33 @@
+#pragma once
+
+#ifndef C10_USING_CUSTOM_GENERATED_MACROS
+#include <c10/xpu/impl/xpu_cmake_macros.h>
+#endif
+
+// See c10/macros/Export.h for a detailed explanation of what the function
+// of these macros are.  We need one set of macros for every separate library
+// we build.
+
+#ifdef _WIN32
+#if defined(C10_XPU_BUILD_SHARED_LIBS)
+#define C10_XPU_EXPORT __declspec(dllexport)
+#define C10_XPU_IMPORT __declspec(dllimport)
+#else
+#define C10_XPU_EXPORT
+#define C10_XPU_IMPORT
+#endif
+#else // _WIN32
+#if defined(__GNUC__)
+#define C10_XPU_EXPORT __attribute__((__visibility__("default")))
+#else // defined(__GNUC__)
+#define C10_XPU_EXPORT
+#endif // defined(__GNUC__)
+#define C10_XPU_IMPORT C10_XPU_EXPORT
+#endif // _WIN32
+
+// This one is being used by libc10_xpu.so
+#ifdef C10_XPU_BUILD_MAIN_LIB
+#define C10_XPU_API C10_XPU_EXPORT
+#else
+#define C10_XPU_API C10_XPU_IMPORT
+#endif
--- a/rl/Lib/site-packages/torch/include/c10/xpu/XPUStream.h
+++ b/rl/Lib/site-packages/torch/include/c10/xpu/XPUStream.h
@ -0,0 +1,189 @@
+#pragma once
+
+#include <c10/core/Stream.h>
+#include <c10/core/impl/GPUTrace.h>
+#include <c10/xpu/XPUFunctions.h>
+
+namespace c10::xpu {
+
+/*
+ * Note [Stream Management]
+ *
+ * An XPUStream is an abstraction of an actual SYCL queue in which SYCL kernel
+ * can execute. Currently, there are several pools per device to manage SYCL
+ * queue, and a device's pool is lazily created.
+ *
+ * There are two pools per device. The first pool contains "normal priority"
+ * queues. The second pool is the "high priority" queues. There are 32 queues in
+ * per pool per device, and when a queue is requested one of these queues is
+ * returned round-robin. That is, the first queue requested is at index 0, the
+ * second at index 1... to index 31, then index 0 again.
+ *
+ * This means that if 33 queues are requested, the first and last queues
+ * requested are actually the same queue (under the covers) and kernels enqueued
+ * on them cannot run concurrently.
+ *
+ * It is safe to enqueue a kernel on the same queue from two different
+ * threads as the SYCL specification described.
+ */
+
+static constexpr int max_compile_time_stream_priorities = 2;
+
+/*
+ * This serves as a wrapper around c10::Stream and acts as a representation for
+ * a SYCL queue, which allows asynchronous execution of XPU tasks.
+ */
+class C10_XPU_API XPUStream {
+ public:
+  enum Unchecked { UNCHECKED };
+
+  /// Construct a XPUStream from a Stream. This construction is checked, and
+  /// will raise an error if the Stream is not, in fact, a XPU stream.
+  explicit XPUStream(Stream stream) : stream_(stream) {
+    TORCH_CHECK(stream_.device_type() == DeviceType::XPU);
+  }
+
+  /// Construct a XPUStream from a Stream with no error checking.
+  explicit XPUStream(Unchecked, Stream stream) : stream_(stream) {}
+
+  bool operator==(const XPUStream& other) const noexcept {
+    return unwrap() == other.unwrap();
+  }
+
+  bool operator!=(const XPUStream& other) const noexcept {
+    return unwrap() != other.unwrap();
+  }
+
+  /// Implicit conversion to sycl::queue&.
+  operator sycl::queue&() const {
+    return queue();
+  }
+
+  /// Implicit conversion to Stream (a.k.a., forget that the stream is a
+  /// XPU stream).
+  operator Stream() const {
+    return unwrap();
+  }
+
+  /// Get the XPU device type that this stream is associated with.
+  DeviceType device_type() const {
+    return DeviceType::XPU;
+  }
+
+  /// Get the XPU device index that this stream is associated with.
+  DeviceIndex device_index() const {
+    return stream_.device_index();
+  }
+
+  /// Get the full Device that this stream is associated with. The Device is
+  /// guaranteed to be a XPU device.
+  Device device() const {
+    return Device(DeviceType::XPU, device_index());
+  }
+
+  /// Return the stream ID corresponding to this particular stream. StreamId is
+  /// a int64_t representation generated by its type and index.
+  StreamId id() const {
+    return stream_.id();
+  }
+
+  /// Return true if all enqueued tasks in this stream have been completed,
+  /// otherwise return false.
+  bool query() const {
+    return queue().ext_oneapi_empty();
+  }
+
+  /// Performs a blocking wait for the completion of all enqueued tasks in this
+  /// stream.
+  void synchronize() const {
+    queue().wait_and_throw();
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_stream_synchronization(
+          c10::kXPU, reinterpret_cast<uintptr_t>(&queue()));
+    }
+  }
+
+  /// Return the priority that this stream is associated with. Lower numbers
+  /// represent higher priority.
+  int priority() const;
+
+  /// Explicit conversion to sycl::queue&.
+  sycl::queue& queue() const;
+
+  /// Explicit conversion to Stream.
+  Stream unwrap() const {
+    return stream_;
+  }
+
+  /// Reversibly pack a XPUStream into a struct representation. The XPUStream
+  /// can be unpacked using unpack3().
+  struct c10::StreamData3 pack3() const {
+    return stream_.pack3();
+  }
+
+  /// Unpack a XPUStream from the 3 fields generated by pack3().
+  static XPUStream unpack3(
+      StreamId stream_id,
+      DeviceIndex device_index,
+      DeviceType device_type) {
+    return XPUStream(Stream::unpack3(stream_id, device_index, device_type));
+  }
+
+  /// Return the range of priority **supported by PyTorch**.
+  static std::tuple<int, int> priority_range() {
+    return std::make_tuple(0, -max_compile_time_stream_priorities + 1);
+  }
+
+ private:
+  Stream stream_;
+};
+
+/**
+ * Get a stream from the pool in a round-robin fashion.
+ *
+ * You can request a stream from the highest priority pool by setting
+ * isHighPriority to true for a specific device.
+ */
+C10_XPU_API XPUStream
+getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
+
+/**
+ * Get a stream from the pool in a round-robin fashion.
+ *
+ * You can request a stream by setting a priority value for a specific device.
+ * The priority number lower, the priority higher.
+ */
+C10_XPU_API XPUStream
+getStreamFromPool(const int priority, DeviceIndex device = -1);
+
+/**
+ * Get the current XPU stream, for the passed XPU device, or for the current
+ * device if no device index is passed.
+ */
+C10_XPU_API XPUStream getCurrentXPUStream(DeviceIndex device = -1);
+
+/**
+ * Set the current stream on the device of the passed in stream to be the passed
+ * in stream.
+ */
+C10_XPU_API void setCurrentXPUStream(XPUStream stream);
+
+C10_XPU_API std::ostream& operator<<(std::ostream& stream, const XPUStream& s);
+
+/**
+ * Block all reserved SYCL queues in the stream pools on the device, and wait
+ * for their synchronizations.
+ */
+C10_XPU_API void syncStreamsOnDevice(DeviceIndex device = -1);
+
+} // namespace c10::xpu
+
+namespace std {
+template <>
+struct hash<c10::xpu::XPUStream> {
+  size_t operator()(c10::xpu::XPUStream s) const noexcept {
+    return std::hash<c10::Stream>{}(s.unwrap());
+  }
+};
+} // namespace std
--- a/rl/Lib/site-packages/torch/include/c10/xpu/impl/XPUGuardImpl.h
+++ b/rl/Lib/site-packages/torch/include/c10/xpu/impl/XPUGuardImpl.h
@ -0,0 +1,179 @@
+#pragma once
+
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/core/impl/GPUTrace.h>
+#include <c10/xpu/XPUCachingAllocator.h>
+#include <c10/xpu/XPUFunctions.h>
+#include <c10/xpu/XPUStream.h>
+
+#include <vector>
+
+namespace c10::xpu::impl {
+
+struct XPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
+  static constexpr DeviceType static_type = kXPU;
+
+  XPUGuardImpl() = default;
+
+  explicit XPUGuardImpl(DeviceType t) {
+    TORCH_INTERNAL_ASSERT(t == kXPU);
+  }
+
+  DeviceType type() const override {
+    return kXPU;
+  }
+
+  Device exchangeDevice(Device d) const override {
+    TORCH_INTERNAL_ASSERT(d.is_xpu());
+    const auto old_device_index = c10::xpu::exchange_device(d.index());
+    return Device(kXPU, old_device_index);
+  }
+
+  Device getDevice() const override {
+    const auto device = c10::xpu::current_device();
+    return Device(kXPU, device);
+  }
+
+  void setDevice(Device d) const override {
+    TORCH_INTERNAL_ASSERT(d.is_xpu());
+    c10::xpu::set_device(d.index());
+  }
+
+  void uncheckedSetDevice(Device d) const noexcept override {
+    c10::xpu::set_device(d.index());
+  }
+
+  Stream getStream(Device d) const noexcept override {
+    return getCurrentXPUStream(d.index()).unwrap();
+  }
+
+  Stream getNewStream(Device d, int priority = 0) const override {
+    return getStreamFromPool(priority, d.index());
+  }
+
+  Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false)
+      const override {
+    return getStreamFromPool(isHighPriority, d.index());
+  }
+
+  // NB: These do NOT set the current device
+  Stream exchangeStream(Stream s) const noexcept override {
+    const XPUStream stream(s);
+    const auto old_stream = getCurrentXPUStream(s.device().index());
+    setCurrentXPUStream(stream);
+    return old_stream.unwrap();
+  }
+
+  DeviceIndex deviceCount() const noexcept override {
+    return c10::xpu::device_count();
+  }
+
+  // Event-related functions
+  void destroyEvent(void* event, const DeviceIndex device_index)
+      const noexcept override {
+    if (!event)
+      return;
+
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_deletion(
+          c10::kXPU, reinterpret_cast<uintptr_t>(event));
+    }
+
+    delete reinterpret_cast<sycl::event*>(event);
+  }
+
+  void record(
+      void** event,
+      const Stream& stream,
+      const DeviceIndex device_index,
+      const EventFlag flag) const override {
+    TORCH_CHECK(
+        device_index == -1 || device_index == stream.device_index(),
+        "Event device index ",
+        device_index,
+        " does not match recording stream's device index ",
+        stream.device_index(),
+        ".");
+
+    auto* xpu_event = reinterpret_cast<sycl::event*>(*event);
+    const XPUStream xpu_stream{stream};
+
+    // Delete the event previously recorded.
+    if (xpu_event)
+      delete xpu_event;
+    xpu_event = new sycl::event(xpu_stream.queue().ext_oneapi_submit_barrier());
+    *event = reinterpret_cast<void*>(xpu_event);
+
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_record(
+          c10::kXPU,
+          reinterpret_cast<uintptr_t>(xpu_event),
+          reinterpret_cast<uintptr_t>(&xpu_stream.queue()));
+    }
+  }
+
+  void block(void* event, const Stream& stream) const override {
+    if (!event)
+      return;
+    auto* xpu_event = reinterpret_cast<sycl::event*>(event);
+    std::vector<sycl::event> event_list{*xpu_event};
+    const XPUStream xpu_stream(stream);
+    xpu_stream.queue().ext_oneapi_submit_barrier(event_list);
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_wait(
+          c10::kXPU,
+          reinterpret_cast<uintptr_t>(xpu_event),
+          reinterpret_cast<uintptr_t>(&xpu_stream.queue()));
+    }
+  }
+
+  bool queryEvent(void* event) const override {
+    using namespace sycl::info;
+    if (!event)
+      return true;
+    auto* xpu_event = reinterpret_cast<sycl::event*>(event);
+    return xpu_event->get_info<event::command_execution_status>() ==
+        event_command_status::complete;
+  }
+
+  // Stream-related functions
+  bool queryStream(const Stream& stream) const override {
+    const XPUStream xpu_stream{stream};
+    return xpu_stream.query();
+  }
+
+  void synchronizeStream(const Stream& stream) const override {
+    const XPUStream xpu_stream{stream};
+    xpu_stream.synchronize();
+  }
+
+  void synchronizeEvent(void* event) const override {
+    if (!event)
+      return;
+    auto* xpu_event = reinterpret_cast<sycl::event*>(event);
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_synchronization(
+          c10::kXPU, reinterpret_cast<uintptr_t>(xpu_event));
+    }
+    xpu_event->wait_and_throw();
+  }
+
+  void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream)
+      const override {
+    const XPUStream xpu_stream{stream};
+    XPUCachingAllocator::recordStream(data_ptr, xpu_stream);
+  }
+
+  double elapsedTime(void* event1, void* event2, const DeviceIndex device_index)
+      const override {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false, "elapsedTime is not supported by XPU backend.");
+  }
+};
+
+} // namespace c10::xpu::impl