I am done

2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions
--- a/rl/Lib/site-packages/torch/include/ATen/xpu/CachingHostAllocator.h
+++ b/rl/Lib/site-packages/torch/include/ATen/xpu/CachingHostAllocator.h
@ -0,0 +1,23 @@
+#pragma once
+
+#include <ATen/core/CachingHostAllocator.h>
+#include <ATen/xpu/XPUEvent.h>
+#include <c10/core/Allocator.h>
+#include <c10/xpu/XPUStream.h>
+
+namespace at::xpu {
+
+TORCH_XPU_API c10::Allocator* getCachingHostAllocator();
+
+TORCH_XPU_API bool CachingHostAllocator_recordEvent(
+    void* ptr,
+    void* ctx,
+    c10::xpu::XPUStream stream);
+
+TORCH_XPU_API void CachingHostAllocator_emptyCache();
+
+inline TORCH_XPU_API at::DataPtr HostAlloc(size_t size) {
+  return getCachingHostAllocator()->allocate(size);
+}
+
+} // namespace at::xpu
--- a/rl/Lib/site-packages/torch/include/ATen/xpu/PinnedMemoryAllocator.h
+++ b/rl/Lib/site-packages/torch/include/ATen/xpu/PinnedMemoryAllocator.h
@ -0,0 +1,11 @@
+#pragma once
+
+#include <ATen/xpu/CachingHostAllocator.h>
+#include <c10/core/Allocator.h>
+
+namespace at::xpu {
+
+inline TORCH_XPU_API at::Allocator* getPinnedMemoryAllocator() {
+  return getCachingHostAllocator();
+}
+} // namespace at::xpu
--- a/rl/Lib/site-packages/torch/include/ATen/xpu/XPUContext.h
+++ b/rl/Lib/site-packages/torch/include/ATen/xpu/XPUContext.h
@ -0,0 +1,20 @@
+#pragma once
+
+#include <ATen/Context.h>
+#include <c10/xpu/XPUFunctions.h>
+#include <c10/xpu/XPUStream.h>
+
+namespace at::xpu {
+
+// XPU is available if we compiled with XPU.
+inline bool is_available() {
+  return c10::xpu::device_count() > 0;
+}
+
+TORCH_XPU_API DeviceProp* getCurrentDeviceProperties();
+
+TORCH_XPU_API DeviceProp* getDeviceProperties(DeviceIndex device);
+
+TORCH_XPU_API int32_t getGlobalIdxFromDevice(DeviceIndex device);
+
+} // namespace at::xpu
--- a/rl/Lib/site-packages/torch/include/ATen/xpu/XPUDevice.h
+++ b/rl/Lib/site-packages/torch/include/ATen/xpu/XPUDevice.h
@ -0,0 +1,13 @@
+#pragma once
+
+#include <ATen/Context.h>
+#include <c10/xpu/XPUFunctions.h>
+
+namespace at::xpu {
+
+inline Device getDeviceFromPtr(void* ptr) {
+  auto device = c10::xpu::get_device_idx_from_pointer(ptr);
+  return {c10::DeviceType::XPU, device};
+}
+
+} // namespace at::xpu
--- a/rl/Lib/site-packages/torch/include/ATen/xpu/XPUEvent.h
+++ b/rl/Lib/site-packages/torch/include/ATen/xpu/XPUEvent.h
@ -0,0 +1,166 @@
+#pragma once
+#include <ATen/xpu/XPUContext.h>
+
+#include <optional>
+
+namespace at::xpu {
+
+/*
+ * XPUEvent are movable not copyable wrappers around SYCL event. XPUEvent are
+ * constructed lazily when first recorded. It has a device, and this device is
+ * acquired from the first recording stream. Later streams that record the event
+ * must match the same device.
+ *
+ * Currently, XPUEvent does NOT support to export an inter-process event from
+ * another process via inter-process comunication(IPC). So it means that
+ * inter-process communication for event handles between different processes is
+ * not available. This could impact some applications that rely on cross-process
+ * synchronization and communication.
+ */
+struct TORCH_XPU_API XPUEvent {
+  // Constructors
+  XPUEvent(bool enable_timing = false) noexcept
+      : enable_timing_{enable_timing} {}
+
+  ~XPUEvent() {
+    if (isCreated()) {
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_event_deletion(
+            at::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
+      }
+    }
+  }
+
+  XPUEvent(const XPUEvent&) = delete;
+  XPUEvent& operator=(const XPUEvent&) = delete;
+
+  XPUEvent(XPUEvent&& other) = default;
+  XPUEvent& operator=(XPUEvent&& other) = default;
+
+  operator sycl::event&() const {
+    return event();
+  }
+
+  std::optional<at::Device> device() const {
+    if (isCreated()) {
+      return at::Device(at::kXPU, device_index_);
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  inline bool isCreated() const {
+    return (event_.get() != nullptr);
+  }
+
+  DeviceIndex device_index() const {
+    return device_index_;
+  }
+
+  sycl::event& event() const {
+    return *event_;
+  }
+
+  bool query() const {
+    using namespace sycl::info;
+    if (!isCreated()) {
+      return true;
+    }
+
+    return event().get_info<event::command_execution_status>() ==
+        event_command_status::complete;
+  }
+
+  void record() {
+    record(getCurrentXPUStream());
+  }
+
+  void recordOnce(const XPUStream& stream) {
+    if (!isCreated()) {
+      record(stream);
+    }
+  }
+
+  void record(const XPUStream& stream) {
+    if (!isCreated()) {
+      device_index_ = stream.device_index();
+      event_ = std::make_unique<sycl::event>(
+          stream.queue().ext_oneapi_submit_barrier());
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_event_creation(
+            at::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
+      }
+    } else {
+      TORCH_CHECK(
+          device_index_ == stream.device_index(),
+          "Event device ",
+          device_index_,
+          " does not match recording stream's device ",
+          stream.device_index(),
+          ".");
+      event_.reset();
+      event_ = std::make_unique<sycl::event>(
+          stream.queue().ext_oneapi_submit_barrier());
+    }
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_record(
+          at::kXPU,
+          reinterpret_cast<uintptr_t>(event_.get()),
+          reinterpret_cast<uintptr_t>(&stream.queue()));
+    }
+  }
+
+  void block(const XPUStream& stream) {
+    if (isCreated()) {
+      std::vector<sycl::event> event_list{event()};
+      // Make this stream wait until event_ is completed.
+      stream.queue().ext_oneapi_submit_barrier(event_list);
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_event_wait(
+            at::kXPU,
+            reinterpret_cast<uintptr_t>(event_.get()),
+            reinterpret_cast<uintptr_t>(&stream.queue()));
+      }
+    }
+  }
+
+  float elapsed_time(const XPUEvent& other) const {
+    TORCH_CHECK(
+        isCreated() && other.isCreated(),
+        "Both events must be recorded before calculating elapsed time.");
+    TORCH_CHECK(
+        query() && other.query(),
+        "Both events must be completed before calculating elapsed time.");
+    TORCH_CHECK(
+        enable_timing_ && other.enable_timing_,
+        "Both events must be created with argument 'enable_timing=True'.");
+    // TODO: provides the ability to time the execution of commands in a SYCL
+    // queue without enabling profiling on the entire queue
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false, "elapsed_time is not supported by XPUEvent.");
+  }
+
+  void synchronize() const {
+    if (isCreated()) {
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_event_synchronization(
+            at::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
+      }
+      event().wait_and_throw();
+    }
+  }
+
+ private:
+  bool enable_timing_ = false;
+  DeviceIndex device_index_ = -1;
+  // Only need to track the last event, as events in an in-order queue are
+  // executed sequentially.
+  std::unique_ptr<sycl::event> event_;
+};
+
+} // namespace at::xpu
--- a/rl/Lib/site-packages/torch/include/ATen/xpu/XPUGeneratorImpl.h
+++ b/rl/Lib/site-packages/torch/include/ATen/xpu/XPUGeneratorImpl.h
@ -0,0 +1,39 @@
+#pragma once
+
+#include <ATen/core/Generator.h>
+
+namespace at {
+
+struct TORCH_XPU_API XPUGeneratorImpl : public GeneratorImpl {
+  // Constructors
+  XPUGeneratorImpl(DeviceIndex device_index = -1);
+  ~XPUGeneratorImpl() override = default;
+
+  // XPUGeneratorImpl methods
+  std::shared_ptr<XPUGeneratorImpl> clone() const;
+  void set_current_seed(uint64_t seed) override;
+  void set_offset(uint64_t offset) override;
+  uint64_t get_offset() const override;
+  uint64_t current_seed() const override;
+  uint64_t seed() override;
+  void set_state(const c10::TensorImpl& new_state) override;
+  c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
+  void set_philox_offset_per_thread(uint64_t offset);
+  uint64_t philox_offset_per_thread() const;
+  std::pair<uint64_t, uint64_t> philox_engine_inputs(uint64_t increment);
+  static c10::DeviceType device_type();
+
+ private:
+  XPUGeneratorImpl* clone_impl() const override;
+  uint64_t seed_ = default_rng_seed_val;
+  uint64_t philox_offset_per_thread_ = 0;
+};
+
+namespace xpu::detail {
+
+TORCH_XPU_API const Generator& getDefaultXPUGenerator(DeviceIndex device = -1);
+
+TORCH_XPU_API Generator createXPUGenerator(DeviceIndex device = -1);
+
+} // namespace xpu::detail
+} // namespace at
--- a/rl/Lib/site-packages/torch/include/ATen/xpu/detail/XPUHooks.h
+++ b/rl/Lib/site-packages/torch/include/ATen/xpu/detail/XPUHooks.h
@ -0,0 +1,26 @@
+#pragma once
+
+#include <ATen/detail/XPUHooksInterface.h>
+
+namespace at::xpu::detail {
+
+// The real implementation of XPUHooksInterface
+struct XPUHooks : public at::XPUHooksInterface {
+  XPUHooks(at::XPUHooksArgs) {}
+  void initXPU() const override;
+  bool hasXPU() const override;
+  std::string showConfig() const override;
+  int32_t getGlobalIdxFromDevice(const at::Device& device) const override;
+  Generator getXPUGenerator(DeviceIndex device_index = -1) const override;
+  const Generator& getDefaultXPUGenerator(
+      DeviceIndex device_index = -1) const override;
+  Device getDeviceFromPtr(void* data) const override;
+  c10::DeviceIndex getNumGPUs() const override;
+  DeviceIndex current_device() const override;
+  void deviceSynchronize(DeviceIndex device_index) const override;
+  Allocator* getPinnedMemoryAllocator() const override;
+  bool isPinnedPtr(const void* data) const override;
+  bool hasPrimaryContext(DeviceIndex device_index) const override;
+};
+
+} // namespace at::xpu::detail