I am done

2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions
--- a/rl/Lib/site-packages/torch/include/c10/cuda/CUDAAlgorithm.h
+++ b/rl/Lib/site-packages/torch/include/c10/cuda/CUDAAlgorithm.h
@ -0,0 +1,31 @@
+#ifdef THRUST_DEVICE_LOWER_BOUND_WORKS
+#include <thrust/binary_search.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#endif
+namespace c10::cuda {
+#ifdef THRUST_DEVICE_LOWER_BOUND_WORKS
+template <typename Iter, typename Scalar>
+__forceinline__ __device__ Iter
+lower_bound(Iter start, Iter end, Scalar value) {
+  return thrust::lower_bound(thrust::device, start, end, value);
+}
+#else
+// thrust::lower_bound is broken on device, see
+// https://github.com/NVIDIA/thrust/issues/1734 Implementation inspired by
+// https://github.com/pytorch/pytorch/blob/805120ab572efef66425c9f595d9c6c464383336/aten/src/ATen/native/cuda/Bucketization.cu#L28
+template <typename Iter, typename Scalar>
+__device__ Iter lower_bound(Iter start, Iter end, Scalar value) {
+  while (start < end) {
+    auto mid = start + ((end - start) >> 1);
+    if (*mid < value) {
+      start = mid + 1;
+    } else {
+      end = mid;
+    }
+  }
+  return end;
+}
+#endif // THRUST_DEVICE_LOWER_BOUND_WORKS
+} // namespace c10::cuda
--- a/rl/Lib/site-packages/torch/include/c10/cuda/CUDAAllocatorConfig.h
+++ b/rl/Lib/site-packages/torch/include/c10/cuda/CUDAAllocatorConfig.h
@ -0,0 +1,124 @@
+#pragma once
+
+#include <c10/cuda/CUDAMacros.h>
+#include <c10/util/Exception.h>
+
+#include <atomic>
+#include <cstddef>
+#include <cstdlib>
+#include <mutex>
+#include <string>
+#include <vector>
+
+namespace c10::cuda::CUDACachingAllocator {
+
+// Environment config parser
+class C10_CUDA_API CUDAAllocatorConfig {
+ public:
+  static size_t max_split_size() {
+    return instance().m_max_split_size;
+  }
+  static double garbage_collection_threshold() {
+    return instance().m_garbage_collection_threshold;
+  }
+
+  static bool expandable_segments() {
+#ifndef PYTORCH_C10_DRIVER_API_SUPPORTED
+    if (instance().m_expandable_segments) {
+      TORCH_WARN_ONCE("expandable_segments not supported on this platform")
+    }
+    return false;
+#else
+    return instance().m_expandable_segments;
+#endif
+  }
+
+  static bool release_lock_on_cudamalloc() {
+    return instance().m_release_lock_on_cudamalloc;
+  }
+
+  /** Pinned memory allocator settings */
+  static bool pinned_use_cuda_host_register() {
+    return instance().m_pinned_use_cuda_host_register;
+  }
+
+  static size_t pinned_num_register_threads() {
+    return instance().m_pinned_num_register_threads;
+  }
+
+  static size_t pinned_max_register_threads() {
+    // Based on the benchmark results, we see better allocation performance
+    // with 8 threads. However on future systems, we may need more threads
+    // and limiting this to 128 threads.
+    return 128;
+  }
+
+  // This is used to round-up allocation size to nearest power of 2 divisions.
+  // More description below in function roundup_power2_next_division
+  // As ane example, if we want 4 divisions between 2's power, this can be done
+  // using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
+  static size_t roundup_power2_divisions(size_t size);
+
+  static std::vector<size_t> roundup_power2_divisions() {
+    return instance().m_roundup_power2_divisions;
+  }
+
+  static std::string last_allocator_settings() {
+    std::lock_guard<std::mutex> lock(
+        instance().m_last_allocator_settings_mutex);
+    return instance().m_last_allocator_settings;
+  }
+
+  static CUDAAllocatorConfig& instance() {
+    static CUDAAllocatorConfig* s_instance = ([]() {
+      auto inst = new CUDAAllocatorConfig();
+      const char* env = getenv("PYTORCH_CUDA_ALLOC_CONF");
+      inst->parseArgs(env);
+      return inst;
+    })();
+    return *s_instance;
+  }
+
+  void parseArgs(const char* env);
+
+ private:
+  CUDAAllocatorConfig();
+
+  static void lexArgs(const char* env, std::vector<std::string>& config);
+  static void consumeToken(
+      const std::vector<std::string>& config,
+      size_t i,
+      const char c);
+  size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
+  size_t parseGarbageCollectionThreshold(
+      const std::vector<std::string>& config,
+      size_t i);
+  size_t parseRoundUpPower2Divisions(
+      const std::vector<std::string>& config,
+      size_t i);
+  size_t parseAllocatorConfig(
+      const std::vector<std::string>& config,
+      size_t i,
+      bool& used_cudaMallocAsync);
+  size_t parsePinnedUseCudaHostRegister(
+      const std::vector<std::string>& config,
+      size_t i);
+  size_t parsePinnedNumRegisterThreads(
+      const std::vector<std::string>& config,
+      size_t i);
+
+  std::atomic<size_t> m_max_split_size;
+  std::vector<size_t> m_roundup_power2_divisions;
+  std::atomic<double> m_garbage_collection_threshold;
+  std::atomic<size_t> m_pinned_num_register_threads;
+  std::atomic<bool> m_expandable_segments;
+  std::atomic<bool> m_release_lock_on_cudamalloc;
+  std::atomic<bool> m_pinned_use_cuda_host_register;
+  std::string m_last_allocator_settings;
+  std::mutex m_last_allocator_settings_mutex;
+};
+
+// General caching allocator utilities
+C10_CUDA_API void setAllocatorSettings(const std::string& env);
+
+} // namespace c10::cuda::CUDACachingAllocator
--- a/rl/Lib/site-packages/torch/include/c10/cuda/CUDACachingAllocator.h
+++ b/rl/Lib/site-packages/torch/include/c10/cuda/CUDACachingAllocator.h
@ -0,0 +1,499 @@
+#pragma once
+
+#include <c10/core/CachingDeviceAllocator.h>
+#include <c10/cuda/CUDAGraphsC10Utils.h>
+#include <c10/cuda/CUDAMacros.h>
+#include <c10/cuda/CUDAStream.h>
+#include <c10/util/ApproximateClock.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Registry.h>
+
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+
+namespace c10 {
+
+// Caching allocator will execute every registered callback if it unable to find
+// block inside of already allocated area.
+class C10_CUDA_API FreeMemoryCallback {
+ public:
+  virtual ~FreeMemoryCallback() = default;
+  virtual bool Execute() = 0;
+};
+
+C10_DECLARE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback);
+#define REGISTER_FREE_MEMORY_CALLBACK(name, ...) \
+  C10_REGISTER_CLASS(FreeCudaMemoryCallbacksRegistry, name, __VA_ARGS__);
+} // namespace c10
+  //
+// TODO: Turn this into an honest to goodness class. I briefly attempted to do
+// this, but it was a bit irritating to figure out how to also correctly
+// apply pimpl pattern so I didn't have to leak any internal implementation
+// details in the header (CUDACachingAllocator could be made a pimpl, but
+// you also need to appropriately define a class which is a subclass
+// of Allocator. Not impossible, but required a bit more surgery than
+// I wanted to do at the time.)
+//
+// Why is this using a namespace rather than old-style THCCachingAllocator_
+// prefix?  Mostly because it made the HIPify rules easier to write; _ is
+// not counted as a word boundary, so you would otherwise have to list each
+// of these functions.
+
+namespace c10::cuda::CUDACachingAllocator {
+
+// Preserved only for BC reasons
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using c10::CachingDeviceAllocator::DeviceStats;
+
+extern const size_t kLargeBuffer;
+
+typedef std::shared_ptr<GatheredContext> (*CreateContextFn)();
+
+// Struct containing info of an allocation block (i.e. a fractional part of a
+// cudaMalloc)..
+struct BlockInfo {
+  size_t size = 0;
+  size_t requested_size = 0;
+  int32_t gc_counter = 0;
+  bool allocated = false;
+  bool active = false;
+  std::shared_ptr<GatheredContext>
+      context_when_allocated; // per-watcher context
+};
+
+// Struct containing info of a memory segment (i.e. one contiguous cudaMalloc).
+struct SegmentInfo {
+  c10::DeviceIndex device = 0;
+  size_t address = 0;
+  size_t total_size = 0;
+  size_t requested_size = 0; // unrounded, actually requested size
+  size_t allocated_size = 0;
+  size_t active_size = 0;
+  cudaStream_t stream = nullptr;
+  bool is_large = false;
+  bool is_expandable = false;
+  MempoolId_t owner_private_pool_id = {0, 0};
+  std::vector<BlockInfo> blocks;
+  std::shared_ptr<GatheredContext> context_when_allocated;
+};
+
+struct AllocatorState {
+  virtual ~AllocatorState() = default;
+};
+
+union trace_time_ {
+  time_t t_;
+  approx_time_t approx_t_;
+};
+
+struct TraceEntry {
+  enum Action {
+    ALLOC, // API made to the caching allocator for new memory
+    FREE_REQUESTED, // API call made to the caching allocator to free memory
+    FREE_COMPLETED, // The allocator might have to delay a free because
+                    // it is still in use on another stream via record_stream
+                    // This event is generated when a free actually completes.
+    SEGMENT_ALLOC, // a call to cudaMalloc to get more memory from the OS
+    SEGMENT_FREE, // a call to cudaFree to return memory to the OS (e.g. to
+                  // defragment or empty_caches)
+    SEGMENT_MAP, // a call to cuMemMap (used with expandable_segments)
+    SEGMENT_UNMAP, // unmap part of a segment (used with expandable segments)
+    SNAPSHOT, // a call to snapshot, used to correlate memory snapshots to trace
+              // events
+    OOM // the allocator threw an OutOfMemoryError (addr_ is the amount of free
+        // bytes reported by cuda)
+  };
+  TraceEntry(
+      Action action,
+      c10::DeviceIndex device,
+      size_t addr,
+      size_t size,
+      cudaStream_t stream,
+      approx_time_t time,
+      std::shared_ptr<GatheredContext> context = nullptr)
+      : action_(action),
+        device_(device),
+        addr_(addr),
+        context_(std::move(context)),
+        stream_(stream),
+        size_(size) {
+    time_.approx_t_ = time;
+  }
+  Action action_;
+  c10::DeviceIndex device_;
+  size_t addr_; // for OOM, this is the amount of free bytes reported by cuda
+  std::shared_ptr<GatheredContext> context_;
+  cudaStream_t stream_{};
+  size_t size_;
+  trace_time_ time_{};
+};
+
+// Calls made by record_function will save annotations
+struct AnnotationEntry {
+  AnnotationEntry(c10::DeviceIndex device, approx_time_t time)
+      : device_(device) {
+    time_.approx_t_ = time;
+  }
+
+  void recordUserMetadata(const std::string& name, std::string value) {
+    metadata_[name] = std::move(value);
+  }
+
+  c10::DeviceIndex device_;
+  trace_time_ time_{};
+  std::unordered_map<std::string, std::string> metadata_;
+};
+
+struct AllocatorConfigInfo {
+  double garbage_collection_threshold;
+  size_t max_split_size;
+  size_t pinned_num_register_threads;
+  bool expandable_segments;
+  bool release_lock_on_malloc;
+  bool pinned_use_host_register;
+  std::string last_allocator_settings;
+  std::vector<size_t> roundup_power2_divisions;
+};
+
+struct SnapshotInfo {
+  std::vector<SegmentInfo> segments;
+  std::vector<std::vector<TraceEntry>> device_traces;
+  std::vector<AnnotationEntry> external_annotations;
+  AllocatorConfigInfo config_metadata;
+};
+
+// returns the pointers freed in the pool
+// and the pointers allocated. Note: a pointer
+// may appear in both freed and allocated
+struct CheckpointDelta {
+  std::vector<void*> ptrs_freed;
+  std::vector<at::DataPtr> dataptrs_allocd;
+};
+
+enum struct RecordContext {
+  NEVER = 0,
+  STATE = 1, // only keep stacks for active allocations
+  ALLOC = 2, // additionally keep stacks for allocations in the trace history
+  ALL = 3, // additionally record stacks for when something is freed
+};
+
+using OutOfMemoryObserver = std::function<void(
+    int64_t device,
+    size_t allocated,
+    size_t device_total,
+    size_t device_free)>;
+
+using AllocatorTraceTracker = std::function<void(const TraceEntry&)>;
+
+struct ShareableHandle {
+  ptrdiff_t offset;
+  std::string handle;
+};
+
+class CUDAAllocator : public Allocator {
+ public:
+  virtual void* raw_alloc(size_t nbytes) = 0;
+  virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0;
+  virtual void raw_delete(void* ptr) = 0;
+  virtual void init(int device_count) = 0;
+  virtual bool initialized() = 0;
+  virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0;
+  virtual void emptyCache() = 0;
+  virtual void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) = 0;
+  virtual void* getBaseAllocation(void* ptr, size_t* size) = 0;
+  virtual void recordStream(const DataPtr&, CUDAStream stream) = 0;
+  virtual c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
+      c10::DeviceIndex device) = 0;
+  virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0;
+  virtual void resetPeakStats(c10::DeviceIndex device) = 0;
+  virtual SnapshotInfo snapshot() = 0;
+  virtual void beginAllocateToPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      std::function<bool(cudaStream_t)> filter) = 0;
+  virtual void endAllocateToPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id) = 0;
+  virtual void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) = 0;
+  // returns true if the allocated blocks are equal to expected live allocations
+  virtual bool checkPoolLiveAllocations(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      const std::unordered_set<void*>& expected_live_allocations) {
+    TORCH_CHECK(
+        false,
+        name(),
+        " does not yet support checkPoolLiveAllocations. "
+        "If you need it, please file an issue describing your use case.");
+  }
+  virtual ShareableHandle shareIpcHandle(void* ptr) = 0;
+  virtual std::shared_ptr<void> getIpcDevPtr(std::string handle) = 0;
+  virtual bool isHistoryEnabled() {
+    TORCH_CHECK(
+        false,
+        name(),
+        " does not yet support recordHistory. "
+        "If you need it, please file an issue describing your use case.");
+  }
+  virtual void recordHistory(
+      bool enabled,
+      CreateContextFn context_recorder,
+      size_t alloc_trace_max_entries,
+      RecordContext when) = 0;
+  virtual void recordAnnotation(
+      const std::vector<std::pair<std::string, std::string>>& md){};
+  virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;
+
+  // Attached AllocatorTraceTracker callbacks will be called while the
+  // per-device allocator lock is held. Any additional locks taken from within
+  // the callback must be proven to always have the lock order that never
+  // triggers a deadlock. In particular, Python's GIL may be held when
+  // calling the allocator so it is unsafe to try to acquire the GIL in this
+  // callback.
+  virtual void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) = 0;
+
+  virtual void enablePeerAccess(
+      c10::DeviceIndex dev,
+      c10::DeviceIndex dev_to_access) = 0;
+
+  // memory not allocated from cudaMalloc cannot be copied
+  // across devices using cudaMemcpyAsync if peer to peer access is disabled.
+  // instead it requires cudaMemcpyAsyncPeer
+  //  with P2P Enabled, all combinations work
+  //  with P2P Disabled:
+  //                       cudaMalloc cudaMallocAsync/cuMemMap
+  // cudaMemcpyAsyncPeer   works      works
+  // cudaMemcpyAsync       works      error
+
+  // This function performs chooses to use the Peer version of
+  // memcpy if required based on where the allocated put dst/src.
+  virtual cudaError_t memcpyAsync(
+      void* dst,
+      int dstDevice,
+      const void* src,
+      int srcDevice,
+      size_t count,
+      cudaStream_t stream,
+      bool p2p_enabled) = 0;
+  virtual std::shared_ptr<AllocatorState> getCheckpointState(
+      c10::DeviceIndex device,
+      MempoolId_t id) = 0;
+  virtual CheckpointDelta setCheckpointPoolState(
+      c10::DeviceIndex device,
+      std::shared_ptr<AllocatorState> pps) = 0;
+  virtual std::string name() = 0;
+};
+
+// Allocator object, statically initialized
+// See BackendInitializer in CUDACachingAllocator.cpp.
+// Atomic loads on x86 are just normal loads,
+// (atomic stores are different), so reading this value
+// is no different than loading a pointer.
+C10_CUDA_API extern std::atomic<CUDAAllocator*> allocator;
+
+inline CUDAAllocator* get() {
+  return allocator.load();
+}
+
+// Called directly by clients.
+inline void* raw_alloc(size_t nbytes) {
+  return get()->raw_alloc(nbytes);
+}
+
+inline void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) {
+  return get()->raw_alloc_with_stream(nbytes, stream);
+}
+
+inline void raw_delete(void* ptr) {
+  return get()->raw_delete(ptr);
+}
+
+inline void init(int device_count) {
+  return get()->init(device_count);
+}
+
+inline void setMemoryFraction(double fraction, c10::DeviceIndex device) {
+  return get()->setMemoryFraction(fraction, device);
+}
+
+inline void emptyCache() {
+  return get()->emptyCache();
+}
+
+inline void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) {
+  return get()->cacheInfo(device, largestBlock);
+}
+
+inline void* getBaseAllocation(void* ptr, size_t* size) {
+  return get()->getBaseAllocation(ptr, size);
+}
+
+inline void recordStream(const DataPtr& dataPtr, CUDAStream stream) {
+  return get()->recordStream(dataPtr, stream);
+}
+
+inline c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
+    c10::DeviceIndex device) {
+  return get()->getDeviceStats(device);
+}
+
+inline void resetAccumulatedStats(c10::DeviceIndex device) {
+  return get()->resetAccumulatedStats(device);
+}
+
+inline void resetPeakStats(c10::DeviceIndex device) {
+  return get()->resetPeakStats(device);
+}
+
+inline SnapshotInfo snapshot() {
+  return get()->snapshot();
+}
+
+inline std::shared_ptr<AllocatorState> getCheckpointState(
+    c10::DeviceIndex device,
+    MempoolId_t id) {
+  return get()->getCheckpointState(device, id);
+}
+
+inline CheckpointDelta setCheckpointPoolState(
+    c10::DeviceIndex device,
+    std::shared_ptr<AllocatorState> pps) {
+  return get()->setCheckpointPoolState(device, std::move(pps));
+}
+
+// CUDAGraph interactions
+inline void beginAllocateToPool(
+    c10::DeviceIndex device,
+    MempoolId_t mempool_id,
+    std::function<bool(cudaStream_t)> filter) {
+  get()->beginAllocateToPool(device, mempool_id, std::move(filter));
+}
+
+inline void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  get()->endAllocateToPool(device, mempool_id);
+}
+
+inline void recordHistory(
+    bool enabled,
+    CreateContextFn context_recorder,
+    size_t alloc_trace_max_entries,
+    RecordContext when) {
+  return get()->recordHistory(
+      enabled, context_recorder, alloc_trace_max_entries, when);
+}
+
+inline void recordAnnotation(
+    const std::vector<std::pair<std::string, std::string>>& md) {
+  return get()->recordAnnotation(md);
+}
+
+inline bool isHistoryEnabled() {
+  return get()->isHistoryEnabled();
+}
+
+inline bool checkPoolLiveAllocations(
+    c10::DeviceIndex device,
+    MempoolId_t mempool_id,
+    const std::unordered_set<void*>& expected_live_allocations) {
+  return get()->checkPoolLiveAllocations(
+      device, mempool_id, expected_live_allocations);
+}
+
+inline void attachOutOfMemoryObserver(OutOfMemoryObserver observer) {
+  return get()->attachOutOfMemoryObserver(std::move(observer));
+}
+
+inline void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) {
+  return get()->attachAllocatorTraceTracker(std::move(tracker));
+}
+
+inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  return get()->releasePool(device, mempool_id);
+}
+// Not part of CUDA_ALLOCATOR_BACKEND_INTERFACE
+inline std::shared_ptr<void> getIpcDevPtr(std::string handle) {
+  return get()->getIpcDevPtr(std::move(handle));
+}
+
+inline ShareableHandle shareIpcHandle(void* ptr) {
+  return get()->shareIpcHandle(ptr);
+}
+
+inline std::string name() {
+  return get()->name();
+}
+
+inline cudaError_t memcpyAsync(
+    void* dst,
+    int dstDevice,
+    const void* src,
+    int srcDevice,
+    size_t count,
+    cudaStream_t stream,
+    bool p2p_enabled) {
+  return get()->memcpyAsync(
+      dst, dstDevice, src, srcDevice, count, stream, p2p_enabled);
+}
+
+inline void enablePeerAccess(
+    c10::DeviceIndex dev,
+    c10::DeviceIndex dev_to_access) {
+  return get()->enablePeerAccess(dev, dev_to_access);
+}
+
+} // namespace c10::cuda::CUDACachingAllocator
+
+namespace c10::cuda {
+
+// MemPool represents a pool of memory in a caching allocator. Currently,
+// it's just the ID of the pool object maintained in the CUDACachingAllocator.
+//
+// An allocator pointer can be passed to the MemPool to define how the
+// allocations should be done in the pool. For example: using a different
+// system allocator such as ncclMemAlloc.
+struct C10_CUDA_API MemPool {
+  MemPool(
+      CUDACachingAllocator::CUDAAllocator* allocator = nullptr,
+      bool is_user_created = true);
+
+  MempoolId_t id();
+  CUDACachingAllocator::CUDAAllocator* allocator();
+
+ private:
+  static std::atomic<CaptureId_t> uid_;
+  static std::atomic<CaptureId_t> uuid_;
+  CUDACachingAllocator::CUDAAllocator* allocator_;
+  bool is_user_created_;
+  MempoolId_t id_;
+};
+
+// MemPoolContext holds the currently active pool and stashes the previous
+// pool. On deletion it makes the previous pool active.
+struct C10_CUDA_API MemPoolContext {
+  MemPoolContext(MemPool* mempool);
+
+  ~MemPoolContext();
+
+  // getActiveMemPool() can be used to get the currently active pool.
+  // For instance: in CUDACachingAllocator, we can route allocations
+  // to a user provided allocator, by doing:
+  //
+  //  auto active_pool = MemPoolContext::getActiveMemPool();
+  //  if (active_pool && active_pool->allocator()) {
+  //    ptr = active_pool->allocator()->raw_alloc(size);
+  //  }
+  //
+  static MemPool* getActiveMemPool();
+
+ private:
+  MemPool* prev_mempool_;
+};
+
+} // namespace c10::cuda
--- a/rl/Lib/site-packages/torch/include/c10/cuda/CUDADeviceAssertion.h
+++ b/rl/Lib/site-packages/torch/include/c10/cuda/CUDADeviceAssertion.h
@ -0,0 +1,96 @@
+#pragma once
+
+#include <c10/cuda/CUDAException.h>
+#include <c10/macros/Macros.h>
+
+namespace c10::cuda {
+
+#ifdef TORCH_USE_CUDA_DSA
+// Copy string from `src` to `dst`
+static __device__ void dstrcpy(char* dst, const char* src) {
+  int i = 0;
+  // Copy string from source to destination, ensuring that it
+  // isn't longer than `C10_CUDA_DSA_MAX_STR_LEN-1`
+  while (*src != '\0' && i++ < C10_CUDA_DSA_MAX_STR_LEN - 1) {
+    *dst++ = *src++;
+  }
+  *dst = '\0';
+}
+
+static __device__ void dsa_add_new_assertion_failure(
+    DeviceAssertionsData* assertions_data,
+    const char* assertion_msg,
+    const char* filename,
+    const char* function_name,
+    const int line_number,
+    const uint32_t caller,
+    const dim3 block_id,
+    const dim3 thread_id) {
+  // `assertions_data` may be nullptr if device-side assertion checking
+  // is disabled at run-time. If it is disabled at compile time this
+  // function will never be called
+  if (!assertions_data) {
+    return;
+  }
+
+  // Atomically increment so other threads can fail at the same time
+  // Note that incrementing this means that the CPU can observe that
+  // a failure has happened and can begin to respond before we've
+  // written information about that failure out to the buffer.
+  const auto nid = atomicAdd(&(assertions_data->assertion_count), 1);
+
+  if (nid >= C10_CUDA_DSA_ASSERTION_COUNT) {
+    // At this point we're ran out of assertion buffer space.
+    // We could print a message about this, but that'd get
+    // spammy if a lot of threads did it, so we just silently
+    // ignore any other assertion failures. In most cases the
+    // failures will all probably be analogous anyway.
+    return;
+  }
+
+  // Write information about the assertion failure to memory.
+  // Note that this occurs only after the `assertion_count`
+  // increment broadcasts that there's been a problem.
+  auto& self = assertions_data->assertions[nid];
+  dstrcpy(self.assertion_msg, assertion_msg);
+  dstrcpy(self.filename, filename);
+  dstrcpy(self.function_name, function_name);
+  self.line_number = line_number;
+  self.caller = caller;
+  self.block_id[0] = block_id.x;
+  self.block_id[1] = block_id.y;
+  self.block_id[2] = block_id.z;
+  self.thread_id[0] = thread_id.x;
+  self.thread_id[1] = thread_id.y;
+  self.thread_id[2] = thread_id.z;
+}
+
+// Emulates a kernel assertion. The assertion won't stop the kernel's progress,
+// so you should assume everything the kernel produces is garbage if there's an
+// assertion failure.
+// NOTE: This assumes that `assertions_data` and  `assertion_caller_id` are
+//       arguments of the kernel and therefore accessible.
+#define CUDA_KERNEL_ASSERT2(condition)                                   \
+  do {                                                                   \
+    if (C10_UNLIKELY(!(condition))) {                                    \
+      /* Has an atomic element so threads can fail at the same time */   \
+      c10::cuda::dsa_add_new_assertion_failure(                          \
+          assertions_data,                                               \
+          C10_STRINGIZE(condition),                                      \
+          __FILE__,                                                      \
+          __FUNCTION__,                                                  \
+          __LINE__,                                                      \
+          assertion_caller_id,                                           \
+          blockIdx,                                                      \
+          threadIdx);                                                    \
+      /* Now that the kernel has failed we early exit the kernel, but */ \
+      /* otherwise keep going and rely on the host to check UVM and */   \
+      /* determine we've had a problem */                                \
+      return;                                                            \
+    }                                                                    \
+  } while (false)
+#else
+#define CUDA_KERNEL_ASSERT2(condition) assert(condition)
+#endif
+
+} // namespace c10::cuda
--- a/rl/Lib/site-packages/torch/include/c10/cuda/CUDADeviceAssertionHost.h
+++ b/rl/Lib/site-packages/torch/include/c10/cuda/CUDADeviceAssertionHost.h
@ -0,0 +1,164 @@
+#pragma once
+
+#include <c10/cuda/CUDAMacros.h>
+
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <utility>
+#include <vector>
+
+#ifdef USE_CUDA
+#define TORCH_USE_CUDA_DSA
+#endif
+
+/// Number of assertion failure messages we can store. If this is too small
+/// threads will fail silently.
+constexpr int C10_CUDA_DSA_ASSERTION_COUNT = 10;
+constexpr int C10_CUDA_DSA_MAX_STR_LEN = 512;
+
+namespace c10::cuda {
+
+/// Holds information about any device-side assertions that fail.
+/// Held in managed memory and access by both the CPU and the GPU.
+struct DeviceAssertionData {
+  /// Stringification of the assertion
+  // NOLINTNEXTLINE(*-c-arrays)
+  char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN]{};
+  /// File the assertion was in
+  // NOLINTNEXTLINE(*-c-arrays)
+  char filename[C10_CUDA_DSA_MAX_STR_LEN]{};
+  /// Name of the function the assertion was in
+  // NOLINTNEXTLINE(*-c-arrays)
+  char function_name[C10_CUDA_DSA_MAX_STR_LEN]{};
+  /// Line number the assertion was at
+  int line_number{};
+  /// Number uniquely identifying the kernel launch that triggered the assertion
+  uint32_t caller{};
+  /// block_id of the thread that failed the assertion
+  // NOLINTNEXTLINE(*-c-arrays)
+  int32_t block_id[3]{};
+  /// third_id of the thread that failed the assertion
+  // NOLINTNEXTLINE(*-c-arrays)
+  int32_t thread_id[3]{};
+};
+
+/// Used to hold assertions generated by the device
+/// Held in managed memory and access by both the CPU and the GPU.
+struct DeviceAssertionsData {
+  /// Total number of assertions found; a subset of thse will be recorded
+  /// in `assertions`
+  int32_t assertion_count{};
+  /// An array of assertions that will be written to in a race-free manner
+  // NOLINTNEXTLINE(*-c-arrays)
+  DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT]{};
+};
+
+/// Use to hold info about kernel launches so that we can run kernels
+/// asynchronously and still associate launches with device-side
+/// assertion failures
+struct CUDAKernelLaunchInfo {
+  /// Filename of the code where the kernel was launched from
+  const char* launch_filename;
+  /// Function from which the kernel was launched
+  const char* launch_function;
+  /// Line number of where the code was launched from
+  uint32_t launch_linenum;
+  /// Backtrace of where the kernel was launched from, only populated if
+  /// CUDAKernelLaunchRegistry::gather_launch_stacktrace is True
+  std::string launch_stacktrace;
+  /// Kernel that was launched
+  const char* kernel_name;
+  /// Device the kernel was launched on
+  int device;
+  /// Stream the kernel was launched on
+  int32_t stream;
+  /// A number that uniquely identifies the kernel launch
+  uint64_t generation_number;
+};
+
+/// Circular buffer used to hold information about kernel launches
+/// this is later used to reconstruct how a device-side kernel assertion failure
+/// occurred CUDAKernelLaunchRegistry is used as a singleton
+class C10_CUDA_API CUDAKernelLaunchRegistry {
+ private:
+  /// Assume that this is the max number of kernel launches that might ever be
+  /// enqueued across all streams on a single device
+  static constexpr int max_kernel_launches = 1024;
+  /// How many kernel launch infos we've inserted. Used to ensure that circular
+  /// queue doesn't provide false information by always increasing, but also to
+  /// mark where we are inserting into the queue
+#ifdef TORCH_USE_CUDA_DSA
+  uint64_t generation_number = 0;
+#endif
+  /// Shared mutex between writer and accessor to ensure multi-threaded safety.
+  mutable std::mutex read_write_mutex;
+  /// Used to ensure prevent race conditions in GPU memory allocation
+  mutable std::mutex gpu_alloc_mutex;
+  /// Pointer to managed memory keeping track of device-side assertions. There
+  /// is one entry for each possible device the process might work with. Unused
+  /// entries are nullptrs. We could also use an unordered_set here, but this
+  /// vector design will be faster and the wasted memory is small since we
+  /// expect the number of GPUs per node will always be small
+  std::vector<
+      std::unique_ptr<DeviceAssertionsData, void (*)(DeviceAssertionsData*)>>
+      uvm_assertions;
+  /// A single circular buffer holds information about every kernel launch the
+  /// process makes across all devices.
+  std::vector<CUDAKernelLaunchInfo> kernel_launches;
+  bool check_env_for_enable_launch_stacktracing() const;
+  bool check_env_for_dsa_enabled() const;
+
+ public:
+  CUDAKernelLaunchRegistry();
+  /// Register a new kernel launch and obtain a generation number back to be
+  /// passed to the kernel
+  uint32_t insert(
+      const char* launch_filename,
+      const char* launch_function,
+      const uint32_t launch_linenum,
+      const char* kernel_name,
+      const int32_t stream_id);
+  /// Get copies of the kernel launch registry and each device's assertion
+  /// failure buffer so they can be inspected without raising race conditions
+  std::
+      pair<std::vector<DeviceAssertionsData>, std::vector<CUDAKernelLaunchInfo>>
+      snapshot() const;
+  /// Get a pointer to the current device's assertion failure buffer. If no such
+  /// buffer exists then one is created. This means that the first kernel launch
+  /// made on each device will be slightly slower because memory allocations are
+  /// required
+  DeviceAssertionsData* get_uvm_assertions_ptr_for_current_device();
+  /// Gets the global singleton of the registry
+  static CUDAKernelLaunchRegistry& get_singleton_ref();
+  /// If not all devices support DSA, we disable it
+  const bool do_all_devices_support_managed_memory = false;
+  /// Whether or not to gather stack traces when launching kernels
+  bool gather_launch_stacktrace = false;
+  /// Whether or not host-side DSA is enabled or disabled at run-time
+  /// Note: Device-side code cannot be enabled/disabled at run-time
+  bool enabled_at_runtime = false;
+  /// Whether or not a device has indicated a failure
+  bool has_failed() const;
+#ifdef TORCH_USE_CUDA_DSA
+  const bool enabled_at_compile_time = true;
+#else
+  const bool enabled_at_compile_time = false;
+#endif
+};
+
+std::string c10_retrieve_device_side_assertion_info();
+
+} // namespace c10::cuda
+
+// Each kernel launched with TORCH_DSA_KERNEL_LAUNCH
+// requires the same input arguments. We introduce the following macro to
+// standardize these.
+#define TORCH_DSA_KERNEL_ARGS                                              \
+  [[maybe_unused]] c10::cuda::DeviceAssertionsData *const assertions_data, \
+      [[maybe_unused]] uint32_t assertion_caller_id
+
+// This macro can be used to pass the DSA arguments onward to another
+// function
+#define TORCH_DSA_KERNEL_ARGS_PASS assertions_data, assertion_caller_id
--- a/rl/Lib/site-packages/torch/include/c10/cuda/CUDAException.h
+++ b/rl/Lib/site-packages/torch/include/c10/cuda/CUDAException.h
@ -0,0 +1,100 @@
+#pragma once
+
+#include <c10/cuda/CUDADeviceAssertionHost.h>
+#include <c10/cuda/CUDAMacros.h>
+#include <c10/cuda/CUDAMiscFunctions.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+#include <cuda.h>
+
+// Note [CHECK macro]
+// ~~~~~~~~~~~~~~~~~~
+// This is a macro so that AT_ERROR can get accurate __LINE__
+// and __FILE__ information.  We could split this into a short
+// macro and a function implementation if we pass along __LINE__
+// and __FILE__, but no one has found this worth doing.
+
+// Used to denote errors from CUDA framework.
+// This needs to be declared here instead util/Exception.h for proper conversion
+// during hipify.
+namespace c10 {
+class C10_CUDA_API CUDAError : public c10::Error {
+  using Error::Error;
+};
+} // namespace c10
+
+#define C10_CUDA_CHECK(EXPR)                                        \
+  do {                                                              \
+    const cudaError_t __err = EXPR;                                 \
+    c10::cuda::c10_cuda_check_implementation(                       \
+        static_cast<int32_t>(__err),                                \
+        __FILE__,                                                   \
+        __func__, /* Line number data type not well-defined between \
+                      compilers, so we perform an explicit cast */  \
+        static_cast<uint32_t>(__LINE__),                            \
+        true);                                                      \
+  } while (0)
+
+#define C10_CUDA_CHECK_WARN(EXPR)                              \
+  do {                                                         \
+    const cudaError_t __err = EXPR;                            \
+    if (C10_UNLIKELY(__err != cudaSuccess)) {                  \
+      auto error_unused C10_UNUSED = cudaGetLastError();       \
+      (void)error_unused;                                      \
+      TORCH_WARN("CUDA warning: ", cudaGetErrorString(__err)); \
+    }                                                          \
+  } while (0)
+
+// Indicates that a CUDA error is handled in a non-standard way
+#define C10_CUDA_ERROR_HANDLED(EXPR) EXPR
+
+// Intentionally ignore a CUDA error
+#define C10_CUDA_IGNORE_ERROR(EXPR)                             \
+  do {                                                          \
+    const cudaError_t __err = EXPR;                             \
+    if (C10_UNLIKELY(__err != cudaSuccess)) {                   \
+      cudaError_t error_unused C10_UNUSED = cudaGetLastError(); \
+      (void)error_unused;                                       \
+    }                                                           \
+  } while (0)
+
+// Clear the last CUDA error
+#define C10_CUDA_CLEAR_ERROR()                                \
+  do {                                                        \
+    cudaError_t error_unused C10_UNUSED = cudaGetLastError(); \
+    (void)error_unused;                                       \
+  } while (0)
+
+// This should be used directly after every kernel launch to ensure
+// the launch happened correctly and provide an early, close-to-source
+// diagnostic if it didn't.
+#define C10_CUDA_KERNEL_LAUNCH_CHECK() C10_CUDA_CHECK(cudaGetLastError())
+
+/// Launches a CUDA kernel appending to it all the information need to handle
+/// device-side assertion failures. Checks that the launch was successful.
+#define TORCH_DSA_KERNEL_LAUNCH(                                      \
+    kernel, blocks, threads, shared_mem, stream, ...)                 \
+  do {                                                                \
+    auto& launch_registry =                                           \
+        c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref();     \
+    kernel<<<blocks, threads, shared_mem, stream>>>(                  \
+        __VA_ARGS__,                                                  \
+        launch_registry.get_uvm_assertions_ptr_for_current_device(),  \
+        launch_registry.insert(                                       \
+            __FILE__, __FUNCTION__, __LINE__, #kernel, stream.id())); \
+    C10_CUDA_KERNEL_LAUNCH_CHECK();                                   \
+  } while (0)
+
+namespace c10::cuda {
+
+/// In the event of a CUDA failure, formats a nice error message about that
+/// failure and also checks for device-side assertion failures
+C10_CUDA_API void c10_cuda_check_implementation(
+    const int32_t err,
+    const char* filename,
+    const char* function_name,
+    const int line_number,
+    const bool include_device_assertions);
+
+} // namespace c10::cuda
--- a/rl/Lib/site-packages/torch/include/c10/cuda/CUDAFunctions.h
+++ b/rl/Lib/site-packages/torch/include/c10/cuda/CUDAFunctions.h
@ -0,0 +1,116 @@
+#pragma once
+
+// This header provides C++ wrappers around commonly used CUDA API functions.
+// The benefit of using C++ here is that we can raise an exception in the
+// event of an error, rather than explicitly pass around error codes.  This
+// leads to more natural APIs.
+//
+// The naming convention used here matches the naming convention of torch.cuda
+
+#include <c10/core/Device.h>
+#include <c10/core/impl/GPUTrace.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAMacros.h>
+#include <cuda_runtime_api.h>
+namespace c10::cuda {
+
+// NB: In the past, we were inconsistent about whether or not this reported
+// an error if there were driver problems are not.  Based on experience
+// interacting with users, it seems that people basically ~never want this
+// function to fail; it should just return zero if things are not working.
+// Oblige them.
+// It still might log a warning for user first time it's invoked
+C10_CUDA_API DeviceIndex device_count() noexcept;
+
+// Version of device_count that throws is no devices are detected
+C10_CUDA_API DeviceIndex device_count_ensure_non_zero();
+
+C10_CUDA_API DeviceIndex current_device();
+
+C10_CUDA_API void set_device(DeviceIndex device);
+
+C10_CUDA_API void device_synchronize();
+
+C10_CUDA_API void warn_or_error_on_sync();
+
+// Raw CUDA device management functions
+C10_CUDA_API cudaError_t GetDeviceCount(int* dev_count);
+
+C10_CUDA_API cudaError_t GetDevice(DeviceIndex* device);
+
+C10_CUDA_API cudaError_t SetDevice(DeviceIndex device);
+
+C10_CUDA_API cudaError_t MaybeSetDevice(DeviceIndex device);
+
+C10_CUDA_API DeviceIndex ExchangeDevice(DeviceIndex device);
+
+C10_CUDA_API DeviceIndex MaybeExchangeDevice(DeviceIndex device);
+
+C10_CUDA_API void SetTargetDevice();
+
+enum class SyncDebugMode { L_DISABLED = 0, L_WARN, L_ERROR };
+
+// this is a holder for c10 global state (similar to at GlobalContext)
+// currently it's used to store cuda synchronization warning state,
+// but can be expanded to hold other related global state, e.g. to
+// record stream usage
+class WarningState {
+ public:
+  void set_sync_debug_mode(SyncDebugMode l) {
+    sync_debug_mode = l;
+  }
+
+  SyncDebugMode get_sync_debug_mode() {
+    return sync_debug_mode;
+  }
+
+ private:
+  SyncDebugMode sync_debug_mode = SyncDebugMode::L_DISABLED;
+};
+
+C10_CUDA_API __inline__ WarningState& warning_state() {
+  static WarningState warning_state_;
+  return warning_state_;
+}
+// the subsequent functions are defined in the header because for performance
+// reasons we want them to be inline
+C10_CUDA_API void __inline__ memcpy_and_sync(
+    void* dst,
+    const void* src,
+    int64_t nbytes,
+    cudaMemcpyKind kind,
+    cudaStream_t stream) {
+  if (C10_UNLIKELY(
+          warning_state().get_sync_debug_mode() != SyncDebugMode::L_DISABLED)) {
+    warn_or_error_on_sync();
+  }
+  const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+  if (C10_UNLIKELY(interp)) {
+    (*interp)->trace_gpu_stream_synchronization(
+        c10::kCUDA, reinterpret_cast<uintptr_t>(stream));
+  }
+#if defined(TORCH_HIP_VERSION) && (TORCH_HIP_VERSION >= 301)
+  C10_CUDA_CHECK(hipMemcpyWithStream(dst, src, nbytes, kind, stream));
+#else
+  C10_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, kind, stream));
+  C10_CUDA_CHECK(cudaStreamSynchronize(stream));
+#endif
+}
+
+C10_CUDA_API void __inline__ stream_synchronize(cudaStream_t stream) {
+  if (C10_UNLIKELY(
+          warning_state().get_sync_debug_mode() != SyncDebugMode::L_DISABLED)) {
+    warn_or_error_on_sync();
+  }
+  const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+  if (C10_UNLIKELY(interp)) {
+    (*interp)->trace_gpu_stream_synchronization(
+        c10::kCUDA, reinterpret_cast<uintptr_t>(stream));
+  }
+  C10_CUDA_CHECK(cudaStreamSynchronize(stream));
+}
+
+C10_CUDA_API bool hasPrimaryContext(DeviceIndex device_index);
+C10_CUDA_API std::optional<DeviceIndex> getDeviceIndexWithPrimaryContext();
+
+} // namespace c10::cuda
--- a/rl/Lib/site-packages/torch/include/c10/cuda/CUDAGraphsC10Utils.h
+++ b/rl/Lib/site-packages/torch/include/c10/cuda/CUDAGraphsC10Utils.h
@ -0,0 +1,77 @@
+#pragma once
+
+#include <c10/cuda/CUDAStream.h>
+#include <iostream>
+#include <utility>
+
+// CUDA Graphs utils used by c10 and aten.
+// aten/cuda/CUDAGraphsUtils.cuh adds utils used by aten only.
+
+namespace c10::cuda {
+
+using CaptureId_t = unsigned long long;
+
+// first is set if the instance is created by CUDAGraph::capture_begin.
+// second is set if the instance is created by at::cuda::graph_pool_handle.
+using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
+
+// RAII guard for "cudaStreamCaptureMode", a thread-local value
+// that controls the error-checking strictness of a capture.
+struct C10_CUDA_API CUDAStreamCaptureModeGuard {
+  CUDAStreamCaptureModeGuard(cudaStreamCaptureMode desired)
+      : strictness_(desired) {
+    C10_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&strictness_));
+  }
+  ~CUDAStreamCaptureModeGuard() {
+    C10_CUDA_CHECK_WARN(cudaThreadExchangeStreamCaptureMode(&strictness_));
+  }
+
+ private:
+  cudaStreamCaptureMode strictness_;
+};
+
+// Protects against enum cudaStreamCaptureStatus implementation changes.
+// Some compilers seem not to like static_assert without the messages.
+static_assert(
+    int(cudaStreamCaptureStatus::cudaStreamCaptureStatusNone) == 0,
+    "unexpected int(cudaStreamCaptureStatusNone) value");
+static_assert(
+    int(cudaStreamCaptureStatus::cudaStreamCaptureStatusActive) == 1,
+    "unexpected int(cudaStreamCaptureStatusActive) value");
+static_assert(
+    int(cudaStreamCaptureStatus::cudaStreamCaptureStatusInvalidated) == 2,
+    "unexpected int(cudaStreamCaptureStatusInvalidated) value");
+
+enum class CaptureStatus : int {
+  None = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusNone),
+  Active = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusActive),
+  Invalidated = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusInvalidated)
+};
+
+inline std::ostream& operator<<(std::ostream& os, CaptureStatus status) {
+  switch (status) {
+    case CaptureStatus::None:
+      os << "cudaStreamCaptureStatusNone";
+      break;
+    case CaptureStatus::Active:
+      os << "cudaStreamCaptureStatusActive";
+      break;
+    case CaptureStatus::Invalidated:
+      os << "cudaStreamCaptureStatusInvalidated";
+      break;
+    default:
+      TORCH_INTERNAL_ASSERT(
+          false, "Unknown CUDA graph CaptureStatus", int(status));
+  }
+  return os;
+}
+
+// Use this version where you're sure a CUDA context exists already.
+inline CaptureStatus currentStreamCaptureStatusMayInitCtx() {
+  cudaStreamCaptureStatus is_capturing{cudaStreamCaptureStatusNone};
+  C10_CUDA_CHECK(
+      cudaStreamIsCapturing(c10::cuda::getCurrentCUDAStream(), &is_capturing));
+  return CaptureStatus(is_capturing);
+}
+
+} // namespace c10::cuda
--- a/rl/Lib/site-packages/torch/include/c10/cuda/CUDAGuard.h
+++ b/rl/Lib/site-packages/torch/include/c10/cuda/CUDAGuard.h
@ -0,0 +1,301 @@
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/core/impl/InlineDeviceGuard.h>
+#include <c10/core/impl/InlineStreamGuard.h>
+#include <c10/cuda/CUDAMacros.h>
+#include <c10/cuda/impl/CUDAGuardImpl.h>
+
+namespace c10::cuda {
+
+// This code is kind of boilerplatey.  See Note [Whither the DeviceGuard
+// boilerplate]
+
+/// A variant of DeviceGuard that is specialized for CUDA.  It accepts
+/// integer indices (interpreting them as CUDA devices) and is a little
+/// more efficient than DeviceGuard (it compiles to straight line
+/// cudaSetDevice/cudaGetDevice calls); however, it can only be used
+/// from code that links against CUDA directly.
+struct CUDAGuard {
+  /// No default constructor; see Note [Omitted default constructor from RAII]
+  explicit CUDAGuard() = delete;
+
+  /// Set the current CUDA device to the passed device index.
+  explicit CUDAGuard(DeviceIndex device_index) : guard_(device_index) {}
+
+  /// Sets the current CUDA device to the passed device.  Errors if the passed
+  /// device is not a CUDA device.
+  explicit CUDAGuard(Device device) : guard_(device) {}
+
+  // Copy is not allowed
+  CUDAGuard(const CUDAGuard&) = delete;
+  CUDAGuard& operator=(const CUDAGuard&) = delete;
+
+  // Move is not allowed (there is no uninitialized state)
+  CUDAGuard(CUDAGuard&& other) = delete;
+  CUDAGuard& operator=(CUDAGuard&& other) = delete;
+
+  /// Sets the CUDA device to the given device.  Errors if the given device
+  /// is not a CUDA device.
+  void set_device(Device device) {
+    guard_.set_device(device);
+  }
+
+  /// Sets the CUDA device to the given device.  Errors if the given device
+  /// is not a CUDA device.  (This method is provided for uniformity with
+  /// DeviceGuard).
+  void reset_device(Device device) {
+    guard_.reset_device(device);
+  }
+
+  /// Sets the CUDA device to the given device index.
+  void set_index(DeviceIndex device_index) {
+    guard_.set_index(device_index);
+  }
+
+  /// Returns the device that was set upon construction of the guard
+  Device original_device() const {
+    return guard_.original_device();
+  }
+
+  /// Returns the last device that was set via `set_device`, if any, otherwise
+  /// the device passed during construction.
+  Device current_device() const {
+    return guard_.current_device();
+  }
+
+ private:
+  /// The guard for the current device.
+  c10::impl::InlineDeviceGuard<impl::CUDAGuardImpl> guard_;
+};
+
+/// A variant of OptionalDeviceGuard that is specialized for CUDA.  See
+/// CUDAGuard for when you can use this.
+struct OptionalCUDAGuard {
+  /// Create an uninitialized OptionalCUDAGuard.
+  explicit OptionalCUDAGuard() : guard_() {}
+
+  /// Set the current CUDA device to the passed Device, if it is not nullopt.
+  explicit OptionalCUDAGuard(std::optional<Device> device_opt)
+      : guard_(device_opt) {}
+
+  /// Set the current CUDA device to the passed device index, if it is not
+  /// nullopt
+  explicit OptionalCUDAGuard(std::optional<DeviceIndex> device_index_opt)
+      : guard_(device_index_opt) {}
+
+  // Copy is not allowed
+  OptionalCUDAGuard(const OptionalCUDAGuard&) = delete;
+  OptionalCUDAGuard& operator=(const OptionalCUDAGuard&) = delete;
+
+  // See Note [Move construction for RAII guards is tricky]
+  OptionalCUDAGuard(OptionalCUDAGuard&& other) = delete;
+
+  // See Note [Move assignment for RAII guards is tricky]
+  OptionalCUDAGuard& operator=(OptionalCUDAGuard&& other) = delete;
+
+  /// Sets the CUDA device to the given device, initializing the guard if it
+  /// is not already initialized.  Errors if the given device is not a CUDA
+  /// device.
+  void set_device(Device device) {
+    guard_.set_device(device);
+  }
+
+  /// Sets the CUDA device to the given device, initializing the guard if it is
+  /// not already initialized.  Errors if the given device is not a CUDA device.
+  /// (This method is provided for uniformity with OptionalDeviceGuard).
+  void reset_device(Device device) {
+    guard_.reset_device(device);
+  }
+
+  /// Sets the CUDA device to the given device index, initializing the guard if
+  /// it is not already initialized.
+  void set_index(DeviceIndex device_index) {
+    guard_.set_index(device_index);
+  }
+
+  /// Returns the device that was set immediately prior to initialization of the
+  /// guard, or nullopt if the guard is uninitialized.
+  std::optional<Device> original_device() const {
+    return guard_.original_device();
+  }
+
+  /// Returns the most recent device that was set using this device guard,
+  /// either from construction, or via set_device, if the guard is initialized,
+  /// or nullopt if the guard is uninitialized.
+  std::optional<Device> current_device() const {
+    return guard_.current_device();
+  }
+
+  /// Restore the original CUDA device, resetting this guard to uninitialized
+  /// state.
+  void reset() {
+    guard_.reset();
+  }
+
+ private:
+  c10::impl::InlineOptionalDeviceGuard<impl::CUDAGuardImpl> guard_;
+};
+
+/// A variant of StreamGuard that is specialized for CUDA.  See CUDAGuard
+/// for when you can use this.
+struct CUDAStreamGuard {
+  /// No default constructor, see Note [Omitted default constructor from RAII]
+  explicit CUDAStreamGuard() = delete;
+
+  /// Set the current CUDA device to the device associated with the passed
+  /// stream, and set the current CUDA stream on that device to the passed
+  /// stream. Errors if the Stream is not a CUDA stream.
+  explicit CUDAStreamGuard(Stream stream) : guard_(stream) {}
+
+  /// Copy is disallowed
+  CUDAStreamGuard(const CUDAStreamGuard&) = delete;
+  CUDAStreamGuard& operator=(const CUDAStreamGuard&) = delete;
+
+  /// Move is disallowed, as CUDAStreamGuard does not have an uninitialized
+  /// state, which is required for moves on types with nontrivial destructors.
+  CUDAStreamGuard(CUDAStreamGuard&& other) = delete;
+  CUDAStreamGuard& operator=(CUDAStreamGuard&& other) = delete;
+
+  /// Resets the currently set stream to the original stream and
+  /// the currently set device to the original device.  Then,
+  /// set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream.
+  /// Errors if the stream passed is not a CUDA stream.
+  ///
+  /// NOTE: this implementation may skip some stream/device setting if
+  /// it can prove that it is unnecessary.
+  ///
+  /// WARNING: reset_stream does NOT preserve previously set streams on
+  /// different devices.  If you need to set streams on multiple devices
+  /// on CUDA, use CUDAMultiStreamGuard instead.
+  void reset_stream(Stream stream) {
+    guard_.reset_stream(stream);
+  }
+
+  /// Returns the CUDA stream that was set at the time the guard was
+  /// constructed.
+  CUDAStream original_stream() const {
+    return CUDAStream(CUDAStream::UNCHECKED, guard_.original_stream());
+  }
+
+  /// Returns the most recent CUDA stream that was set using this device guard,
+  /// either from construction, or via set_stream.
+  CUDAStream current_stream() const {
+    return CUDAStream(CUDAStream::UNCHECKED, guard_.current_stream());
+  }
+
+  /// Returns the most recent CUDA device that was set using this device guard,
+  /// either from construction, or via set_device/reset_device/set_index.
+  Device current_device() const {
+    return guard_.current_device();
+  }
+
+  /// Returns the CUDA device that was set at the most recent reset_stream(),
+  /// or otherwise the device at construction time.
+  Device original_device() const {
+    return guard_.original_device();
+  }
+
+ private:
+  c10::impl::InlineStreamGuard<impl::CUDAGuardImpl> guard_;
+};
+
+/// A variant of OptionalStreamGuard that is specialized for CUDA.  See
+/// CUDAGuard for when you can use this.
+struct OptionalCUDAStreamGuard {
+  /// Create an uninitialized guard.
+  explicit OptionalCUDAStreamGuard() : guard_() {}
+
+  /// Set the current CUDA device to the device associated with the passed
+  /// stream, and set the current CUDA stream on that device to the passed
+  /// stream. Errors if the Stream is not a CUDA stream.
+  explicit OptionalCUDAStreamGuard(Stream stream) : guard_(stream) {}
+
+  /// Set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream,
+  /// if the passed stream is not nullopt.
+  explicit OptionalCUDAStreamGuard(std::optional<Stream> stream_opt)
+      : guard_(stream_opt) {}
+
+  /// Copy is disallowed
+  OptionalCUDAStreamGuard(const OptionalCUDAStreamGuard&) = delete;
+  OptionalCUDAStreamGuard& operator=(const OptionalCUDAStreamGuard&) = delete;
+
+  // See Note [Move construction for RAII guards is tricky]
+  OptionalCUDAStreamGuard(OptionalCUDAStreamGuard&& other) = delete;
+
+  // See Note [Move assignment for RAII guards is tricky]
+  OptionalCUDAStreamGuard& operator=(OptionalCUDAStreamGuard&& other) = delete;
+
+  /// Resets the currently set CUDA stream to the original stream and
+  /// the currently set device to the original device.  Then,
+  /// set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream.
+  /// Initializes the guard if it was not previously initialized.
+  void reset_stream(Stream stream) {
+    guard_.reset_stream(stream);
+  }
+
+  /// Returns the CUDA stream that was set at the time the guard was most
+  /// recently initialized, or nullopt if the guard is uninitialized.
+  std::optional<CUDAStream> original_stream() const {
+    auto r = guard_.original_stream();
+    if (r.has_value()) {
+      return std::make_optional(CUDAStream(CUDAStream::UNCHECKED, r.value()));
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  /// Returns the most recent CUDA stream that was set using this stream guard,
+  /// either from construction, or via reset_stream, if the guard is
+  /// initialized, or nullopt if the guard is uninitialized.
+  std::optional<CUDAStream> current_stream() const {
+    auto r = guard_.current_stream();
+    if (r.has_value()) {
+      return std::make_optional(CUDAStream(CUDAStream::UNCHECKED, r.value()));
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  /// Restore the original CUDA device and stream, resetting this guard to
+  /// uninitialized state.
+  void reset() {
+    guard_.reset();
+  }
+
+ private:
+  c10::impl::InlineOptionalStreamGuard<impl::CUDAGuardImpl> guard_;
+};
+
+/// A variant of MultiStreamGuard that is specialized for CUDA.
+struct CUDAMultiStreamGuard {
+  explicit CUDAMultiStreamGuard(ArrayRef<CUDAStream> streams)
+      : guard_(unwrapStreams(streams)) {}
+
+  /// Copy is disallowed
+  CUDAMultiStreamGuard(const CUDAMultiStreamGuard&) = delete;
+  CUDAMultiStreamGuard& operator=(const CUDAMultiStreamGuard&) = delete;
+
+  // See Note [Move construction for RAII guards is tricky]
+  CUDAMultiStreamGuard(CUDAMultiStreamGuard&& other) = delete;
+
+  // See Note [Move assignment for RAII guards is tricky]
+  CUDAMultiStreamGuard& operator=(CUDAMultiStreamGuard&& other) = delete;
+
+ private:
+  c10::impl::InlineMultiStreamGuard<impl::CUDAGuardImpl> guard_;
+
+  static std::vector<Stream> unwrapStreams(ArrayRef<CUDAStream> cudaStreams) {
+    std::vector<Stream> streams;
+    streams.reserve(cudaStreams.size());
+    for (const CUDAStream& cudaStream : cudaStreams) {
+      streams.push_back(cudaStream);
+    }
+    return streams;
+  }
+};
+
+} // namespace c10::cuda
--- a/rl/Lib/site-packages/torch/include/c10/cuda/CUDAMacros.h
+++ b/rl/Lib/site-packages/torch/include/c10/cuda/CUDAMacros.h
@ -0,0 +1,51 @@
+#pragma once
+
+#ifndef C10_USING_CUSTOM_GENERATED_MACROS
+
+// We have not yet modified the AMD HIP build to generate this file so
+// we add an extra option to specifically ignore it.
+#ifndef C10_CUDA_NO_CMAKE_CONFIGURE_FILE
+#include <c10/cuda/impl/cuda_cmake_macros.h>
+#endif // C10_CUDA_NO_CMAKE_CONFIGURE_FILE
+
+#endif
+
+// See c10/macros/Export.h for a detailed explanation of what the function
+// of these macros are.  We need one set of macros for every separate library
+// we build.
+
+#ifdef _WIN32
+#if defined(C10_CUDA_BUILD_SHARED_LIBS)
+#define C10_CUDA_EXPORT __declspec(dllexport)
+#define C10_CUDA_IMPORT __declspec(dllimport)
+#else
+#define C10_CUDA_EXPORT
+#define C10_CUDA_IMPORT
+#endif
+#else // _WIN32
+#if defined(__GNUC__)
+#define C10_CUDA_EXPORT __attribute__((__visibility__("default")))
+#else // defined(__GNUC__)
+#define C10_CUDA_EXPORT
+#endif // defined(__GNUC__)
+#define C10_CUDA_IMPORT C10_CUDA_EXPORT
+#endif // _WIN32
+
+// This one is being used by libc10_cuda.so
+#ifdef C10_CUDA_BUILD_MAIN_LIB
+#define C10_CUDA_API C10_CUDA_EXPORT
+#else
+#define C10_CUDA_API C10_CUDA_IMPORT
+#endif
+
+/**
+ * The maximum number of GPUs that we recognizes. Increasing this beyond the
+ * initial limit of 16 broke Caffe2 testing, hence the ifdef guards.
+ * This value cannot be more than 128 because our DeviceIndex is a uint8_t.
+o */
+#ifdef FBCODE_CAFFE2
+// fbcode depends on this value being 16
+#define C10_COMPILE_TIME_MAX_GPUS 16
+#else
+#define C10_COMPILE_TIME_MAX_GPUS 120
+#endif
--- a/rl/Lib/site-packages/torch/include/c10/cuda/CUDAMathCompat.h
+++ b/rl/Lib/site-packages/torch/include/c10/cuda/CUDAMathCompat.h
@ -0,0 +1,152 @@
+#pragma once
+
+/* This file defines math functions compatible across different gpu
+ * platforms (currently CUDA and HIP).
+ */
+#if defined(__CUDACC__) || defined(__HIPCC__)
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+#ifdef __HIPCC__
+#define __MATH_FUNCTIONS_DECL__ inline C10_DEVICE
+#else /* __HIPCC__ */
+#ifdef __CUDACC_RTC__
+#define __MATH_FUNCTIONS_DECL__ C10_HOST_DEVICE
+#else /* __CUDACC_RTC__ */
+#define __MATH_FUNCTIONS_DECL__ inline C10_HOST_DEVICE
+#endif /* __CUDACC_RTC__ */
+#endif /* __HIPCC__ */
+
+namespace c10::cuda::compat {
+
+__MATH_FUNCTIONS_DECL__ float abs(float x) {
+  return ::fabsf(x);
+}
+__MATH_FUNCTIONS_DECL__ double abs(double x) {
+  return ::fabs(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float exp(float x) {
+  return ::expf(x);
+}
+__MATH_FUNCTIONS_DECL__ double exp(double x) {
+  return ::exp(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float ceil(float x) {
+  return ::ceilf(x);
+}
+__MATH_FUNCTIONS_DECL__ double ceil(double x) {
+  return ::ceil(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float copysign(float x, float y) {
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+  return ::copysignf(x, y);
+#else
+  // std::copysign gets ICE/Segfaults with gcc 7.5/8 on arm64
+  // (e.g. Jetson), see PyTorch PR #51834
+  // This host function needs to be here for the compiler but is never used
+  TORCH_INTERNAL_ASSERT(
+      false, "CUDAMathCompat copysign should not run on the CPU");
+#endif
+}
+__MATH_FUNCTIONS_DECL__ double copysign(double x, double y) {
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+  return ::copysign(x, y);
+#else
+  // see above
+  TORCH_INTERNAL_ASSERT(
+      false, "CUDAMathCompat copysign should not run on the CPU");
+#endif
+}
+
+__MATH_FUNCTIONS_DECL__ float floor(float x) {
+  return ::floorf(x);
+}
+__MATH_FUNCTIONS_DECL__ double floor(double x) {
+  return ::floor(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float log(float x) {
+  return ::logf(x);
+}
+__MATH_FUNCTIONS_DECL__ double log(double x) {
+  return ::log(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float log1p(float x) {
+  return ::log1pf(x);
+}
+
+__MATH_FUNCTIONS_DECL__ double log1p(double x) {
+  return ::log1p(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float max(float x, float y) {
+  return ::fmaxf(x, y);
+}
+__MATH_FUNCTIONS_DECL__ double max(double x, double y) {
+  return ::fmax(x, y);
+}
+
+__MATH_FUNCTIONS_DECL__ float min(float x, float y) {
+  return ::fminf(x, y);
+}
+__MATH_FUNCTIONS_DECL__ double min(double x, double y) {
+  return ::fmin(x, y);
+}
+
+__MATH_FUNCTIONS_DECL__ float pow(float x, float y) {
+  return ::powf(x, y);
+}
+__MATH_FUNCTIONS_DECL__ double pow(double x, double y) {
+  return ::pow(x, y);
+}
+
+__MATH_FUNCTIONS_DECL__ void sincos(float x, float* sptr, float* cptr) {
+  return ::sincosf(x, sptr, cptr);
+}
+__MATH_FUNCTIONS_DECL__ void sincos(double x, double* sptr, double* cptr) {
+  return ::sincos(x, sptr, cptr);
+}
+
+__MATH_FUNCTIONS_DECL__ float sqrt(float x) {
+  return ::sqrtf(x);
+}
+__MATH_FUNCTIONS_DECL__ double sqrt(double x) {
+  return ::sqrt(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float rsqrt(float x) {
+  return ::rsqrtf(x);
+}
+__MATH_FUNCTIONS_DECL__ double rsqrt(double x) {
+  return ::rsqrt(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float tan(float x) {
+  return ::tanf(x);
+}
+__MATH_FUNCTIONS_DECL__ double tan(double x) {
+  return ::tan(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float tanh(float x) {
+  return ::tanhf(x);
+}
+__MATH_FUNCTIONS_DECL__ double tanh(double x) {
+  return ::tanh(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float normcdf(float x) {
+  return ::normcdff(x);
+}
+__MATH_FUNCTIONS_DECL__ double normcdf(double x) {
+  return ::normcdf(x);
+}
+
+} // namespace c10::cuda::compat
+
+#endif
--- a/rl/Lib/site-packages/torch/include/c10/cuda/CUDAMiscFunctions.h
+++ b/rl/Lib/site-packages/torch/include/c10/cuda/CUDAMiscFunctions.h
@ -0,0 +1,12 @@
+#pragma once
+// this file is to avoid circular dependency between CUDAFunctions.h and
+// CUDAExceptions.h
+
+#include <c10/cuda/CUDAMacros.h>
+
+#include <mutex>
+
+namespace c10::cuda {
+C10_CUDA_API const char* get_cuda_check_suffix() noexcept;
+C10_CUDA_API std::mutex* getFreeMutex();
+} // namespace c10::cuda
--- a/rl/Lib/site-packages/torch/include/c10/cuda/CUDAStream.h
+++ b/rl/Lib/site-packages/torch/include/c10/cuda/CUDAStream.h
@ -0,0 +1,268 @@
+#pragma once
+
+#include <cuda_runtime_api.h>
+
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/Stream.h>
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/util/Exception.h>
+
+/*
+ * Stream pool note.
+ *
+ * A CUDAStream is an abstraction of an actual cuStream on the GPU. CUDAStreams
+ * are backed by cuStreams, but they use several pools to minimize the costs
+ * associated with creating, retaining, and destroying cuStreams.
+ *
+ * There are three pools per device, and a device's pools are lazily created.
+ *
+ * The first pool contains only the default stream. When the default stream
+ * is requested it's returned.
+ *
+ * The second pool is the "low priority" or "default priority" streams. In
+ * HIP builds there is no distinction between streams in this pool and streams
+ * in the third pool (below). There are 32 of these streams per device, and
+ * when a stream is requested one of these streams is returned round-robin.
+ * That is, the first stream requested is at index 0, the second at index 1...
+ * to index 31, then index 0 again.
+ *
+ * This means that if 33 low priority streams are requested, the first and
+ * last streams requested are actually the same stream (under the covers)
+ * and kernels enqueued on them cannot run concurrently.
+ *
+ * The third pool is the "high priority" streams. The third pool acts like
+ * the second pool except the streams are created with a higher priority.
+ *
+ * These pools suggest that stream users should prefer many short-lived streams,
+ * as the cost of acquiring and releasing streams is effectively zero. If
+ * many longer-lived streams are required in performance critical scenarios
+ * then the functionality here may need to be extended to allow, for example,
+ * "reserving" a subset of the pool so that other streams do not accidentally
+ * overlap the performance critical streams.
+ *
+ * Note: although the notion of "current stream for device" is thread local
+ * (every OS thread has a separate current stream, as one might expect),
+ * the stream pool is global across all threads; stream 0 is always stream 0
+ * no matter which thread you use it on.  Multiple threads can synchronize
+ * on the same stream.  Although the CUDA documentation is not very clear
+ * on the matter, streams are thread safe; e.g., it is safe to enqueue
+ * a kernel on the same stream from two different threads.
+ */
+
+namespace c10::cuda {
+
+static constexpr int max_compile_time_stream_priorities = 4;
+
+// Value object representing a CUDA stream.  This is just a wrapper
+// around c10::Stream, but it comes with a little extra CUDA-specific
+// functionality (conversion to cudaStream_t), and a guarantee that
+// the wrapped c10::Stream really is a CUDA stream.
+class C10_CUDA_API CUDAStream {
+ public:
+  enum Unchecked { UNCHECKED };
+
+  /// Construct a CUDAStream from a Stream.  This construction is checked,
+  /// and will raise an error if the Stream is not, in fact, a CUDA stream.
+  explicit CUDAStream(Stream stream) : stream_(stream) {
+    TORCH_CHECK(stream_.device_type() == DeviceType::CUDA);
+  }
+
+  /// Construct a CUDAStream from a Stream with no error checking.
+  /// This constructor uses the "named" constructor idiom, and can
+  /// be invoked as: CUDAStream(CUDAStream::UNCHECKED, stream)
+  explicit CUDAStream(Unchecked, Stream stream) : stream_(stream) {}
+
+  bool operator==(const CUDAStream& other) const noexcept {
+    return unwrap() == other.unwrap();
+  }
+
+  bool operator!=(const CUDAStream& other) const noexcept {
+    return unwrap() != other.unwrap();
+  }
+
+  /// Implicit conversion to cudaStream_t.
+  operator cudaStream_t() const {
+    return stream();
+  }
+
+  /// Implicit conversion to Stream (a.k.a., forget that the stream is a
+  /// CUDA stream).
+  operator Stream() const {
+    return unwrap();
+  }
+
+  /// Used to avoid baking in device type explicitly to Python-side API.
+  DeviceType device_type() const {
+    return DeviceType::CUDA;
+  }
+
+  /// Get the CUDA device index that this stream is associated with.
+  DeviceIndex device_index() const {
+    return stream_.device_index();
+  }
+
+  /// Get the full Device that this stream is associated with.  The Device
+  /// is guaranteed to be a CUDA device.
+  Device device() const {
+    return Device(DeviceType::CUDA, device_index());
+  }
+
+  /// Return the stream ID corresponding to this particular stream.
+  StreamId id() const {
+    return stream_.id();
+  }
+
+  bool query() const {
+    DeviceGuard guard{stream_.device()};
+    cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaStreamQuery(stream()));
+
+    if (err == cudaSuccess) {
+      return true;
+    } else if (err != cudaErrorNotReady) {
+      C10_CUDA_CHECK(err);
+    } else {
+      // ignore and clear the error if not ready
+      (void)cudaGetLastError();
+    }
+
+    return false;
+  }
+
+  void synchronize() const {
+    DeviceGuard guard{stream_.device()};
+    c10::cuda::stream_synchronize(stream());
+  }
+
+  int priority() const {
+    DeviceGuard guard{stream_.device()};
+    int priority = 0;
+    C10_CUDA_CHECK(cudaStreamGetPriority(stream(), &priority));
+    return priority;
+  }
+
+  /// Explicit conversion to cudaStream_t.
+  cudaStream_t stream() const;
+
+  /// Explicit conversion to Stream.
+  Stream unwrap() const {
+    return stream_;
+  }
+
+  /// Reversibly pack a CUDAStream into a struct representation.
+  /// Previously the stream's data was packed into a single int64_t,
+  /// as it was assumed the fields would not require more than
+  /// 64 bits of storage in total.
+  /// See https://github.com/pytorch/pytorch/issues/75854
+  /// for more information regarding newer platforms that may violate
+  /// this assumption.
+  ///
+  /// The CUDAStream can be unpacked using unpack().
+  struct c10::StreamData3 pack3() const {
+    return stream_.pack3();
+  }
+
+  // Unpack a CUDAStream from the 3 fields generated by pack().
+  static CUDAStream unpack3(
+      StreamId stream_id,
+      DeviceIndex device_index,
+      DeviceType device_type) {
+    return CUDAStream(Stream::unpack3(stream_id, device_index, device_type));
+  }
+
+  static std::tuple<int, int> priority_range() {
+    // Note: this returns the range of priority **supported by PyTorch**, not
+    // the range of priority **supported by CUDA**. The former is a subset of
+    // the latter.
+    int least_priority = 0, greatest_priority = 0;
+    C10_CUDA_CHECK(
+        cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
+#ifdef USE_ROCM
+    // See Note [HIP stream priorities]
+    TORCH_INTERNAL_ASSERT(
+        least_priority == 1, "Unexpected HIP stream priority range");
+    least_priority = 0;
+#else
+    TORCH_INTERNAL_ASSERT(
+        least_priority == 0, "Unexpected CUDA stream priority range");
+#endif
+    TORCH_INTERNAL_ASSERT(
+        greatest_priority <= -1, "Unexpected CUDA stream priority range");
+    greatest_priority = std::max(
+        -c10::cuda::max_compile_time_stream_priorities + 1, greatest_priority);
+    return std::make_tuple(least_priority, greatest_priority);
+  }
+
+  // Deleted for now; use CUDAEvent::block instead
+  // void synchronize_with(const CUDAEvent& event) const;
+
+ private:
+  Stream stream_;
+};
+
+/**
+ * Get a new stream from the CUDA stream pool.  You can think of this
+ * as "creating" a new stream, but no such creation actually happens;
+ * instead, streams are preallocated from the pool and returned in a
+ * round-robin fashion.
+ *
+ * You can request a stream from the high priority pool by setting
+ * isHighPriority to true, or a stream for a specific device by setting device
+ * (defaulting to the current CUDA stream.)
+ */
+C10_API CUDAStream
+getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
+// no default priority to disambiguate overloads
+C10_API CUDAStream
+getStreamFromPool(const int priority, DeviceIndex device = -1);
+
+/**
+ * Get a CUDAStream from a externally allocated one.
+ *
+ * This is mainly for interoperability with different libraries where we
+ * want to operate on a non-torch allocated stream for data exchange or similar
+ * purposes
+ */
+C10_API CUDAStream
+getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index);
+
+/**
+ * Get the default CUDA stream, for the passed CUDA device, or for the
+ * current device if no device index is passed.  The default stream is
+ * where most computation occurs when you aren't explicitly using
+ * streams.
+ */
+C10_API CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -1);
+
+/**
+ * Get the current CUDA stream, for the passed CUDA device, or for the
+ * current device if no device index is passed.  The current CUDA stream
+ * will usually be the default CUDA stream for the device, but it may
+ * be different if someone called 'setCurrentCUDAStream' or used 'StreamGuard'
+ * or 'CUDAStreamGuard'.
+ */
+C10_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1);
+
+/**
+ * Set the current stream on the device of the passed in stream to be
+ * the passed in stream.  Yes, you read that right: this function
+ * has *nothing* to do with the current device: it toggles the current
+ * stream of the device of the passed stream.
+ *
+ * Confused?  Avoid using this function; prefer using 'CUDAStreamGuard' instead
+ * (which will switch both your current device and current stream in the way you
+ * expect, and reset it back to its original state afterwards).
+ */
+C10_API void setCurrentCUDAStream(CUDAStream stream);
+
+C10_API std::ostream& operator<<(std::ostream& stream, const CUDAStream& s);
+
+} // namespace c10::cuda
+
+namespace std {
+template <>
+struct hash<c10::cuda::CUDAStream> {
+  size_t operator()(c10::cuda::CUDAStream s) const noexcept {
+    return std::hash<c10::Stream>{}(s.unwrap());
+  }
+};
+} // namespace std
--- a/rl/Lib/site-packages/torch/include/c10/cuda/driver_api.h
+++ b/rl/Lib/site-packages/torch/include/c10/cuda/driver_api.h
@ -0,0 +1,63 @@
+#pragma once
+#include <cuda.h>
+#define NVML_NO_UNVERSIONED_FUNC_DEFS
+#include <nvml.h>
+
+#define C10_CUDA_DRIVER_CHECK(EXPR)                                        \
+  do {                                                                     \
+    CUresult __err = EXPR;                                                 \
+    if (__err != CUDA_SUCCESS) {                                           \
+      const char* err_str;                                                 \
+      CUresult get_error_str_err C10_UNUSED =                              \
+          c10::cuda::DriverAPI::get()->cuGetErrorString_(__err, &err_str); \
+      if (get_error_str_err != CUDA_SUCCESS) {                             \
+        AT_ERROR("CUDA driver error: unknown error");                      \
+      } else {                                                             \
+        AT_ERROR("CUDA driver error: ", err_str);                          \
+      }                                                                    \
+    }                                                                      \
+  } while (0)
+
+#define C10_LIBCUDA_DRIVER_API(_)   \
+  _(cuDeviceGetAttribute)           \
+  _(cuMemAddressReserve)            \
+  _(cuMemRelease)                   \
+  _(cuMemMap)                       \
+  _(cuMemAddressFree)               \
+  _(cuMemSetAccess)                 \
+  _(cuMemUnmap)                     \
+  _(cuMemCreate)                    \
+  _(cuMemGetAllocationGranularity)  \
+  _(cuMemExportToShareableHandle)   \
+  _(cuMemImportFromShareableHandle) \
+  _(cuGetErrorString)
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12030)
+#define C10_LIBCUDA_DRIVER_API_12030(_) \
+  _(cuMulticastAddDevice)               \
+  _(cuMulticastBindMem)                 \
+  _(cuMulticastCreate)
+#else
+#define C10_LIBCUDA_DRIVER_API_12030(_)
+#endif
+
+#define C10_NVML_DRIVER_API(_)           \
+  _(nvmlInit_v2)                         \
+  _(nvmlDeviceGetHandleByPciBusId_v2)    \
+  _(nvmlDeviceGetNvLinkRemoteDeviceType) \
+  _(nvmlDeviceGetNvLinkRemotePciInfo_v2) \
+  _(nvmlDeviceGetComputeRunningProcesses)
+
+namespace c10::cuda {
+
+struct DriverAPI {
+#define CREATE_MEMBER(name) decltype(&name) name##_;
+  C10_LIBCUDA_DRIVER_API(CREATE_MEMBER)
+  C10_LIBCUDA_DRIVER_API_12030(CREATE_MEMBER)
+  C10_NVML_DRIVER_API(CREATE_MEMBER)
+#undef CREATE_MEMBER
+  static DriverAPI* get();
+  static void* get_nvml_handle();
+};
+
+} // namespace c10::cuda
--- a/rl/Lib/site-packages/torch/include/c10/cuda/impl/CUDAGuardImpl.h
+++ b/rl/Lib/site-packages/torch/include/c10/cuda/impl/CUDAGuardImpl.h
@ -0,0 +1,249 @@
+#pragma once
+
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/core/impl/GPUTrace.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/Stream.h>
+#include <c10/core/impl/PyInterpreter.h>
+#include <cuda_runtime_api.h>
+#include <cstdint>
+#include <optional>
+
+namespace c10::cuda::impl {
+
+struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
+  static constexpr DeviceType static_type = DeviceType::CUDA;
+
+  CUDAGuardImpl() = default;
+  explicit CUDAGuardImpl(DeviceType t) {
+    TORCH_INTERNAL_ASSERT(t == DeviceType::CUDA);
+  }
+  DeviceType type() const override {
+    return DeviceType::CUDA;
+  }
+  Device exchangeDevice(Device d) const override {
+    TORCH_INTERNAL_ASSERT(d.is_cuda());
+    auto old_device_index = c10::cuda::ExchangeDevice(d.index());
+    return Device(DeviceType::CUDA, old_device_index);
+  }
+  Device getDevice() const override {
+    DeviceIndex device = 0;
+    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    return Device(DeviceType::CUDA, device);
+  }
+  std::optional<Device> uncheckedGetDevice() const noexcept {
+    DeviceIndex device{-1};
+    const auto err = C10_CUDA_ERROR_HANDLED(c10::cuda::GetDevice(&device));
+    C10_CUDA_CHECK_WARN(err);
+    if (err != cudaSuccess) {
+      return std::nullopt;
+    }
+    return Device(DeviceType::CUDA, device);
+  }
+  void setDevice(Device d) const override {
+    TORCH_INTERNAL_ASSERT(d.is_cuda());
+    C10_CUDA_CHECK(c10::cuda::SetDevice(d.index()));
+  }
+  void uncheckedSetDevice(Device d) const noexcept override {
+    C10_CUDA_CHECK_WARN(c10::cuda::MaybeSetDevice(d.index()));
+  }
+  Stream getStream(Device d) const noexcept override {
+    return getCurrentCUDAStream(d.index()).unwrap();
+  }
+  Stream getDefaultStream(Device d) const override {
+    return getDefaultCUDAStream(d.index());
+  }
+  Stream getNewStream(Device d, int priority = 0) const override {
+    return getStreamFromPool(priority, d.index());
+  }
+  Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false)
+      const override {
+    return getStreamFromPool(isHighPriority, d.index());
+  }
+  // NB: These do NOT set the current device
+  Stream exchangeStream(Stream s) const noexcept override {
+    CUDAStream cs(s);
+    auto old_stream = getCurrentCUDAStream(s.device().index());
+    setCurrentCUDAStream(cs);
+    return old_stream.unwrap();
+  }
+  DeviceIndex deviceCount() const noexcept override {
+    return device_count();
+  }
+
+  // Event-related functions
+  void createEvent(cudaEvent_t* cuda_event, const EventFlag flag) const {
+    // Maps PyTorch's Event::Flag to CUDA flag
+    auto cuda_flag = cudaEventDefault;
+    switch (flag) {
+      case EventFlag::PYTORCH_DEFAULT:
+        cuda_flag = cudaEventDisableTiming;
+        break;
+      case EventFlag::BACKEND_DEFAULT:
+        cuda_flag = cudaEventDefault;
+        break;
+      default:
+        TORCH_CHECK(false, "CUDA event received unknown flag");
+    }
+
+    C10_CUDA_CHECK(cudaEventCreateWithFlags(cuda_event, cuda_flag));
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_creation(
+          c10::kCUDA, reinterpret_cast<uintptr_t>(cuda_event));
+    }
+  }
+
+  void destroyEvent(void* event, const DeviceIndex device_index)
+      const noexcept override {
+    if (!event)
+      return;
+    auto cuda_event = static_cast<cudaEvent_t>(event);
+    DeviceIndex orig_device{-1};
+    C10_CUDA_CHECK_WARN(c10::cuda::GetDevice(&orig_device));
+    C10_CUDA_CHECK_WARN(c10::cuda::SetDevice(device_index));
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_deletion(
+          c10::kCUDA, reinterpret_cast<uintptr_t>(cuda_event));
+    }
+    C10_CUDA_CHECK_WARN(cudaEventDestroy(cuda_event));
+    C10_CUDA_CHECK_WARN(c10::cuda::SetDevice(orig_device));
+  }
+
+  void record(
+      void** event,
+      const Stream& stream,
+      const DeviceIndex device_index,
+      const EventFlag flag) const override {
+    TORCH_CHECK(
+        device_index == -1 || device_index == stream.device_index(),
+        "Event device index ",
+        device_index,
+        " does not match recording stream's device index ",
+        stream.device_index(),
+        ".");
+
+    cudaEvent_t cuda_event = static_cast<cudaEvent_t>(*event);
+    CUDAStream cuda_stream{stream};
+
+    // Moves to stream's device to record
+    const auto orig_device = getDevice();
+    setDevice(stream.device());
+
+    // Creates the event (lazily)
+    if (!cuda_event)
+      createEvent(&cuda_event, flag);
+    C10_CUDA_CHECK(cudaEventRecord(cuda_event, cuda_stream));
+    // Makes the void* point to the (possibly just allocated) CUDA event
+    *event = cuda_event;
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_record(
+          c10::kCUDA,
+          reinterpret_cast<uintptr_t>(cuda_event),
+          reinterpret_cast<uintptr_t>(cuda_stream.stream()));
+    }
+
+    // Resets device
+    setDevice(orig_device);
+  }
+
+  void block(void* event, const Stream& stream) const override {
+    if (!event)
+      return;
+    cudaEvent_t cuda_event = static_cast<cudaEvent_t>(event);
+    CUDAStream cuda_stream{stream};
+    const auto orig_device = getDevice();
+    setDevice(stream.device());
+    C10_CUDA_CHECK(cudaStreamWaitEvent(
+        cuda_stream,
+        cuda_event,
+        /*flags (must be zero)=*/0));
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_wait(
+          c10::kCUDA,
+          reinterpret_cast<uintptr_t>(cuda_event),
+          reinterpret_cast<uintptr_t>(cuda_stream.stream()));
+    }
+    setDevice(orig_device);
+  }
+
+  // May be called from any device
+  bool queryEvent(void* event) const override {
+    if (!event)
+      return true;
+    cudaEvent_t cuda_event = static_cast<cudaEvent_t>(event);
+    // Note: cudaEventQuery can be safely called from any device
+    const cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaEventQuery(cuda_event));
+    if (err != cudaErrorNotReady) {
+      C10_CUDA_CHECK(err);
+    } else {
+      // ignore and clear the error if not ready
+      (void)cudaGetLastError();
+    }
+    return (err == cudaSuccess);
+  }
+
+  // Stream-related functions
+  bool queryStream(const Stream& stream) const override {
+    CUDAStream cuda_stream{stream};
+    return cuda_stream.query();
+  }
+
+  void synchronizeStream(const Stream& stream) const override {
+    CUDAStream cuda_stream{stream};
+    cuda_stream.synchronize();
+  }
+
+  void synchronizeEvent(void* event) const override {
+    if (!event)
+      return;
+    cudaEvent_t cuda_event = static_cast<cudaEvent_t>(event);
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_synchronization(
+          c10::kCUDA, reinterpret_cast<uintptr_t>(cuda_event));
+    }
+    // Note: cudaEventSynchronize can be safely called from any device
+    C10_CUDA_CHECK(cudaEventSynchronize(cuda_event));
+  }
+
+  void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream)
+      const override {
+    CUDAStream cuda_stream{stream};
+    CUDACachingAllocator::recordStream(data_ptr, cuda_stream);
+  }
+
+  double elapsedTime(void* event1, void* event2, const DeviceIndex device_index)
+      const override {
+    TORCH_CHECK(
+        event1 && event2,
+        "Both events must be recorded before calculating elapsed time.");
+    // Even though cudaEventElapsedTime can be safely called from any device, if
+    // the current device is not initialized, it will create a new cuda context,
+    // which will consume a lot of memory.
+    DeviceIndex orig_device{-1};
+    C10_CUDA_CHECK(c10::cuda::GetDevice(&orig_device));
+    C10_CUDA_CHECK(c10::cuda::SetDevice(device_index));
+    cudaEvent_t cuda_event1 = static_cast<cudaEvent_t>(event1);
+    cudaEvent_t cuda_event2 = static_cast<cudaEvent_t>(event2);
+    float time_ms = 0;
+    // raise cudaErrorNotReady if either event is recorded but not yet completed
+    C10_CUDA_CHECK(cudaEventElapsedTime(&time_ms, cuda_event1, cuda_event2));
+    C10_CUDA_CHECK(c10::cuda::SetDevice(orig_device));
+    return static_cast<double>(time_ms);
+  }
+};
+
+} // namespace c10::cuda::impl
--- a/rl/Lib/site-packages/torch/include/c10/cuda/impl/CUDATest.h
+++ b/rl/Lib/site-packages/torch/include/c10/cuda/impl/CUDATest.h
@ -0,0 +1,9 @@
+#pragma once
+
+#include <c10/cuda/CUDAMacros.h>
+
+namespace c10::cuda::impl {
+
+C10_CUDA_API int c10_cuda_test();
+
+}