I am done

This commit is contained in:
2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions

View File

@ -0,0 +1,31 @@
#ifdef THRUST_DEVICE_LOWER_BOUND_WORKS
#include <thrust/binary_search.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/functional.h>
#endif
namespace c10::cuda {
#ifdef THRUST_DEVICE_LOWER_BOUND_WORKS
template <typename Iter, typename Scalar>
__forceinline__ __device__ Iter
lower_bound(Iter start, Iter end, Scalar value) {
return thrust::lower_bound(thrust::device, start, end, value);
}
#else
// thrust::lower_bound is broken on device, see
// https://github.com/NVIDIA/thrust/issues/1734 Implementation inspired by
// https://github.com/pytorch/pytorch/blob/805120ab572efef66425c9f595d9c6c464383336/aten/src/ATen/native/cuda/Bucketization.cu#L28
template <typename Iter, typename Scalar>
__device__ Iter lower_bound(Iter start, Iter end, Scalar value) {
while (start < end) {
auto mid = start + ((end - start) >> 1);
if (*mid < value) {
start = mid + 1;
} else {
end = mid;
}
}
return end;
}
#endif // THRUST_DEVICE_LOWER_BOUND_WORKS
} // namespace c10::cuda

View File

@ -0,0 +1,124 @@
#pragma once
#include <c10/cuda/CUDAMacros.h>
#include <c10/util/Exception.h>
#include <atomic>
#include <cstddef>
#include <cstdlib>
#include <mutex>
#include <string>
#include <vector>
namespace c10::cuda::CUDACachingAllocator {
// Environment config parser
class C10_CUDA_API CUDAAllocatorConfig {
public:
static size_t max_split_size() {
return instance().m_max_split_size;
}
static double garbage_collection_threshold() {
return instance().m_garbage_collection_threshold;
}
static bool expandable_segments() {
#ifndef PYTORCH_C10_DRIVER_API_SUPPORTED
if (instance().m_expandable_segments) {
TORCH_WARN_ONCE("expandable_segments not supported on this platform")
}
return false;
#else
return instance().m_expandable_segments;
#endif
}
static bool release_lock_on_cudamalloc() {
return instance().m_release_lock_on_cudamalloc;
}
/** Pinned memory allocator settings */
static bool pinned_use_cuda_host_register() {
return instance().m_pinned_use_cuda_host_register;
}
static size_t pinned_num_register_threads() {
return instance().m_pinned_num_register_threads;
}
static size_t pinned_max_register_threads() {
// Based on the benchmark results, we see better allocation performance
// with 8 threads. However on future systems, we may need more threads
// and limiting this to 128 threads.
return 128;
}
// This is used to round-up allocation size to nearest power of 2 divisions.
// More description below in function roundup_power2_next_division
// As ane example, if we want 4 divisions between 2's power, this can be done
// using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
static size_t roundup_power2_divisions(size_t size);
static std::vector<size_t> roundup_power2_divisions() {
return instance().m_roundup_power2_divisions;
}
static std::string last_allocator_settings() {
std::lock_guard<std::mutex> lock(
instance().m_last_allocator_settings_mutex);
return instance().m_last_allocator_settings;
}
static CUDAAllocatorConfig& instance() {
static CUDAAllocatorConfig* s_instance = ([]() {
auto inst = new CUDAAllocatorConfig();
const char* env = getenv("PYTORCH_CUDA_ALLOC_CONF");
inst->parseArgs(env);
return inst;
})();
return *s_instance;
}
void parseArgs(const char* env);
private:
CUDAAllocatorConfig();
static void lexArgs(const char* env, std::vector<std::string>& config);
static void consumeToken(
const std::vector<std::string>& config,
size_t i,
const char c);
size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
size_t parseGarbageCollectionThreshold(
const std::vector<std::string>& config,
size_t i);
size_t parseRoundUpPower2Divisions(
const std::vector<std::string>& config,
size_t i);
size_t parseAllocatorConfig(
const std::vector<std::string>& config,
size_t i,
bool& used_cudaMallocAsync);
size_t parsePinnedUseCudaHostRegister(
const std::vector<std::string>& config,
size_t i);
size_t parsePinnedNumRegisterThreads(
const std::vector<std::string>& config,
size_t i);
std::atomic<size_t> m_max_split_size;
std::vector<size_t> m_roundup_power2_divisions;
std::atomic<double> m_garbage_collection_threshold;
std::atomic<size_t> m_pinned_num_register_threads;
std::atomic<bool> m_expandable_segments;
std::atomic<bool> m_release_lock_on_cudamalloc;
std::atomic<bool> m_pinned_use_cuda_host_register;
std::string m_last_allocator_settings;
std::mutex m_last_allocator_settings_mutex;
};
// General caching allocator utilities
C10_CUDA_API void setAllocatorSettings(const std::string& env);
} // namespace c10::cuda::CUDACachingAllocator

View File

@ -0,0 +1,499 @@
#pragma once
#include <c10/core/CachingDeviceAllocator.h>
#include <c10/cuda/CUDAGraphsC10Utils.h>
#include <c10/cuda/CUDAMacros.h>
#include <c10/cuda/CUDAStream.h>
#include <c10/util/ApproximateClock.h>
#include <c10/util/Exception.h>
#include <c10/util/Registry.h>
#include <array>
#include <atomic>
#include <cstddef>
#include <cstdint>
#include <functional>
#include <memory>
#include <string>
#include <unordered_set>
#include <utility>
namespace c10 {
// Caching allocator will execute every registered callback if it unable to find
// block inside of already allocated area.
class C10_CUDA_API FreeMemoryCallback {
public:
virtual ~FreeMemoryCallback() = default;
virtual bool Execute() = 0;
};
C10_DECLARE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback);
#define REGISTER_FREE_MEMORY_CALLBACK(name, ...) \
C10_REGISTER_CLASS(FreeCudaMemoryCallbacksRegistry, name, __VA_ARGS__);
} // namespace c10
//
// TODO: Turn this into an honest to goodness class. I briefly attempted to do
// this, but it was a bit irritating to figure out how to also correctly
// apply pimpl pattern so I didn't have to leak any internal implementation
// details in the header (CUDACachingAllocator could be made a pimpl, but
// you also need to appropriately define a class which is a subclass
// of Allocator. Not impossible, but required a bit more surgery than
// I wanted to do at the time.)
//
// Why is this using a namespace rather than old-style THCCachingAllocator_
// prefix? Mostly because it made the HIPify rules easier to write; _ is
// not counted as a word boundary, so you would otherwise have to list each
// of these functions.
namespace c10::cuda::CUDACachingAllocator {
// Preserved only for BC reasons
// NOLINTNEXTLINE(misc-unused-using-decls)
using c10::CachingDeviceAllocator::DeviceStats;
extern const size_t kLargeBuffer;
typedef std::shared_ptr<GatheredContext> (*CreateContextFn)();
// Struct containing info of an allocation block (i.e. a fractional part of a
// cudaMalloc)..
struct BlockInfo {
size_t size = 0;
size_t requested_size = 0;
int32_t gc_counter = 0;
bool allocated = false;
bool active = false;
std::shared_ptr<GatheredContext>
context_when_allocated; // per-watcher context
};
// Struct containing info of a memory segment (i.e. one contiguous cudaMalloc).
struct SegmentInfo {
c10::DeviceIndex device = 0;
size_t address = 0;
size_t total_size = 0;
size_t requested_size = 0; // unrounded, actually requested size
size_t allocated_size = 0;
size_t active_size = 0;
cudaStream_t stream = nullptr;
bool is_large = false;
bool is_expandable = false;
MempoolId_t owner_private_pool_id = {0, 0};
std::vector<BlockInfo> blocks;
std::shared_ptr<GatheredContext> context_when_allocated;
};
struct AllocatorState {
virtual ~AllocatorState() = default;
};
union trace_time_ {
time_t t_;
approx_time_t approx_t_;
};
struct TraceEntry {
enum Action {
ALLOC, // API made to the caching allocator for new memory
FREE_REQUESTED, // API call made to the caching allocator to free memory
FREE_COMPLETED, // The allocator might have to delay a free because
// it is still in use on another stream via record_stream
// This event is generated when a free actually completes.
SEGMENT_ALLOC, // a call to cudaMalloc to get more memory from the OS
SEGMENT_FREE, // a call to cudaFree to return memory to the OS (e.g. to
// defragment or empty_caches)
SEGMENT_MAP, // a call to cuMemMap (used with expandable_segments)
SEGMENT_UNMAP, // unmap part of a segment (used with expandable segments)
SNAPSHOT, // a call to snapshot, used to correlate memory snapshots to trace
// events
OOM // the allocator threw an OutOfMemoryError (addr_ is the amount of free
// bytes reported by cuda)
};
TraceEntry(
Action action,
c10::DeviceIndex device,
size_t addr,
size_t size,
cudaStream_t stream,
approx_time_t time,
std::shared_ptr<GatheredContext> context = nullptr)
: action_(action),
device_(device),
addr_(addr),
context_(std::move(context)),
stream_(stream),
size_(size) {
time_.approx_t_ = time;
}
Action action_;
c10::DeviceIndex device_;
size_t addr_; // for OOM, this is the amount of free bytes reported by cuda
std::shared_ptr<GatheredContext> context_;
cudaStream_t stream_{};
size_t size_;
trace_time_ time_{};
};
// Calls made by record_function will save annotations
struct AnnotationEntry {
AnnotationEntry(c10::DeviceIndex device, approx_time_t time)
: device_(device) {
time_.approx_t_ = time;
}
void recordUserMetadata(const std::string& name, std::string value) {
metadata_[name] = std::move(value);
}
c10::DeviceIndex device_;
trace_time_ time_{};
std::unordered_map<std::string, std::string> metadata_;
};
struct AllocatorConfigInfo {
double garbage_collection_threshold;
size_t max_split_size;
size_t pinned_num_register_threads;
bool expandable_segments;
bool release_lock_on_malloc;
bool pinned_use_host_register;
std::string last_allocator_settings;
std::vector<size_t> roundup_power2_divisions;
};
struct SnapshotInfo {
std::vector<SegmentInfo> segments;
std::vector<std::vector<TraceEntry>> device_traces;
std::vector<AnnotationEntry> external_annotations;
AllocatorConfigInfo config_metadata;
};
// returns the pointers freed in the pool
// and the pointers allocated. Note: a pointer
// may appear in both freed and allocated
struct CheckpointDelta {
std::vector<void*> ptrs_freed;
std::vector<at::DataPtr> dataptrs_allocd;
};
enum struct RecordContext {
NEVER = 0,
STATE = 1, // only keep stacks for active allocations
ALLOC = 2, // additionally keep stacks for allocations in the trace history
ALL = 3, // additionally record stacks for when something is freed
};
using OutOfMemoryObserver = std::function<void(
int64_t device,
size_t allocated,
size_t device_total,
size_t device_free)>;
using AllocatorTraceTracker = std::function<void(const TraceEntry&)>;
struct ShareableHandle {
ptrdiff_t offset;
std::string handle;
};
class CUDAAllocator : public Allocator {
public:
virtual void* raw_alloc(size_t nbytes) = 0;
virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0;
virtual void raw_delete(void* ptr) = 0;
virtual void init(int device_count) = 0;
virtual bool initialized() = 0;
virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0;
virtual void emptyCache() = 0;
virtual void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) = 0;
virtual void* getBaseAllocation(void* ptr, size_t* size) = 0;
virtual void recordStream(const DataPtr&, CUDAStream stream) = 0;
virtual c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
c10::DeviceIndex device) = 0;
virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0;
virtual void resetPeakStats(c10::DeviceIndex device) = 0;
virtual SnapshotInfo snapshot() = 0;
virtual void beginAllocateToPool(
c10::DeviceIndex device,
MempoolId_t mempool_id,
std::function<bool(cudaStream_t)> filter) = 0;
virtual void endAllocateToPool(
c10::DeviceIndex device,
MempoolId_t mempool_id) = 0;
virtual void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) = 0;
// returns true if the allocated blocks are equal to expected live allocations
virtual bool checkPoolLiveAllocations(
c10::DeviceIndex device,
MempoolId_t mempool_id,
const std::unordered_set<void*>& expected_live_allocations) {
TORCH_CHECK(
false,
name(),
" does not yet support checkPoolLiveAllocations. "
"If you need it, please file an issue describing your use case.");
}
virtual ShareableHandle shareIpcHandle(void* ptr) = 0;
virtual std::shared_ptr<void> getIpcDevPtr(std::string handle) = 0;
virtual bool isHistoryEnabled() {
TORCH_CHECK(
false,
name(),
" does not yet support recordHistory. "
"If you need it, please file an issue describing your use case.");
}
virtual void recordHistory(
bool enabled,
CreateContextFn context_recorder,
size_t alloc_trace_max_entries,
RecordContext when) = 0;
virtual void recordAnnotation(
const std::vector<std::pair<std::string, std::string>>& md){};
virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;
// Attached AllocatorTraceTracker callbacks will be called while the
// per-device allocator lock is held. Any additional locks taken from within
// the callback must be proven to always have the lock order that never
// triggers a deadlock. In particular, Python's GIL may be held when
// calling the allocator so it is unsafe to try to acquire the GIL in this
// callback.
virtual void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) = 0;
virtual void enablePeerAccess(
c10::DeviceIndex dev,
c10::DeviceIndex dev_to_access) = 0;
// memory not allocated from cudaMalloc cannot be copied
// across devices using cudaMemcpyAsync if peer to peer access is disabled.
// instead it requires cudaMemcpyAsyncPeer
// with P2P Enabled, all combinations work
// with P2P Disabled:
// cudaMalloc cudaMallocAsync/cuMemMap
// cudaMemcpyAsyncPeer works works
// cudaMemcpyAsync works error
// This function performs chooses to use the Peer version of
// memcpy if required based on where the allocated put dst/src.
virtual cudaError_t memcpyAsync(
void* dst,
int dstDevice,
const void* src,
int srcDevice,
size_t count,
cudaStream_t stream,
bool p2p_enabled) = 0;
virtual std::shared_ptr<AllocatorState> getCheckpointState(
c10::DeviceIndex device,
MempoolId_t id) = 0;
virtual CheckpointDelta setCheckpointPoolState(
c10::DeviceIndex device,
std::shared_ptr<AllocatorState> pps) = 0;
virtual std::string name() = 0;
};
// Allocator object, statically initialized
// See BackendInitializer in CUDACachingAllocator.cpp.
// Atomic loads on x86 are just normal loads,
// (atomic stores are different), so reading this value
// is no different than loading a pointer.
C10_CUDA_API extern std::atomic<CUDAAllocator*> allocator;
inline CUDAAllocator* get() {
return allocator.load();
}
// Called directly by clients.
inline void* raw_alloc(size_t nbytes) {
return get()->raw_alloc(nbytes);
}
inline void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) {
return get()->raw_alloc_with_stream(nbytes, stream);
}
inline void raw_delete(void* ptr) {
return get()->raw_delete(ptr);
}
inline void init(int device_count) {
return get()->init(device_count);
}
inline void setMemoryFraction(double fraction, c10::DeviceIndex device) {
return get()->setMemoryFraction(fraction, device);
}
inline void emptyCache() {
return get()->emptyCache();
}
inline void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) {
return get()->cacheInfo(device, largestBlock);
}
inline void* getBaseAllocation(void* ptr, size_t* size) {
return get()->getBaseAllocation(ptr, size);
}
inline void recordStream(const DataPtr& dataPtr, CUDAStream stream) {
return get()->recordStream(dataPtr, stream);
}
inline c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
c10::DeviceIndex device) {
return get()->getDeviceStats(device);
}
inline void resetAccumulatedStats(c10::DeviceIndex device) {
return get()->resetAccumulatedStats(device);
}
inline void resetPeakStats(c10::DeviceIndex device) {
return get()->resetPeakStats(device);
}
inline SnapshotInfo snapshot() {
return get()->snapshot();
}
inline std::shared_ptr<AllocatorState> getCheckpointState(
c10::DeviceIndex device,
MempoolId_t id) {
return get()->getCheckpointState(device, id);
}
inline CheckpointDelta setCheckpointPoolState(
c10::DeviceIndex device,
std::shared_ptr<AllocatorState> pps) {
return get()->setCheckpointPoolState(device, std::move(pps));
}
// CUDAGraph interactions
inline void beginAllocateToPool(
c10::DeviceIndex device,
MempoolId_t mempool_id,
std::function<bool(cudaStream_t)> filter) {
get()->beginAllocateToPool(device, mempool_id, std::move(filter));
}
inline void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id) {
get()->endAllocateToPool(device, mempool_id);
}
inline void recordHistory(
bool enabled,
CreateContextFn context_recorder,
size_t alloc_trace_max_entries,
RecordContext when) {
return get()->recordHistory(
enabled, context_recorder, alloc_trace_max_entries, when);
}
inline void recordAnnotation(
const std::vector<std::pair<std::string, std::string>>& md) {
return get()->recordAnnotation(md);
}
inline bool isHistoryEnabled() {
return get()->isHistoryEnabled();
}
inline bool checkPoolLiveAllocations(
c10::DeviceIndex device,
MempoolId_t mempool_id,
const std::unordered_set<void*>& expected_live_allocations) {
return get()->checkPoolLiveAllocations(
device, mempool_id, expected_live_allocations);
}
inline void attachOutOfMemoryObserver(OutOfMemoryObserver observer) {
return get()->attachOutOfMemoryObserver(std::move(observer));
}
inline void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) {
return get()->attachAllocatorTraceTracker(std::move(tracker));
}
inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) {
return get()->releasePool(device, mempool_id);
}
// Not part of CUDA_ALLOCATOR_BACKEND_INTERFACE
inline std::shared_ptr<void> getIpcDevPtr(std::string handle) {
return get()->getIpcDevPtr(std::move(handle));
}
inline ShareableHandle shareIpcHandle(void* ptr) {
return get()->shareIpcHandle(ptr);
}
inline std::string name() {
return get()->name();
}
inline cudaError_t memcpyAsync(
void* dst,
int dstDevice,
const void* src,
int srcDevice,
size_t count,
cudaStream_t stream,
bool p2p_enabled) {
return get()->memcpyAsync(
dst, dstDevice, src, srcDevice, count, stream, p2p_enabled);
}
inline void enablePeerAccess(
c10::DeviceIndex dev,
c10::DeviceIndex dev_to_access) {
return get()->enablePeerAccess(dev, dev_to_access);
}
} // namespace c10::cuda::CUDACachingAllocator
namespace c10::cuda {
// MemPool represents a pool of memory in a caching allocator. Currently,
// it's just the ID of the pool object maintained in the CUDACachingAllocator.
//
// An allocator pointer can be passed to the MemPool to define how the
// allocations should be done in the pool. For example: using a different
// system allocator such as ncclMemAlloc.
struct C10_CUDA_API MemPool {
MemPool(
CUDACachingAllocator::CUDAAllocator* allocator = nullptr,
bool is_user_created = true);
MempoolId_t id();
CUDACachingAllocator::CUDAAllocator* allocator();
private:
static std::atomic<CaptureId_t> uid_;
static std::atomic<CaptureId_t> uuid_;
CUDACachingAllocator::CUDAAllocator* allocator_;
bool is_user_created_;
MempoolId_t id_;
};
// MemPoolContext holds the currently active pool and stashes the previous
// pool. On deletion it makes the previous pool active.
struct C10_CUDA_API MemPoolContext {
MemPoolContext(MemPool* mempool);
~MemPoolContext();
// getActiveMemPool() can be used to get the currently active pool.
// For instance: in CUDACachingAllocator, we can route allocations
// to a user provided allocator, by doing:
//
// auto active_pool = MemPoolContext::getActiveMemPool();
// if (active_pool && active_pool->allocator()) {
// ptr = active_pool->allocator()->raw_alloc(size);
// }
//
static MemPool* getActiveMemPool();
private:
MemPool* prev_mempool_;
};
} // namespace c10::cuda

View File

@ -0,0 +1,96 @@
#pragma once
#include <c10/cuda/CUDAException.h>
#include <c10/macros/Macros.h>
namespace c10::cuda {
#ifdef TORCH_USE_CUDA_DSA
// Copy string from `src` to `dst`
static __device__ void dstrcpy(char* dst, const char* src) {
int i = 0;
// Copy string from source to destination, ensuring that it
// isn't longer than `C10_CUDA_DSA_MAX_STR_LEN-1`
while (*src != '\0' && i++ < C10_CUDA_DSA_MAX_STR_LEN - 1) {
*dst++ = *src++;
}
*dst = '\0';
}
static __device__ void dsa_add_new_assertion_failure(
DeviceAssertionsData* assertions_data,
const char* assertion_msg,
const char* filename,
const char* function_name,
const int line_number,
const uint32_t caller,
const dim3 block_id,
const dim3 thread_id) {
// `assertions_data` may be nullptr if device-side assertion checking
// is disabled at run-time. If it is disabled at compile time this
// function will never be called
if (!assertions_data) {
return;
}
// Atomically increment so other threads can fail at the same time
// Note that incrementing this means that the CPU can observe that
// a failure has happened and can begin to respond before we've
// written information about that failure out to the buffer.
const auto nid = atomicAdd(&(assertions_data->assertion_count), 1);
if (nid >= C10_CUDA_DSA_ASSERTION_COUNT) {
// At this point we're ran out of assertion buffer space.
// We could print a message about this, but that'd get
// spammy if a lot of threads did it, so we just silently
// ignore any other assertion failures. In most cases the
// failures will all probably be analogous anyway.
return;
}
// Write information about the assertion failure to memory.
// Note that this occurs only after the `assertion_count`
// increment broadcasts that there's been a problem.
auto& self = assertions_data->assertions[nid];
dstrcpy(self.assertion_msg, assertion_msg);
dstrcpy(self.filename, filename);
dstrcpy(self.function_name, function_name);
self.line_number = line_number;
self.caller = caller;
self.block_id[0] = block_id.x;
self.block_id[1] = block_id.y;
self.block_id[2] = block_id.z;
self.thread_id[0] = thread_id.x;
self.thread_id[1] = thread_id.y;
self.thread_id[2] = thread_id.z;
}
// Emulates a kernel assertion. The assertion won't stop the kernel's progress,
// so you should assume everything the kernel produces is garbage if there's an
// assertion failure.
// NOTE: This assumes that `assertions_data` and `assertion_caller_id` are
// arguments of the kernel and therefore accessible.
#define CUDA_KERNEL_ASSERT2(condition) \
do { \
if (C10_UNLIKELY(!(condition))) { \
/* Has an atomic element so threads can fail at the same time */ \
c10::cuda::dsa_add_new_assertion_failure( \
assertions_data, \
C10_STRINGIZE(condition), \
__FILE__, \
__FUNCTION__, \
__LINE__, \
assertion_caller_id, \
blockIdx, \
threadIdx); \
/* Now that the kernel has failed we early exit the kernel, but */ \
/* otherwise keep going and rely on the host to check UVM and */ \
/* determine we've had a problem */ \
return; \
} \
} while (false)
#else
#define CUDA_KERNEL_ASSERT2(condition) assert(condition)
#endif
} // namespace c10::cuda

View File

@ -0,0 +1,164 @@
#pragma once
#include <c10/cuda/CUDAMacros.h>
#include <cstdint>
#include <memory>
#include <mutex>
#include <string>
#include <utility>
#include <vector>
#ifdef USE_CUDA
#define TORCH_USE_CUDA_DSA
#endif
/// Number of assertion failure messages we can store. If this is too small
/// threads will fail silently.
constexpr int C10_CUDA_DSA_ASSERTION_COUNT = 10;
constexpr int C10_CUDA_DSA_MAX_STR_LEN = 512;
namespace c10::cuda {
/// Holds information about any device-side assertions that fail.
/// Held in managed memory and access by both the CPU and the GPU.
struct DeviceAssertionData {
/// Stringification of the assertion
// NOLINTNEXTLINE(*-c-arrays)
char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN]{};
/// File the assertion was in
// NOLINTNEXTLINE(*-c-arrays)
char filename[C10_CUDA_DSA_MAX_STR_LEN]{};
/// Name of the function the assertion was in
// NOLINTNEXTLINE(*-c-arrays)
char function_name[C10_CUDA_DSA_MAX_STR_LEN]{};
/// Line number the assertion was at
int line_number{};
/// Number uniquely identifying the kernel launch that triggered the assertion
uint32_t caller{};
/// block_id of the thread that failed the assertion
// NOLINTNEXTLINE(*-c-arrays)
int32_t block_id[3]{};
/// third_id of the thread that failed the assertion
// NOLINTNEXTLINE(*-c-arrays)
int32_t thread_id[3]{};
};
/// Used to hold assertions generated by the device
/// Held in managed memory and access by both the CPU and the GPU.
struct DeviceAssertionsData {
/// Total number of assertions found; a subset of thse will be recorded
/// in `assertions`
int32_t assertion_count{};
/// An array of assertions that will be written to in a race-free manner
// NOLINTNEXTLINE(*-c-arrays)
DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT]{};
};
/// Use to hold info about kernel launches so that we can run kernels
/// asynchronously and still associate launches with device-side
/// assertion failures
struct CUDAKernelLaunchInfo {
/// Filename of the code where the kernel was launched from
const char* launch_filename;
/// Function from which the kernel was launched
const char* launch_function;
/// Line number of where the code was launched from
uint32_t launch_linenum;
/// Backtrace of where the kernel was launched from, only populated if
/// CUDAKernelLaunchRegistry::gather_launch_stacktrace is True
std::string launch_stacktrace;
/// Kernel that was launched
const char* kernel_name;
/// Device the kernel was launched on
int device;
/// Stream the kernel was launched on
int32_t stream;
/// A number that uniquely identifies the kernel launch
uint64_t generation_number;
};
/// Circular buffer used to hold information about kernel launches
/// this is later used to reconstruct how a device-side kernel assertion failure
/// occurred CUDAKernelLaunchRegistry is used as a singleton
class C10_CUDA_API CUDAKernelLaunchRegistry {
private:
/// Assume that this is the max number of kernel launches that might ever be
/// enqueued across all streams on a single device
static constexpr int max_kernel_launches = 1024;
/// How many kernel launch infos we've inserted. Used to ensure that circular
/// queue doesn't provide false information by always increasing, but also to
/// mark where we are inserting into the queue
#ifdef TORCH_USE_CUDA_DSA
uint64_t generation_number = 0;
#endif
/// Shared mutex between writer and accessor to ensure multi-threaded safety.
mutable std::mutex read_write_mutex;
/// Used to ensure prevent race conditions in GPU memory allocation
mutable std::mutex gpu_alloc_mutex;
/// Pointer to managed memory keeping track of device-side assertions. There
/// is one entry for each possible device the process might work with. Unused
/// entries are nullptrs. We could also use an unordered_set here, but this
/// vector design will be faster and the wasted memory is small since we
/// expect the number of GPUs per node will always be small
std::vector<
std::unique_ptr<DeviceAssertionsData, void (*)(DeviceAssertionsData*)>>
uvm_assertions;
/// A single circular buffer holds information about every kernel launch the
/// process makes across all devices.
std::vector<CUDAKernelLaunchInfo> kernel_launches;
bool check_env_for_enable_launch_stacktracing() const;
bool check_env_for_dsa_enabled() const;
public:
CUDAKernelLaunchRegistry();
/// Register a new kernel launch and obtain a generation number back to be
/// passed to the kernel
uint32_t insert(
const char* launch_filename,
const char* launch_function,
const uint32_t launch_linenum,
const char* kernel_name,
const int32_t stream_id);
/// Get copies of the kernel launch registry and each device's assertion
/// failure buffer so they can be inspected without raising race conditions
std::
pair<std::vector<DeviceAssertionsData>, std::vector<CUDAKernelLaunchInfo>>
snapshot() const;
/// Get a pointer to the current device's assertion failure buffer. If no such
/// buffer exists then one is created. This means that the first kernel launch
/// made on each device will be slightly slower because memory allocations are
/// required
DeviceAssertionsData* get_uvm_assertions_ptr_for_current_device();
/// Gets the global singleton of the registry
static CUDAKernelLaunchRegistry& get_singleton_ref();
/// If not all devices support DSA, we disable it
const bool do_all_devices_support_managed_memory = false;
/// Whether or not to gather stack traces when launching kernels
bool gather_launch_stacktrace = false;
/// Whether or not host-side DSA is enabled or disabled at run-time
/// Note: Device-side code cannot be enabled/disabled at run-time
bool enabled_at_runtime = false;
/// Whether or not a device has indicated a failure
bool has_failed() const;
#ifdef TORCH_USE_CUDA_DSA
const bool enabled_at_compile_time = true;
#else
const bool enabled_at_compile_time = false;
#endif
};
std::string c10_retrieve_device_side_assertion_info();
} // namespace c10::cuda
// Each kernel launched with TORCH_DSA_KERNEL_LAUNCH
// requires the same input arguments. We introduce the following macro to
// standardize these.
#define TORCH_DSA_KERNEL_ARGS \
[[maybe_unused]] c10::cuda::DeviceAssertionsData *const assertions_data, \
[[maybe_unused]] uint32_t assertion_caller_id
// This macro can be used to pass the DSA arguments onward to another
// function
#define TORCH_DSA_KERNEL_ARGS_PASS assertions_data, assertion_caller_id

View File

@ -0,0 +1,100 @@
#pragma once
#include <c10/cuda/CUDADeviceAssertionHost.h>
#include <c10/cuda/CUDAMacros.h>
#include <c10/cuda/CUDAMiscFunctions.h>
#include <c10/macros/Macros.h>
#include <c10/util/Exception.h>
#include <c10/util/irange.h>
#include <cuda.h>
// Note [CHECK macro]
// ~~~~~~~~~~~~~~~~~~
// This is a macro so that AT_ERROR can get accurate __LINE__
// and __FILE__ information. We could split this into a short
// macro and a function implementation if we pass along __LINE__
// and __FILE__, but no one has found this worth doing.
// Used to denote errors from CUDA framework.
// This needs to be declared here instead util/Exception.h for proper conversion
// during hipify.
namespace c10 {
class C10_CUDA_API CUDAError : public c10::Error {
using Error::Error;
};
} // namespace c10
#define C10_CUDA_CHECK(EXPR) \
do { \
const cudaError_t __err = EXPR; \
c10::cuda::c10_cuda_check_implementation( \
static_cast<int32_t>(__err), \
__FILE__, \
__func__, /* Line number data type not well-defined between \
compilers, so we perform an explicit cast */ \
static_cast<uint32_t>(__LINE__), \
true); \
} while (0)
#define C10_CUDA_CHECK_WARN(EXPR) \
do { \
const cudaError_t __err = EXPR; \
if (C10_UNLIKELY(__err != cudaSuccess)) { \
auto error_unused C10_UNUSED = cudaGetLastError(); \
(void)error_unused; \
TORCH_WARN("CUDA warning: ", cudaGetErrorString(__err)); \
} \
} while (0)
// Indicates that a CUDA error is handled in a non-standard way
#define C10_CUDA_ERROR_HANDLED(EXPR) EXPR
// Intentionally ignore a CUDA error
#define C10_CUDA_IGNORE_ERROR(EXPR) \
do { \
const cudaError_t __err = EXPR; \
if (C10_UNLIKELY(__err != cudaSuccess)) { \
cudaError_t error_unused C10_UNUSED = cudaGetLastError(); \
(void)error_unused; \
} \
} while (0)
// Clear the last CUDA error
#define C10_CUDA_CLEAR_ERROR() \
do { \
cudaError_t error_unused C10_UNUSED = cudaGetLastError(); \
(void)error_unused; \
} while (0)
// This should be used directly after every kernel launch to ensure
// the launch happened correctly and provide an early, close-to-source
// diagnostic if it didn't.
#define C10_CUDA_KERNEL_LAUNCH_CHECK() C10_CUDA_CHECK(cudaGetLastError())
/// Launches a CUDA kernel appending to it all the information need to handle
/// device-side assertion failures. Checks that the launch was successful.
#define TORCH_DSA_KERNEL_LAUNCH( \
kernel, blocks, threads, shared_mem, stream, ...) \
do { \
auto& launch_registry = \
c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref(); \
kernel<<<blocks, threads, shared_mem, stream>>>( \
__VA_ARGS__, \
launch_registry.get_uvm_assertions_ptr_for_current_device(), \
launch_registry.insert( \
__FILE__, __FUNCTION__, __LINE__, #kernel, stream.id())); \
C10_CUDA_KERNEL_LAUNCH_CHECK(); \
} while (0)
namespace c10::cuda {
/// In the event of a CUDA failure, formats a nice error message about that
/// failure and also checks for device-side assertion failures
C10_CUDA_API void c10_cuda_check_implementation(
const int32_t err,
const char* filename,
const char* function_name,
const int line_number,
const bool include_device_assertions);
} // namespace c10::cuda

View File

@ -0,0 +1,116 @@
#pragma once
// This header provides C++ wrappers around commonly used CUDA API functions.
// The benefit of using C++ here is that we can raise an exception in the
// event of an error, rather than explicitly pass around error codes. This
// leads to more natural APIs.
//
// The naming convention used here matches the naming convention of torch.cuda
#include <c10/core/Device.h>
#include <c10/core/impl/GPUTrace.h>
#include <c10/cuda/CUDAException.h>
#include <c10/cuda/CUDAMacros.h>
#include <cuda_runtime_api.h>
namespace c10::cuda {
// NB: In the past, we were inconsistent about whether or not this reported
// an error if there were driver problems are not. Based on experience
// interacting with users, it seems that people basically ~never want this
// function to fail; it should just return zero if things are not working.
// Oblige them.
// It still might log a warning for user first time it's invoked
C10_CUDA_API DeviceIndex device_count() noexcept;
// Version of device_count that throws is no devices are detected
C10_CUDA_API DeviceIndex device_count_ensure_non_zero();
C10_CUDA_API DeviceIndex current_device();
C10_CUDA_API void set_device(DeviceIndex device);
C10_CUDA_API void device_synchronize();
C10_CUDA_API void warn_or_error_on_sync();
// Raw CUDA device management functions
C10_CUDA_API cudaError_t GetDeviceCount(int* dev_count);
C10_CUDA_API cudaError_t GetDevice(DeviceIndex* device);
C10_CUDA_API cudaError_t SetDevice(DeviceIndex device);
C10_CUDA_API cudaError_t MaybeSetDevice(DeviceIndex device);
C10_CUDA_API DeviceIndex ExchangeDevice(DeviceIndex device);
C10_CUDA_API DeviceIndex MaybeExchangeDevice(DeviceIndex device);
C10_CUDA_API void SetTargetDevice();
enum class SyncDebugMode { L_DISABLED = 0, L_WARN, L_ERROR };
// this is a holder for c10 global state (similar to at GlobalContext)
// currently it's used to store cuda synchronization warning state,
// but can be expanded to hold other related global state, e.g. to
// record stream usage
class WarningState {
public:
void set_sync_debug_mode(SyncDebugMode l) {
sync_debug_mode = l;
}
SyncDebugMode get_sync_debug_mode() {
return sync_debug_mode;
}
private:
SyncDebugMode sync_debug_mode = SyncDebugMode::L_DISABLED;
};
C10_CUDA_API __inline__ WarningState& warning_state() {
static WarningState warning_state_;
return warning_state_;
}
// the subsequent functions are defined in the header because for performance
// reasons we want them to be inline
C10_CUDA_API void __inline__ memcpy_and_sync(
void* dst,
const void* src,
int64_t nbytes,
cudaMemcpyKind kind,
cudaStream_t stream) {
if (C10_UNLIKELY(
warning_state().get_sync_debug_mode() != SyncDebugMode::L_DISABLED)) {
warn_or_error_on_sync();
}
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
if (C10_UNLIKELY(interp)) {
(*interp)->trace_gpu_stream_synchronization(
c10::kCUDA, reinterpret_cast<uintptr_t>(stream));
}
#if defined(TORCH_HIP_VERSION) && (TORCH_HIP_VERSION >= 301)
C10_CUDA_CHECK(hipMemcpyWithStream(dst, src, nbytes, kind, stream));
#else
C10_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, kind, stream));
C10_CUDA_CHECK(cudaStreamSynchronize(stream));
#endif
}
C10_CUDA_API void __inline__ stream_synchronize(cudaStream_t stream) {
if (C10_UNLIKELY(
warning_state().get_sync_debug_mode() != SyncDebugMode::L_DISABLED)) {
warn_or_error_on_sync();
}
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
if (C10_UNLIKELY(interp)) {
(*interp)->trace_gpu_stream_synchronization(
c10::kCUDA, reinterpret_cast<uintptr_t>(stream));
}
C10_CUDA_CHECK(cudaStreamSynchronize(stream));
}
C10_CUDA_API bool hasPrimaryContext(DeviceIndex device_index);
C10_CUDA_API std::optional<DeviceIndex> getDeviceIndexWithPrimaryContext();
} // namespace c10::cuda

View File

@ -0,0 +1,77 @@
#pragma once
#include <c10/cuda/CUDAStream.h>
#include <iostream>
#include <utility>
// CUDA Graphs utils used by c10 and aten.
// aten/cuda/CUDAGraphsUtils.cuh adds utils used by aten only.
namespace c10::cuda {
using CaptureId_t = unsigned long long;
// first is set if the instance is created by CUDAGraph::capture_begin.
// second is set if the instance is created by at::cuda::graph_pool_handle.
using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
// RAII guard for "cudaStreamCaptureMode", a thread-local value
// that controls the error-checking strictness of a capture.
struct C10_CUDA_API CUDAStreamCaptureModeGuard {
CUDAStreamCaptureModeGuard(cudaStreamCaptureMode desired)
: strictness_(desired) {
C10_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&strictness_));
}
~CUDAStreamCaptureModeGuard() {
C10_CUDA_CHECK_WARN(cudaThreadExchangeStreamCaptureMode(&strictness_));
}
private:
cudaStreamCaptureMode strictness_;
};
// Protects against enum cudaStreamCaptureStatus implementation changes.
// Some compilers seem not to like static_assert without the messages.
static_assert(
int(cudaStreamCaptureStatus::cudaStreamCaptureStatusNone) == 0,
"unexpected int(cudaStreamCaptureStatusNone) value");
static_assert(
int(cudaStreamCaptureStatus::cudaStreamCaptureStatusActive) == 1,
"unexpected int(cudaStreamCaptureStatusActive) value");
static_assert(
int(cudaStreamCaptureStatus::cudaStreamCaptureStatusInvalidated) == 2,
"unexpected int(cudaStreamCaptureStatusInvalidated) value");
enum class CaptureStatus : int {
None = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusNone),
Active = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusActive),
Invalidated = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusInvalidated)
};
inline std::ostream& operator<<(std::ostream& os, CaptureStatus status) {
switch (status) {
case CaptureStatus::None:
os << "cudaStreamCaptureStatusNone";
break;
case CaptureStatus::Active:
os << "cudaStreamCaptureStatusActive";
break;
case CaptureStatus::Invalidated:
os << "cudaStreamCaptureStatusInvalidated";
break;
default:
TORCH_INTERNAL_ASSERT(
false, "Unknown CUDA graph CaptureStatus", int(status));
}
return os;
}
// Use this version where you're sure a CUDA context exists already.
inline CaptureStatus currentStreamCaptureStatusMayInitCtx() {
cudaStreamCaptureStatus is_capturing{cudaStreamCaptureStatusNone};
C10_CUDA_CHECK(
cudaStreamIsCapturing(c10::cuda::getCurrentCUDAStream(), &is_capturing));
return CaptureStatus(is_capturing);
}
} // namespace c10::cuda

View File

@ -0,0 +1,301 @@
#pragma once
#include <c10/core/DeviceType.h>
#include <c10/core/impl/InlineDeviceGuard.h>
#include <c10/core/impl/InlineStreamGuard.h>
#include <c10/cuda/CUDAMacros.h>
#include <c10/cuda/impl/CUDAGuardImpl.h>
namespace c10::cuda {
// This code is kind of boilerplatey. See Note [Whither the DeviceGuard
// boilerplate]
/// A variant of DeviceGuard that is specialized for CUDA. It accepts
/// integer indices (interpreting them as CUDA devices) and is a little
/// more efficient than DeviceGuard (it compiles to straight line
/// cudaSetDevice/cudaGetDevice calls); however, it can only be used
/// from code that links against CUDA directly.
struct CUDAGuard {
/// No default constructor; see Note [Omitted default constructor from RAII]
explicit CUDAGuard() = delete;
/// Set the current CUDA device to the passed device index.
explicit CUDAGuard(DeviceIndex device_index) : guard_(device_index) {}
/// Sets the current CUDA device to the passed device. Errors if the passed
/// device is not a CUDA device.
explicit CUDAGuard(Device device) : guard_(device) {}
// Copy is not allowed
CUDAGuard(const CUDAGuard&) = delete;
CUDAGuard& operator=(const CUDAGuard&) = delete;
// Move is not allowed (there is no uninitialized state)
CUDAGuard(CUDAGuard&& other) = delete;
CUDAGuard& operator=(CUDAGuard&& other) = delete;
/// Sets the CUDA device to the given device. Errors if the given device
/// is not a CUDA device.
void set_device(Device device) {
guard_.set_device(device);
}
/// Sets the CUDA device to the given device. Errors if the given device
/// is not a CUDA device. (This method is provided for uniformity with
/// DeviceGuard).
void reset_device(Device device) {
guard_.reset_device(device);
}
/// Sets the CUDA device to the given device index.
void set_index(DeviceIndex device_index) {
guard_.set_index(device_index);
}
/// Returns the device that was set upon construction of the guard
Device original_device() const {
return guard_.original_device();
}
/// Returns the last device that was set via `set_device`, if any, otherwise
/// the device passed during construction.
Device current_device() const {
return guard_.current_device();
}
private:
/// The guard for the current device.
c10::impl::InlineDeviceGuard<impl::CUDAGuardImpl> guard_;
};
/// A variant of OptionalDeviceGuard that is specialized for CUDA. See
/// CUDAGuard for when you can use this.
struct OptionalCUDAGuard {
/// Create an uninitialized OptionalCUDAGuard.
explicit OptionalCUDAGuard() : guard_() {}
/// Set the current CUDA device to the passed Device, if it is not nullopt.
explicit OptionalCUDAGuard(std::optional<Device> device_opt)
: guard_(device_opt) {}
/// Set the current CUDA device to the passed device index, if it is not
/// nullopt
explicit OptionalCUDAGuard(std::optional<DeviceIndex> device_index_opt)
: guard_(device_index_opt) {}
// Copy is not allowed
OptionalCUDAGuard(const OptionalCUDAGuard&) = delete;
OptionalCUDAGuard& operator=(const OptionalCUDAGuard&) = delete;
// See Note [Move construction for RAII guards is tricky]
OptionalCUDAGuard(OptionalCUDAGuard&& other) = delete;
// See Note [Move assignment for RAII guards is tricky]
OptionalCUDAGuard& operator=(OptionalCUDAGuard&& other) = delete;
/// Sets the CUDA device to the given device, initializing the guard if it
/// is not already initialized. Errors if the given device is not a CUDA
/// device.
void set_device(Device device) {
guard_.set_device(device);
}
/// Sets the CUDA device to the given device, initializing the guard if it is
/// not already initialized. Errors if the given device is not a CUDA device.
/// (This method is provided for uniformity with OptionalDeviceGuard).
void reset_device(Device device) {
guard_.reset_device(device);
}
/// Sets the CUDA device to the given device index, initializing the guard if
/// it is not already initialized.
void set_index(DeviceIndex device_index) {
guard_.set_index(device_index);
}
/// Returns the device that was set immediately prior to initialization of the
/// guard, or nullopt if the guard is uninitialized.
std::optional<Device> original_device() const {
return guard_.original_device();
}
/// Returns the most recent device that was set using this device guard,
/// either from construction, or via set_device, if the guard is initialized,
/// or nullopt if the guard is uninitialized.
std::optional<Device> current_device() const {
return guard_.current_device();
}
/// Restore the original CUDA device, resetting this guard to uninitialized
/// state.
void reset() {
guard_.reset();
}
private:
c10::impl::InlineOptionalDeviceGuard<impl::CUDAGuardImpl> guard_;
};
/// A variant of StreamGuard that is specialized for CUDA. See CUDAGuard
/// for when you can use this.
struct CUDAStreamGuard {
/// No default constructor, see Note [Omitted default constructor from RAII]
explicit CUDAStreamGuard() = delete;
/// Set the current CUDA device to the device associated with the passed
/// stream, and set the current CUDA stream on that device to the passed
/// stream. Errors if the Stream is not a CUDA stream.
explicit CUDAStreamGuard(Stream stream) : guard_(stream) {}
/// Copy is disallowed
CUDAStreamGuard(const CUDAStreamGuard&) = delete;
CUDAStreamGuard& operator=(const CUDAStreamGuard&) = delete;
/// Move is disallowed, as CUDAStreamGuard does not have an uninitialized
/// state, which is required for moves on types with nontrivial destructors.
CUDAStreamGuard(CUDAStreamGuard&& other) = delete;
CUDAStreamGuard& operator=(CUDAStreamGuard&& other) = delete;
/// Resets the currently set stream to the original stream and
/// the currently set device to the original device. Then,
/// set the current device to the device associated with the passed stream,
/// and set the current stream on that device to the passed stream.
/// Errors if the stream passed is not a CUDA stream.
///
/// NOTE: this implementation may skip some stream/device setting if
/// it can prove that it is unnecessary.
///
/// WARNING: reset_stream does NOT preserve previously set streams on
/// different devices. If you need to set streams on multiple devices
/// on CUDA, use CUDAMultiStreamGuard instead.
void reset_stream(Stream stream) {
guard_.reset_stream(stream);
}
/// Returns the CUDA stream that was set at the time the guard was
/// constructed.
CUDAStream original_stream() const {
return CUDAStream(CUDAStream::UNCHECKED, guard_.original_stream());
}
/// Returns the most recent CUDA stream that was set using this device guard,
/// either from construction, or via set_stream.
CUDAStream current_stream() const {
return CUDAStream(CUDAStream::UNCHECKED, guard_.current_stream());
}
/// Returns the most recent CUDA device that was set using this device guard,
/// either from construction, or via set_device/reset_device/set_index.
Device current_device() const {
return guard_.current_device();
}
/// Returns the CUDA device that was set at the most recent reset_stream(),
/// or otherwise the device at construction time.
Device original_device() const {
return guard_.original_device();
}
private:
c10::impl::InlineStreamGuard<impl::CUDAGuardImpl> guard_;
};
/// A variant of OptionalStreamGuard that is specialized for CUDA. See
/// CUDAGuard for when you can use this.
struct OptionalCUDAStreamGuard {
/// Create an uninitialized guard.
explicit OptionalCUDAStreamGuard() : guard_() {}
/// Set the current CUDA device to the device associated with the passed
/// stream, and set the current CUDA stream on that device to the passed
/// stream. Errors if the Stream is not a CUDA stream.
explicit OptionalCUDAStreamGuard(Stream stream) : guard_(stream) {}
/// Set the current device to the device associated with the passed stream,
/// and set the current stream on that device to the passed stream,
/// if the passed stream is not nullopt.
explicit OptionalCUDAStreamGuard(std::optional<Stream> stream_opt)
: guard_(stream_opt) {}
/// Copy is disallowed
OptionalCUDAStreamGuard(const OptionalCUDAStreamGuard&) = delete;
OptionalCUDAStreamGuard& operator=(const OptionalCUDAStreamGuard&) = delete;
// See Note [Move construction for RAII guards is tricky]
OptionalCUDAStreamGuard(OptionalCUDAStreamGuard&& other) = delete;
// See Note [Move assignment for RAII guards is tricky]
OptionalCUDAStreamGuard& operator=(OptionalCUDAStreamGuard&& other) = delete;
/// Resets the currently set CUDA stream to the original stream and
/// the currently set device to the original device. Then,
/// set the current device to the device associated with the passed stream,
/// and set the current stream on that device to the passed stream.
/// Initializes the guard if it was not previously initialized.
void reset_stream(Stream stream) {
guard_.reset_stream(stream);
}
/// Returns the CUDA stream that was set at the time the guard was most
/// recently initialized, or nullopt if the guard is uninitialized.
std::optional<CUDAStream> original_stream() const {
auto r = guard_.original_stream();
if (r.has_value()) {
return std::make_optional(CUDAStream(CUDAStream::UNCHECKED, r.value()));
} else {
return std::nullopt;
}
}
/// Returns the most recent CUDA stream that was set using this stream guard,
/// either from construction, or via reset_stream, if the guard is
/// initialized, or nullopt if the guard is uninitialized.
std::optional<CUDAStream> current_stream() const {
auto r = guard_.current_stream();
if (r.has_value()) {
return std::make_optional(CUDAStream(CUDAStream::UNCHECKED, r.value()));
} else {
return std::nullopt;
}
}
/// Restore the original CUDA device and stream, resetting this guard to
/// uninitialized state.
void reset() {
guard_.reset();
}
private:
c10::impl::InlineOptionalStreamGuard<impl::CUDAGuardImpl> guard_;
};
/// A variant of MultiStreamGuard that is specialized for CUDA.
struct CUDAMultiStreamGuard {
explicit CUDAMultiStreamGuard(ArrayRef<CUDAStream> streams)
: guard_(unwrapStreams(streams)) {}
/// Copy is disallowed
CUDAMultiStreamGuard(const CUDAMultiStreamGuard&) = delete;
CUDAMultiStreamGuard& operator=(const CUDAMultiStreamGuard&) = delete;
// See Note [Move construction for RAII guards is tricky]
CUDAMultiStreamGuard(CUDAMultiStreamGuard&& other) = delete;
// See Note [Move assignment for RAII guards is tricky]
CUDAMultiStreamGuard& operator=(CUDAMultiStreamGuard&& other) = delete;
private:
c10::impl::InlineMultiStreamGuard<impl::CUDAGuardImpl> guard_;
static std::vector<Stream> unwrapStreams(ArrayRef<CUDAStream> cudaStreams) {
std::vector<Stream> streams;
streams.reserve(cudaStreams.size());
for (const CUDAStream& cudaStream : cudaStreams) {
streams.push_back(cudaStream);
}
return streams;
}
};
} // namespace c10::cuda

View File

@ -0,0 +1,51 @@
#pragma once
#ifndef C10_USING_CUSTOM_GENERATED_MACROS
// We have not yet modified the AMD HIP build to generate this file so
// we add an extra option to specifically ignore it.
#ifndef C10_CUDA_NO_CMAKE_CONFIGURE_FILE
#include <c10/cuda/impl/cuda_cmake_macros.h>
#endif // C10_CUDA_NO_CMAKE_CONFIGURE_FILE
#endif
// See c10/macros/Export.h for a detailed explanation of what the function
// of these macros are. We need one set of macros for every separate library
// we build.
#ifdef _WIN32
#if defined(C10_CUDA_BUILD_SHARED_LIBS)
#define C10_CUDA_EXPORT __declspec(dllexport)
#define C10_CUDA_IMPORT __declspec(dllimport)
#else
#define C10_CUDA_EXPORT
#define C10_CUDA_IMPORT
#endif
#else // _WIN32
#if defined(__GNUC__)
#define C10_CUDA_EXPORT __attribute__((__visibility__("default")))
#else // defined(__GNUC__)
#define C10_CUDA_EXPORT
#endif // defined(__GNUC__)
#define C10_CUDA_IMPORT C10_CUDA_EXPORT
#endif // _WIN32
// This one is being used by libc10_cuda.so
#ifdef C10_CUDA_BUILD_MAIN_LIB
#define C10_CUDA_API C10_CUDA_EXPORT
#else
#define C10_CUDA_API C10_CUDA_IMPORT
#endif
/**
* The maximum number of GPUs that we recognizes. Increasing this beyond the
* initial limit of 16 broke Caffe2 testing, hence the ifdef guards.
* This value cannot be more than 128 because our DeviceIndex is a uint8_t.
o */
#ifdef FBCODE_CAFFE2
// fbcode depends on this value being 16
#define C10_COMPILE_TIME_MAX_GPUS 16
#else
#define C10_COMPILE_TIME_MAX_GPUS 120
#endif

View File

@ -0,0 +1,152 @@
#pragma once
/* This file defines math functions compatible across different gpu
* platforms (currently CUDA and HIP).
*/
#if defined(__CUDACC__) || defined(__HIPCC__)
#include <c10/macros/Macros.h>
#include <c10/util/Exception.h>
#ifdef __HIPCC__
#define __MATH_FUNCTIONS_DECL__ inline C10_DEVICE
#else /* __HIPCC__ */
#ifdef __CUDACC_RTC__
#define __MATH_FUNCTIONS_DECL__ C10_HOST_DEVICE
#else /* __CUDACC_RTC__ */
#define __MATH_FUNCTIONS_DECL__ inline C10_HOST_DEVICE
#endif /* __CUDACC_RTC__ */
#endif /* __HIPCC__ */
namespace c10::cuda::compat {
__MATH_FUNCTIONS_DECL__ float abs(float x) {
return ::fabsf(x);
}
__MATH_FUNCTIONS_DECL__ double abs(double x) {
return ::fabs(x);
}
__MATH_FUNCTIONS_DECL__ float exp(float x) {
return ::expf(x);
}
__MATH_FUNCTIONS_DECL__ double exp(double x) {
return ::exp(x);
}
__MATH_FUNCTIONS_DECL__ float ceil(float x) {
return ::ceilf(x);
}
__MATH_FUNCTIONS_DECL__ double ceil(double x) {
return ::ceil(x);
}
__MATH_FUNCTIONS_DECL__ float copysign(float x, float y) {
#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
return ::copysignf(x, y);
#else
// std::copysign gets ICE/Segfaults with gcc 7.5/8 on arm64
// (e.g. Jetson), see PyTorch PR #51834
// This host function needs to be here for the compiler but is never used
TORCH_INTERNAL_ASSERT(
false, "CUDAMathCompat copysign should not run on the CPU");
#endif
}
__MATH_FUNCTIONS_DECL__ double copysign(double x, double y) {
#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
return ::copysign(x, y);
#else
// see above
TORCH_INTERNAL_ASSERT(
false, "CUDAMathCompat copysign should not run on the CPU");
#endif
}
__MATH_FUNCTIONS_DECL__ float floor(float x) {
return ::floorf(x);
}
__MATH_FUNCTIONS_DECL__ double floor(double x) {
return ::floor(x);
}
__MATH_FUNCTIONS_DECL__ float log(float x) {
return ::logf(x);
}
__MATH_FUNCTIONS_DECL__ double log(double x) {
return ::log(x);
}
__MATH_FUNCTIONS_DECL__ float log1p(float x) {
return ::log1pf(x);
}
__MATH_FUNCTIONS_DECL__ double log1p(double x) {
return ::log1p(x);
}
__MATH_FUNCTIONS_DECL__ float max(float x, float y) {
return ::fmaxf(x, y);
}
__MATH_FUNCTIONS_DECL__ double max(double x, double y) {
return ::fmax(x, y);
}
__MATH_FUNCTIONS_DECL__ float min(float x, float y) {
return ::fminf(x, y);
}
__MATH_FUNCTIONS_DECL__ double min(double x, double y) {
return ::fmin(x, y);
}
__MATH_FUNCTIONS_DECL__ float pow(float x, float y) {
return ::powf(x, y);
}
__MATH_FUNCTIONS_DECL__ double pow(double x, double y) {
return ::pow(x, y);
}
__MATH_FUNCTIONS_DECL__ void sincos(float x, float* sptr, float* cptr) {
return ::sincosf(x, sptr, cptr);
}
__MATH_FUNCTIONS_DECL__ void sincos(double x, double* sptr, double* cptr) {
return ::sincos(x, sptr, cptr);
}
__MATH_FUNCTIONS_DECL__ float sqrt(float x) {
return ::sqrtf(x);
}
__MATH_FUNCTIONS_DECL__ double sqrt(double x) {
return ::sqrt(x);
}
__MATH_FUNCTIONS_DECL__ float rsqrt(float x) {
return ::rsqrtf(x);
}
__MATH_FUNCTIONS_DECL__ double rsqrt(double x) {
return ::rsqrt(x);
}
__MATH_FUNCTIONS_DECL__ float tan(float x) {
return ::tanf(x);
}
__MATH_FUNCTIONS_DECL__ double tan(double x) {
return ::tan(x);
}
__MATH_FUNCTIONS_DECL__ float tanh(float x) {
return ::tanhf(x);
}
__MATH_FUNCTIONS_DECL__ double tanh(double x) {
return ::tanh(x);
}
__MATH_FUNCTIONS_DECL__ float normcdf(float x) {
return ::normcdff(x);
}
__MATH_FUNCTIONS_DECL__ double normcdf(double x) {
return ::normcdf(x);
}
} // namespace c10::cuda::compat
#endif

View File

@ -0,0 +1,12 @@
#pragma once
// this file is to avoid circular dependency between CUDAFunctions.h and
// CUDAExceptions.h
#include <c10/cuda/CUDAMacros.h>
#include <mutex>
namespace c10::cuda {
C10_CUDA_API const char* get_cuda_check_suffix() noexcept;
C10_CUDA_API std::mutex* getFreeMutex();
} // namespace c10::cuda

View File

@ -0,0 +1,268 @@
#pragma once
#include <cuda_runtime_api.h>
#include <c10/core/DeviceGuard.h>
#include <c10/core/Stream.h>
#include <c10/cuda/CUDAFunctions.h>
#include <c10/util/Exception.h>
/*
* Stream pool note.
*
* A CUDAStream is an abstraction of an actual cuStream on the GPU. CUDAStreams
* are backed by cuStreams, but they use several pools to minimize the costs
* associated with creating, retaining, and destroying cuStreams.
*
* There are three pools per device, and a device's pools are lazily created.
*
* The first pool contains only the default stream. When the default stream
* is requested it's returned.
*
* The second pool is the "low priority" or "default priority" streams. In
* HIP builds there is no distinction between streams in this pool and streams
* in the third pool (below). There are 32 of these streams per device, and
* when a stream is requested one of these streams is returned round-robin.
* That is, the first stream requested is at index 0, the second at index 1...
* to index 31, then index 0 again.
*
* This means that if 33 low priority streams are requested, the first and
* last streams requested are actually the same stream (under the covers)
* and kernels enqueued on them cannot run concurrently.
*
* The third pool is the "high priority" streams. The third pool acts like
* the second pool except the streams are created with a higher priority.
*
* These pools suggest that stream users should prefer many short-lived streams,
* as the cost of acquiring and releasing streams is effectively zero. If
* many longer-lived streams are required in performance critical scenarios
* then the functionality here may need to be extended to allow, for example,
* "reserving" a subset of the pool so that other streams do not accidentally
* overlap the performance critical streams.
*
* Note: although the notion of "current stream for device" is thread local
* (every OS thread has a separate current stream, as one might expect),
* the stream pool is global across all threads; stream 0 is always stream 0
* no matter which thread you use it on. Multiple threads can synchronize
* on the same stream. Although the CUDA documentation is not very clear
* on the matter, streams are thread safe; e.g., it is safe to enqueue
* a kernel on the same stream from two different threads.
*/
namespace c10::cuda {
static constexpr int max_compile_time_stream_priorities = 4;
// Value object representing a CUDA stream. This is just a wrapper
// around c10::Stream, but it comes with a little extra CUDA-specific
// functionality (conversion to cudaStream_t), and a guarantee that
// the wrapped c10::Stream really is a CUDA stream.
class C10_CUDA_API CUDAStream {
public:
enum Unchecked { UNCHECKED };
/// Construct a CUDAStream from a Stream. This construction is checked,
/// and will raise an error if the Stream is not, in fact, a CUDA stream.
explicit CUDAStream(Stream stream) : stream_(stream) {
TORCH_CHECK(stream_.device_type() == DeviceType::CUDA);
}
/// Construct a CUDAStream from a Stream with no error checking.
/// This constructor uses the "named" constructor idiom, and can
/// be invoked as: CUDAStream(CUDAStream::UNCHECKED, stream)
explicit CUDAStream(Unchecked, Stream stream) : stream_(stream) {}
bool operator==(const CUDAStream& other) const noexcept {
return unwrap() == other.unwrap();
}
bool operator!=(const CUDAStream& other) const noexcept {
return unwrap() != other.unwrap();
}
/// Implicit conversion to cudaStream_t.
operator cudaStream_t() const {
return stream();
}
/// Implicit conversion to Stream (a.k.a., forget that the stream is a
/// CUDA stream).
operator Stream() const {
return unwrap();
}
/// Used to avoid baking in device type explicitly to Python-side API.
DeviceType device_type() const {
return DeviceType::CUDA;
}
/// Get the CUDA device index that this stream is associated with.
DeviceIndex device_index() const {
return stream_.device_index();
}
/// Get the full Device that this stream is associated with. The Device
/// is guaranteed to be a CUDA device.
Device device() const {
return Device(DeviceType::CUDA, device_index());
}
/// Return the stream ID corresponding to this particular stream.
StreamId id() const {
return stream_.id();
}
bool query() const {
DeviceGuard guard{stream_.device()};
cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaStreamQuery(stream()));
if (err == cudaSuccess) {
return true;
} else if (err != cudaErrorNotReady) {
C10_CUDA_CHECK(err);
} else {
// ignore and clear the error if not ready
(void)cudaGetLastError();
}
return false;
}
void synchronize() const {
DeviceGuard guard{stream_.device()};
c10::cuda::stream_synchronize(stream());
}
int priority() const {
DeviceGuard guard{stream_.device()};
int priority = 0;
C10_CUDA_CHECK(cudaStreamGetPriority(stream(), &priority));
return priority;
}
/// Explicit conversion to cudaStream_t.
cudaStream_t stream() const;
/// Explicit conversion to Stream.
Stream unwrap() const {
return stream_;
}
/// Reversibly pack a CUDAStream into a struct representation.
/// Previously the stream's data was packed into a single int64_t,
/// as it was assumed the fields would not require more than
/// 64 bits of storage in total.
/// See https://github.com/pytorch/pytorch/issues/75854
/// for more information regarding newer platforms that may violate
/// this assumption.
///
/// The CUDAStream can be unpacked using unpack().
struct c10::StreamData3 pack3() const {
return stream_.pack3();
}
// Unpack a CUDAStream from the 3 fields generated by pack().
static CUDAStream unpack3(
StreamId stream_id,
DeviceIndex device_index,
DeviceType device_type) {
return CUDAStream(Stream::unpack3(stream_id, device_index, device_type));
}
static std::tuple<int, int> priority_range() {
// Note: this returns the range of priority **supported by PyTorch**, not
// the range of priority **supported by CUDA**. The former is a subset of
// the latter.
int least_priority = 0, greatest_priority = 0;
C10_CUDA_CHECK(
cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
#ifdef USE_ROCM
// See Note [HIP stream priorities]
TORCH_INTERNAL_ASSERT(
least_priority == 1, "Unexpected HIP stream priority range");
least_priority = 0;
#else
TORCH_INTERNAL_ASSERT(
least_priority == 0, "Unexpected CUDA stream priority range");
#endif
TORCH_INTERNAL_ASSERT(
greatest_priority <= -1, "Unexpected CUDA stream priority range");
greatest_priority = std::max(
-c10::cuda::max_compile_time_stream_priorities + 1, greatest_priority);
return std::make_tuple(least_priority, greatest_priority);
}
// Deleted for now; use CUDAEvent::block instead
// void synchronize_with(const CUDAEvent& event) const;
private:
Stream stream_;
};
/**
* Get a new stream from the CUDA stream pool. You can think of this
* as "creating" a new stream, but no such creation actually happens;
* instead, streams are preallocated from the pool and returned in a
* round-robin fashion.
*
* You can request a stream from the high priority pool by setting
* isHighPriority to true, or a stream for a specific device by setting device
* (defaulting to the current CUDA stream.)
*/
C10_API CUDAStream
getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
// no default priority to disambiguate overloads
C10_API CUDAStream
getStreamFromPool(const int priority, DeviceIndex device = -1);
/**
* Get a CUDAStream from a externally allocated one.
*
* This is mainly for interoperability with different libraries where we
* want to operate on a non-torch allocated stream for data exchange or similar
* purposes
*/
C10_API CUDAStream
getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index);
/**
* Get the default CUDA stream, for the passed CUDA device, or for the
* current device if no device index is passed. The default stream is
* where most computation occurs when you aren't explicitly using
* streams.
*/
C10_API CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -1);
/**
* Get the current CUDA stream, for the passed CUDA device, or for the
* current device if no device index is passed. The current CUDA stream
* will usually be the default CUDA stream for the device, but it may
* be different if someone called 'setCurrentCUDAStream' or used 'StreamGuard'
* or 'CUDAStreamGuard'.
*/
C10_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1);
/**
* Set the current stream on the device of the passed in stream to be
* the passed in stream. Yes, you read that right: this function
* has *nothing* to do with the current device: it toggles the current
* stream of the device of the passed stream.
*
* Confused? Avoid using this function; prefer using 'CUDAStreamGuard' instead
* (which will switch both your current device and current stream in the way you
* expect, and reset it back to its original state afterwards).
*/
C10_API void setCurrentCUDAStream(CUDAStream stream);
C10_API std::ostream& operator<<(std::ostream& stream, const CUDAStream& s);
} // namespace c10::cuda
namespace std {
template <>
struct hash<c10::cuda::CUDAStream> {
size_t operator()(c10::cuda::CUDAStream s) const noexcept {
return std::hash<c10::Stream>{}(s.unwrap());
}
};
} // namespace std

View File

@ -0,0 +1,63 @@
#pragma once
#include <cuda.h>
#define NVML_NO_UNVERSIONED_FUNC_DEFS
#include <nvml.h>
#define C10_CUDA_DRIVER_CHECK(EXPR) \
do { \
CUresult __err = EXPR; \
if (__err != CUDA_SUCCESS) { \
const char* err_str; \
CUresult get_error_str_err C10_UNUSED = \
c10::cuda::DriverAPI::get()->cuGetErrorString_(__err, &err_str); \
if (get_error_str_err != CUDA_SUCCESS) { \
AT_ERROR("CUDA driver error: unknown error"); \
} else { \
AT_ERROR("CUDA driver error: ", err_str); \
} \
} \
} while (0)
#define C10_LIBCUDA_DRIVER_API(_) \
_(cuDeviceGetAttribute) \
_(cuMemAddressReserve) \
_(cuMemRelease) \
_(cuMemMap) \
_(cuMemAddressFree) \
_(cuMemSetAccess) \
_(cuMemUnmap) \
_(cuMemCreate) \
_(cuMemGetAllocationGranularity) \
_(cuMemExportToShareableHandle) \
_(cuMemImportFromShareableHandle) \
_(cuGetErrorString)
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12030)
#define C10_LIBCUDA_DRIVER_API_12030(_) \
_(cuMulticastAddDevice) \
_(cuMulticastBindMem) \
_(cuMulticastCreate)
#else
#define C10_LIBCUDA_DRIVER_API_12030(_)
#endif
#define C10_NVML_DRIVER_API(_) \
_(nvmlInit_v2) \
_(nvmlDeviceGetHandleByPciBusId_v2) \
_(nvmlDeviceGetNvLinkRemoteDeviceType) \
_(nvmlDeviceGetNvLinkRemotePciInfo_v2) \
_(nvmlDeviceGetComputeRunningProcesses)
namespace c10::cuda {
struct DriverAPI {
#define CREATE_MEMBER(name) decltype(&name) name##_;
C10_LIBCUDA_DRIVER_API(CREATE_MEMBER)
C10_LIBCUDA_DRIVER_API_12030(CREATE_MEMBER)
C10_NVML_DRIVER_API(CREATE_MEMBER)
#undef CREATE_MEMBER
static DriverAPI* get();
static void* get_nvml_handle();
};
} // namespace c10::cuda

View File

@ -0,0 +1,249 @@
#pragma once
#include <c10/core/impl/DeviceGuardImplInterface.h>
#include <c10/core/impl/GPUTrace.h>
#include <c10/macros/Macros.h>
#include <c10/util/Exception.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/cuda/CUDAException.h>
#include <c10/cuda/CUDAFunctions.h>
#include <c10/cuda/CUDAStream.h>
#include <c10/core/Device.h>
#include <c10/core/DeviceType.h>
#include <c10/core/Stream.h>
#include <c10/core/impl/PyInterpreter.h>
#include <cuda_runtime_api.h>
#include <cstdint>
#include <optional>
namespace c10::cuda::impl {
struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
static constexpr DeviceType static_type = DeviceType::CUDA;
CUDAGuardImpl() = default;
explicit CUDAGuardImpl(DeviceType t) {
TORCH_INTERNAL_ASSERT(t == DeviceType::CUDA);
}
DeviceType type() const override {
return DeviceType::CUDA;
}
Device exchangeDevice(Device d) const override {
TORCH_INTERNAL_ASSERT(d.is_cuda());
auto old_device_index = c10::cuda::ExchangeDevice(d.index());
return Device(DeviceType::CUDA, old_device_index);
}
Device getDevice() const override {
DeviceIndex device = 0;
C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
return Device(DeviceType::CUDA, device);
}
std::optional<Device> uncheckedGetDevice() const noexcept {
DeviceIndex device{-1};
const auto err = C10_CUDA_ERROR_HANDLED(c10::cuda::GetDevice(&device));
C10_CUDA_CHECK_WARN(err);
if (err != cudaSuccess) {
return std::nullopt;
}
return Device(DeviceType::CUDA, device);
}
void setDevice(Device d) const override {
TORCH_INTERNAL_ASSERT(d.is_cuda());
C10_CUDA_CHECK(c10::cuda::SetDevice(d.index()));
}
void uncheckedSetDevice(Device d) const noexcept override {
C10_CUDA_CHECK_WARN(c10::cuda::MaybeSetDevice(d.index()));
}
Stream getStream(Device d) const noexcept override {
return getCurrentCUDAStream(d.index()).unwrap();
}
Stream getDefaultStream(Device d) const override {
return getDefaultCUDAStream(d.index());
}
Stream getNewStream(Device d, int priority = 0) const override {
return getStreamFromPool(priority, d.index());
}
Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false)
const override {
return getStreamFromPool(isHighPriority, d.index());
}
// NB: These do NOT set the current device
Stream exchangeStream(Stream s) const noexcept override {
CUDAStream cs(s);
auto old_stream = getCurrentCUDAStream(s.device().index());
setCurrentCUDAStream(cs);
return old_stream.unwrap();
}
DeviceIndex deviceCount() const noexcept override {
return device_count();
}
// Event-related functions
void createEvent(cudaEvent_t* cuda_event, const EventFlag flag) const {
// Maps PyTorch's Event::Flag to CUDA flag
auto cuda_flag = cudaEventDefault;
switch (flag) {
case EventFlag::PYTORCH_DEFAULT:
cuda_flag = cudaEventDisableTiming;
break;
case EventFlag::BACKEND_DEFAULT:
cuda_flag = cudaEventDefault;
break;
default:
TORCH_CHECK(false, "CUDA event received unknown flag");
}
C10_CUDA_CHECK(cudaEventCreateWithFlags(cuda_event, cuda_flag));
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
if (C10_UNLIKELY(interp)) {
(*interp)->trace_gpu_event_creation(
c10::kCUDA, reinterpret_cast<uintptr_t>(cuda_event));
}
}
void destroyEvent(void* event, const DeviceIndex device_index)
const noexcept override {
if (!event)
return;
auto cuda_event = static_cast<cudaEvent_t>(event);
DeviceIndex orig_device{-1};
C10_CUDA_CHECK_WARN(c10::cuda::GetDevice(&orig_device));
C10_CUDA_CHECK_WARN(c10::cuda::SetDevice(device_index));
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
if (C10_UNLIKELY(interp)) {
(*interp)->trace_gpu_event_deletion(
c10::kCUDA, reinterpret_cast<uintptr_t>(cuda_event));
}
C10_CUDA_CHECK_WARN(cudaEventDestroy(cuda_event));
C10_CUDA_CHECK_WARN(c10::cuda::SetDevice(orig_device));
}
void record(
void** event,
const Stream& stream,
const DeviceIndex device_index,
const EventFlag flag) const override {
TORCH_CHECK(
device_index == -1 || device_index == stream.device_index(),
"Event device index ",
device_index,
" does not match recording stream's device index ",
stream.device_index(),
".");
cudaEvent_t cuda_event = static_cast<cudaEvent_t>(*event);
CUDAStream cuda_stream{stream};
// Moves to stream's device to record
const auto orig_device = getDevice();
setDevice(stream.device());
// Creates the event (lazily)
if (!cuda_event)
createEvent(&cuda_event, flag);
C10_CUDA_CHECK(cudaEventRecord(cuda_event, cuda_stream));
// Makes the void* point to the (possibly just allocated) CUDA event
*event = cuda_event;
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
if (C10_UNLIKELY(interp)) {
(*interp)->trace_gpu_event_record(
c10::kCUDA,
reinterpret_cast<uintptr_t>(cuda_event),
reinterpret_cast<uintptr_t>(cuda_stream.stream()));
}
// Resets device
setDevice(orig_device);
}
void block(void* event, const Stream& stream) const override {
if (!event)
return;
cudaEvent_t cuda_event = static_cast<cudaEvent_t>(event);
CUDAStream cuda_stream{stream};
const auto orig_device = getDevice();
setDevice(stream.device());
C10_CUDA_CHECK(cudaStreamWaitEvent(
cuda_stream,
cuda_event,
/*flags (must be zero)=*/0));
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
if (C10_UNLIKELY(interp)) {
(*interp)->trace_gpu_event_wait(
c10::kCUDA,
reinterpret_cast<uintptr_t>(cuda_event),
reinterpret_cast<uintptr_t>(cuda_stream.stream()));
}
setDevice(orig_device);
}
// May be called from any device
bool queryEvent(void* event) const override {
if (!event)
return true;
cudaEvent_t cuda_event = static_cast<cudaEvent_t>(event);
// Note: cudaEventQuery can be safely called from any device
const cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaEventQuery(cuda_event));
if (err != cudaErrorNotReady) {
C10_CUDA_CHECK(err);
} else {
// ignore and clear the error if not ready
(void)cudaGetLastError();
}
return (err == cudaSuccess);
}
// Stream-related functions
bool queryStream(const Stream& stream) const override {
CUDAStream cuda_stream{stream};
return cuda_stream.query();
}
void synchronizeStream(const Stream& stream) const override {
CUDAStream cuda_stream{stream};
cuda_stream.synchronize();
}
void synchronizeEvent(void* event) const override {
if (!event)
return;
cudaEvent_t cuda_event = static_cast<cudaEvent_t>(event);
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
if (C10_UNLIKELY(interp)) {
(*interp)->trace_gpu_event_synchronization(
c10::kCUDA, reinterpret_cast<uintptr_t>(cuda_event));
}
// Note: cudaEventSynchronize can be safely called from any device
C10_CUDA_CHECK(cudaEventSynchronize(cuda_event));
}
void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream)
const override {
CUDAStream cuda_stream{stream};
CUDACachingAllocator::recordStream(data_ptr, cuda_stream);
}
double elapsedTime(void* event1, void* event2, const DeviceIndex device_index)
const override {
TORCH_CHECK(
event1 && event2,
"Both events must be recorded before calculating elapsed time.");
// Even though cudaEventElapsedTime can be safely called from any device, if
// the current device is not initialized, it will create a new cuda context,
// which will consume a lot of memory.
DeviceIndex orig_device{-1};
C10_CUDA_CHECK(c10::cuda::GetDevice(&orig_device));
C10_CUDA_CHECK(c10::cuda::SetDevice(device_index));
cudaEvent_t cuda_event1 = static_cast<cudaEvent_t>(event1);
cudaEvent_t cuda_event2 = static_cast<cudaEvent_t>(event2);
float time_ms = 0;
// raise cudaErrorNotReady if either event is recorded but not yet completed
C10_CUDA_CHECK(cudaEventElapsedTime(&time_ms, cuda_event1, cuda_event2));
C10_CUDA_CHECK(c10::cuda::SetDevice(orig_device));
return static_cast<double>(time_ms);
}
};
} // namespace c10::cuda::impl

View File

@ -0,0 +1,9 @@
#pragma once
#include <c10/cuda/CUDAMacros.h>
namespace c10::cuda::impl {
C10_CUDA_API int c10_cuda_test();
}