I am done

This commit is contained in:
2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions

View File

@ -0,0 +1,27 @@
#pragma once
#include <c10/core/CachingDeviceAllocator.h>
#include <c10/xpu/XPUStream.h>
namespace c10::xpu::XPUCachingAllocator {
C10_XPU_API Allocator* get();
C10_XPU_API void init(DeviceIndex device_count);
C10_XPU_API void emptyCache();
C10_XPU_API void resetPeakStats(DeviceIndex device);
C10_XPU_API void resetAccumulatedStats(DeviceIndex device);
C10_XPU_API c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
DeviceIndex device);
C10_XPU_API void* raw_alloc(size_t size);
C10_XPU_API void raw_delete(void* ptr);
C10_XPU_API void recordStream(const DataPtr& dataPtr, XPUStream stream);
} // namespace c10::xpu::XPUCachingAllocator

View File

@ -0,0 +1,188 @@
#pragma once
#include <c10/xpu/XPUMacros.h>
#include <sycl/sycl.hpp>
namespace c10::xpu {
#define AT_FORALL_XPU_DEVICE_PROPERTIES(_) \
/* the device name of this SYCL device. */ \
_(name) \
\
/* the device type associated with the device. */ \
_(device_type) \
\
/* the vendor of this SYCL device. */ \
_(vendor) \
\
/* a backend-defined driver version as a std::string. */ \
_(driver_version) \
\
/* the SYCL version as a std::string in the form <major>.<minor> */ \
_(version) \
\
/* true if the SYCL device is available. Otherwise, return false. */ \
_(is_available) \
\
/* the maximum size in bytes of the arguments that can be passed to a \
* kernel. */ \
_(max_parameter_size) \
\
/* the number of parallel compute units available to the device. */ \
_(max_compute_units) \
\
/* the maximum dimensions that specify the global and local work-item IDs \
* used by the data parallel execution model. */ \
_(max_work_item_dimensions) \
\
/* the maximum number of workitems that are permitted in a work-group \
* executing a kernel on a single compute unit. */ \
_(max_work_group_size) \
\
/* the maximum number of subgroups in a work-group for any kernel executed \
* on the device. */ \
_(max_num_sub_groups) \
\
/* a std::vector of size_t containing the set of sub-group sizes supported \
* by the device. */ \
_(sub_group_sizes) \
\
/* the maximum configured clock frequency of this SYCL device in MHz. */ \
_(max_clock_frequency) \
\
/* the default compute device address space size specified as an unsigned \
* integer value in bits. Must return either 32 or 64. */ \
_(address_bits) \
\
/* the maximum size of memory object allocation in bytes. */ \
_(max_mem_alloc_size) \
\
/* the minimum value in bits of the largest supported SYCL built-in data \
* type if this SYCL device is not of device type \
* sycl::info::device_type::custom. */ \
_(mem_base_addr_align) \
\
/* a std::vector of info::fp_config describing the half/single/double \
* precision floating-point capability of this SYCL device. */ \
_(half_fp_config) \
_(single_fp_config) \
_(double_fp_config) \
\
/* the size of global device memory in bytes. */ \
_(global_mem_size) \
\
/* the type of global memory cache supported. */ \
_(global_mem_cache_type) \
\
/* the size of global memory cache in bytes. */ \
_(global_mem_cache_size) \
\
/* the size of global memory cache line in bytes. */ \
_(global_mem_cache_line_size) \
\
/* the type of local memory supported. */ \
_(local_mem_type) \
\
/* the size of local memory arena in bytes. */ \
_(local_mem_size) \
\
/* the maximum number of sub-devices that can be created when this device is \
* partitioned. */ \
_(partition_max_sub_devices) \
\
/* the resolution of device timer in nanoseconds. */ \
_(profiling_timer_resolution) \
\
/* the preferred native vector width size for built-in scalar types that can \
* be put into vectors. */ \
_(preferred_vector_width_char) \
_(preferred_vector_width_short) \
_(preferred_vector_width_int) \
_(preferred_vector_width_long) \
_(preferred_vector_width_float) \
_(preferred_vector_width_double) \
_(preferred_vector_width_half) \
\
/* the native ISA vector width. The vector width is defined as the number of \
* scalar elements that can be stored in the vector. */ \
_(native_vector_width_char) \
_(native_vector_width_short) \
_(native_vector_width_int) \
_(native_vector_width_long) \
_(native_vector_width_float) \
_(native_vector_width_double) \
_(native_vector_width_half)
#define AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(_) \
/* the number of EUs associated with the Intel GPU. */ \
_(gpu_eu_count, 512) \
\
/* the number of EUs in a subslice. */ \
_(gpu_eu_count_per_subslice, 8) \
\
/* the simd width of EU of GPU. */ \
_(gpu_eu_simd_width, 8) \
\
/* the number of hardware threads per EU of GPU. */ \
_(gpu_hw_threads_per_eu, 8)
#define AT_FORALL_XPU_DEVICE_ASPECT(_) \
/* sycl::half is supported on device. */ \
_(fp16) \
\
/* double is supported on device. */ \
_(fp64) \
\
/* 64-bit atomic operation is supported on device. */ \
_(atomic64)
#define AT_FORALL_XPU_EXP_CL_ASPECT(_) \
/* conversion between single-precision 32-bit floating-point values and \
* 16-bit bfloat16 values is supported on device. */ \
_(bfloat16_conversions) \
\
/* specialized hardware to compute MMA is supported on device. */ \
_(subgroup_matrix_multiply_accumulate) \
\
/* specialized hardware to compute MMA for 32-bit floating-point is \
* supported on device. */ \
_(subgroup_matrix_multiply_accumulate_tensor_float32) \
\
/* block read operations for efficient matrix multiplication is supported on \
* device. */ \
_(subgroup_2d_block_io)
#define _DEFINE_SYCL_PROP(ns, property, member) \
ns::property::return_type member;
#define DEFINE_DEVICE_PROP(property) \
_DEFINE_SYCL_PROP(sycl::info::device, property, property)
#define DEFINE_PLATFORM_PROP(property, member) \
_DEFINE_SYCL_PROP(sycl::info::platform, property, member)
#define DEFINE_EXT_DEVICE_PROP(property, ...) \
_DEFINE_SYCL_PROP(sycl::ext::intel::info::device, property, property)
#define DEFINE_DEVICE_ASPECT(member) bool has_##member;
struct C10_XPU_API DeviceProp {
AT_FORALL_XPU_DEVICE_PROPERTIES(DEFINE_DEVICE_PROP);
// the platform name.
DEFINE_PLATFORM_PROP(name, platform_name);
AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(DEFINE_EXT_DEVICE_PROP);
AT_FORALL_XPU_DEVICE_ASPECT(DEFINE_DEVICE_ASPECT);
AT_FORALL_XPU_EXP_CL_ASPECT(DEFINE_DEVICE_ASPECT);
};
#undef _DEFINE_SYCL_PROP
#undef DEFINE_DEVICE_PROP
#undef DEFINE_PLATFORM_PROP
#undef DEFINE_EXT_DEVICE_PROP
#undef DEFINE_DEVICE_ASPECT
} // namespace c10::xpu

View File

@ -0,0 +1,22 @@
#pragma once
#include <c10/util/Exception.h>
#include <sycl/sycl.hpp>
namespace c10::xpu {
static inline sycl::async_handler asyncHandler = [](sycl::exception_list el) {
if (el.size() == 0) {
return;
}
for (const auto& e : el) {
try {
std::rethrow_exception(e);
} catch (sycl::exception& e) {
TORCH_WARN("SYCL Exception: ", e.what());
}
}
throw;
};
} // namespace c10::xpu

View File

@ -0,0 +1,35 @@
#pragma once
#include <c10/core/Device.h>
#include <c10/xpu/XPUDeviceProp.h>
#include <c10/xpu/XPUMacros.h>
// The naming convention used here matches the naming convention of torch.xpu
namespace c10::xpu {
// Log a warning only once if no devices are detected.
C10_XPU_API DeviceIndex device_count();
// Throws an error if no devices are detected.
C10_XPU_API DeviceIndex device_count_ensure_non_zero();
C10_XPU_API DeviceIndex current_device();
C10_XPU_API void set_device(DeviceIndex device);
C10_XPU_API DeviceIndex exchange_device(DeviceIndex device);
C10_XPU_API DeviceIndex maybe_exchange_device(DeviceIndex to_device);
C10_XPU_API sycl::device& get_raw_device(DeviceIndex device);
C10_XPU_API sycl::context& get_device_context();
C10_XPU_API void get_device_properties(
DeviceProp* device_prop,
DeviceIndex device);
C10_XPU_API DeviceIndex get_device_idx_from_pointer(void* ptr);
} // namespace c10::xpu

View File

@ -0,0 +1,33 @@
#pragma once
#ifndef C10_USING_CUSTOM_GENERATED_MACROS
#include <c10/xpu/impl/xpu_cmake_macros.h>
#endif
// See c10/macros/Export.h for a detailed explanation of what the function
// of these macros are. We need one set of macros for every separate library
// we build.
#ifdef _WIN32
#if defined(C10_XPU_BUILD_SHARED_LIBS)
#define C10_XPU_EXPORT __declspec(dllexport)
#define C10_XPU_IMPORT __declspec(dllimport)
#else
#define C10_XPU_EXPORT
#define C10_XPU_IMPORT
#endif
#else // _WIN32
#if defined(__GNUC__)
#define C10_XPU_EXPORT __attribute__((__visibility__("default")))
#else // defined(__GNUC__)
#define C10_XPU_EXPORT
#endif // defined(__GNUC__)
#define C10_XPU_IMPORT C10_XPU_EXPORT
#endif // _WIN32
// This one is being used by libc10_xpu.so
#ifdef C10_XPU_BUILD_MAIN_LIB
#define C10_XPU_API C10_XPU_EXPORT
#else
#define C10_XPU_API C10_XPU_IMPORT
#endif

View File

@ -0,0 +1,189 @@
#pragma once
#include <c10/core/Stream.h>
#include <c10/core/impl/GPUTrace.h>
#include <c10/xpu/XPUFunctions.h>
namespace c10::xpu {
/*
* Note [Stream Management]
*
* An XPUStream is an abstraction of an actual SYCL queue in which SYCL kernel
* can execute. Currently, there are several pools per device to manage SYCL
* queue, and a device's pool is lazily created.
*
* There are two pools per device. The first pool contains "normal priority"
* queues. The second pool is the "high priority" queues. There are 32 queues in
* per pool per device, and when a queue is requested one of these queues is
* returned round-robin. That is, the first queue requested is at index 0, the
* second at index 1... to index 31, then index 0 again.
*
* This means that if 33 queues are requested, the first and last queues
* requested are actually the same queue (under the covers) and kernels enqueued
* on them cannot run concurrently.
*
* It is safe to enqueue a kernel on the same queue from two different
* threads as the SYCL specification described.
*/
static constexpr int max_compile_time_stream_priorities = 2;
/*
* This serves as a wrapper around c10::Stream and acts as a representation for
* a SYCL queue, which allows asynchronous execution of XPU tasks.
*/
class C10_XPU_API XPUStream {
public:
enum Unchecked { UNCHECKED };
/// Construct a XPUStream from a Stream. This construction is checked, and
/// will raise an error if the Stream is not, in fact, a XPU stream.
explicit XPUStream(Stream stream) : stream_(stream) {
TORCH_CHECK(stream_.device_type() == DeviceType::XPU);
}
/// Construct a XPUStream from a Stream with no error checking.
explicit XPUStream(Unchecked, Stream stream) : stream_(stream) {}
bool operator==(const XPUStream& other) const noexcept {
return unwrap() == other.unwrap();
}
bool operator!=(const XPUStream& other) const noexcept {
return unwrap() != other.unwrap();
}
/// Implicit conversion to sycl::queue&.
operator sycl::queue&() const {
return queue();
}
/// Implicit conversion to Stream (a.k.a., forget that the stream is a
/// XPU stream).
operator Stream() const {
return unwrap();
}
/// Get the XPU device type that this stream is associated with.
DeviceType device_type() const {
return DeviceType::XPU;
}
/// Get the XPU device index that this stream is associated with.
DeviceIndex device_index() const {
return stream_.device_index();
}
/// Get the full Device that this stream is associated with. The Device is
/// guaranteed to be a XPU device.
Device device() const {
return Device(DeviceType::XPU, device_index());
}
/// Return the stream ID corresponding to this particular stream. StreamId is
/// a int64_t representation generated by its type and index.
StreamId id() const {
return stream_.id();
}
/// Return true if all enqueued tasks in this stream have been completed,
/// otherwise return false.
bool query() const {
return queue().ext_oneapi_empty();
}
/// Performs a blocking wait for the completion of all enqueued tasks in this
/// stream.
void synchronize() const {
queue().wait_and_throw();
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
if (C10_UNLIKELY(interp)) {
(*interp)->trace_gpu_stream_synchronization(
c10::kXPU, reinterpret_cast<uintptr_t>(&queue()));
}
}
/// Return the priority that this stream is associated with. Lower numbers
/// represent higher priority.
int priority() const;
/// Explicit conversion to sycl::queue&.
sycl::queue& queue() const;
/// Explicit conversion to Stream.
Stream unwrap() const {
return stream_;
}
/// Reversibly pack a XPUStream into a struct representation. The XPUStream
/// can be unpacked using unpack3().
struct c10::StreamData3 pack3() const {
return stream_.pack3();
}
/// Unpack a XPUStream from the 3 fields generated by pack3().
static XPUStream unpack3(
StreamId stream_id,
DeviceIndex device_index,
DeviceType device_type) {
return XPUStream(Stream::unpack3(stream_id, device_index, device_type));
}
/// Return the range of priority **supported by PyTorch**.
static std::tuple<int, int> priority_range() {
return std::make_tuple(0, -max_compile_time_stream_priorities + 1);
}
private:
Stream stream_;
};
/**
* Get a stream from the pool in a round-robin fashion.
*
* You can request a stream from the highest priority pool by setting
* isHighPriority to true for a specific device.
*/
C10_XPU_API XPUStream
getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
/**
* Get a stream from the pool in a round-robin fashion.
*
* You can request a stream by setting a priority value for a specific device.
* The priority number lower, the priority higher.
*/
C10_XPU_API XPUStream
getStreamFromPool(const int priority, DeviceIndex device = -1);
/**
* Get the current XPU stream, for the passed XPU device, or for the current
* device if no device index is passed.
*/
C10_XPU_API XPUStream getCurrentXPUStream(DeviceIndex device = -1);
/**
* Set the current stream on the device of the passed in stream to be the passed
* in stream.
*/
C10_XPU_API void setCurrentXPUStream(XPUStream stream);
C10_XPU_API std::ostream& operator<<(std::ostream& stream, const XPUStream& s);
/**
* Block all reserved SYCL queues in the stream pools on the device, and wait
* for their synchronizations.
*/
C10_XPU_API void syncStreamsOnDevice(DeviceIndex device = -1);
} // namespace c10::xpu
namespace std {
template <>
struct hash<c10::xpu::XPUStream> {
size_t operator()(c10::xpu::XPUStream s) const noexcept {
return std::hash<c10::Stream>{}(s.unwrap());
}
};
} // namespace std

View File

@ -0,0 +1,179 @@
#pragma once
#include <c10/core/DeviceGuard.h>
#include <c10/core/impl/DeviceGuardImplInterface.h>
#include <c10/core/impl/GPUTrace.h>
#include <c10/xpu/XPUCachingAllocator.h>
#include <c10/xpu/XPUFunctions.h>
#include <c10/xpu/XPUStream.h>
#include <vector>
namespace c10::xpu::impl {
struct XPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
static constexpr DeviceType static_type = kXPU;
XPUGuardImpl() = default;
explicit XPUGuardImpl(DeviceType t) {
TORCH_INTERNAL_ASSERT(t == kXPU);
}
DeviceType type() const override {
return kXPU;
}
Device exchangeDevice(Device d) const override {
TORCH_INTERNAL_ASSERT(d.is_xpu());
const auto old_device_index = c10::xpu::exchange_device(d.index());
return Device(kXPU, old_device_index);
}
Device getDevice() const override {
const auto device = c10::xpu::current_device();
return Device(kXPU, device);
}
void setDevice(Device d) const override {
TORCH_INTERNAL_ASSERT(d.is_xpu());
c10::xpu::set_device(d.index());
}
void uncheckedSetDevice(Device d) const noexcept override {
c10::xpu::set_device(d.index());
}
Stream getStream(Device d) const noexcept override {
return getCurrentXPUStream(d.index()).unwrap();
}
Stream getNewStream(Device d, int priority = 0) const override {
return getStreamFromPool(priority, d.index());
}
Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false)
const override {
return getStreamFromPool(isHighPriority, d.index());
}
// NB: These do NOT set the current device
Stream exchangeStream(Stream s) const noexcept override {
const XPUStream stream(s);
const auto old_stream = getCurrentXPUStream(s.device().index());
setCurrentXPUStream(stream);
return old_stream.unwrap();
}
DeviceIndex deviceCount() const noexcept override {
return c10::xpu::device_count();
}
// Event-related functions
void destroyEvent(void* event, const DeviceIndex device_index)
const noexcept override {
if (!event)
return;
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
if (C10_UNLIKELY(interp)) {
(*interp)->trace_gpu_event_deletion(
c10::kXPU, reinterpret_cast<uintptr_t>(event));
}
delete reinterpret_cast<sycl::event*>(event);
}
void record(
void** event,
const Stream& stream,
const DeviceIndex device_index,
const EventFlag flag) const override {
TORCH_CHECK(
device_index == -1 || device_index == stream.device_index(),
"Event device index ",
device_index,
" does not match recording stream's device index ",
stream.device_index(),
".");
auto* xpu_event = reinterpret_cast<sycl::event*>(*event);
const XPUStream xpu_stream{stream};
// Delete the event previously recorded.
if (xpu_event)
delete xpu_event;
xpu_event = new sycl::event(xpu_stream.queue().ext_oneapi_submit_barrier());
*event = reinterpret_cast<void*>(xpu_event);
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
if (C10_UNLIKELY(interp)) {
(*interp)->trace_gpu_event_record(
c10::kXPU,
reinterpret_cast<uintptr_t>(xpu_event),
reinterpret_cast<uintptr_t>(&xpu_stream.queue()));
}
}
void block(void* event, const Stream& stream) const override {
if (!event)
return;
auto* xpu_event = reinterpret_cast<sycl::event*>(event);
std::vector<sycl::event> event_list{*xpu_event};
const XPUStream xpu_stream(stream);
xpu_stream.queue().ext_oneapi_submit_barrier(event_list);
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
if (C10_UNLIKELY(interp)) {
(*interp)->trace_gpu_event_wait(
c10::kXPU,
reinterpret_cast<uintptr_t>(xpu_event),
reinterpret_cast<uintptr_t>(&xpu_stream.queue()));
}
}
bool queryEvent(void* event) const override {
using namespace sycl::info;
if (!event)
return true;
auto* xpu_event = reinterpret_cast<sycl::event*>(event);
return xpu_event->get_info<event::command_execution_status>() ==
event_command_status::complete;
}
// Stream-related functions
bool queryStream(const Stream& stream) const override {
const XPUStream xpu_stream{stream};
return xpu_stream.query();
}
void synchronizeStream(const Stream& stream) const override {
const XPUStream xpu_stream{stream};
xpu_stream.synchronize();
}
void synchronizeEvent(void* event) const override {
if (!event)
return;
auto* xpu_event = reinterpret_cast<sycl::event*>(event);
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
if (C10_UNLIKELY(interp)) {
(*interp)->trace_gpu_event_synchronization(
c10::kXPU, reinterpret_cast<uintptr_t>(xpu_event));
}
xpu_event->wait_and_throw();
}
void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream)
const override {
const XPUStream xpu_stream{stream};
XPUCachingAllocator::recordStream(data_ptr, xpu_stream);
}
double elapsedTime(void* event1, void* event2, const DeviceIndex device_index)
const override {
TORCH_CHECK_NOT_IMPLEMENTED(
false, "elapsedTime is not supported by XPU backend.");
}
};
} // namespace c10::xpu::impl