/*! * Copyright (c) 2017 by Contributors * \file dlpack.h * \brief The common header of DLPack. */ #ifndef DLPACK_DLPACK_H_ #define DLPACK_DLPACK_H_ /** * \brief Compatibility with C++ */ #ifdef __cplusplus #define DLPACK_EXTERN_C extern "C" #else #define DLPACK_EXTERN_C #endif /*! \brief The current version of dlpack */ #define DLPACK_VERSION 80 /*! \brief The current ABI version of dlpack */ #define DLPACK_ABI_VERSION 1 /*! \brief DLPACK_DLL prefix for windows */ #ifdef _WIN32 #ifdef DLPACK_EXPORTS #define DLPACK_DLL __declspec(dllexport) #else #define DLPACK_DLL __declspec(dllimport) #endif #else #define DLPACK_DLL #endif #include #include #ifdef __cplusplus extern "C" { #endif /*! * \brief The device type in DLDevice. */ #ifdef __cplusplus typedef enum : int32_t { #else typedef enum { #endif /*! \brief CPU device */ kDLCPU = 1, /*! \brief CUDA GPU device */ kDLCUDA = 2, /*! * \brief Pinned CUDA CPU memory by cudaMallocHost */ kDLCUDAHost = 3, /*! \brief OpenCL devices. */ kDLOpenCL = 4, /*! \brief Vulkan buffer for next generation graphics. */ kDLVulkan = 7, /*! \brief Metal for Apple GPU. */ kDLMetal = 8, /*! \brief Verilog simulator buffer */ kDLVPI = 9, /*! \brief ROCm GPUs for AMD GPUs */ kDLROCM = 10, /*! * \brief Pinned ROCm CPU memory allocated by hipMallocHost */ kDLROCMHost = 11, /*! * \brief Reserved extension device type, * used for quickly test extension device * The semantics can differ depending on the implementation. */ kDLExtDev = 12, /*! * \brief CUDA managed/unified memory allocated by cudaMallocManaged */ kDLCUDAManaged = 13, /*! * \brief Unified shared memory allocated on a oneAPI non-partititioned * device. Call to oneAPI runtime is required to determine the device * type, the USM allocation type and the sycl context it is bound to. * */ kDLOneAPI = 14, /*! \brief GPU support for next generation WebGPU standard. */ kDLWebGPU = 15, /*! \brief Qualcomm Hexagon DSP */ kDLHexagon = 16, } DLDeviceType; /*! * \brief A Device for Tensor and operator. */ typedef struct { /*! \brief The device type used in the device. */ DLDeviceType device_type; /*! * \brief The device index. * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0. */ int32_t device_id; } DLDevice; /*! * \brief The type code options DLDataType. */ typedef enum { /*! \brief signed integer */ kDLInt = 0U, /*! \brief unsigned integer */ kDLUInt = 1U, /*! \brief IEEE floating point */ kDLFloat = 2U, /*! * \brief Opaque handle type, reserved for testing purposes. * Frameworks need to agree on the handle data type for the exchange to be well-defined. */ kDLOpaqueHandle = 3U, /*! \brief bfloat16 */ kDLBfloat = 4U, /*! * \brief complex number * (C/C++/Python layout: compact struct per complex number) */ kDLComplex = 5U, /*! \brief boolean */ kDLBool = 6U, } DLDataTypeCode; /*! * \brief The data type the tensor can hold. The data type is assumed to follow the * native endian-ness. An explicit error message should be raised when attempting to * export an array with non-native endianness * * Examples * - float: type_code = 2, bits = 32, lanes = 1 * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4 * - int8: type_code = 0, bits = 8, lanes = 1 * - std::complex: type_code = 5, bits = 64, lanes = 1 * - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits) */ typedef struct { /*! * \brief Type code of base types. * We keep it uint8_t instead of DLDataTypeCode for minimal memory * footprint, but the value should be one of DLDataTypeCode enum values. * */ uint8_t code; /*! * \brief Number of bits, common choices are 8, 16, 32. */ uint8_t bits; /*! \brief Number of lanes in the type, used for vector types. */ uint16_t lanes; } DLDataType; /*! * \brief Plain C Tensor object, does not manage memory. */ typedef struct { /*! * \brief The data pointer points to the allocated data. This will be CUDA * device pointer or cl_mem handle in OpenCL. It may be opaque on some device * types. This pointer is always aligned to 256 bytes as in CUDA. The * `byte_offset` field should be used to point to the beginning of the data. * * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow, * TVM, perhaps others) do not adhere to this 256 byte aligment requirement * on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed * (after which this note will be updated); at the moment it is recommended * to not rely on the data pointer being correctly aligned. * * For given DLTensor, the size of memory required to store the contents of * data is calculated as follows: * * \code{.c} * static inline size_t GetDataSize(const DLTensor* t) { * size_t size = 1; * for (tvm_index_t i = 0; i < t->ndim; ++i) { * size *= t->shape[i]; * } * size *= (t->dtype.bits * t->dtype.lanes + 7) / 8; * return size; * } * \endcode */ void* data; /*! \brief The device of the tensor */ DLDevice device; /*! \brief Number of dimensions */ int32_t ndim; /*! \brief The data type of the pointer*/ DLDataType dtype; /*! \brief The shape of the tensor */ const int64_t* shape; /*! * \brief strides of the tensor (in number of elements, not bytes) * can be NULL, indicating tensor is compact and row-majored. */ const int64_t* strides; /*! \brief The offset in bytes to the beginning pointer to data */ uint64_t byte_offset; } DLTensor; /*! * \brief C Tensor object, manage memory of DLTensor. This data structure is * intended to facilitate the borrowing of DLTensor by another framework. It is * not meant to transfer the tensor. When the borrowing framework doesn't need * the tensor, it should call the deleter to notify the host that the resource * is no longer needed. */ typedef struct DLManagedTensor { /*! \brief DLTensor which is being memory managed */ DLTensor dl_tensor; /*! \brief the context of the original host framework of DLManagedTensor in * which DLManagedTensor is used in the framework. It can also be NULL. */ void * manager_ctx; /*! \brief Destructor signature void (*)(void*) - this should be called * to destruct manager_ctx which holds the DLManagedTensor. It can be NULL * if there is no way for the caller to provide a reasonable destructor. * The destructors deletes the argument self as well. */ void (*deleter)(struct DLManagedTensor * self); } DLManagedTensor; #ifdef __cplusplus } // DLPACK_EXTERN_C #endif #endif // DLPACK_DLPACK_H_