233 lines
6.7 KiB
C
233 lines
6.7 KiB
C
|
/*!
|
||
|
* Copyright (c) 2017 by Contributors
|
||
|
* \file dlpack.h
|
||
|
* \brief The common header of DLPack.
|
||
|
*/
|
||
|
#ifndef DLPACK_DLPACK_H_
|
||
|
#define DLPACK_DLPACK_H_
|
||
|
|
||
|
/**
|
||
|
* \brief Compatibility with C++
|
||
|
*/
|
||
|
#ifdef __cplusplus
|
||
|
#define DLPACK_EXTERN_C extern "C"
|
||
|
#else
|
||
|
#define DLPACK_EXTERN_C
|
||
|
#endif
|
||
|
|
||
|
/*! \brief The current version of dlpack */
|
||
|
#define DLPACK_VERSION 80
|
||
|
|
||
|
/*! \brief The current ABI version of dlpack */
|
||
|
#define DLPACK_ABI_VERSION 1
|
||
|
|
||
|
/*! \brief DLPACK_DLL prefix for windows */
|
||
|
#ifdef _WIN32
|
||
|
#ifdef DLPACK_EXPORTS
|
||
|
#define DLPACK_DLL __declspec(dllexport)
|
||
|
#else
|
||
|
#define DLPACK_DLL __declspec(dllimport)
|
||
|
#endif
|
||
|
#else
|
||
|
#define DLPACK_DLL
|
||
|
#endif
|
||
|
|
||
|
#include <stdint.h>
|
||
|
#include <stddef.h>
|
||
|
|
||
|
#ifdef __cplusplus
|
||
|
extern "C" {
|
||
|
#endif
|
||
|
/*!
|
||
|
* \brief The device type in DLDevice.
|
||
|
*/
|
||
|
#ifdef __cplusplus
|
||
|
typedef enum : int32_t {
|
||
|
#else
|
||
|
typedef enum {
|
||
|
#endif
|
||
|
/*! \brief CPU device */
|
||
|
kDLCPU = 1,
|
||
|
/*! \brief CUDA GPU device */
|
||
|
kDLCUDA = 2,
|
||
|
/*!
|
||
|
* \brief Pinned CUDA CPU memory by cudaMallocHost
|
||
|
*/
|
||
|
kDLCUDAHost = 3,
|
||
|
/*! \brief OpenCL devices. */
|
||
|
kDLOpenCL = 4,
|
||
|
/*! \brief Vulkan buffer for next generation graphics. */
|
||
|
kDLVulkan = 7,
|
||
|
/*! \brief Metal for Apple GPU. */
|
||
|
kDLMetal = 8,
|
||
|
/*! \brief Verilog simulator buffer */
|
||
|
kDLVPI = 9,
|
||
|
/*! \brief ROCm GPUs for AMD GPUs */
|
||
|
kDLROCM = 10,
|
||
|
/*!
|
||
|
* \brief Pinned ROCm CPU memory allocated by hipMallocHost
|
||
|
*/
|
||
|
kDLROCMHost = 11,
|
||
|
/*!
|
||
|
* \brief Reserved extension device type,
|
||
|
* used for quickly test extension device
|
||
|
* The semantics can differ depending on the implementation.
|
||
|
*/
|
||
|
kDLExtDev = 12,
|
||
|
/*!
|
||
|
* \brief CUDA managed/unified memory allocated by cudaMallocManaged
|
||
|
*/
|
||
|
kDLCUDAManaged = 13,
|
||
|
/*!
|
||
|
* \brief Unified shared memory allocated on a oneAPI non-partititioned
|
||
|
* device. Call to oneAPI runtime is required to determine the device
|
||
|
* type, the USM allocation type and the sycl context it is bound to.
|
||
|
*
|
||
|
*/
|
||
|
kDLOneAPI = 14,
|
||
|
/*! \brief GPU support for next generation WebGPU standard. */
|
||
|
kDLWebGPU = 15,
|
||
|
/*! \brief Qualcomm Hexagon DSP */
|
||
|
kDLHexagon = 16,
|
||
|
} DLDeviceType;
|
||
|
|
||
|
/*!
|
||
|
* \brief A Device for Tensor and operator.
|
||
|
*/
|
||
|
typedef struct {
|
||
|
/*! \brief The device type used in the device. */
|
||
|
DLDeviceType device_type;
|
||
|
/*!
|
||
|
* \brief The device index.
|
||
|
* For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
|
||
|
*/
|
||
|
int32_t device_id;
|
||
|
} DLDevice;
|
||
|
|
||
|
/*!
|
||
|
* \brief The type code options DLDataType.
|
||
|
*/
|
||
|
typedef enum {
|
||
|
/*! \brief signed integer */
|
||
|
kDLInt = 0U,
|
||
|
/*! \brief unsigned integer */
|
||
|
kDLUInt = 1U,
|
||
|
/*! \brief IEEE floating point */
|
||
|
kDLFloat = 2U,
|
||
|
/*!
|
||
|
* \brief Opaque handle type, reserved for testing purposes.
|
||
|
* Frameworks need to agree on the handle data type for the exchange to be well-defined.
|
||
|
*/
|
||
|
kDLOpaqueHandle = 3U,
|
||
|
/*! \brief bfloat16 */
|
||
|
kDLBfloat = 4U,
|
||
|
/*!
|
||
|
* \brief complex number
|
||
|
* (C/C++/Python layout: compact struct per complex number)
|
||
|
*/
|
||
|
kDLComplex = 5U,
|
||
|
/*! \brief boolean */
|
||
|
kDLBool = 6U,
|
||
|
} DLDataTypeCode;
|
||
|
|
||
|
/*!
|
||
|
* \brief The data type the tensor can hold. The data type is assumed to follow the
|
||
|
* native endian-ness. An explicit error message should be raised when attempting to
|
||
|
* export an array with non-native endianness
|
||
|
*
|
||
|
* Examples
|
||
|
* - float: type_code = 2, bits = 32, lanes = 1
|
||
|
* - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
|
||
|
* - int8: type_code = 0, bits = 8, lanes = 1
|
||
|
* - std::complex<float>: type_code = 5, bits = 64, lanes = 1
|
||
|
* - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
|
||
|
*/
|
||
|
typedef struct {
|
||
|
/*!
|
||
|
* \brief Type code of base types.
|
||
|
* We keep it uint8_t instead of DLDataTypeCode for minimal memory
|
||
|
* footprint, but the value should be one of DLDataTypeCode enum values.
|
||
|
* */
|
||
|
uint8_t code;
|
||
|
/*!
|
||
|
* \brief Number of bits, common choices are 8, 16, 32.
|
||
|
*/
|
||
|
uint8_t bits;
|
||
|
/*! \brief Number of lanes in the type, used for vector types. */
|
||
|
uint16_t lanes;
|
||
|
} DLDataType;
|
||
|
|
||
|
/*!
|
||
|
* \brief Plain C Tensor object, does not manage memory.
|
||
|
*/
|
||
|
typedef struct {
|
||
|
/*!
|
||
|
* \brief The data pointer points to the allocated data. This will be CUDA
|
||
|
* device pointer or cl_mem handle in OpenCL. It may be opaque on some device
|
||
|
* types. This pointer is always aligned to 256 bytes as in CUDA. The
|
||
|
* `byte_offset` field should be used to point to the beginning of the data.
|
||
|
*
|
||
|
* Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
|
||
|
* TVM, perhaps others) do not adhere to this 256 byte aligment requirement
|
||
|
* on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed
|
||
|
* (after which this note will be updated); at the moment it is recommended
|
||
|
* to not rely on the data pointer being correctly aligned.
|
||
|
*
|
||
|
* For given DLTensor, the size of memory required to store the contents of
|
||
|
* data is calculated as follows:
|
||
|
*
|
||
|
* \code{.c}
|
||
|
* static inline size_t GetDataSize(const DLTensor* t) {
|
||
|
* size_t size = 1;
|
||
|
* for (tvm_index_t i = 0; i < t->ndim; ++i) {
|
||
|
* size *= t->shape[i];
|
||
|
* }
|
||
|
* size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
|
||
|
* return size;
|
||
|
* }
|
||
|
* \endcode
|
||
|
*/
|
||
|
void* data;
|
||
|
/*! \brief The device of the tensor */
|
||
|
DLDevice device;
|
||
|
/*! \brief Number of dimensions */
|
||
|
int32_t ndim;
|
||
|
/*! \brief The data type of the pointer*/
|
||
|
DLDataType dtype;
|
||
|
/*! \brief The shape of the tensor */
|
||
|
const int64_t* shape;
|
||
|
/*!
|
||
|
* \brief strides of the tensor (in number of elements, not bytes)
|
||
|
* can be NULL, indicating tensor is compact and row-majored.
|
||
|
*/
|
||
|
const int64_t* strides;
|
||
|
/*! \brief The offset in bytes to the beginning pointer to data */
|
||
|
uint64_t byte_offset;
|
||
|
} DLTensor;
|
||
|
|
||
|
/*!
|
||
|
* \brief C Tensor object, manage memory of DLTensor. This data structure is
|
||
|
* intended to facilitate the borrowing of DLTensor by another framework. It is
|
||
|
* not meant to transfer the tensor. When the borrowing framework doesn't need
|
||
|
* the tensor, it should call the deleter to notify the host that the resource
|
||
|
* is no longer needed.
|
||
|
*/
|
||
|
typedef struct DLManagedTensor {
|
||
|
/*! \brief DLTensor which is being memory managed */
|
||
|
DLTensor dl_tensor;
|
||
|
/*! \brief the context of the original host framework of DLManagedTensor in
|
||
|
* which DLManagedTensor is used in the framework. It can also be NULL.
|
||
|
*/
|
||
|
void * manager_ctx;
|
||
|
/*! \brief Destructor signature void (*)(void*) - this should be called
|
||
|
* to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
|
||
|
* if there is no way for the caller to provide a reasonable destructor.
|
||
|
* The destructors deletes the argument self as well.
|
||
|
*/
|
||
|
void (*deleter)(struct DLManagedTensor * self);
|
||
|
} DLManagedTensor;
|
||
|
#ifdef __cplusplus
|
||
|
} // DLPACK_EXTERN_C
|
||
|
#endif
|
||
|
#endif // DLPACK_DLPACK_H_
|