diff --git a/include/infinicore/adaptor/aten_adaptor.hpp b/include/infinicore/adaptor/aten_adaptor.hpp index 0c3237dc9..e7a852085 100644 --- a/include/infinicore/adaptor/aten_adaptor.hpp +++ b/include/infinicore/adaptor/aten_adaptor.hpp @@ -6,9 +6,9 @@ #include #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) -#include -#include #include +#include +#include #endif namespace infinicore::adaptor { diff --git a/include/infinicore/nn.hpp b/include/infinicore/nn.hpp index b927b294b..4fb2d55c3 100644 --- a/include/infinicore/nn.hpp +++ b/include/infinicore/nn.hpp @@ -1,5 +1,6 @@ #pragma once #include "nn/embedding.hpp" +#include "nn/layernorm.hpp" #include "nn/linear.hpp" #include "nn/rmsnorm.hpp" diff --git a/include/infinicore/nn/embedding.hpp b/include/infinicore/nn/embedding.hpp index 1c8a29966..50a387325 100644 --- a/include/infinicore/nn/embedding.hpp +++ b/include/infinicore/nn/embedding.hpp @@ -1,7 +1,7 @@ #pragma once -#include "module.hpp" #include "../ops.hpp" +#include "module.hpp" #include namespace infinicore::nn { @@ -78,10 +78,10 @@ class Embedding : public Module { INFINICORE_NN_PARAMETER(weight); private: - size_t num_embeddings_; // Vocabulary size - size_t embedding_dim_; // Embedding dimension - std::optional padding_idx_; // Optional padding index - DataType dtype_; // Data type for embedding weights + size_t num_embeddings_; // Vocabulary size + size_t embedding_dim_; // Embedding dimension + std::optional padding_idx_; // Optional padding index + DataType dtype_; // Data type for embedding weights }; } // namespace infinicore::nn diff --git a/include/infinicore/nn/layernorm.hpp b/include/infinicore/nn/layernorm.hpp new file mode 100644 index 000000000..687487cfa --- /dev/null +++ b/include/infinicore/nn/layernorm.hpp @@ -0,0 +1,60 @@ +#pragma once + +#include "../ops.hpp" +#include "module.hpp" + +namespace infinicore::nn { + +/** + * @brief Layer Normalization + * + * Applies LayerNorm over the last dimension. + * + * Formula: y = (x - mean) / sqrt(var + eps) * weight + bias + */ +class LayerNorm : public Module { +public: + /** + * @brief Construct a LayerNorm layer + * + * @param normalized_shape Size of the feature dimension to normalize (typically hidden_size) + * @param eps Small constant for numerical stability (default: 1e-5) + * @param dtype Data type for the weight/bias (default: DataType::F32) + * @param device Device to create the parameters on + */ + LayerNorm(size_t normalized_shape, + double eps = 1e-5, + const DataType &dtype = DataType::F32, + const Device &device = Device()); + + /** + * @brief Forward pass: apply LayerNorm + * + * @param x Input tensor of shape (*, normalized_shape) + * @return Normalized tensor with same shape as input + */ + Tensor forward(const Tensor &x) const; + + // Module information + size_t normalized_shape() const { return normalized_shape_; } + double eps() const { return eps_; } + DataType dtype() const { return dtype_; } + + // String representation + std::string extra_repr() const; + + // Accessors for parameters + Tensor weight() const { return weight_; } + Tensor bias() const { return bias_; } + +protected: + INFINICORE_NN_PARAMETER(weight); + INFINICORE_NN_PARAMETER(bias); + +private: + size_t normalized_shape_; + double eps_; + DataType dtype_; +}; + +} // namespace infinicore::nn diff --git a/include/infinicore/nn/module.hpp b/include/infinicore/nn/module.hpp index 76c531afc..ae12d092a 100644 --- a/include/infinicore/nn/module.hpp +++ b/include/infinicore/nn/module.hpp @@ -3,10 +3,10 @@ #include "../tensor.hpp" #include "parameter.hpp" +#include #include #include #include -#include namespace infinicore::nn { class Module { diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp index 18741c402..6019c3a9b 100644 --- a/include/infinicore/ops.hpp +++ b/include/infinicore/ops.hpp @@ -14,14 +14,19 @@ #include "ops/binary_cross_entropy_with_logits.hpp" #include "ops/causal_softmax.hpp" #include "ops/cdist.hpp" +#include "ops/conv2d.hpp" #include "ops/cross_entropy.hpp" #include "ops/embedding.hpp" #include "ops/flash_attention.hpp" #include "ops/fmin.hpp" #include "ops/fmod.hpp" +#include "ops/gelu.hpp" +#include "ops/gelutanh.hpp" #include "ops/hardswish.hpp" #include "ops/hardtanh.hpp" #include "ops/kv_caching.hpp" +#include "ops/layer_norm.hpp" +#include "ops/linear.hpp" #include "ops/matmul.hpp" #include "ops/ones.hpp" #include "ops/paged_attention.hpp" @@ -29,11 +34,14 @@ #include "ops/paged_caching.hpp" #include "ops/per_tensor_dequant_i8.hpp" #include "ops/per_tensor_quant_i8.hpp" +#include "ops/quickgelu.hpp" #include "ops/random_sample.hpp" #include "ops/rearrange.hpp" #include "ops/reciprocal.hpp" +#include "ops/relu.hpp" #include "ops/rms_norm.hpp" #include "ops/rope.hpp" #include "ops/silu.hpp" #include "ops/silu_and_mul.hpp" +#include "ops/softmax.hpp" #include "ops/swiglu.hpp" diff --git a/include/infinicore/ops/conv2d.hpp b/include/infinicore/ops/conv2d.hpp new file mode 100644 index 000000000..f1dda90ac --- /dev/null +++ b/include/infinicore/ops/conv2d.hpp @@ -0,0 +1,38 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +#include +#include + +namespace infinicore::op { +class Conv2d { +public: + using schema = void (*)(Tensor, Tensor, Tensor, Tensor, + const size_t *, const size_t *, const size_t *, size_t); + static void execute(Tensor output, + Tensor input, + Tensor weight, + Tensor bias, + const size_t *pads, + const size_t *strides, + const size_t *dilations, + size_t n); + static common::OpDispatcher &dispatcher(); +}; + +Tensor conv2d(Tensor input, + Tensor weight, + Tensor bias, + const std::vector &pads, + const std::vector &strides, + const std::vector &dilations); +void conv2d_(Tensor output, + Tensor input, + Tensor weight, + Tensor bias, + const std::vector &pads, + const std::vector &strides, + const std::vector &dilations); +} // namespace infinicore::op diff --git a/include/infinicore/ops/gelu.hpp b/include/infinicore/ops/gelu.hpp new file mode 100644 index 000000000..5e8c61347 --- /dev/null +++ b/include/infinicore/ops/gelu.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Gelu { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor output, Tensor input); + static common::OpDispatcher &dispatcher(); +}; + +Tensor gelu(Tensor input); +void gelu_(Tensor output, Tensor input); +} // namespace infinicore::op diff --git a/include/infinicore/ops/gelutanh.hpp b/include/infinicore/ops/gelutanh.hpp new file mode 100644 index 000000000..c968c81fd --- /dev/null +++ b/include/infinicore/ops/gelutanh.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class GeluTanh { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor output, Tensor input); + static common::OpDispatcher &dispatcher(); +}; + +Tensor gelu_tanh(Tensor input); +void gelu_tanh_(Tensor output, Tensor input); +} // namespace infinicore::op diff --git a/include/infinicore/ops/layer_norm.hpp b/include/infinicore/ops/layer_norm.hpp new file mode 100644 index 000000000..ad8187bdc --- /dev/null +++ b/include/infinicore/ops/layer_norm.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class LayerNorm { +public: + using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, float); + static void execute(Tensor output, + Tensor input_standardization, + Tensor input_std_deviation, + Tensor input, + Tensor weight, + Tensor bias, + float epsilon); + static common::OpDispatcher &dispatcher(); +}; + +Tensor layer_norm(Tensor input, Tensor weight, Tensor bias, float epsilon = 1e-5f); +void layer_norm_(Tensor output, + Tensor input_standardization, + Tensor input_std_deviation, + Tensor input, + Tensor weight, + Tensor bias, + float epsilon = 1e-5f); +} // namespace infinicore::op diff --git a/include/infinicore/ops/mha_kvcache.hpp b/include/infinicore/ops/mha_kvcache.hpp index 2769e4e39..69a11f61c 100644 --- a/include/infinicore/ops/mha_kvcache.hpp +++ b/include/infinicore/ops/mha_kvcache.hpp @@ -22,14 +22,14 @@ namespace infinicore::op { INFINICORE_GRAPH_OP_CLASS( MhaKVCache, - Tensor, // out - const Tensor &, // q - const Tensor &, // k_cache - const Tensor &, // v_cache - const Tensor &, // seqlens_k - const Tensor &, // block_table - std::optional, // alibi_slopes - float); // scale + Tensor, // out + const Tensor &, // q + const Tensor &, // k_cache + const Tensor &, // v_cache + const Tensor &, // seqlens_k + const Tensor &, // block_table + std::optional, // alibi_slopes + float); // scale Tensor mha_kvcache(const Tensor &q, const Tensor &k_cache, diff --git a/include/infinicore/ops/quickgelu.hpp b/include/infinicore/ops/quickgelu.hpp new file mode 100644 index 000000000..b67d7f71a --- /dev/null +++ b/include/infinicore/ops/quickgelu.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class QuickGelu { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor output, Tensor input); + static common::OpDispatcher &dispatcher(); +}; + +Tensor quick_gelu(Tensor input); +void quick_gelu_(Tensor output, Tensor input); +} // namespace infinicore::op diff --git a/include/infinicore/ops/relu.hpp b/include/infinicore/ops/relu.hpp new file mode 100644 index 000000000..59f2b26f5 --- /dev/null +++ b/include/infinicore/ops/relu.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Relu { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor output, Tensor input); + static common::OpDispatcher &dispatcher(); +}; + +Tensor relu(Tensor input); +void relu_(Tensor output, Tensor input); +} // namespace infinicore::op diff --git a/include/infinicore/ops/softmax.hpp b/include/infinicore/ops/softmax.hpp new file mode 100644 index 000000000..c96ab1810 --- /dev/null +++ b/include/infinicore/ops/softmax.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Softmax { +public: + using schema = void (*)(Tensor, Tensor, int); + static void execute(Tensor output, Tensor input, int axis); + static common::OpDispatcher &dispatcher(); +}; + +Tensor softmax(Tensor input, int axis = -1); +void softmax_(Tensor output, Tensor input, int axis = -1); +} // namespace infinicore::op diff --git a/include/infinicore/quantization/compressed_tensors.hpp b/include/infinicore/quantization/compressed_tensors.hpp index 0e3e45512..0549c1f00 100644 --- a/include/infinicore/quantization/compressed_tensors.hpp +++ b/include/infinicore/quantization/compressed_tensors.hpp @@ -9,7 +9,7 @@ class CompressedTensors : public BaseQuantization { // information and support multiple quantization schemes. public: explicit CompressedTensors(const nlohmann::json &quant_config) - : BaseQuantization(quant_config) {}; + : BaseQuantization(quant_config){}; infinicore::quantization::QuantScheme get_quant_scheme() const override { diff --git a/include/infinicore/quantization/none_quantizaiton.hpp b/include/infinicore/quantization/none_quantizaiton.hpp index be5e4b377..5009f0adc 100644 --- a/include/infinicore/quantization/none_quantizaiton.hpp +++ b/include/infinicore/quantization/none_quantizaiton.hpp @@ -9,7 +9,7 @@ class NoneQuantization : public BaseQuantization { // information and support multiple quantization schemes. public: explicit NoneQuantization(const nlohmann::json &quant_config) - : BaseQuantization(quant_config) {}; + : BaseQuantization(quant_config){}; infinicore::quantization::QuantScheme get_quant_scheme() const override { diff --git a/include/infiniop.h b/include/infiniop.h index fe9537876..08fd4ad21 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -38,6 +38,7 @@ #include "infiniop/ops/fmin.h" #include "infiniop/ops/fmod.h" #include "infiniop/ops/gelu.h" +#include "infiniop/ops/gelutanh.h" #include "infiniop/ops/gemm.h" #include "infiniop/ops/hardswish.h" #include "infiniop/ops/hardtanh.h" @@ -66,6 +67,7 @@ #include "infiniop/ops/paged_caching.h" #include "infiniop/ops/quant/per_channel_quant_int8.h" #include "infiniop/ops/quant/per_tensor_quant_int8.h" +#include "infiniop/ops/quickgelu.h" #include "infiniop/ops/random_sample.h" #include "infiniop/ops/rearrange.h" #include "infiniop/ops/reciprocal.h" diff --git a/include/infiniop/ops/add.h b/include/infiniop/ops/add.h index 2797f28f8..8a7592c1c 100644 --- a/include/infiniop/ops/add.h +++ b/include/infiniop/ops/add.h @@ -6,20 +6,20 @@ typedef struct InfiniopDescriptor *infiniopAddDescriptor_t; __INFINI_C __export infiniStatus_t infiniopCreateAddDescriptor(infiniopHandle_t handle, - infiniopAddDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); + infiniopAddDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); __INFINI_C __export infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopAdd(infiniopAddDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream); + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc); diff --git a/include/infiniop/ops/add_rms_norm.h b/include/infiniop/ops/add_rms_norm.h index 6c7584957..2cde1db76 100644 --- a/include/infiniop/ops/add_rms_norm.h +++ b/include/infiniop/ops/add_rms_norm.h @@ -18,14 +18,14 @@ __INFINI_C __export infiniStatus_t infiniopCreateAddRMSNormDescriptor( __INFINI_C __export infiniStatus_t infiniopGetAddRMSNormWorkspaceSize(infiniopAddRMSNormDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopAddRMSNorm(infiniopAddRMSNormDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - void *residual_out, - const void *a, - const void *b, - const void *weight, - void *stream); + void *workspace, + size_t workspace_size, + void *y, + void *residual_out, + const void *a, + const void *b, + const void *weight, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroyAddRMSNormDescriptor(infiniopAddRMSNormDescriptor_t desc); diff --git a/include/infiniop/ops/attention.h b/include/infiniop/ops/attention.h index d49b5668c..21a911489 100644 --- a/include/infiniop/ops/attention.h +++ b/include/infiniop/ops/attention.h @@ -8,27 +8,27 @@ typedef struct InfiniopDescriptor *infiniopAttentionDescriptor_t; __INFINI_C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t handle, - infiniopAttentionDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t out_desc, - infiniopTensorDescriptor_t q_desc, - infiniopTensorDescriptor_t k_desc, - infiniopTensorDescriptor_t v_desc, - infiniopTensorDescriptor_t k_cache_desc, - infiniopTensorDescriptor_t v_cache_desc, - size_t pos); + infiniopAttentionDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t q_desc, + infiniopTensorDescriptor_t k_desc, + infiniopTensorDescriptor_t v_desc, + infiniopTensorDescriptor_t k_cache_desc, + infiniopTensorDescriptor_t v_cache_desc, + size_t pos); __INFINI_C __export infiniStatus_t infiniopGetAttentionWorkspaceSize(infiniopAttentionDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopAttention(infiniopAttentionDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *out, - const void *q, - const void *k, - const void *v, - void *k_cache, - void *v_cache, - void *stream); + void *workspace, + size_t workspace_size, + void *out, + const void *q, + const void *k, + const void *v, + void *k_cache, + void *v_cache, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroyAttentionDescriptor(infiniopAttentionDescriptor_t desc); #endif diff --git a/include/infiniop/ops/clip.h b/include/infiniop/ops/clip.h index b007164bb..965932ad8 100644 --- a/include/infiniop/ops/clip.h +++ b/include/infiniop/ops/clip.h @@ -6,22 +6,22 @@ typedef struct InfiniopDescriptor *infiniopClipDescriptor_t; __INFINI_C __export infiniStatus_t infiniopCreateClipDescriptor(infiniopHandle_t handle, - infiniopClipDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x, - infiniopTensorDescriptor_t min_val, - infiniopTensorDescriptor_t max_val); + infiniopClipDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t min_val, + infiniopTensorDescriptor_t max_val); __INFINI_C __export infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopClip(infiniopClipDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - const void *min_val, - const void *max_val, - void *stream); + void *workspace, + size_t workspace_size, + void *y, + const void *x, + const void *min_val, + const void *max_val, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc); diff --git a/include/infiniop/ops/conv.h b/include/infiniop/ops/conv.h index efbc72357..eea64f5e9 100644 --- a/include/infiniop/ops/conv.h +++ b/include/infiniop/ops/conv.h @@ -6,15 +6,15 @@ typedef struct InfiniopDescriptor *infiniopConvDescriptor_t; __INFINI_C __export infiniStatus_t infiniopCreateConvDescriptor(infiniopHandle_t handle, - infiniopConvDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc, - infiniopTensorDescriptor_t w_desc, - infiniopTensorDescriptor_t b_desc, - void *pads, - void *strides, - void *dilations, - size_t n); + infiniopConvDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc, + infiniopTensorDescriptor_t b_desc, + void *pads, + void *strides, + void *dilations, + size_t n); __INFINI_C __export infiniStatus_t infiniopGetConvWorkspaceSize(infiniopConvDescriptor_t desc, size_t *size); diff --git a/include/infiniop/ops/dequantize_awq.h b/include/infiniop/ops/dequantize_awq.h index bb495e2d1..f4cdc4bbe 100644 --- a/include/infiniop/ops/dequantize_awq.h +++ b/include/infiniop/ops/dequantize_awq.h @@ -6,22 +6,22 @@ typedef struct InfiniopDescriptor *infiniopDequantizeAWQDescriptor_t; __INFINI_C __export infiniStatus_t infiniopCreateDequantizeAWQDescriptor(infiniopHandle_t handle, - infiniopDequantizeAWQDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t out_desc, - infiniopTensorDescriptor_t qweight_desc, - infiniopTensorDescriptor_t scales_desc, - infiniopTensorDescriptor_t zeros_desc); + infiniopDequantizeAWQDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t qweight_desc, + infiniopTensorDescriptor_t scales_desc, + infiniopTensorDescriptor_t zeros_desc); __INFINI_C __export infiniStatus_t infiniopGetDequantizeAWQWorkspaceSize(infiniopDequantizeAWQDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopDequantizeAWQ(infiniopDequantizeAWQDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *out, - const void *qweight, - const void *scales, - const void *zeros, - void *stream); + void *workspace, + size_t workspace_size, + void *out, + const void *qweight, + const void *scales, + const void *zeros, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroyDequantizeAWQDescriptor(infiniopDequantizeAWQDescriptor_t desc); diff --git a/include/infiniop/ops/embedding.h b/include/infiniop/ops/embedding.h index 318dcd33e..5528be131 100644 --- a/include/infiniop/ops/embedding.h +++ b/include/infiniop/ops/embedding.h @@ -23,4 +23,3 @@ __INFINI_C __export infiniStatus_t infiniopDestroyEmbeddingDescriptor( infiniopEmbeddingDescriptor_t desc); #endif - diff --git a/include/infiniop/ops/gelu.h b/include/infiniop/ops/gelu.h index 52aa26c56..3ddc21df8 100644 --- a/include/infiniop/ops/gelu.h +++ b/include/infiniop/ops/gelu.h @@ -6,18 +6,18 @@ typedef struct InfiniopDescriptor *infiniopGeluDescriptor_t; __INFINI_C __export infiniStatus_t infiniopCreateGeluDescriptor(infiniopHandle_t handle, - infiniopGeluDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output, - infiniopTensorDescriptor_t intput); + infiniopGeluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t intput); __INFINI_C __export infiniStatus_t infiniopGetGeluWorkspaceSize(infiniopGeluDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopGelu(infiniopGeluDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *intput, - void *stream); + void *workspace, + size_t workspace_size, + void *output, + const void *intput, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroyGeluDescriptor(infiniopGeluDescriptor_t desc); diff --git a/include/infiniop/ops/gelutanh.h b/include/infiniop/ops/gelutanh.h new file mode 100644 index 000000000..9e94514b8 --- /dev/null +++ b/include/infiniop/ops/gelutanh.h @@ -0,0 +1,43 @@ +#ifndef __INFINIOP_GELUTANH_API_H__ +#define __INFINIOP_GELUTANH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopGeluTanhDescriptor_t; + +/** + * Create GELU-Tanh descriptor + * + * y = x * 0.5 * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3))) + */ +__INFINI_C __export infiniStatus_t infiniopCreateGeluTanhDescriptor( + infiniopHandle_t handle, + infiniopGeluTanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +/** + * Query workspace size + */ +__INFINI_C __export infiniStatus_t infiniopGetGeluTanhWorkspaceSize( + infiniopGeluTanhDescriptor_t desc, + size_t *size); + +/** + * Launch GELU-Tanh operator + */ +__INFINI_C __export infiniStatus_t infiniopGeluTanh( + infiniopGeluTanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +/** + * Destroy descriptor + */ +__INFINI_C __export infiniStatus_t infiniopDestroyGeluTanhDescriptor( + infiniopGeluTanhDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/gemm.h b/include/infiniop/ops/gemm.h index 3d4d4dc0c..430e37003 100644 --- a/include/infiniop/ops/gemm.h +++ b/include/infiniop/ops/gemm.h @@ -6,22 +6,22 @@ typedef struct InfiniopDescriptor *infiniopGemmDescriptor_t; __INFINI_C __export infiniStatus_t infiniopCreateGemmDescriptor(infiniopHandle_t handle, - infiniopGemmDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c_desc, - infiniopTensorDescriptor_t a_desc, - infiniopTensorDescriptor_t b_desc); + infiniopGemmDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc); __INFINI_C __export infiniStatus_t infiniopGetGemmWorkspaceSize(infiniopGemmDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopGemm(infiniopGemmDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - void const *a, - void const *b, - float alpha, - float beta, - void *stream); + void *workspace, + size_t workspace_size, + void *c, + void const *a, + void const *b, + float alpha, + float beta, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc); diff --git a/include/infiniop/ops/int8_gemm.h b/include/infiniop/ops/int8_gemm.h index 1ebdb0a22..bc96c50a3 100644 --- a/include/infiniop/ops/int8_gemm.h +++ b/include/infiniop/ops/int8_gemm.h @@ -6,26 +6,26 @@ typedef InfiniopDescriptor *infiniopI8GemmDescriptor_t; __INFINI_C __export infiniStatus_t infiniopCreateI8GemmDescriptor(infiniopHandle_t handle, - infiniopI8GemmDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t out_desc, - infiniopTensorDescriptor_t bias_desc, - infiniopTensorDescriptor_t x_desc, - infiniopTensorDescriptor_t x_scale_desc, - infiniopTensorDescriptor_t weights_desc, - infiniopTensorDescriptor_t weights_scale_desc); + infiniopI8GemmDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t bias_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t x_scale_desc, + infiniopTensorDescriptor_t weights_desc, + infiniopTensorDescriptor_t weights_scale_desc); __INFINI_C __export infiniStatus_t infiniopGetI8GemmWorkspaceSize(infiniopI8GemmDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopI8Gemm(infiniopI8GemmDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *out, - const void *bias, - const void *x, - const void *x_scale, - const void *weights, - const void *weights_scale, - void *stream); + void *workspace, + size_t workspace_size, + void *out, + const void *bias, + const void *x, + const void *x_scale, + const void *weights, + const void *weights_scale, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroyI8GemmDescriptor(infiniopI8GemmDescriptor_t desc); diff --git a/include/infiniop/ops/kv_caching.h b/include/infiniop/ops/kv_caching.h index 12b86caa0..1e091cf37 100644 --- a/include/infiniop/ops/kv_caching.h +++ b/include/infiniop/ops/kv_caching.h @@ -17,14 +17,14 @@ __INFINI_C __export infiniStatus_t infiniopCreateKVCachingDescriptor( __INFINI_C __export infiniStatus_t infiniopGetKVCachingWorkspaceSize(infiniopKVCachingDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopKVCaching(infiniopKVCachingDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *k_cache, - void *v_cache, - const void *k, - const void *v, - const void *past_kv_lengths, - void *stream); + void *workspace, + size_t workspace_size, + void *k_cache, + void *v_cache, + const void *k, + const void *v, + const void *past_kv_lengths, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroyKVCachingDescriptor(infiniopKVCachingDescriptor_t desc); diff --git a/include/infiniop/ops/layer_norm.h b/include/infiniop/ops/layer_norm.h index 9f984d50a..140d18f02 100644 --- a/include/infiniop/ops/layer_norm.h +++ b/include/infiniop/ops/layer_norm.h @@ -19,15 +19,15 @@ __INFINI_C __export infiniStatus_t infiniopCreateLayerNormDescriptor( __INFINI_C __export infiniStatus_t infiniopGetLayerNormWorkspaceSize(infiniopLayerNormDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopLayerNorm(infiniopLayerNormDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - void *input_standardization, - void *input_std_deviation, - const void *input, - const void *weight, - const void *bias, - void *stream); + void *workspace, + size_t workspace_size, + void *output, + void *input_standardization, + void *input_std_deviation, + const void *input, + const void *weight, + const void *bias, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroyLayerNormDescriptor(infiniopLayerNormDescriptor_t desc); diff --git a/include/infiniop/ops/log_softmax.h b/include/infiniop/ops/log_softmax.h index 249e97bd5..8e2bef0f3 100644 --- a/include/infiniop/ops/log_softmax.h +++ b/include/infiniop/ops/log_softmax.h @@ -6,19 +6,19 @@ typedef struct InfiniopDescriptor *infiniopLogSoftmaxDescriptor_t; __INFINI_C __export infiniStatus_t infiniopCreateLogSoftmaxDescriptor(infiniopHandle_t handle, - infiniopLogSoftmaxDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output, - infiniopTensorDescriptor_t input, - int dim); + infiniopLogSoftmaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + int dim); __INFINI_C __export infiniStatus_t infiniopGetLogSoftmaxWorkspaceSize(infiniopLogSoftmaxDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopLogSoftmax(infiniopLogSoftmaxDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - void *stream); + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroyLogSoftmaxDescriptor(infiniopLogSoftmaxDescriptor_t desc); diff --git a/include/infiniop/ops/lp_norm.h b/include/infiniop/ops/lp_norm.h index b4a32a71f..1022fdb85 100644 --- a/include/infiniop/ops/lp_norm.h +++ b/include/infiniop/ops/lp_norm.h @@ -17,11 +17,11 @@ __INFINI_C __export infiniStatus_t infiniopCreateLPNormDescriptor( __INFINI_C __export infiniStatus_t infiniopGetLPNormWorkspaceSize(infiniopLPNormDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopLPNorm(infiniopLPNormDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - void *stream); + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroyLPNormDescriptor(infiniopLPNormDescriptor_t desc); diff --git a/include/infiniop/ops/mul.h b/include/infiniop/ops/mul.h index b00e2b718..e3e48c39d 100644 --- a/include/infiniop/ops/mul.h +++ b/include/infiniop/ops/mul.h @@ -6,20 +6,20 @@ typedef struct InfiniopDescriptor *infiniopMulDescriptor_t; __INFINI_C __export infiniStatus_t infiniopCreateMulDescriptor(infiniopHandle_t handle, - infiniopMulDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); + infiniopMulDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); __INFINI_C __export infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopMul(infiniopMulDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream); + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc); diff --git a/include/infiniop/ops/ones.h b/include/infiniop/ops/ones.h index 439679e51..e66b0f884 100644 --- a/include/infiniop/ops/ones.h +++ b/include/infiniop/ops/ones.h @@ -6,18 +6,18 @@ typedef struct InfiniopDescriptor *infiniopOnesDescriptor_t; __INFINI_C __export infiniStatus_t infiniopCreateOnesDescriptor(infiniopHandle_t handle, - infiniopOnesDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); + infiniopOnesDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); __INFINI_C __export infiniStatus_t infiniopGetOnesWorkspaceSize(infiniopOnesDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopOnes(infiniopOnesDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroyOnesDescriptor(infiniopOnesDescriptor_t desc); diff --git a/include/infiniop/ops/quant/per_channel_quant_int8.h b/include/infiniop/ops/quant/per_channel_quant_int8.h index 343039693..faa5efd78 100644 --- a/include/infiniop/ops/quant/per_channel_quant_int8.h +++ b/include/infiniop/ops/quant/per_channel_quant_int8.h @@ -6,22 +6,22 @@ typedef InfiniopDescriptor *infiniopPerChannelQuantI8Descriptor_t; __INFINI_C __export infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t handle, - infiniopPerChannelQuantI8Descriptor_t *desc_ptr, - infiniopTensorDescriptor_t x_packed_desc, - infiniopTensorDescriptor_t x_scale_desc, - infiniopTensorDescriptor_t x_zero_desc, - infiniopTensorDescriptor_t x_desc); + infiniopPerChannelQuantI8Descriptor_t *desc_ptr, + infiniopTensorDescriptor_t x_packed_desc, + infiniopTensorDescriptor_t x_scale_desc, + infiniopTensorDescriptor_t x_zero_desc, + infiniopTensorDescriptor_t x_desc); __INFINI_C __export infiniStatus_t infiniopGetPerChannelQuantI8WorkspaceSize(infiniopPerChannelQuantI8Descriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopPerChannelQuantI8(infiniopPerChannelQuantI8Descriptor_t desc, - void *workspace, - size_t workspace_size, - void *x_packed, - void *x_scale, - void *x_zero, - const void *x, - void *stream); + void *workspace, + size_t workspace_size, + void *x_packed, + void *x_scale, + void *x_zero, + const void *x, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroyPerChannelQuantI8Descriptor(infiniopPerChannelQuantI8Descriptor_t desc); diff --git a/include/infiniop/ops/quickgelu.h b/include/infiniop/ops/quickgelu.h new file mode 100644 index 000000000..be66f9495 --- /dev/null +++ b/include/infiniop/ops/quickgelu.h @@ -0,0 +1,42 @@ +#ifndef __INFINIOP_QUICKGELU_API_H__ +#define __INFINIOP_QUICKGELU_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopQuickGeluDescriptor_t; + +/** + * Create QuickGELU descriptor + * y = x * sigmoid(1.702 * x) + */ +__INFINI_C __export infiniStatus_t infiniopCreateQuickGeluDescriptor( + infiniopHandle_t handle, + infiniopQuickGeluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +/** + * Query workspace size + */ +__INFINI_C __export infiniStatus_t infiniopGetQuickGeluWorkspaceSize( + infiniopQuickGeluDescriptor_t desc, + size_t *size); + +/** + * Launch QuickGELU operator + */ +__INFINI_C __export infiniStatus_t infiniopQuickGelu( + infiniopQuickGeluDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +/** + * Destroy descriptor + */ +__INFINI_C __export infiniStatus_t infiniopDestroyQuickGeluDescriptor( + infiniopQuickGeluDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/relu.h b/include/infiniop/ops/relu.h index adaa8c6e0..8b129badd 100644 --- a/include/infiniop/ops/relu.h +++ b/include/infiniop/ops/relu.h @@ -6,18 +6,18 @@ typedef struct InfiniopDescriptor *infiniopReluDescriptor_t; __INFINI_C __export infiniStatus_t infiniopCreateReluDescriptor(infiniopHandle_t handle, - infiniopReluDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); + infiniopReluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); __INFINI_C __export infiniStatus_t infiniopGetReluWorkspaceSize(infiniopReluDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopRelu(infiniopReluDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroyReluDescriptor(infiniopReluDescriptor_t desc); diff --git a/include/infiniop/ops/rms_norm.h b/include/infiniop/ops/rms_norm.h index dc2ae6629..f90da65e4 100644 --- a/include/infiniop/ops/rms_norm.h +++ b/include/infiniop/ops/rms_norm.h @@ -16,7 +16,7 @@ __INFINI_C __export infiniStatus_t infiniopCreateRMSNormDescriptor( __INFINI_C __export infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *workspace, size_t workspace_size, - void *y, const void *x, const void *w, void *stream); + void *y, const void *x, const void *w, void *stream); __INFINI_C __export infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t desc); diff --git a/include/infiniop/ops/sigmoid.h b/include/infiniop/ops/sigmoid.h index 2bd48b286..461fbdf7a 100644 --- a/include/infiniop/ops/sigmoid.h +++ b/include/infiniop/ops/sigmoid.h @@ -6,18 +6,18 @@ typedef struct InfiniopDescriptor *infiniopSigmoidDescriptor_t; __INFINI_C __export infiniStatus_t infiniopCreateSigmoidDescriptor(infiniopHandle_t handle, - infiniopSigmoidDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); + infiniopSigmoidDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); __INFINI_C __export infiniStatus_t infiniopGetSigmoidWorkspaceSize(infiniopSigmoidDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopSigmoid(infiniopSigmoidDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroySigmoidDescriptor(infiniopSigmoidDescriptor_t desc); diff --git a/include/infiniop/ops/silu.h b/include/infiniop/ops/silu.h index 4541f184c..f864962c2 100644 --- a/include/infiniop/ops/silu.h +++ b/include/infiniop/ops/silu.h @@ -6,18 +6,18 @@ typedef struct InfiniopDescriptor *infiniopSiluDescriptor_t; __INFINI_C __export infiniStatus_t infiniopCreateSiluDescriptor(infiniopHandle_t handle, - infiniopSiluDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output, - infiniopTensorDescriptor_t intput); + infiniopSiluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t intput); __INFINI_C __export infiniStatus_t infiniopGetSiluWorkspaceSize(infiniopSiluDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopSilu(infiniopSiluDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *intput, - void *stream); + void *workspace, + size_t workspace_size, + void *output, + const void *intput, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroySiluDescriptor(infiniopSiluDescriptor_t desc); diff --git a/include/infiniop/ops/sub.h b/include/infiniop/ops/sub.h index c85870f74..3a516c30c 100644 --- a/include/infiniop/ops/sub.h +++ b/include/infiniop/ops/sub.h @@ -6,20 +6,20 @@ typedef struct InfiniopDescriptor *infiniopSubDescriptor_t; __INFINI_C __export infiniStatus_t infiniopCreateSubDescriptor(infiniopHandle_t handle, - infiniopSubDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); + infiniopSubDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); __INFINI_C __export infiniStatus_t infiniopGetSubWorkspaceSize(infiniopSubDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopSub(infiniopSubDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - void *stream); + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroySubDescriptor(infiniopSubDescriptor_t desc); diff --git a/include/infiniop/ops/swiglu.h b/include/infiniop/ops/swiglu.h index 2fcaa2144..e11749c6b 100644 --- a/include/infiniop/ops/swiglu.h +++ b/include/infiniop/ops/swiglu.h @@ -6,20 +6,20 @@ typedef struct InfiniopDescriptor *infiniopSwiGLUDescriptor_t; __INFINI_C __export infiniStatus_t infiniopCreateSwiGLUDescriptor(infiniopHandle_t handle, - infiniopSwiGLUDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c_desc, - infiniopTensorDescriptor_t a_desc, - infiniopTensorDescriptor_t b_desc); + infiniopSwiGLUDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc); __INFINI_C __export infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopSwiGLU(infiniopSwiGLUDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - void const *a, - void const *b, - void *stream); + void *workspace, + size_t workspace_size, + void *c, + void const *a, + void const *b, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc); diff --git a/include/infiniop/ops/tanh.h b/include/infiniop/ops/tanh.h index acca5a6b4..e623d854c 100644 --- a/include/infiniop/ops/tanh.h +++ b/include/infiniop/ops/tanh.h @@ -6,18 +6,18 @@ typedef struct InfiniopDescriptor *infiniopTanhDescriptor_t; __INFINI_C __export infiniStatus_t infiniopCreateTanhDescriptor(infiniopHandle_t handle, - infiniopTanhDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output, - infiniopTensorDescriptor_t input); + infiniopTanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); __INFINI_C __export infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopTanh(infiniopTanhDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - void *stream); + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc); diff --git a/include/infiniop/ops/topkrouter.h b/include/infiniop/ops/topkrouter.h index f3bbb5d3f..1d56a17cb 100644 --- a/include/infiniop/ops/topkrouter.h +++ b/include/infiniop/ops/topkrouter.h @@ -6,22 +6,22 @@ typedef struct InfiniopDescriptor *infiniopTopkrouterDescriptor_t; __INFINI_C __export infiniStatus_t infiniopCreateTopkrouterDescriptor(infiniopHandle_t handle, - infiniopTopkrouterDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t x_desc, - infiniopTensorDescriptor_t correction_bias_desc); + infiniopTopkrouterDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t correction_bias_desc); __INFINI_C __export infiniStatus_t infiniopGetTopkrouterWorkspaceSize(infiniopTopkrouterDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *values, - void *indices, - const void *x, - const void *correction_bias, - const float routed_scaling_factor, - const size_t topk, - void *stream); + void *workspace, + size_t workspace_size, + void *values, + void *indices, + const void *x, + const void *correction_bias, + const float routed_scaling_factor, + const size_t topk, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroyTopkrouterDescriptor(infiniopTopkrouterDescriptor_t desc); diff --git a/include/infiniop/ops/topksoftmax.h b/include/infiniop/ops/topksoftmax.h index f91ef8354..74ea10d69 100644 --- a/include/infiniop/ops/topksoftmax.h +++ b/include/infiniop/ops/topksoftmax.h @@ -6,20 +6,20 @@ typedef struct InfiniopDescriptor *infiniopTopksoftmaxDescriptor_t; __INFINI_C __export infiniStatus_t infiniopCreateTopksoftmaxDescriptor(infiniopHandle_t handle, - infiniopTopksoftmaxDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t x_desc); + infiniopTopksoftmaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t x_desc); __INFINI_C __export infiniStatus_t infiniopGetTopksoftmaxWorkspaceSize(infiniopTopksoftmaxDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopTopksoftmax(infiniopTopksoftmaxDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *values, - void *indices, - const void *x, - const size_t topk, - const int norm, - void *stream); + void *workspace, + size_t workspace_size, + void *values, + void *indices, + const void *x, + const size_t topk, + const int norm, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroyTopksoftmaxDescriptor(infiniopTopksoftmaxDescriptor_t desc); diff --git a/include/infiniop/ops/zeros.h b/include/infiniop/ops/zeros.h index 63a3e46d4..5d7ce03d5 100644 --- a/include/infiniop/ops/zeros.h +++ b/include/infiniop/ops/zeros.h @@ -6,18 +6,18 @@ typedef struct InfiniopDescriptor *infiniopZerosDescriptor_t; __INFINI_C __export infiniStatus_t infiniopCreateZerosDescriptor(infiniopHandle_t handle, - infiniopZerosDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); + infiniopZerosDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); __INFINI_C __export infiniStatus_t infiniopGetZerosWorkspaceSize(infiniopZerosDescriptor_t desc, size_t *size); __INFINI_C __export infiniStatus_t infiniopZeros(infiniopZerosDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); __INFINI_C __export infiniStatus_t infiniopDestroyZerosDescriptor(infiniopZerosDescriptor_t desc); diff --git a/src/infinicore/nn/layernorm.cc b/src/infinicore/nn/layernorm.cc new file mode 100644 index 000000000..ddb17bc9d --- /dev/null +++ b/src/infinicore/nn/layernorm.cc @@ -0,0 +1,28 @@ +#include "infinicore/nn/layernorm.hpp" + +namespace infinicore::nn { + +LayerNorm::LayerNorm(size_t normalized_shape, + double eps, + const DataType &dtype, + const Device &device) + : normalized_shape_(normalized_shape), + eps_(eps), + dtype_(dtype) { + INFINICORE_NN_PARAMETER_INIT(weight, ({normalized_shape_}, dtype_, device)); + INFINICORE_NN_PARAMETER_INIT(bias, ({normalized_shape_}, dtype_, device)); + auto weight_init = infinicore::Tensor::ones({normalized_shape_}, dtype_, device); + auto bias_init = infinicore::Tensor::zeros({normalized_shape_}, dtype_, device); + weight_->copy_from(weight_init); + bias_->copy_from(bias_init); +} + +Tensor LayerNorm::forward(const Tensor &x) const { + return infinicore::op::layer_norm(x, weight_, bias_, static_cast(eps_)); +} + +std::string LayerNorm::extra_repr() const { + return "normalized_shape=" + std::to_string(normalized_shape_) + ", eps=" + std::to_string(eps_) + ", dtype=" + infinicore::toString(dtype_); +} + +} // namespace infinicore::nn diff --git a/src/infinicore/ops/conv2d/conv2d.cc b/src/infinicore/ops/conv2d/conv2d.cc new file mode 100644 index 000000000..6ea76087d --- /dev/null +++ b/src/infinicore/ops/conv2d/conv2d.cc @@ -0,0 +1,67 @@ +#include "infinicore/ops/conv2d.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &Conv2d::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Conv2d::execute(Tensor output, + Tensor input, + Tensor weight, + Tensor bias, + const size_t *pads, + const size_t *strides, + const size_t *dilations, + size_t n) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input, weight, bias); + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No Conv2d implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input, weight, bias, pads, strides, dilations, n); +} + +Tensor conv2d(Tensor input, + Tensor weight, + Tensor bias, + const std::vector &pads, + const std::vector &strides, + const std::vector &dilations) { + // Output shape should be pre-computed by caller; allocate a conservative placeholder. + // This helper is rarely used in performance-critical paths. + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + conv2d_(output, input, weight, bias, pads, strides, dilations); + return output; +} + +void conv2d_(Tensor output, + Tensor input, + Tensor weight, + Tensor bias, + const std::vector &pads, + const std::vector &strides, + const std::vector &dilations) { + if (pads.size() != strides.size() || pads.size() != dilations.size()) { + throw std::runtime_error("conv2d_: pads/strides/dilations must have the same size"); + } + Conv2d::execute(output, + input, + weight, + bias, + pads.data(), + strides.data(), + dilations.data(), + pads.size()); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/conv2d/conv2d_infiniop.cc b/src/infinicore/ops/conv2d/conv2d_infiniop.cc new file mode 100644 index 000000000..c15c14fdd --- /dev/null +++ b/src/infinicore/ops/conv2d/conv2d_infiniop.cc @@ -0,0 +1,69 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/conv2d.hpp" +#include + +namespace infinicore::op::conv2d_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopConvDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyConvDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, + Tensor input, + Tensor weight, + Tensor bias, + const size_t *pads, + const size_t *strides, + const size_t *dilations, + size_t n) { + size_t seed = hash_combine(output, input, weight, bias, n); + for (size_t i = 0; i < n; ++i) { + hash_combine(seed, pads[i], strides[i], dilations[i]); + } + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopConvDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateConvDescriptor( + context::getInfiniopHandle(device), &desc, + output->desc(), input->desc(), weight->desc(), + bias ? bias->desc() : nullptr, + const_cast(pads), + const_cast(strides), + const_cast(dilations), + n)); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetConvWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopConv( + desc, workspace->data(), workspace_size, + output->data(), + input->data(), + weight->data(), + bias ? bias->data() : nullptr, + context::getStream())); +} + +static bool registered = []() { + Conv2d::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::conv2d_impl::infiniop diff --git a/src/infinicore/ops/gelu/gelu.cc b/src/infinicore/ops/gelu/gelu.cc new file mode 100644 index 000000000..612a2ecee --- /dev/null +++ b/src/infinicore/ops/gelu/gelu.cc @@ -0,0 +1,37 @@ +#include "infinicore/ops/gelu.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &Gelu::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Gelu::execute(Tensor output, Tensor input) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No Gelu implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input); +} + +Tensor gelu(Tensor input) { + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + gelu_(output, input); + return output; +} + +void gelu_(Tensor output, Tensor input) { + Gelu::execute(output, input); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/gelu/gelu_infiniop.cc b/src/infinicore/ops/gelu/gelu_infiniop.cc new file mode 100644 index 000000000..6294a05c2 --- /dev/null +++ b/src/infinicore/ops/gelu/gelu_infiniop.cc @@ -0,0 +1,50 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/gelu.hpp" +#include + +namespace infinicore::op::gelu_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopGeluDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyGeluDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input) { + size_t seed = hash_combine(output, input); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopGeluDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateGeluDescriptor( + context::getInfiniopHandle(device), &desc, + output->desc(), input->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetGeluWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopGelu( + desc, workspace->data(), workspace_size, + output->data(), input->data(), context::getStream())); +} + +static bool registered = []() { + Gelu::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::gelu_impl::infiniop diff --git a/src/infinicore/ops/gelutanh/gelutanh.cc b/src/infinicore/ops/gelutanh/gelutanh.cc new file mode 100644 index 000000000..b6bae39dd --- /dev/null +++ b/src/infinicore/ops/gelutanh/gelutanh.cc @@ -0,0 +1,37 @@ +#include "infinicore/ops/gelutanh.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &GeluTanh::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void GeluTanh::execute(Tensor output, Tensor input) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No GeluTanh implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input); +} + +Tensor gelu_tanh(Tensor input) { + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + gelu_tanh_(output, input); + return output; +} + +void gelu_tanh_(Tensor output, Tensor input) { + GeluTanh::execute(output, input); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/gelutanh/gelutanh_infiniop.cc b/src/infinicore/ops/gelutanh/gelutanh_infiniop.cc new file mode 100644 index 000000000..fb13ca98a --- /dev/null +++ b/src/infinicore/ops/gelutanh/gelutanh_infiniop.cc @@ -0,0 +1,50 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/gelutanh.hpp" +#include + +namespace infinicore::op::gelutanh_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopGeluTanhDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyGeluTanhDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input) { + size_t seed = hash_combine(output, input); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopGeluTanhDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateGeluTanhDescriptor( + context::getInfiniopHandle(device), &desc, + output->desc(), input->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetGeluTanhWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopGeluTanh( + desc, workspace->data(), workspace_size, + output->data(), input->data(), context::getStream())); +} + +static bool registered = []() { + GeluTanh::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::gelutanh_impl::infiniop diff --git a/src/infinicore/ops/layer_norm/layer_norm.cc b/src/infinicore/ops/layer_norm/layer_norm.cc new file mode 100644 index 000000000..55d2ccd73 --- /dev/null +++ b/src/infinicore/ops/layer_norm/layer_norm.cc @@ -0,0 +1,62 @@ +#include "infinicore/ops/layer_norm.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &LayerNorm::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void LayerNorm::execute(Tensor output, + Tensor input_standardization, + Tensor input_std_deviation, + Tensor input, + Tensor weight, + Tensor bias, + float epsilon) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input_standardization, input_std_deviation, input, weight, bias); + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No LayerNorm implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input_standardization, input_std_deviation, input, weight, bias, epsilon); +} + +Tensor layer_norm(Tensor input, Tensor weight, Tensor bias, float epsilon) { + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + + if (shape.empty()) { + throw std::runtime_error("layer_norm: input must have at least one dimension"); + } + + Shape std_shape = shape; + std_shape.pop_back(); + if (std_shape.empty()) { + std_shape.push_back(1); + } + + auto input_standardization = Tensor::empty(shape, input->dtype(), input->device()); + auto input_std_deviation = Tensor::empty(std_shape, input->dtype(), input->device()); + layer_norm_(output, input_standardization, input_std_deviation, input, weight, bias, epsilon); + return output; +} + +void layer_norm_(Tensor output, + Tensor input_standardization, + Tensor input_std_deviation, + Tensor input, + Tensor weight, + Tensor bias, + float epsilon) { + LayerNorm::execute(output, input_standardization, input_std_deviation, input, weight, bias, epsilon); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/layer_norm/layer_norm_infiniop.cc b/src/infinicore/ops/layer_norm/layer_norm_infiniop.cc new file mode 100644 index 000000000..6ef64ac90 --- /dev/null +++ b/src/infinicore/ops/layer_norm/layer_norm_infiniop.cc @@ -0,0 +1,68 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/layer_norm.hpp" +#include + +namespace infinicore::op::layer_norm_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopLayerNormDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyLayerNormDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, + Tensor input_standardization, + Tensor input_std_deviation, + Tensor input, + Tensor weight, + Tensor bias, + float epsilon) { + size_t seed = hash_combine(output, input_standardization, input_std_deviation, input, weight, bias, epsilon); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopLayerNormDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateLayerNormDescriptor( + context::getInfiniopHandle(device), &desc, + output->desc(), + input_standardization->desc(), + input_std_deviation->desc(), + input->desc(), + weight ? weight->desc() : nullptr, + bias ? bias->desc() : nullptr, + epsilon)); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetLayerNormWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopLayerNorm( + desc, workspace->data(), workspace_size, + output->data(), + input_standardization->data(), + input_std_deviation->data(), + input->data(), + weight ? weight->data() : nullptr, + bias ? bias->data() : nullptr, + context::getStream())); +} + +static bool registered = []() { + LayerNorm::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::layer_norm_impl::infiniop diff --git a/src/infinicore/ops/quickgelu/quickgelu.cc b/src/infinicore/ops/quickgelu/quickgelu.cc new file mode 100644 index 000000000..4ab46c49a --- /dev/null +++ b/src/infinicore/ops/quickgelu/quickgelu.cc @@ -0,0 +1,37 @@ +#include "infinicore/ops/quickgelu.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &QuickGelu::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void QuickGelu::execute(Tensor output, Tensor input) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No QuickGelu implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input); +} + +Tensor quick_gelu(Tensor input) { + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + quick_gelu_(output, input); + return output; +} + +void quick_gelu_(Tensor output, Tensor input) { + QuickGelu::execute(output, input); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/quickgelu/quickgelu_infiniop.cc b/src/infinicore/ops/quickgelu/quickgelu_infiniop.cc new file mode 100644 index 000000000..3b18c315a --- /dev/null +++ b/src/infinicore/ops/quickgelu/quickgelu_infiniop.cc @@ -0,0 +1,50 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/quickgelu.hpp" +#include + +namespace infinicore::op::quickgelu_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopQuickGeluDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyQuickGeluDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input) { + size_t seed = hash_combine(output, input); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopQuickGeluDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateQuickGeluDescriptor( + context::getInfiniopHandle(device), &desc, + output->desc(), input->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetQuickGeluWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopQuickGelu( + desc, workspace->data(), workspace_size, + output->data(), input->data(), context::getStream())); +} + +static bool registered = []() { + QuickGelu::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::quickgelu_impl::infiniop diff --git a/src/infinicore/ops/relu/relu.cc b/src/infinicore/ops/relu/relu.cc new file mode 100644 index 000000000..dceb618b1 --- /dev/null +++ b/src/infinicore/ops/relu/relu.cc @@ -0,0 +1,37 @@ +#include "infinicore/ops/relu.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &Relu::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Relu::execute(Tensor output, Tensor input) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No Relu implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input); +} + +Tensor relu(Tensor input) { + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + relu_(output, input); + return output; +} + +void relu_(Tensor output, Tensor input) { + Relu::execute(output, input); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/relu/relu_infiniop.cc b/src/infinicore/ops/relu/relu_infiniop.cc new file mode 100644 index 000000000..dc80535a8 --- /dev/null +++ b/src/infinicore/ops/relu/relu_infiniop.cc @@ -0,0 +1,50 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/relu.hpp" +#include + +namespace infinicore::op::relu_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopReluDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyReluDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input) { + size_t seed = hash_combine(output, input); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopReluDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateReluDescriptor( + context::getInfiniopHandle(device), &desc, + output->desc(), input->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetReluWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopRelu( + desc, workspace->data(), workspace_size, + output->data(), input->data(), context::getStream())); +} + +static bool registered = []() { + Relu::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::relu_impl::infiniop diff --git a/src/infinicore/ops/softmax/softmax.cc b/src/infinicore/ops/softmax/softmax.cc new file mode 100644 index 000000000..1856f61de --- /dev/null +++ b/src/infinicore/ops/softmax/softmax.cc @@ -0,0 +1,37 @@ +#include "infinicore/ops/softmax.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &Softmax::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Softmax::execute(Tensor output, Tensor input, int axis) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No Softmax implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input, axis); +} + +Tensor softmax(Tensor input, int axis) { + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + softmax_(output, input, axis); + return output; +} + +void softmax_(Tensor output, Tensor input, int axis) { + Softmax::execute(output, input, axis); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/softmax/softmax_infiniop.cc b/src/infinicore/ops/softmax/softmax_infiniop.cc new file mode 100644 index 000000000..2c1dfd6e5 --- /dev/null +++ b/src/infinicore/ops/softmax/softmax_infiniop.cc @@ -0,0 +1,50 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/softmax.hpp" +#include + +namespace infinicore::op::softmax_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopSoftmaxDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroySoftmaxDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input, int axis) { + size_t seed = hash_combine(output, input, axis); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopSoftmaxDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateSoftmaxDescriptor( + context::getInfiniopHandle(device), &desc, + output->desc(), input->desc(), axis)); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetSoftmaxWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopSoftmax( + desc, workspace->data(), workspace_size, + output->data(), input->data(), context::getStream())); +} + +static bool registered = []() { + Softmax::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::softmax_impl::infiniop diff --git a/src/infiniop/ops/conv/operator.cc b/src/infiniop/ops/conv/operator.cc index 5ccfa4180..425351c4f 100644 --- a/src/infiniop/ops/conv/operator.cc +++ b/src/infiniop/ops/conv/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/conv_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API) || defined(ENABLE_HYGON_API) #include "nvidia/conv_nvidia.cuh" #endif @@ -48,6 +48,9 @@ __INFINI_C __export infiniStatus_t infiniopCreateConvDescriptor(infiniopHandle_t #ifdef ENABLE_ALI_API CREATE(INFINI_DEVICE_ALI, nvidia); #endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -82,6 +85,9 @@ infiniopGetConvWorkspaceSize( #ifdef ENABLE_ALI_API GET(INFINI_DEVICE_ALI, nvidia); #endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -124,6 +130,9 @@ __INFINI_C infiniStatus_t infiniopConv( #ifdef ENABLE_ALI_API CALCULATE(INFINI_DEVICE_ALI, nvidia); #endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -154,6 +163,9 @@ infiniopDestroyConvDescriptor(infiniopConvDescriptor_t desc) { #ifdef ENABLE_ALI_API DELETE(INFINI_DEVICE_ALI, nvidia); #endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; diff --git a/src/infiniop/ops/gelu/operator.cc b/src/infiniop/ops/gelu/operator.cc index e08464382..bdcdd8bd6 100644 --- a/src/infiniop/ops/gelu/operator.cc +++ b/src/infiniop/ops/gelu/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/gelu_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API) || defined(ENABLE_HYGON_API) #include "nvidia/gelu_nvidia.cuh" #endif #ifdef ENABLE_METAX_API @@ -43,6 +43,9 @@ __INFINI_C infiniStatus_t infiniopCreateGeluDescriptor( #ifdef ENABLE_QY_API CREATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax); #endif @@ -80,6 +83,9 @@ __INFINI_C infiniStatus_t infiniopGetGeluWorkspaceSize(infiniopGeluDescriptor_t #ifdef ENABLE_QY_API GET(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax); #endif @@ -125,6 +131,9 @@ __INFINI_C infiniStatus_t infiniopGelu( #ifdef ENABLE_QY_API CALCULATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax); #endif @@ -164,6 +173,9 @@ infiniopDestroyGeluDescriptor(infiniopGeluDescriptor_t desc) { #ifdef ENABLE_QY_API DELETE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, metax); #endif diff --git a/src/infiniop/ops/gelutanh/cpu/gelutanh_cpu.cc b/src/infiniop/ops/gelutanh/cpu/gelutanh_cpu.cc new file mode 100644 index 000000000..a9d41522c --- /dev/null +++ b/src/infiniop/ops/gelutanh/cpu/gelutanh_cpu.cc @@ -0,0 +1,52 @@ +#include "gelutanh_cpu.h" + +namespace op::gelutanh::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + CHECK_SAME_SHAPE(y_shape, x_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + (void)workspace; + (void)workspace_size; + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::gelutanh::cpu diff --git a/src/infiniop/ops/gelutanh/cpu/gelutanh_cpu.h b/src/infiniop/ops/gelutanh/cpu/gelutanh_cpu.h new file mode 100644 index 000000000..540ff01a8 --- /dev/null +++ b/src/infiniop/ops/gelutanh/cpu/gelutanh_cpu.h @@ -0,0 +1,26 @@ +#ifndef __GELUTANH_CPU_H__ +#define __GELUTANH_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +#include + +ELEMENTWISE_DESCRIPTOR(gelutanh, cpu) + +namespace op::gelutanh::cpu { +typedef struct GeluTanhOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + // y = x * 0.5 * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3))) + constexpr T alpha = static_cast(0.7978845608); // sqrt(2/pi) + constexpr T beta = static_cast(0.044715); + T inner = alpha * (x + beta * x * x * x); + return x * static_cast(0.5) * (static_cast(1) + std::tanh(inner)); + } +} GeluTanhOp; +} // namespace op::gelutanh::cpu + +#endif // __GELUTANH_CPU_H__ diff --git a/src/infiniop/ops/gelutanh/cuda/kernel.cuh b/src/infiniop/ops/gelutanh/cuda/kernel.cuh new file mode 100644 index 000000000..eddfe1b5a --- /dev/null +++ b/src/infiniop/ops/gelutanh/cuda/kernel.cuh @@ -0,0 +1,58 @@ +#ifndef __GELUTANH_CUDA_H__ +#define __GELUTANH_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include +#include +#include + +namespace op::gelutanh::cuda { + +typedef struct GeluTanhOp { +public: + static constexpr size_t num_inputs = 1; + + // GELU-Tanh constants + // static constexpr float alpha = std::sqrt(2.0 / M_PI); + // static constexpr float beta = 0.044715f; + static constexpr float alpha = 0.7978845608f; // sqrt(2/pi) + static constexpr float beta = 0.044715f; + // f32 tanh helper + __device__ __forceinline__ float tanh_f32_func(float x) const { + return tanhf(x); + } + + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + // half2 -> float2 + float2 vf = __half22float2(x); + float inner_x0 = alpha * (vf.x + beta * vf.x * vf.x * vf.x); + float inner_x1 = alpha * (vf.y + beta * vf.y * vf.y * vf.y); + float2 vr = make_float2(tanh_f32_func(inner_x0) * 0.5f + 0.5f, + tanh_f32_func(inner_x1) * 0.5f + 0.5f); + return __hmul2(x, __float22half2_rn(vr)); // y = x * 0.5 * (1 + tanh(...)) + } else if constexpr (std::is_same_v) { + float xf = __half2float(x); + float inner = alpha * (xf + beta * xf * xf * xf); + float yf = xf * 0.5f * (1.0f + tanh_f32_func(inner)); + return __float2half_rn(yf); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(x); + float inner = alpha * (xf + beta * xf * xf * xf); + float yf = xf * 0.5f * (1.0f + tanh_f32_func(inner)); + return __float2bfloat16(yf); + } else if constexpr (std::is_same_v) { + float inner = alpha * (x + beta * x * x * x); + return x * 0.5f * (1.0f + tanh_f32_func(inner)); + } else { // double + double inner = alpha * (x + beta * x * x * x); + return x * 0.5 * (1.0 + std::tanh(inner)); + } + } + +} GeluTanhOp; + +} // namespace op::gelutanh::cuda + +#endif // __GELUTANH_CUDA_H__ diff --git a/src/infiniop/ops/gelutanh/nvidia/gelutanh_nvidia.cu b/src/infiniop/ops/gelutanh/nvidia/gelutanh_nvidia.cu new file mode 100644 index 000000000..e51b09606 --- /dev/null +++ b/src/infiniop/ops/gelutanh/nvidia/gelutanh_nvidia.cu @@ -0,0 +1,70 @@ +#include "../cuda/kernel.cuh" +#include "gelutanh_nvidia.cuh" + +namespace op::gelutanh::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, + INFINI_DTYPE_F16, + INFINI_DTYPE_F32, + INFINI_DTYPE_F64, + INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::GeluTanhOp, half>( + _info, workspace, output, inputs, stream); + + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::GeluTanhOp, __nv_bfloat16>( + _info, workspace, output, inputs, stream); + + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::GeluTanhOp, float>( + _info, workspace, output, inputs, stream); + + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::GeluTanhOp, double>( + _info, workspace, output, inputs, stream); + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::gelutanh::nvidia diff --git a/src/infiniop/ops/gelutanh/nvidia/gelutanh_nvidia.cuh b/src/infiniop/ops/gelutanh/nvidia/gelutanh_nvidia.cuh new file mode 100644 index 000000000..3155a7af1 --- /dev/null +++ b/src/infiniop/ops/gelutanh/nvidia/gelutanh_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __GELUTANH_CUDA_API_H__ +#define __GELUTANH_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(gelutanh, nvidia) + +#endif // __GELUTANH_CUDA_API_H__ diff --git a/src/infiniop/ops/gelutanh/operator.cc b/src/infiniop/ops/gelutanh/operator.cc new file mode 100644 index 000000000..9c241e155 --- /dev/null +++ b/src/infiniop/ops/gelutanh/operator.cc @@ -0,0 +1,143 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/gelutanh.h" + +#ifdef ENABLE_CPU_API +#include "cpu/gelutanh_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) +#include "nvidia/gelutanh_nvidia.cuh" +#endif + +__INFINI_C infiniStatus_t infiniopCreateGeluTanhDescriptor( + infiniopHandle_t handle, + infiniopGeluTanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::gelutanh::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__INFINI_C infiniStatus_t infiniopGetGeluTanhWorkspaceSize(infiniopGeluTanhDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef GET +} + +__INFINI_C infiniStatus_t infiniopGeluTanh( + infiniopGeluTanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__INFINI_C infiniStatus_t infiniopDestroyGeluTanhDescriptor(infiniopGeluTanhDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/layer_norm/operator.cc b/src/infiniop/ops/layer_norm/operator.cc index d3b5e76c3..b99088e23 100644 --- a/src/infiniop/ops/layer_norm/operator.cc +++ b/src/infiniop/ops/layer_norm/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/layer_norm_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API) || defined(ENABLE_HYGON_API) #include "nvidia/layer_norm_nvidia.cuh" #endif #ifdef ENABLE_METAX_API @@ -52,6 +52,9 @@ __INFINI_C infiniStatus_t infiniopCreateLayerNormDescriptor( #ifdef ENABLE_QY_API CREATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax); #endif @@ -85,6 +88,9 @@ __INFINI_C infiniStatus_t infiniopGetLayerNormWorkspaceSize(infiniopLayerNormDes #ifdef ENABLE_QY_API GET(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax); #endif @@ -138,6 +144,9 @@ __INFINI_C infiniStatus_t infiniopLayerNorm( #ifdef ENABLE_QY_API CALCULATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax); #endif @@ -171,6 +180,9 @@ infiniopDestroyLayerNormDescriptor(infiniopLayerNormDescriptor_t desc) { #ifdef ENABLE_QY_API DELETE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, metax); #endif diff --git a/src/infiniop/ops/quickgelu/cpu/quickgelu_cpu.cc b/src/infiniop/ops/quickgelu/cpu/quickgelu_cpu.cc new file mode 100644 index 000000000..d89304eb0 --- /dev/null +++ b/src/infiniop/ops/quickgelu/cpu/quickgelu_cpu.cc @@ -0,0 +1,52 @@ +#include "quickgelu_cpu.h" + +namespace op::quickgelu::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + CHECK_SAME_SHAPE(y_shape, x_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + (void)workspace; + (void)workspace_size; + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::quickgelu::cpu diff --git a/src/infiniop/ops/quickgelu/cpu/quickgelu_cpu.h b/src/infiniop/ops/quickgelu/cpu/quickgelu_cpu.h new file mode 100644 index 000000000..de80a7af4 --- /dev/null +++ b/src/infiniop/ops/quickgelu/cpu/quickgelu_cpu.h @@ -0,0 +1,25 @@ +#ifndef __QUICKGELU_CPU_H__ +#define __QUICKGELU_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +#include + +ELEMENTWISE_DESCRIPTOR(quickgelu, cpu) + +namespace op::quickgelu::cpu { +typedef struct QuickGeluOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + // quickgelu(x) = x * sigmoid(1.702 * x) + constexpr T alpha = static_cast(1.702); + T ax = alpha * x; + return x / (static_cast(1) + std::exp(-ax)); + } +} QuickGeluOp; +} // namespace op::quickgelu::cpu + +#endif // __QUICKGELU_CPU_H__ diff --git a/src/infiniop/ops/quickgelu/cuda/kernel.cuh b/src/infiniop/ops/quickgelu/cuda/kernel.cuh new file mode 100644 index 000000000..2c13c4b9d --- /dev/null +++ b/src/infiniop/ops/quickgelu/cuda/kernel.cuh @@ -0,0 +1,60 @@ +#ifndef __QUICKGELU_CUDA_H__ +#define __QUICKGELU_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include +#include + +namespace op::quickgelu::cuda { + +typedef struct QuickGeluOp { +public: + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x) const { + // quickgelu(x) = x * sigmoid(1.702 * x) + + constexpr float alpha = 1.702f; + + if constexpr (std::is_same_v) { + half2 ax = __hmul2(make_half2(alpha, alpha), x); + half2 denominator = __hadd2(make_half2(1, 1), h2exp(__hneg2(ax))); + half2 sigmoid = h2rcp(denominator); + return __hmul2(x, sigmoid); + + } else if constexpr (std::is_same_v) { + half ax = __hmul(__float2half(alpha), x); + half denominator = __hadd(__float2half(1.0f), hexp(__hneg(ax))); + half sigmoid = hrcp(denominator); + return __hmul(x, sigmoid); + + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(x); + float ax = alpha * xf; + float s = 1.0f / (1.0f + __expf(-ax)); + return __float2bfloat16(xf * s); + + } else if constexpr (std::is_same_v) { + float ax = alpha * x; + float s; + if (ax >= 0.0f) { + float z = expf(-ax); + s = 1.0f / (1.0f + z); + } else { + float z = expf(ax); + s = z / (1.0f + z); + } + return x * s; + + } else { // double + double ax = static_cast(alpha) * x; + return x / (1.0 + exp(-ax)); + } + } + +} QuickGeluOp; + +} // namespace op::quickgelu::cuda + +#endif // __QUICKGELU_CUDA_H__ diff --git a/src/infiniop/ops/quickgelu/nvidia/quickgelu_nvidia.cu b/src/infiniop/ops/quickgelu/nvidia/quickgelu_nvidia.cu new file mode 100644 index 000000000..387e08ecb --- /dev/null +++ b/src/infiniop/ops/quickgelu/nvidia/quickgelu_nvidia.cu @@ -0,0 +1,70 @@ +#include "../cuda/kernel.cuh" +#include "quickgelu_nvidia.cuh" + +namespace op::quickgelu::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, + INFINI_DTYPE_F16, + INFINI_DTYPE_F32, + INFINI_DTYPE_F64, + INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::QuickGeluOp, half>( + _info, workspace, output, inputs, stream); + + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::QuickGeluOp, __nv_bfloat16>( + _info, workspace, output, inputs, stream); + + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::QuickGeluOp, float>( + _info, workspace, output, inputs, stream); + + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::QuickGeluOp, double>( + _info, workspace, output, inputs, stream); + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::quickgelu::nvidia diff --git a/src/infiniop/ops/quickgelu/nvidia/quickgelu_nvidia.cuh b/src/infiniop/ops/quickgelu/nvidia/quickgelu_nvidia.cuh new file mode 100644 index 000000000..f6125c778 --- /dev/null +++ b/src/infiniop/ops/quickgelu/nvidia/quickgelu_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __QUICKGELU_CUDA_API_H__ +#define __QUICKGELU_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(quickgelu, nvidia) + +#endif // __QUICKGELU_CUDA_API_H__ diff --git a/src/infiniop/ops/quickgelu/operator.cc b/src/infiniop/ops/quickgelu/operator.cc new file mode 100644 index 000000000..f85a3e49a --- /dev/null +++ b/src/infiniop/ops/quickgelu/operator.cc @@ -0,0 +1,143 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/quickgelu.h" + +#ifdef ENABLE_CPU_API +#include "cpu/quickgelu_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) +#include "nvidia/quickgelu_nvidia.cuh" +#endif + +__INFINI_C infiniStatus_t infiniopCreateQuickGeluDescriptor( + infiniopHandle_t handle, + infiniopQuickGeluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::quickgelu::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__INFINI_C infiniStatus_t infiniopGetQuickGeluWorkspaceSize(infiniopQuickGeluDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef GET +} + +__INFINI_C infiniStatus_t infiniopQuickGelu( + infiniopQuickGeluDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__INFINI_C infiniStatus_t infiniopDestroyQuickGeluDescriptor(infiniopQuickGeluDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/relu/operator.cc b/src/infiniop/ops/relu/operator.cc index ee594832b..59b6f8ada 100644 --- a/src/infiniop/ops/relu/operator.cc +++ b/src/infiniop/ops/relu/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/relu_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API) || defined(ENABLE_HYGON_API) #include "nvidia/relu_nvidia.cuh" #endif #ifdef ENABLE_METAX_API @@ -42,6 +42,9 @@ __INFINI_C infiniStatus_t infiniopCreateReluDescriptor( #ifdef ENABLE_QY_API CREATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API #ifdef ENABLE_NINETOOTHED CREATE(INFINI_DEVICE_METAX, metax); @@ -78,6 +81,9 @@ __INFINI_C infiniStatus_t infiniopGetReluWorkspaceSize(infiniopReluDescriptor_t #ifdef ENABLE_QY_API GET(INFINI_DEVICE_QY, nvidia) #endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia) +#endif #ifdef ENABLE_METAX_API #ifdef ENABLE_NINETOOTHED GET(INFINI_DEVICE_METAX, metax) @@ -122,6 +128,9 @@ __INFINI_C infiniStatus_t infiniopRelu( #ifdef ENABLE_QY_API CALCULATE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API #ifdef ENABLE_NINETOOTHED CALCULATE(INFINI_DEVICE_METAX, metax); @@ -160,6 +169,9 @@ infiniopDestroyReluDescriptor(infiniopReluDescriptor_t desc) { #ifdef ENABLE_QY_API DELETE(INFINI_DEVICE_QY, nvidia); #endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif #ifdef ENABLE_METAX_API #ifdef ENABLE_NINETOOTHED DELETE(INFINI_DEVICE_METAX, metax); diff --git a/src/infiniop/ops/sigmoid/operator.cc b/src/infiniop/ops/sigmoid/operator.cc index 854be8c41..6d416b130 100644 --- a/src/infiniop/ops/sigmoid/operator.cc +++ b/src/infiniop/ops/sigmoid/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/sigmoid_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) #include "nvidia/sigmoid_nvidia.cuh" #endif @@ -40,6 +40,9 @@ __INFINI_C infiniStatus_t infiniopCreateSigmoidDescriptor( #ifdef ENABLE_ILUVATAR_API CREATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -70,6 +73,9 @@ __INFINI_C infiniStatus_t infiniopGetSigmoidWorkspaceSize(infiniopSigmoidDescrip #endif #ifdef ENABLE_ILUVATAR_API GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia) #endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -109,6 +115,10 @@ __INFINI_C infiniStatus_t infiniopSigmoid( #ifdef ENABLE_ILUVATAR_API CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif + default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } @@ -141,6 +151,10 @@ infiniopDestroySigmoidDescriptor(infiniopSigmoidDescriptor_t desc) { #ifdef ENABLE_ILUVATAR_API DELETE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif + default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } diff --git a/src/infiniop/ops/tanh/operator.cc b/src/infiniop/ops/tanh/operator.cc index dbbb90544..254cd0db7 100644 --- a/src/infiniop/ops/tanh/operator.cc +++ b/src/infiniop/ops/tanh/operator.cc @@ -5,7 +5,7 @@ #ifdef ENABLE_CPU_API #include "cpu/tanh_cpu.h" #endif -#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API) +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API) || defined(ENABLE_HYGON_API) #include "nvidia/tanh_nvidia.cuh" #endif // #ifdef ENABLE_METAX_API @@ -43,7 +43,9 @@ __INFINI_C infiniStatus_t infiniopCreateTanhDescriptor( #ifdef ENABLE_ALI_API CREATE(INFINI_DEVICE_ALI, nvidia); #endif - +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif // #ifdef ENABLE_METAX_API // CREATE(INFINI_DEVICE_METAX, metax); // #endif @@ -78,7 +80,9 @@ __INFINI_C infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t #ifdef ENABLE_ALI_API GET(INFINI_DEVICE_ALI, nvidia); #endif - +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia); +#endif // #ifdef ENABLE_METAX_API // GET(INFINI_DEVICE_METAX, metax); // #endif @@ -120,7 +124,9 @@ __INFINI_C infiniStatus_t infiniopTanh( #ifdef ENABLE_ALI_API CALCULATE(INFINI_DEVICE_ALI, nvidia); #endif - +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif // #ifdef ENABLE_METAX_API // CALCULATE(INFINI_DEVICE_METAX, metax); // #endif @@ -157,7 +163,9 @@ infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc) { #ifdef ENABLE_ALI_API DELETE(INFINI_DEVICE_ALI, nvidia); #endif - +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif // #ifdef ENABLE_METAX_API // DELETE(INFINI_DEVICE_METAX, metax); // #endif diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua index 602fb190d..6f575017e 100644 --- a/xmake/nvidia.lua +++ b/xmake/nvidia.lua @@ -49,9 +49,9 @@ target("infiniop-nvidia") end else add_cuflags("-Xcompiler=-Wall", "-Xcompiler=-Werror") - add_cuflags("-Xcompiler=-fPIC") + add_cuflags("-Xcompiler=-fPIC", {force = true}) add_cuflags("--extended-lambda") - add_culdflags("-Xcompiler=-fPIC") + add_culdflags("-Xcompiler=-fPIC", {force = true}) add_cxflags("-fPIC") add_cxxflags("-fPIC") add_cflags("-fPIC") @@ -95,8 +95,8 @@ target("infinirt-nvidia") add_cuflags("-Xcompiler=/utf-8", "--expt-relaxed-constexpr", "--allow-unsupported-compiler") add_cxxflags("/FS") else - add_cuflags("-Xcompiler=-fPIC") - add_culdflags("-Xcompiler=-fPIC") + add_cuflags("-Xcompiler=-fPIC", {force = true}) + add_culdflags("-Xcompiler=-fPIC", {force = true}) add_cxflags("-fPIC") add_cxxflags("-fPIC") end