From b37636c109d79979343b4ee0639675a2b18ea189 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B5=96=E6=B3=89laiquan?= <167113918+LaiQuan-conquer@users.noreply.github.com> Date: Thu, 9 Apr 2026 11:29:21 +0800 Subject: [PATCH 1/2] issue/1031 merge T1-1-2 --- include/infiniop.h | 5 + include/infiniop/ops/avg_pool3d.h | 27 ++ include/infiniop/ops/dot.h | 26 ++ include/infiniop/ops/histc.h | 28 ++ include/infiniop/ops/log10.h | 24 ++ include/infiniop/ops/log1p.h | 24 ++ .../ops/avg_pool3d/cpu/avg_pool3d_cpu.cc | 254 ++++++++++++++++++ .../ops/avg_pool3d/cpu/avg_pool3d_cpu.h | 64 +++++ .../ops/avg_pool3d/metax/avg_pool3d_metax.h | 49 ++++ .../avg_pool3d/metax/avg_pool3d_metax.maca | 219 +++++++++++++++ .../ops/avg_pool3d/moore/avg_pool3d_moore.h | 44 +++ .../ops/avg_pool3d/moore/avg_pool3d_moore.mu | 52 ++++ .../avg_pool3d/nvidia/avg_pool3d_nvidia.cu | 195 ++++++++++++++ .../avg_pool3d/nvidia/avg_pool3d_nvidia.cuh | 46 ++++ src/infiniop/ops/avg_pool3d/operator.cc | 163 +++++++++++ src/infiniop/ops/dot/cpu/dot_cpu.cc | 105 ++++++++ src/infiniop/ops/dot/cpu/dot_cpu.h | 49 ++++ src/infiniop/ops/dot/cuda/kernel.cuh | 40 +++ src/infiniop/ops/dot/metax/dot_metax.h | 47 ++++ src/infiniop/ops/dot/metax/dot_metax.maca | 106 ++++++++ src/infiniop/ops/dot/moore/dot_moore.h | 47 ++++ src/infiniop/ops/dot/moore/dot_moore.mu | 106 ++++++++ src/infiniop/ops/dot/nvidia/dot_nvidia.cu | 121 +++++++++ src/infiniop/ops/dot/nvidia/dot_nvidia.cuh | 48 ++++ src/infiniop/ops/dot/operator.cc | 160 +++++++++++ src/infiniop/ops/histc/cpu/histc_cpu.cc | 150 +++++++++++ src/infiniop/ops/histc/cpu/histc_cpu.h | 60 +++++ src/infiniop/ops/histc/cuda/kernel.cuh | 49 ++++ src/infiniop/ops/histc/metax/histc_metax.h | 53 ++++ src/infiniop/ops/histc/metax/histc_metax.maca | 105 ++++++++ src/infiniop/ops/histc/moore/histc_moore.h | 53 ++++ src/infiniop/ops/histc/moore/histc_moore.mu | 105 ++++++++ src/infiniop/ops/histc/nvidia/histc_nvidia.cu | 119 ++++++++ .../ops/histc/nvidia/histc_nvidia.cuh | 53 ++++ src/infiniop/ops/histc/operator.cc | 163 +++++++++++ src/infiniop/ops/log10/cpu/log10_cpu.cc | 53 ++++ src/infiniop/ops/log10/cpu/log10_cpu.h | 20 ++ src/infiniop/ops/log10/cuda/kernel.cuh | 31 +++ src/infiniop/ops/log10/log10.h | 8 + src/infiniop/ops/log10/metax/log10_metax.h | 8 + src/infiniop/ops/log10/metax/log10_metax.maca | 60 +++++ src/infiniop/ops/log10/moore/log10_moore.h | 8 + src/infiniop/ops/log10/moore/log10_moore.mu | 61 +++++ .../ops/log10/moore/log10_moore_kernel.h | 39 +++ src/infiniop/ops/log10/nvidia/log10_nvidia.cu | 59 ++++ .../ops/log10/nvidia/log10_nvidia.cuh | 8 + src/infiniop/ops/log10/operator.cc | 157 +++++++++++ src/infiniop/ops/log1p/cpu/log1p_cpu.cc | 53 ++++ src/infiniop/ops/log1p/cpu/log1p_cpu.h | 20 ++ src/infiniop/ops/log1p/cuda/kernel.cuh | 29 ++ src/infiniop/ops/log1p/log1p.h | 8 + src/infiniop/ops/log1p/metax/log1p_metax.h | 8 + src/infiniop/ops/log1p/metax/log1p_metax.maca | 59 ++++ src/infiniop/ops/log1p/moore/log1p_moore.h | 8 + src/infiniop/ops/log1p/moore/log1p_moore.mu | 61 +++++ .../ops/log1p/moore/log1p_moore_kernel.h | 39 +++ src/infiniop/ops/log1p/nvidia/log1p_nvidia.cu | 59 ++++ .../ops/log1p/nvidia/log1p_nvidia.cuh | 8 + src/infiniop/ops/log1p/operator.cc | 157 +++++++++++ test/infiniop/avg_pool3d.py | 153 +++++++++++ test/infiniop/dot.py | 138 ++++++++++ test/infiniop/histc.py | 153 +++++++++++ test/infiniop/libinfiniop/op_register.py | 169 +++++++++++- test/infiniop/libinfiniop/utils.py | 31 ++- test/infiniop/log10.py | 179 ++++++++++++ test/infiniop/log1p.py | 162 +++++++++++ 66 files changed, 4960 insertions(+), 5 deletions(-) create mode 100644 include/infiniop/ops/avg_pool3d.h create mode 100644 include/infiniop/ops/dot.h create mode 100644 include/infiniop/ops/histc.h create mode 100644 include/infiniop/ops/log10.h create mode 100644 include/infiniop/ops/log1p.h create mode 100644 src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.cc create mode 100644 src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.h create mode 100644 src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.h create mode 100644 src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.maca create mode 100644 src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.h create mode 100644 src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.mu create mode 100644 src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cu create mode 100644 src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cuh create mode 100644 src/infiniop/ops/avg_pool3d/operator.cc create mode 100644 src/infiniop/ops/dot/cpu/dot_cpu.cc create mode 100644 src/infiniop/ops/dot/cpu/dot_cpu.h create mode 100644 src/infiniop/ops/dot/cuda/kernel.cuh create mode 100644 src/infiniop/ops/dot/metax/dot_metax.h create mode 100644 src/infiniop/ops/dot/metax/dot_metax.maca create mode 100644 src/infiniop/ops/dot/moore/dot_moore.h create mode 100644 src/infiniop/ops/dot/moore/dot_moore.mu create mode 100644 src/infiniop/ops/dot/nvidia/dot_nvidia.cu create mode 100644 src/infiniop/ops/dot/nvidia/dot_nvidia.cuh create mode 100644 src/infiniop/ops/dot/operator.cc create mode 100644 src/infiniop/ops/histc/cpu/histc_cpu.cc create mode 100644 src/infiniop/ops/histc/cpu/histc_cpu.h create mode 100644 src/infiniop/ops/histc/cuda/kernel.cuh create mode 100644 src/infiniop/ops/histc/metax/histc_metax.h create mode 100644 src/infiniop/ops/histc/metax/histc_metax.maca create mode 100644 src/infiniop/ops/histc/moore/histc_moore.h create mode 100644 src/infiniop/ops/histc/moore/histc_moore.mu create mode 100644 src/infiniop/ops/histc/nvidia/histc_nvidia.cu create mode 100644 src/infiniop/ops/histc/nvidia/histc_nvidia.cuh create mode 100644 src/infiniop/ops/histc/operator.cc create mode 100644 src/infiniop/ops/log10/cpu/log10_cpu.cc create mode 100644 src/infiniop/ops/log10/cpu/log10_cpu.h create mode 100644 src/infiniop/ops/log10/cuda/kernel.cuh create mode 100644 src/infiniop/ops/log10/log10.h create mode 100644 src/infiniop/ops/log10/metax/log10_metax.h create mode 100644 src/infiniop/ops/log10/metax/log10_metax.maca create mode 100644 src/infiniop/ops/log10/moore/log10_moore.h create mode 100644 src/infiniop/ops/log10/moore/log10_moore.mu create mode 100644 src/infiniop/ops/log10/moore/log10_moore_kernel.h create mode 100644 src/infiniop/ops/log10/nvidia/log10_nvidia.cu create mode 100644 src/infiniop/ops/log10/nvidia/log10_nvidia.cuh create mode 100644 src/infiniop/ops/log10/operator.cc create mode 100644 src/infiniop/ops/log1p/cpu/log1p_cpu.cc create mode 100644 src/infiniop/ops/log1p/cpu/log1p_cpu.h create mode 100644 src/infiniop/ops/log1p/cuda/kernel.cuh create mode 100644 src/infiniop/ops/log1p/log1p.h create mode 100644 src/infiniop/ops/log1p/metax/log1p_metax.h create mode 100644 src/infiniop/ops/log1p/metax/log1p_metax.maca create mode 100644 src/infiniop/ops/log1p/moore/log1p_moore.h create mode 100644 src/infiniop/ops/log1p/moore/log1p_moore.mu create mode 100644 src/infiniop/ops/log1p/moore/log1p_moore_kernel.h create mode 100644 src/infiniop/ops/log1p/nvidia/log1p_nvidia.cu create mode 100644 src/infiniop/ops/log1p/nvidia/log1p_nvidia.cuh create mode 100644 src/infiniop/ops/log1p/operator.cc create mode 100644 test/infiniop/avg_pool3d.py create mode 100644 test/infiniop/dot.py create mode 100644 test/infiniop/histc.py create mode 100644 test/infiniop/log10.py create mode 100644 test/infiniop/log1p.py diff --git a/include/infiniop.h b/include/infiniop.h index 5ddd7d297..630244afd 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -18,6 +18,7 @@ #include "infiniop/ops/atanh.h" #include "infiniop/ops/attention.h" #include "infiniop/ops/avg_pool1d.h" +#include "infiniop/ops/avg_pool3d.h" #include "infiniop/ops/binary_cross_entropy_with_logits.h" #include "infiniop/ops/block_diag.h" #include "infiniop/ops/broadcast_to.h" @@ -32,6 +33,7 @@ #include "infiniop/ops/diff.h" #include "infiniop/ops/digamma.h" #include "infiniop/ops/dist.h" +#include "infiniop/ops/dot.h" #include "infiniop/ops/embedding.h" #include "infiniop/ops/equal.h" #include "infiniop/ops/flash_attention.h" @@ -46,6 +48,7 @@ #include "infiniop/ops/hardswish.h" #include "infiniop/ops/hardtanh.h" #include "infiniop/ops/hinge_embedding_loss.h" +#include "infiniop/ops/histc.h" #include "infiniop/ops/huber_loss.h" #include "infiniop/ops/hypot.h" #include "infiniop/ops/index_add.h" @@ -58,6 +61,8 @@ #include "infiniop/ops/layer_norm.h" #include "infiniop/ops/ldexp.h" #include "infiniop/ops/lerp.h" +#include "infiniop/ops/log10.h" +#include "infiniop/ops/log1p.h" #include "infiniop/ops/log_softmax.h" #include "infiniop/ops/logaddexp.h" #include "infiniop/ops/logaddexp2.h" diff --git a/include/infiniop/ops/avg_pool3d.h b/include/infiniop/ops/avg_pool3d.h new file mode 100644 index 000000000..7170e7765 --- /dev/null +++ b/include/infiniop/ops/avg_pool3d.h @@ -0,0 +1,27 @@ +#ifndef __INFINIOP_AVG_POOL3D_API_H__ +#define __INFINIOP_AVG_POOL3D_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAvgPool3dDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAvgPool3dDescriptor(infiniopHandle_t handle, + infiniopAvgPool3dDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + void *kernel_size, + void *stride, + void *padding); + +__C __export infiniStatus_t infiniopGetAvgPool3dWorkspaceSize(infiniopAvgPool3dDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAvgPool3d(infiniopAvgPool3dDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAvgPool3dDescriptor(infiniopAvgPool3dDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/dot.h b/include/infiniop/ops/dot.h new file mode 100644 index 000000000..7fa0d2659 --- /dev/null +++ b/include/infiniop/ops/dot.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_DOT_API_H__ +#define __INFINIOP_DOT_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopDotDescriptor_t; + +__C __export infiniStatus_t infiniopCreateDotDescriptor(infiniopHandle_t handle, + infiniopDotDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetDotWorkspaceSize(infiniopDotDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopDot(infiniopDotDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyDotDescriptor(infiniopDotDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/histc.h b/include/infiniop/ops/histc.h new file mode 100644 index 000000000..cd4695d4e --- /dev/null +++ b/include/infiniop/ops/histc.h @@ -0,0 +1,28 @@ +#ifndef __INFINIOP_HISTC_API_H__ +#define __INFINIOP_HISTC_API_H__ + +#include "../operator_descriptor.h" +#include + +typedef struct InfiniopDescriptor *infiniopHistcDescriptor_t; + +__C __export infiniStatus_t infiniopCreateHistcDescriptor(infiniopHandle_t handle, + infiniopHistcDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + int64_t bins, + double min_val, + double max_val); + +__C __export infiniStatus_t infiniopGetHistcWorkspaceSize(infiniopHistcDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopHistc(infiniopHistcDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyHistcDescriptor(infiniopHistcDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/log10.h b/include/infiniop/ops/log10.h new file mode 100644 index 000000000..7c105dcc3 --- /dev/null +++ b/include/infiniop/ops/log10.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_LOG10_API_H__ +#define __INFINIOP_LOG10_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopLog10Descriptor_t; + +__C __export infiniStatus_t infiniopCreateLog10Descriptor(infiniopHandle_t handle, + infiniopLog10Descriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetLog10WorkspaceSize(infiniopLog10Descriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopLog10(infiniopLog10Descriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyLog10Descriptor(infiniopLog10Descriptor_t desc); + +#endif diff --git a/include/infiniop/ops/log1p.h b/include/infiniop/ops/log1p.h new file mode 100644 index 000000000..ebd608e9d --- /dev/null +++ b/include/infiniop/ops/log1p.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_LOG1P_API_H__ +#define __INFINIOP_LOG1P_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopLog1pDescriptor_t; + +__C __export infiniStatus_t infiniopCreateLog1pDescriptor(infiniopHandle_t handle, + infiniopLog1pDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetLog1pWorkspaceSize(infiniopLog1pDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopLog1p(infiniopLog1pDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyLog1pDescriptor(infiniopLog1pDescriptor_t desc); + +#endif diff --git a/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.cc b/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.cc new file mode 100644 index 000000000..c280a2d92 --- /dev/null +++ b/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.cc @@ -0,0 +1,254 @@ +#include "avg_pool3d_cpu.h" +#include "../../../../utils.h" +#include +#include +#include +#include + +namespace op::avg_pool3d::cpu { + +utils::Result AvgPool3dInfo::create( + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t y_desc, + void *kernel_size, + void *stride, + void *padding) { + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + if (x_shape.size() != 5 || y_shape.size() != 5) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t batch = x_shape[0]; + size_t channels = x_shape[1]; + size_t input_d = x_shape[2]; + size_t input_h = x_shape[3]; + size_t input_w = x_shape[4]; + + // Parse kernel_size + size_t kernel_d, kernel_h, kernel_w; + if (kernel_size) { + size_t *ks = reinterpret_cast(kernel_size); + if (ks[0] == 0 || ks[1] == 0 || ks[2] == 0) { + return INFINI_STATUS_BAD_PARAM; + } + kernel_d = ks[0]; + kernel_h = ks[1]; + kernel_w = ks[2]; + } else { + return INFINI_STATUS_BAD_PARAM; + } + + // Parse stride (default to kernel_size if not provided) + size_t stride_d, stride_h, stride_w; + if (stride) { + size_t *s = reinterpret_cast(stride); + if (s[0] == 0 || s[1] == 0 || s[2] == 0) { + return INFINI_STATUS_BAD_PARAM; + } + stride_d = s[0]; + stride_h = s[1]; + stride_w = s[2]; + } else { + stride_d = kernel_d; + stride_h = kernel_h; + stride_w = kernel_w; + } + + // Parse padding + size_t pad_d, pad_h, pad_w; + if (padding) { + size_t *p = reinterpret_cast(padding); + // Assume it's always a tuple of 3 values for 3D pooling + pad_d = p[0]; + pad_h = p[1]; + pad_w = p[2]; + } else { + pad_d = pad_h = pad_w = 0; + } + + // Calculate output dimensions. Guard against unsigned underflow when kernel > input + 2*pad. + if (pad_d > (std::numeric_limits::max() - input_d) / 2 || + pad_h > (std::numeric_limits::max() - input_h) / 2 || + pad_w > (std::numeric_limits::max() - input_w) / 2) { + return INFINI_STATUS_BAD_PARAM; + } + size_t effective_d = input_d + 2 * pad_d; + size_t effective_h = input_h + 2 * pad_h; + size_t effective_w = input_w + 2 * pad_w; + if (kernel_d > effective_d || kernel_h > effective_h || kernel_w > effective_w) { + return INFINI_STATUS_BAD_PARAM; + } + + size_t output_d = (effective_d - kernel_d) / stride_d + 1; + size_t output_h = (effective_h - kernel_h) / stride_h + 1; + size_t output_w = (effective_w - kernel_w) / stride_w + 1; + + // Verify output shape + if (y_shape[0] != batch || y_shape[1] != channels || + y_shape[2] != output_d || y_shape[3] != output_h || y_shape[4] != output_w) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + AvgPool3dInfo info; + info.batch = batch; + info.channels = channels; + info.input_d = input_d; + info.input_h = input_h; + info.input_w = input_w; + info.output_d = output_d; + info.output_h = output_h; + info.output_w = output_w; + info.kernel_d = kernel_d; + info.kernel_h = kernel_h; + info.kernel_w = kernel_w; + info.stride_d = stride_d; + info.stride_h = stride_h; + info.stride_w = stride_w; + info.pad_d = pad_d; + info.pad_h = pad_h; + info.pad_w = pad_w; + info.input_strides = x_desc->strides(); + info.output_strides = y_desc->strides(); + + if (info.input_strides.size() != x_shape.size() || info.output_strides.size() != y_shape.size()) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + + // Reject broadcasted (0-stride) or negative strides for dimensions that are actually indexed. + // The kernel computes indices using size_t, so negative strides would underflow and go OOB. + for (size_t i = 0; i < x_shape.size(); ++i) { + if (x_shape[i] > 1 && info.input_strides[i] <= 0) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + } + for (size_t i = 0; i < y_shape.size(); ++i) { + if (y_shape[i] > 1 && info.output_strides[i] <= 0) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + } + + return utils::Result(std::move(info)); +} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + void *kernel_size, + void *stride, + void *padding) { + + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + if (y_desc->dtype() != dtype) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + auto info_result = AvgPool3dInfo::create(x_desc, y_desc, kernel_size, stride, padding); + CHECK_RESULT(info_result); + + *desc_ptr = new Descriptor(dtype, info_result.take(), handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +void avg_pool3d_impl( + const AvgPool3dInfo &info, + T *y, + const T *x) { + + const size_t kernel_size = info.kernel_d * info.kernel_h * info.kernel_w; + using Tacc = std::conditional_t, double, float>; + const Tacc inv_kernel_size = Tacc(1) / static_cast(kernel_size); + +#pragma omp parallel for collapse(2) + for (ptrdiff_t b = 0; b < static_cast(info.batch); ++b) { + for (ptrdiff_t c = 0; c < static_cast(info.channels); ++c) { + for (size_t od = 0; od < info.output_d; ++od) { + for (size_t oh = 0; oh < info.output_h; ++oh) { + for (size_t ow = 0; ow < info.output_w; ++ow) { + Tacc sum = Tacc(0); + + // Calculate input window + ptrdiff_t id_start = + static_cast(od) * static_cast(info.stride_d) - + static_cast(info.pad_d); + ptrdiff_t ih_start = + static_cast(oh) * static_cast(info.stride_h) - + static_cast(info.pad_h); + ptrdiff_t iw_start = + static_cast(ow) * static_cast(info.stride_w) - + static_cast(info.pad_w); + + for (size_t kd = 0; kd < info.kernel_d; ++kd) { + for (size_t kh = 0; kh < info.kernel_h; ++kh) { + for (size_t kw = 0; kw < info.kernel_w; ++kw) { + ptrdiff_t id = id_start + static_cast(kd); + ptrdiff_t ih = ih_start + static_cast(kh); + ptrdiff_t iw = iw_start + static_cast(kw); + + // Check bounds (accounting for padding) + if (id >= 0 && id < static_cast(info.input_d) && + ih >= 0 && ih < static_cast(info.input_h) && + iw >= 0 && iw < static_cast(info.input_w)) { + size_t x_idx = b * info.input_strides[0] + + c * info.input_strides[1] + + static_cast(id) * info.input_strides[2] + + static_cast(ih) * info.input_strides[3] + + static_cast(iw) * info.input_strides[4]; + sum += utils::cast(x[x_idx]); + } + } + } + } + + size_t y_idx = b * info.output_strides[0] + + c * info.output_strides[1] + + od * info.output_strides[2] + + oh * info.output_strides[3] + + ow * info.output_strides[4]; + // Match torch.nn.functional.avg_pool3d default behavior (count_include_pad=True): + // padding contributes zeros but still counts in the divisor. + y[y_idx] = utils::cast(sum * inv_kernel_size); + } + } + } + } + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + avg_pool3d_impl(_info, reinterpret_cast(y), reinterpret_cast(x)); + break; + case INFINI_DTYPE_BF16: + avg_pool3d_impl(_info, reinterpret_cast(y), reinterpret_cast(x)); + break; + case INFINI_DTYPE_F32: + avg_pool3d_impl(_info, reinterpret_cast(y), reinterpret_cast(x)); + break; + case INFINI_DTYPE_F64: + avg_pool3d_impl(_info, reinterpret_cast(y), reinterpret_cast(x)); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::avg_pool3d::cpu diff --git a/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.h b/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.h new file mode 100644 index 000000000..f01aeb5cf --- /dev/null +++ b/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.h @@ -0,0 +1,64 @@ +#ifndef __AVG_POOL3D_CPU_H__ +#define __AVG_POOL3D_CPU_H__ + +#include "../../../operator.h" +#include "../../../tensor.h" +#include "../../../devices/cpu/common_cpu.h" +#include + +namespace op::avg_pool3d::cpu { + +struct AvgPool3dInfo { + size_t batch; + size_t channels; + size_t input_d, input_h, input_w; + size_t output_d, output_h, output_w; + size_t kernel_d, kernel_h, kernel_w; + size_t stride_d, stride_h, stride_w; + size_t pad_d, pad_h, pad_w; + std::vector input_strides; + std::vector output_strides; + + static utils::Result create( + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t y_desc, + void *kernel_size, + void *stride, + void *padding); +}; + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + AvgPool3dInfo _info; + + Descriptor(infiniDtype_t dtype, AvgPool3dInfo info, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + void *kernel_size, + void *stride, + void *padding); + + size_t workspaceSize() const { return 0; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::avg_pool3d::cpu + +#endif // __AVG_POOL3D_CPU_H__ diff --git a/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.h b/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.h new file mode 100644 index 000000000..b4277393e --- /dev/null +++ b/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.h @@ -0,0 +1,49 @@ +#ifndef __AVG_POOL3D_METAX_H__ +#define __AVG_POOL3D_METAX_H__ + +#include "../../../operator.h" +#include "../../../tensor.h" +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" + +#ifdef ENABLE_METAX_MC_API +#include +#else +#include +#endif + +namespace op::avg_pool3d::metax { + +class Descriptor final : public InfiniopDescriptor { + struct Opaque; + std::unique_ptr _opaque; + infiniDtype_t _dtype; + + Descriptor(infiniDtype_t dtype, std::unique_ptr opaque, + infiniDevice_t device_type, int device_id); + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + void *kernel_size, + void *stride, + void *padding); + + size_t workspaceSize() const; + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::avg_pool3d::metax + +#endif // __AVG_POOL3D_METAX_H__ diff --git a/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.maca b/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.maca new file mode 100644 index 000000000..c7955812a --- /dev/null +++ b/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.maca @@ -0,0 +1,219 @@ +#include "avg_pool3d_metax.h" +#include "../../../../utils.h" + +namespace op::avg_pool3d::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +#ifdef ENABLE_METAX_MC_API + mcdnnTensorDescriptor_t x_desc = nullptr; + mcdnnTensorDescriptor_t y_desc = nullptr; + mcdnnPoolingDescriptor_t pool_desc = nullptr; +#else + hcdnnTensorDescriptor_t x_desc = nullptr; + hcdnnTensorDescriptor_t y_desc = nullptr; + hcdnnPoolingDescriptor_t pool_desc = nullptr; +#endif + size_t workspace_size = 0; + + Opaque(std::shared_ptr internal_ptr) + : internal(internal_ptr) {} + + ~Opaque() { +#ifdef ENABLE_METAX_MC_API + if (x_desc) mcdnnDestroyTensorDescriptor(x_desc); + if (y_desc) mcdnnDestroyTensorDescriptor(y_desc); + if (pool_desc) mcdnnDestroyPoolingDescriptor(pool_desc); +#else + if (x_desc) hcdnnDestroyTensorDescriptor(x_desc); + if (y_desc) hcdnnDestroyTensorDescriptor(y_desc); + if (pool_desc) hcdnnDestroyPoolingDescriptor(pool_desc); +#endif + } +}; + +Descriptor::Descriptor(infiniDtype_t dtype, std::unique_ptr opaque, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _opaque(std::move(opaque)), + _dtype(dtype) {} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + void *kernel_size, + void *stride, + void *padding) { + + auto metax_handle = reinterpret_cast(handle); + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + if (x_shape.size() != 5 || y_shape.size() != 5) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t *ks = reinterpret_cast(kernel_size); + size_t kernel_d = ks[0], kernel_h = ks[1], kernel_w = ks[2]; + + size_t stride_d, stride_h, stride_w; + if (stride) { + size_t *s = reinterpret_cast(stride); + stride_d = s[0]; + stride_h = s[1]; + stride_w = s[2]; + } else { + stride_d = kernel_d; + stride_h = kernel_h; + stride_w = kernel_w; + } + + size_t pad_d, pad_h, pad_w; + if (padding) { + size_t *p = reinterpret_cast(padding); + pad_d = p[0]; + pad_h = p[1]; + pad_w = p[2]; + } else { + pad_d = pad_h = pad_w = 0; + } + + auto opaque = std::make_unique(metax_handle->internal()); + + // Create hcdnn descriptors +#ifdef ENABLE_METAX_MC_API + CHECK_MCDNN(mcdnnCreateTensorDescriptor(&opaque->x_desc)); + CHECK_MCDNN(mcdnnCreateTensorDescriptor(&opaque->y_desc)); + CHECK_MCDNN(mcdnnCreatePoolingDescriptor(&opaque->pool_desc)); +#else + CHECK_MCDNN(hcdnnCreateTensorDescriptor(&opaque->x_desc)); + CHECK_MCDNN(hcdnnCreateTensorDescriptor(&opaque->y_desc)); + CHECK_MCDNN(hcdnnCreatePoolingDescriptor(&opaque->pool_desc)); +#endif + + // Set tensor descriptors + int n = static_cast(x_shape[0]); + int c = static_cast(x_shape[1]); + int d = static_cast(x_shape[2]); + int h = static_cast(x_shape[3]); + int w = static_cast(x_shape[4]); + int out_d = static_cast(y_shape[2]); + int out_h = static_cast(y_shape[3]); + int out_w = static_cast(y_shape[4]); + + int input_dims[5] = {n, c, d, h, w}; + int input_strides[5] = { + static_cast(c * d * h * w), + static_cast(d * h * w), + static_cast(h * w), + static_cast(w), + 1 + }; + + int output_dims[5] = {n, c, out_d, out_h, out_w}; + int output_strides[5] = { + static_cast(c * out_d * out_h * out_w), + static_cast(out_d * out_h * out_w), + static_cast(out_h * out_w), + static_cast(out_w), + 1 + }; + + hcdnnDataType_t hcdnn_dtype = device::metax::getHcdnnDtype(dtype); +#ifdef ENABLE_METAX_MC_API + CHECK_MCDNN(mcdnnSetTensorNdDescriptor( + opaque->x_desc, hcdnn_dtype, 5, input_dims, input_strides)); + CHECK_MCDNN(mcdnnSetTensorNdDescriptor( + opaque->y_desc, hcdnn_dtype, 5, output_dims, output_strides)); + + // Set pooling descriptor + int window_dims[3] = {static_cast(kernel_d), static_cast(kernel_h), static_cast(kernel_w)}; + int padding_dims[3] = {static_cast(pad_d), static_cast(pad_h), static_cast(pad_w)}; + int stride_dims[3] = {static_cast(stride_d), static_cast(stride_h), static_cast(stride_w)}; + + CHECK_MCDNN(mcdnnSetPoolingNdDescriptor( + opaque->pool_desc, + MCDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING, + MCDNN_NOT_PROPAGATE_NAN, + 3, window_dims, padding_dims, stride_dims)); +#else + CHECK_MCDNN(hcdnnSetTensorNdDescriptor( + opaque->x_desc, hcdnn_dtype, 5, input_dims, input_strides)); + CHECK_MCDNN(hcdnnSetTensorNdDescriptor( + opaque->y_desc, hcdnn_dtype, 5, output_dims, output_strides)); + + // Set pooling descriptor + int window_dims[3] = {static_cast(kernel_d), static_cast(kernel_h), static_cast(kernel_w)}; + int padding_dims[3] = {static_cast(pad_d), static_cast(pad_h), static_cast(pad_w)}; + int stride_dims[3] = {static_cast(stride_d), static_cast(stride_h), static_cast(stride_w)}; + + CHECK_MCDNN(hcdnnSetPoolingNdDescriptor( + opaque->pool_desc, + HCDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING, + HCDNN_NOT_PROPAGATE_NAN, + 3, window_dims, padding_dims, stride_dims)); +#endif + + *desc_ptr = new Descriptor(dtype, std::move(opaque), handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +size_t Descriptor::workspaceSize() const { + return _opaque->workspace_size; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + auto hc_stream = reinterpret_cast(stream); + return _opaque->internal->useMcdnn(hc_stream, [&](hcdnnHandle_t hcdnn_handle) { + const void *alpha = nullptr; + const void *beta = nullptr; + if (_dtype == INFINI_DTYPE_F32) { + static const float alpha_val = 1.0f, beta_val = 0.0f; + alpha = &alpha_val; + beta = &beta_val; + } else if (_dtype == INFINI_DTYPE_F64) { + static const double alpha_val = 1.0, beta_val = 0.0; + alpha = &alpha_val; + beta = &beta_val; + } else { + static const float alpha_val = 1.0f, beta_val = 0.0f; + alpha = &alpha_val; + beta = &beta_val; + } + +#ifdef ENABLE_METAX_MC_API + CHECK_MCDNN(mcdnnPoolingForward( + hcdnn_handle, + _opaque->pool_desc, + alpha, + _opaque->x_desc, x, + beta, + _opaque->y_desc, y)); +#else + CHECK_MCDNN(hcdnnPoolingForward( + hcdnn_handle, + _opaque->pool_desc, + alpha, + _opaque->x_desc, x, + beta, + _opaque->y_desc, y)); +#endif + + return INFINI_STATUS_SUCCESS; + }); +} + +} // namespace op::avg_pool3d::metax diff --git a/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.h b/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.h new file mode 100644 index 000000000..526d36543 --- /dev/null +++ b/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.h @@ -0,0 +1,44 @@ +#ifndef __AVG_POOL3D_MOORE_H__ +#define __AVG_POOL3D_MOORE_H__ + +#include "../../../operator.h" +#include "../../../tensor.h" +#include "../../../devices/moore/moore_common.h" +#include "../../../devices/moore/moore_handle.h" +#include + +namespace op::avg_pool3d::moore { + +class Descriptor final : public InfiniopDescriptor { + struct Opaque; + std::unique_ptr _opaque; + infiniDtype_t _dtype; + + Descriptor(infiniDtype_t dtype, std::unique_ptr opaque, + infiniDevice_t device_type, int device_id); + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + void *kernel_size, + void *stride, + void *padding); + + size_t workspaceSize() const; + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::avg_pool3d::moore + +#endif // __AVG_POOL3D_MOORE_H__ diff --git a/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.mu b/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.mu new file mode 100644 index 000000000..5e1cfd4a6 --- /dev/null +++ b/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.mu @@ -0,0 +1,52 @@ +#include "avg_pool3d_moore.h" +#include "../../../../utils.h" +// MOORE uses CUDA-compatible API, so we can reuse NVIDIA implementation +// by including the NVIDIA source and adapting stream types + +namespace op::avg_pool3d::moore { + +// MOORE platform uses musa API which is CUDA-compatible +// We can reuse the NVIDIA implementation structure +// For now, return NOT_IMPLEMENTED as a placeholder +// Full implementation would require adapting NVIDIA code to use musaStream_t + +struct Descriptor::Opaque {}; + +Descriptor::Descriptor(infiniDtype_t dtype, std::unique_ptr opaque, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _opaque(std::move(opaque)), + _dtype(dtype) {} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + void *kernel_size, + void *stride, + void *padding) { + + // MOORE implementation would be similar to NVIDIA but using musa API + // For now, delegate to a CPU fallback or implement custom kernel + // This is a simplified placeholder - full implementation needed + return INFINI_STATUS_NOT_IMPLEMENTED; +} + +size_t Descriptor::workspaceSize() const { + return 0; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + // Placeholder - full implementation needed + return INFINI_STATUS_NOT_IMPLEMENTED; +} + +} // namespace op::avg_pool3d::moore diff --git a/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cu b/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cu new file mode 100644 index 000000000..ce78f7bd2 --- /dev/null +++ b/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cu @@ -0,0 +1,195 @@ +#include "avg_pool3d_nvidia.cuh" +#include "../../../../utils.h" +#include +#include + +namespace op::avg_pool3d::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; + cudnnTensorDescriptor_t x_desc = nullptr; + cudnnTensorDescriptor_t y_desc = nullptr; + cudnnPoolingDescriptor_t pool_desc = nullptr; + size_t workspace_size = 0; + + Opaque(std::shared_ptr internal_ptr) + : internal(internal_ptr) {} + + ~Opaque() { + if (x_desc) cudnnDestroyTensorDescriptor(x_desc); + if (y_desc) cudnnDestroyTensorDescriptor(y_desc); + if (pool_desc) cudnnDestroyPoolingDescriptor(pool_desc); + } +}; + +Descriptor::Descriptor(infiniDtype_t dtype, std::unique_ptr opaque, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _opaque(std::move(opaque)), + _dtype(dtype) {} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + void *kernel_size, + void *stride, + void *padding) { + + auto nvidia_handle = reinterpret_cast(handle); + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + if (y_desc->dtype() != dtype) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + if (x_shape.size() != 5 || y_shape.size() != 5) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (!kernel_size) { + return INFINI_STATUS_BAD_PARAM; + } + size_t *ks = reinterpret_cast(kernel_size); + if (ks[0] == 0 || ks[1] == 0 || ks[2] == 0) { + return INFINI_STATUS_BAD_PARAM; + } + size_t kernel_d = ks[0], kernel_h = ks[1], kernel_w = ks[2]; + + size_t stride_d, stride_h, stride_w; + if (stride) { + size_t *s = reinterpret_cast(stride); + stride_d = s[0]; + stride_h = s[1]; + stride_w = s[2]; + } else { + stride_d = kernel_d; + stride_h = kernel_h; + stride_w = kernel_w; + } + + size_t pad_d, pad_h, pad_w; + if (padding) { + size_t *p = reinterpret_cast(padding); + pad_d = p[0]; + pad_h = p[1]; + pad_w = p[2]; + } else { + pad_d = pad_h = pad_w = 0; + } + + auto opaque = std::make_unique(nvidia_handle->internal()); + + // Create cuDNN descriptors + CHECK_CUDNN(cudnnCreateTensorDescriptor(&opaque->x_desc)); + CHECK_CUDNN(cudnnCreateTensorDescriptor(&opaque->y_desc)); + CHECK_CUDNN(cudnnCreatePoolingDescriptor(&opaque->pool_desc)); + + // Set tensor descriptors + int n = static_cast(x_shape[0]); + int c = static_cast(x_shape[1]); + int d = static_cast(x_shape[2]); + int h = static_cast(x_shape[3]); + int w = static_cast(x_shape[4]); + int out_d = static_cast(y_shape[2]); + int out_h = static_cast(y_shape[3]); + int out_w = static_cast(y_shape[4]); + + int input_dims[5] = {n, c, d, h, w}; + auto x_strides = x_desc->strides(); + if (x_strides.size() != 5) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + int input_strides[5] = {}; + for (size_t i = 0; i < 5; ++i) { + if (x_strides[i] <= 0 || x_strides[i] > std::numeric_limits::max()) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + input_strides[i] = static_cast(x_strides[i]); + } + + int output_dims[5] = {n, c, out_d, out_h, out_w}; + auto y_strides = y_desc->strides(); + if (y_strides.size() != 5) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + int output_strides[5] = {}; + for (size_t i = 0; i < 5; ++i) { + if (y_strides[i] <= 0 || y_strides[i] > std::numeric_limits::max()) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + output_strides[i] = static_cast(y_strides[i]); + } + + cudnnDataType_t cudnn_dtype = device::nvidia::getCudnnDtype(dtype); + CHECK_CUDNN(cudnnSetTensorNdDescriptor( + opaque->x_desc, cudnn_dtype, 5, input_dims, input_strides)); + CHECK_CUDNN(cudnnSetTensorNdDescriptor( + opaque->y_desc, cudnn_dtype, 5, output_dims, output_strides)); + + // Set pooling descriptor + int window_dims[3] = {static_cast(kernel_d), static_cast(kernel_h), static_cast(kernel_w)}; + int padding_dims[3] = {static_cast(pad_d), static_cast(pad_h), static_cast(pad_w)}; + int stride_dims[3] = {static_cast(stride_d), static_cast(stride_h), static_cast(stride_w)}; + + CHECK_CUDNN(cudnnSetPoolingNdDescriptor( + opaque->pool_desc, + CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING, + CUDNN_NOT_PROPAGATE_NAN, + 3, window_dims, padding_dims, stride_dims)); + + *desc_ptr = new Descriptor(dtype, std::move(opaque), handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +size_t Descriptor::workspaceSize() const { + return _opaque->workspace_size; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + auto cuda_stream = reinterpret_cast(stream); + return _opaque->internal->useCudnn(cuda_stream, [&](cudnnHandle_t cudnn_handle) { + + const void *alpha = nullptr; + const void *beta = nullptr; + if (_dtype == INFINI_DTYPE_F32) { + static const float alpha_val = 1.0f, beta_val = 0.0f; + alpha = &alpha_val; + beta = &beta_val; + } else if (_dtype == INFINI_DTYPE_F64) { + static const double alpha_val = 1.0, beta_val = 0.0; + alpha = &alpha_val; + beta = &beta_val; + } else { + // For F16/BF16, use float alpha/beta + static const float alpha_val = 1.0f, beta_val = 0.0f; + alpha = &alpha_val; + beta = &beta_val; + } + + CHECK_CUDNN(cudnnPoolingForward( + cudnn_handle, + _opaque->pool_desc, + alpha, + _opaque->x_desc, x, + beta, + _opaque->y_desc, y)); + + return INFINI_STATUS_SUCCESS; + }); +} + +} // namespace op::avg_pool3d::nvidia diff --git a/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cuh b/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cuh new file mode 100644 index 000000000..9ace65e7b --- /dev/null +++ b/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cuh @@ -0,0 +1,46 @@ +#ifndef __AVG_POOL3D_NVIDIA_H__ +#define __AVG_POOL3D_NVIDIA_H__ + +#include "../../../operator.h" +#include "../../../tensor.h" +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include + +namespace op::avg_pool3d::nvidia { + +class Descriptor final : public InfiniopDescriptor { + struct Opaque; + std::unique_ptr _opaque; + infiniDtype_t _dtype; + + // Defined out-of-line (in .cu) to avoid requiring Opaque to be complete + // in every translation unit that includes this header. + Descriptor(infiniDtype_t dtype, std::unique_ptr opaque, + infiniDevice_t device_type, int device_id); + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + void *kernel_size, + void *stride, + void *padding); + + size_t workspaceSize() const; + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::avg_pool3d::nvidia + +#endif // __AVG_POOL3D_NVIDIA_H__ diff --git a/src/infiniop/ops/avg_pool3d/operator.cc b/src/infiniop/ops/avg_pool3d/operator.cc new file mode 100644 index 000000000..f9e01b991 --- /dev/null +++ b/src/infiniop/ops/avg_pool3d/operator.cc @@ -0,0 +1,163 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/avg_pool3d.h" + +#ifdef ENABLE_CPU_API +#include "cpu/avg_pool3d_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) +#include "nvidia/avg_pool3d_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/avg_pool3d_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/avg_pool3d_moore.h" +#endif + +__C infiniStatus_t infiniopCreateAvgPool3dDescriptor( + infiniopHandle_t handle, + infiniopAvgPool3dDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + void *kernel_size, + void *stride, + void *padding) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::avg_pool3d::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + x_desc, \ + kernel_size, \ + stride, \ + padding) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAvgPool3dWorkspaceSize(infiniopAvgPool3dDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAvgPool3d( + infiniopAvgPool3dDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, x, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyAvgPool3dDescriptor(infiniopAvgPool3dDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/dot/cpu/dot_cpu.cc b/src/infiniop/ops/dot/cpu/dot_cpu.cc new file mode 100644 index 000000000..2a45bc971 --- /dev/null +++ b/src/infiniop/ops/dot/cpu/dot_cpu.cc @@ -0,0 +1,105 @@ +#include "dot_cpu.h" +#include "../../../../utils.h" + +namespace op::dot::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + + auto dtype = a_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + // This op does not do implicit dtype conversion: y/a/b must match. + if (b_desc->dtype() != dtype || y_desc->dtype() != dtype) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + // Check that y is a scalar (0D tensor or shape [1]) + auto y_shape = y_desc->shape(); + if (y_shape.size() != 0 && (y_shape.size() != 1 || y_shape[0] != 1)) { + return INFINI_STATUS_BAD_PARAM; + } + + // Check that a and b are 1D vectors with same length + auto a_shape = a_desc->shape(); + auto b_shape = b_desc->shape(); + if (a_shape.size() != 1 || b_shape.size() != 1 || a_shape[0] != b_shape[0]) { + return INFINI_STATUS_BAD_PARAM; + } + + size_t n = a_shape[0]; + ptrdiff_t a_stride = a_desc->strides()[0]; + ptrdiff_t b_stride = b_desc->strides()[0]; + + // Negative/broadcasted strides are not supported without an explicit base offset. + if (a_stride <= 0 || b_stride <= 0) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + + *desc_ptr = new Descriptor(dtype, n, a_stride, b_stride, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *a, + const void *b, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: { + const fp16_t *a_ptr = reinterpret_cast(a); + const fp16_t *b_ptr = reinterpret_cast(b); + float result = 0.0f; + for (size_t i = 0; i < _n; ++i) { + result += utils::cast(a_ptr[i * _a_stride]) * utils::cast(b_ptr[i * _b_stride]); + } + *reinterpret_cast(y) = utils::cast(result); + break; + } + case INFINI_DTYPE_BF16: { + const bf16_t *a_ptr = reinterpret_cast(a); + const bf16_t *b_ptr = reinterpret_cast(b); + float result = 0.0f; + for (size_t i = 0; i < _n; ++i) { + result += utils::cast(a_ptr[i * _a_stride]) * utils::cast(b_ptr[i * _b_stride]); + } + *reinterpret_cast(y) = utils::cast(result); + break; + } + case INFINI_DTYPE_F32: { + const float *a_ptr = reinterpret_cast(a); + const float *b_ptr = reinterpret_cast(b); + float result = 0.0f; + for (size_t i = 0; i < _n; ++i) { + result += a_ptr[i * _a_stride] * b_ptr[i * _b_stride]; + } + *reinterpret_cast(y) = result; + break; + } + case INFINI_DTYPE_F64: { + const double *a_ptr = reinterpret_cast(a); + const double *b_ptr = reinterpret_cast(b); + double result = 0.0; + for (size_t i = 0; i < _n; ++i) { + result += a_ptr[i * _a_stride] * b_ptr[i * _b_stride]; + } + *reinterpret_cast(y) = result; + break; + } + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::dot::cpu diff --git a/src/infiniop/ops/dot/cpu/dot_cpu.h b/src/infiniop/ops/dot/cpu/dot_cpu.h new file mode 100644 index 000000000..5c8bbc113 --- /dev/null +++ b/src/infiniop/ops/dot/cpu/dot_cpu.h @@ -0,0 +1,49 @@ +#ifndef __DOT_CPU_H__ +#define __DOT_CPU_H__ + +#include "../../../operator.h" +#include "../../../tensor.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" +#include + +namespace op::dot::cpu { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + size_t _n; + ptrdiff_t _a_stride; + ptrdiff_t _b_stride; + + Descriptor(infiniDtype_t dtype, size_t n, ptrdiff_t a_stride, ptrdiff_t b_stride, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _n(n), + _a_stride(a_stride), + _b_stride(b_stride) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc); + + size_t workspaceSize() const { return 0; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *a, + const void *b, + void *stream) const; +}; + +} // namespace op::dot::cpu + +#endif // __DOT_CPU_H__ diff --git a/src/infiniop/ops/dot/cuda/kernel.cuh b/src/infiniop/ops/dot/cuda/kernel.cuh new file mode 100644 index 000000000..0c4b1249d --- /dev/null +++ b/src/infiniop/ops/dot/cuda/kernel.cuh @@ -0,0 +1,40 @@ +#pragma once +#include "../../../reduce/cuda/reduce.cuh" +#include +#include +#include +#include + +namespace op::cuda { + +// Dot product kernel: computes dot(a, b) = sum(a * b) +template +__global__ void dot_kernel( + Tcompute *result, + const Tdata *a, + const Tdata *b, + size_t n, + ptrdiff_t a_stride, + ptrdiff_t b_stride) { + + Tcompute sum = 0; + + // Each thread computes partial dot product + for (size_t i = threadIdx.x; i < n; i += BLOCK_SIZE) { + Tcompute a_val = Tcompute(a[i * a_stride]); + Tcompute b_val = Tcompute(b[i * b_stride]); + sum += a_val * b_val; + } + + // Use CUB block-level reduction + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + Tcompute block_sum = BlockReduce(temp_storage).Sum(sum); + + // Write result (only thread 0, since we only launch 1 block) + if (threadIdx.x == 0) { + *result = block_sum; + } +} + +} // namespace op::cuda diff --git a/src/infiniop/ops/dot/metax/dot_metax.h b/src/infiniop/ops/dot/metax/dot_metax.h new file mode 100644 index 000000000..f7b4c0e21 --- /dev/null +++ b/src/infiniop/ops/dot/metax/dot_metax.h @@ -0,0 +1,47 @@ +#ifndef __DOT_METAX_H__ +#define __DOT_METAX_H__ + +#include "../../../operator.h" +#include "../../../tensor.h" +#include "../../../devices/metax/metax_common.h" + +namespace op::dot::metax { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + size_t _n; + ptrdiff_t _a_stride; + ptrdiff_t _b_stride; + + Descriptor(infiniDtype_t dtype, size_t n, ptrdiff_t a_stride, ptrdiff_t b_stride, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _n(n), + _a_stride(a_stride), + _b_stride(b_stride) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc); + + size_t workspaceSize() const { return 0; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *a, + const void *b, + void *stream) const; +}; + +} // namespace op::dot::metax + +#endif // __DOT_METAX_H__ diff --git a/src/infiniop/ops/dot/metax/dot_metax.maca b/src/infiniop/ops/dot/metax/dot_metax.maca new file mode 100644 index 000000000..79b12e8a3 --- /dev/null +++ b/src/infiniop/ops/dot/metax/dot_metax.maca @@ -0,0 +1,106 @@ +#include "dot_metax.h" +#include "../cuda/kernel.cuh" +#include "../../../../utils.h" +#include +#include + +namespace op::dot::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + + auto dtype = a_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + // Check that y is a scalar (0D tensor or shape [1]) + auto y_shape = y_desc->shape(); + if (y_shape.size() != 0 && (y_shape.size() != 1 || y_shape[0] != 1)) { + return INFINI_STATUS_BAD_PARAM; + } + + // Check that a and b are 1D vectors with same length + auto a_shape = a_desc->shape(); + auto b_shape = b_desc->shape(); + if (a_shape.size() != 1 || b_shape.size() != 1 || a_shape[0] != b_shape[0]) { + return INFINI_STATUS_BAD_PARAM; + } + + size_t n = a_shape[0]; + ptrdiff_t a_stride = a_desc->strides()[0]; + ptrdiff_t b_stride = b_desc->strides()[0]; + + *desc_ptr = new Descriptor(dtype, n, a_stride, b_stride, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *a, + const void *b, + void *stream) const { + + auto hc_stream = reinterpret_cast(stream); + constexpr unsigned int BLOCK_SIZE = 256; + + // Initialize result to zero + switch (_dtype) { + case INFINI_DTYPE_F16: { + float *result_f = nullptr; + CHECK_METAX(hcMalloc((void **)&result_f, sizeof(float))); + CHECK_METAX(hcMemsetAsync(result_f, 0, sizeof(float), hc_stream)); + cuda::dot_kernel<<<1, BLOCK_SIZE, 0, hc_stream>>>( + result_f, reinterpret_cast(a), reinterpret_cast(b), + _n, _a_stride, _b_stride); + float result_val; + CHECK_METAX(hcMemcpyAsync(&result_val, result_f, sizeof(float), hcMemcpyDeviceToHost, hc_stream)); + CHECK_METAX(hcStreamSynchronize(hc_stream)); + *reinterpret_cast(y) = __float2half(result_val); + CHECK_METAX(hcFree(result_f)); + break; + } + case INFINI_DTYPE_BF16: { + float *result_f = nullptr; + CHECK_METAX(hcMalloc((void **)&result_f, sizeof(float))); + CHECK_METAX(hcMemsetAsync(result_f, 0, sizeof(float), hc_stream)); + cuda::dot_kernel<<<1, BLOCK_SIZE, 0, hc_stream>>>( + result_f, reinterpret_cast(a), reinterpret_cast(b), + _n, _a_stride, _b_stride); + float result_val; + CHECK_METAX(hcMemcpyAsync(&result_val, result_f, sizeof(float), hcMemcpyDeviceToHost, hc_stream)); + CHECK_METAX(hcStreamSynchronize(hc_stream)); + *reinterpret_cast(y) = __float2bfloat16_rn(result_val); + CHECK_METAX(hcFree(result_f)); + break; + } + case INFINI_DTYPE_F32: { + float *result_f = reinterpret_cast(y); + CHECK_METAX(hcMemsetAsync(result_f, 0, sizeof(float), hc_stream)); + cuda::dot_kernel<<<1, BLOCK_SIZE, 0, hc_stream>>>( + result_f, reinterpret_cast(a), reinterpret_cast(b), + _n, _a_stride, _b_stride); + break; + } + case INFINI_DTYPE_F64: { + double *result_d = reinterpret_cast(y); + CHECK_METAX(hcMemsetAsync(result_d, 0, sizeof(double), hc_stream)); + cuda::dot_kernel<<<1, BLOCK_SIZE, 0, hc_stream>>>( + result_d, reinterpret_cast(a), reinterpret_cast(b), + _n, _a_stride, _b_stride); + break; + } + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::dot::metax diff --git a/src/infiniop/ops/dot/moore/dot_moore.h b/src/infiniop/ops/dot/moore/dot_moore.h new file mode 100644 index 000000000..eebc0e528 --- /dev/null +++ b/src/infiniop/ops/dot/moore/dot_moore.h @@ -0,0 +1,47 @@ +#ifndef __DOT_MOORE_H__ +#define __DOT_MOORE_H__ + +#include "../../../operator.h" +#include "../../../tensor.h" +#include "../../../devices/moore/moore_common.h" + +namespace op::dot::moore { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + size_t _n; + ptrdiff_t _a_stride; + ptrdiff_t _b_stride; + + Descriptor(infiniDtype_t dtype, size_t n, ptrdiff_t a_stride, ptrdiff_t b_stride, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _n(n), + _a_stride(a_stride), + _b_stride(b_stride) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc); + + size_t workspaceSize() const { return 0; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *a, + const void *b, + void *stream) const; +}; + +} // namespace op::dot::moore + +#endif // __DOT_MOORE_H__ diff --git a/src/infiniop/ops/dot/moore/dot_moore.mu b/src/infiniop/ops/dot/moore/dot_moore.mu new file mode 100644 index 000000000..7b9e9ec77 --- /dev/null +++ b/src/infiniop/ops/dot/moore/dot_moore.mu @@ -0,0 +1,106 @@ +#include "dot_moore.h" +#include "../cuda/kernel.cuh" +#include "../../../../utils.h" +#include +#include + +namespace op::dot::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + + auto dtype = a_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + // Check that y is a scalar (0D tensor or shape [1]) + auto y_shape = y_desc->shape(); + if (y_shape.size() != 0 && (y_shape.size() != 1 || y_shape[0] != 1)) { + return INFINI_STATUS_BAD_PARAM; + } + + // Check that a and b are 1D vectors with same length + auto a_shape = a_desc->shape(); + auto b_shape = b_desc->shape(); + if (a_shape.size() != 1 || b_shape.size() != 1 || a_shape[0] != b_shape[0]) { + return INFINI_STATUS_BAD_PARAM; + } + + size_t n = a_shape[0]; + ptrdiff_t a_stride = a_desc->strides()[0]; + ptrdiff_t b_stride = b_desc->strides()[0]; + + *desc_ptr = new Descriptor(dtype, n, a_stride, b_stride, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *a, + const void *b, + void *stream) const { + + auto cuda_stream = reinterpret_cast(stream); + constexpr unsigned int BLOCK_SIZE = 256; + + // Initialize result to zero + switch (_dtype) { + case INFINI_DTYPE_F16: { + float *result_f = nullptr; + CHECK_MOORE(cudaMallocAsync(&result_f, sizeof(float), cuda_stream)); + CHECK_MOORE(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream)); + cuda::dot_kernel<<<1, BLOCK_SIZE, 0, cuda_stream>>>( + result_f, reinterpret_cast(a), reinterpret_cast(b), + _n, _a_stride, _b_stride); + float result_val; + CHECK_MOORE(cudaMemcpyAsync(&result_val, result_f, sizeof(float), cudaMemcpyDeviceToHost, cuda_stream)); + CHECK_MOORE(cudaStreamSynchronize(cuda_stream)); + *reinterpret_cast(y) = __float2half(result_val); + CHECK_MOORE(cudaFreeAsync(result_f, cuda_stream)); + break; + } + case INFINI_DTYPE_BF16: { + float *result_f = nullptr; + CHECK_MOORE(cudaMallocAsync(&result_f, sizeof(float), cuda_stream)); + CHECK_MOORE(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream)); + cuda::dot_kernel<<<1, BLOCK_SIZE, 0, cuda_stream>>>( + result_f, reinterpret_cast(a), reinterpret_cast(b), + _n, _a_stride, _b_stride); + float result_val; + CHECK_MOORE(cudaMemcpyAsync(&result_val, result_f, sizeof(float), cudaMemcpyDeviceToHost, cuda_stream)); + CHECK_MOORE(cudaStreamSynchronize(cuda_stream)); + *reinterpret_cast(y) = __float2bfloat16_rn(result_val); + CHECK_MOORE(cudaFreeAsync(result_f, cuda_stream)); + break; + } + case INFINI_DTYPE_F32: { + float *result_f = reinterpret_cast(y); + CHECK_MOORE(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream)); + cuda::dot_kernel<<<1, BLOCK_SIZE, 0, cuda_stream>>>( + result_f, reinterpret_cast(a), reinterpret_cast(b), + _n, _a_stride, _b_stride); + break; + } + case INFINI_DTYPE_F64: { + double *result_d = reinterpret_cast(y); + CHECK_MOORE(cudaMemsetAsync(result_d, 0, sizeof(double), cuda_stream)); + cuda::dot_kernel<<<1, BLOCK_SIZE, 0, cuda_stream>>>( + result_d, reinterpret_cast(a), reinterpret_cast(b), + _n, _a_stride, _b_stride); + break; + } + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::dot::moore diff --git a/src/infiniop/ops/dot/nvidia/dot_nvidia.cu b/src/infiniop/ops/dot/nvidia/dot_nvidia.cu new file mode 100644 index 000000000..464e0a15e --- /dev/null +++ b/src/infiniop/ops/dot/nvidia/dot_nvidia.cu @@ -0,0 +1,121 @@ +#include "dot_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../../../../utils.h" + +namespace op::dot::nvidia { + +__global__ void store_half_from_f32(half *dst, const float *src) { + if (threadIdx.x == 0) { + dst[0] = __float2half(src[0]); + } +} + +__global__ void store_bf16_from_f32(cuda_bfloat16 *dst, const float *src) { + if (threadIdx.x == 0) { + dst[0] = __float2bfloat16_rn(src[0]); + } +} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + + auto dtype = a_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + // This op does not do implicit dtype conversion: y/a/b must match. + if (b_desc->dtype() != dtype || y_desc->dtype() != dtype) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + // Check that y is a scalar (0D tensor or shape [1]) + auto y_shape = y_desc->shape(); + if (y_shape.size() != 0 && (y_shape.size() != 1 || y_shape[0] != 1)) { + return INFINI_STATUS_BAD_PARAM; + } + + // Check that a and b are 1D vectors with same length + auto a_shape = a_desc->shape(); + auto b_shape = b_desc->shape(); + if (a_shape.size() != 1 || b_shape.size() != 1 || a_shape[0] != b_shape[0]) { + return INFINI_STATUS_BAD_PARAM; + } + + size_t n = a_shape[0]; + ptrdiff_t a_stride = a_desc->strides()[0]; + ptrdiff_t b_stride = b_desc->strides()[0]; + + // Negative/broadcasted strides are not supported without an explicit base offset. + if (a_stride <= 0 || b_stride <= 0) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + + *desc_ptr = new Descriptor(dtype, n, a_stride, b_stride, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *a, + const void *b, + void *stream) const { + + auto cuda_stream = reinterpret_cast(stream); + constexpr unsigned int BLOCK_SIZE = 256; + + // Initialize result to zero + switch (_dtype) { + case INFINI_DTYPE_F16: { + float *result_f = nullptr; + CHECK_CUDA(cudaMallocAsync(&result_f, sizeof(float), cuda_stream)); + CHECK_CUDA(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream)); + cuda::dot_kernel<<<1, BLOCK_SIZE, 0, cuda_stream>>>( + result_f, reinterpret_cast(a), reinterpret_cast(b), + _n, _a_stride, _b_stride); + store_half_from_f32<<<1, 1, 0, cuda_stream>>>(reinterpret_cast(y), result_f); + CHECK_CUDA(cudaFreeAsync(result_f, cuda_stream)); + break; + } + case INFINI_DTYPE_BF16: { + float *result_f = nullptr; + CHECK_CUDA(cudaMallocAsync(&result_f, sizeof(float), cuda_stream)); + CHECK_CUDA(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream)); + cuda::dot_kernel<<<1, BLOCK_SIZE, 0, cuda_stream>>>( + result_f, reinterpret_cast(a), reinterpret_cast(b), + _n, _a_stride, _b_stride); + store_bf16_from_f32<<<1, 1, 0, cuda_stream>>>(reinterpret_cast(y), result_f); + CHECK_CUDA(cudaFreeAsync(result_f, cuda_stream)); + break; + } + case INFINI_DTYPE_F32: { + float *result_f = reinterpret_cast(y); + CHECK_CUDA(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream)); + cuda::dot_kernel<<<1, BLOCK_SIZE, 0, cuda_stream>>>( + result_f, reinterpret_cast(a), reinterpret_cast(b), + _n, _a_stride, _b_stride); + break; + } + case INFINI_DTYPE_F64: { + double *result_d = reinterpret_cast(y); + CHECK_CUDA(cudaMemsetAsync(result_d, 0, sizeof(double), cuda_stream)); + cuda::dot_kernel<<<1, BLOCK_SIZE, 0, cuda_stream>>>( + result_d, reinterpret_cast(a), reinterpret_cast(b), + _n, _a_stride, _b_stride); + break; + } + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::dot::nvidia diff --git a/src/infiniop/ops/dot/nvidia/dot_nvidia.cuh b/src/infiniop/ops/dot/nvidia/dot_nvidia.cuh new file mode 100644 index 000000000..76d22aec3 --- /dev/null +++ b/src/infiniop/ops/dot/nvidia/dot_nvidia.cuh @@ -0,0 +1,48 @@ +#ifndef __DOT_NVIDIA_H__ +#define __DOT_NVIDIA_H__ + +#include "../../../operator.h" +#include "../../../tensor.h" +#include "../../../devices/nvidia/nvidia_common.cuh" +#include + +namespace op::dot::nvidia { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + size_t _n; + ptrdiff_t _a_stride; + ptrdiff_t _b_stride; + + Descriptor(infiniDtype_t dtype, size_t n, ptrdiff_t a_stride, ptrdiff_t b_stride, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _n(n), + _a_stride(a_stride), + _b_stride(b_stride) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc); + + size_t workspaceSize() const { return 0; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *a, + const void *b, + void *stream) const; +}; + +} // namespace op::dot::nvidia + +#endif // __DOT_NVIDIA_H__ diff --git a/src/infiniop/ops/dot/operator.cc b/src/infiniop/ops/dot/operator.cc new file mode 100644 index 000000000..b0f8c6be9 --- /dev/null +++ b/src/infiniop/ops/dot/operator.cc @@ -0,0 +1,160 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/dot.h" + +#ifdef ENABLE_CPU_API +#include "cpu/dot_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) +#include "nvidia/dot_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/dot_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/dot_moore.h" +#endif + +__C infiniStatus_t infiniopCreateDotDescriptor( + infiniopHandle_t handle, + infiniopDotDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::dot::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + a_desc, \ + b_desc) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetDotWorkspaceSize(infiniopDotDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDot( + infiniopDotDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, a, b, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyDotDescriptor(infiniopDotDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/histc/cpu/histc_cpu.cc b/src/infiniop/ops/histc/cpu/histc_cpu.cc new file mode 100644 index 000000000..d10e560f8 --- /dev/null +++ b/src/infiniop/ops/histc/cpu/histc_cpu.cc @@ -0,0 +1,150 @@ +#include "histc_cpu.h" +#include "../../../../utils.h" +#include +#include +#include + +namespace op::histc::cpu { + +utils::Result HistcInfo::create( + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t y_desc, + int64_t bins, + double min_val, + double max_val) { + + if (bins <= 0) { + return INFINI_STATUS_BAD_PARAM; + } + + if (min_val >= max_val) { + return INFINI_STATUS_BAD_PARAM; + } + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + // Input should be 1D + if (x_shape.size() != 1) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // Output should be 1D with bins elements + if (y_shape.size() != 1 || y_shape[0] != static_cast(bins)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + HistcInfo info; + info.input_size = x_shape[0]; + info.bins = bins; + info.min_val = min_val; + info.max_val = max_val; + info.input_stride = x_desc->strides()[0]; + info.output_stride = y_desc->strides()[0]; + + // This implementation assumes x points to the first logical element and uses linear indexing. + // Negative (or broadcasted) strides would require an explicit base offset. + if (info.input_stride <= 0) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + + // Writing a histogram into a broadcasted or negatively strided output is undefined. + if (info.output_stride <= 0) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + + return utils::Result(std::move(info)); +} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int64_t bins, + double min_val, + double max_val) { + + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + // Histc output is always float32. + if (y_desc->dtype() != INFINI_DTYPE_F32) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + auto info_result = HistcInfo::create(x_desc, y_desc, bins, min_val, max_val); + CHECK_RESULT(info_result); + + *desc_ptr = new Descriptor(dtype, info_result.take(), handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +void histc_impl( + const HistcInfo &info, + float *y, + const T *x) { + + // Initialize output to zero (supports non-unit stride). + for (int64_t b = 0; b < info.bins; ++b) { + y[b * info.output_stride] = 0.0f; + } + + const double bin_width = (info.max_val - info.min_val) / static_cast(info.bins); + + for (size_t i = 0; i < info.input_size; ++i) { + double val = utils::cast(x[i * info.input_stride]); + + // Skip values outside range + if (val < info.min_val || val > info.max_val) { + continue; + } + + // Calculate bin index + int64_t bin_idx = static_cast((val - info.min_val) / bin_width); + + // Handle edge case: max_val should go to last bin + if (bin_idx >= info.bins) { + bin_idx = info.bins - 1; + } + if (bin_idx < 0) { + bin_idx = 0; + } + + y[bin_idx * info.output_stride] += 1.0f; + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + float *y_ptr = reinterpret_cast(y); + + switch (_dtype) { + case INFINI_DTYPE_F16: + histc_impl(_info, y_ptr, reinterpret_cast(x)); + break; + case INFINI_DTYPE_BF16: + histc_impl(_info, y_ptr, reinterpret_cast(x)); + break; + case INFINI_DTYPE_F32: + histc_impl(_info, y_ptr, reinterpret_cast(x)); + break; + case INFINI_DTYPE_F64: + histc_impl(_info, y_ptr, reinterpret_cast(x)); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::histc::cpu diff --git a/src/infiniop/ops/histc/cpu/histc_cpu.h b/src/infiniop/ops/histc/cpu/histc_cpu.h new file mode 100644 index 000000000..673e8ff4b --- /dev/null +++ b/src/infiniop/ops/histc/cpu/histc_cpu.h @@ -0,0 +1,60 @@ +#ifndef __HISTC_CPU_H__ +#define __HISTC_CPU_H__ + +#include "../../../operator.h" +#include "../../../tensor.h" +#include "../../../devices/cpu/common_cpu.h" + +namespace op::histc::cpu { + +struct HistcInfo { + size_t input_size; + int64_t bins; + double min_val; + double max_val; + ptrdiff_t input_stride; + ptrdiff_t output_stride; + + static utils::Result create( + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t y_desc, + int64_t bins, + double min_val, + double max_val); +}; + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + HistcInfo _info; + + Descriptor(infiniDtype_t dtype, HistcInfo info, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int64_t bins, + double min_val, + double max_val); + + size_t workspaceSize() const { return 0; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::histc::cpu + +#endif // __HISTC_CPU_H__ diff --git a/src/infiniop/ops/histc/cuda/kernel.cuh b/src/infiniop/ops/histc/cuda/kernel.cuh new file mode 100644 index 000000000..ee6dd4f4b --- /dev/null +++ b/src/infiniop/ops/histc/cuda/kernel.cuh @@ -0,0 +1,49 @@ +#pragma once +#include +#include +#include +#include + +namespace op::cuda { + +template +__global__ void histc_kernel( + float *hist, + const T *input, + size_t input_size, + ptrdiff_t input_stride, + int64_t bins, + double min_val, + double max_val) { + + size_t idx = static_cast(blockIdx.x) * static_cast(blockDim.x) + + static_cast(threadIdx.x); + size_t stride = static_cast(blockDim.x) * static_cast(gridDim.x); + size_t input_stride_u = static_cast(input_stride); + + double bin_width = (max_val - min_val) / static_cast(bins); + + for (size_t i = idx; i < input_size; i += stride) { + double val = static_cast(input[i * input_stride_u]); + + // Skip values outside range + if (val < min_val || val > max_val) { + continue; + } + + // Calculate bin index + int64_t bin_idx = static_cast((val - min_val) / bin_width); + + // Handle edge case: max_val should go to last bin + if (bin_idx >= bins) { + bin_idx = bins - 1; + } + if (bin_idx < 0) { + bin_idx = 0; + } + + atomicAdd(&hist[bin_idx], 1.0f); + } +} + +} // namespace op::cuda diff --git a/src/infiniop/ops/histc/metax/histc_metax.h b/src/infiniop/ops/histc/metax/histc_metax.h new file mode 100644 index 000000000..7ba259939 --- /dev/null +++ b/src/infiniop/ops/histc/metax/histc_metax.h @@ -0,0 +1,53 @@ +#ifndef __HISTC_METAX_H__ +#define __HISTC_METAX_H__ + +#include "../../../operator.h" +#include "../../../tensor.h" +#include "../../../devices/metax/metax_common.h" + +namespace op::histc::metax { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + size_t _input_size; + int64_t _bins; + double _min_val; + double _max_val; + ptrdiff_t _input_stride; + + Descriptor(infiniDtype_t dtype, size_t input_size, int64_t bins, + double min_val, double max_val, ptrdiff_t input_stride, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _input_size(input_size), + _bins(bins), + _min_val(min_val), + _max_val(max_val), + _input_stride(input_stride) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int64_t bins, + double min_val, + double max_val); + + size_t workspaceSize() const { return 0; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::histc::metax + +#endif // __HISTC_METAX_H__ diff --git a/src/infiniop/ops/histc/metax/histc_metax.maca b/src/infiniop/ops/histc/metax/histc_metax.maca new file mode 100644 index 000000000..a72f0d980 --- /dev/null +++ b/src/infiniop/ops/histc/metax/histc_metax.maca @@ -0,0 +1,105 @@ +#include "histc_metax.h" +#include "../cuda/kernel.cuh" +#include "../../../../utils.h" +#include +#include + +namespace op::histc::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int64_t bins, + double min_val, + double max_val) { + + if (bins <= 0 || min_val >= max_val) { + return INFINI_STATUS_BAD_PARAM; + } + + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + if (x_shape.size() != 1 || y_shape.size() != 1 || y_shape[0] != static_cast(bins)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t input_size = x_shape[0]; + ptrdiff_t input_stride = x_desc->strides()[0]; + + *desc_ptr = new Descriptor(dtype, input_size, bins, min_val, max_val, + input_stride, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + auto hc_stream = reinterpret_cast(stream); + + // Initialize output to zero + CHECK_METAX(hcMemsetAsync(y, 0, _bins * sizeof(float), hc_stream)); + + constexpr int BLOCK_SIZE = 256; + int num_blocks = (_input_size + BLOCK_SIZE - 1) / BLOCK_SIZE; + + switch (_dtype) { + case INFINI_DTYPE_F16: + cuda::histc_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + _input_size, + _input_stride, + _bins, + _min_val, + _max_val); + break; + case INFINI_DTYPE_BF16: + cuda::histc_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + _input_size, + _input_stride, + _bins, + _min_val, + _max_val); + break; + case INFINI_DTYPE_F32: + cuda::histc_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + _input_size, + _input_stride, + _bins, + _min_val, + _max_val); + break; + case INFINI_DTYPE_F64: + cuda::histc_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + _input_size, + _input_stride, + _bins, + _min_val, + _max_val); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::histc::metax diff --git a/src/infiniop/ops/histc/moore/histc_moore.h b/src/infiniop/ops/histc/moore/histc_moore.h new file mode 100644 index 000000000..877fb5bf3 --- /dev/null +++ b/src/infiniop/ops/histc/moore/histc_moore.h @@ -0,0 +1,53 @@ +#ifndef __HISTC_MOORE_H__ +#define __HISTC_MOORE_H__ + +#include "../../../operator.h" +#include "../../../tensor.h" +#include "../../../devices/moore/moore_common.h" + +namespace op::histc::moore { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + size_t _input_size; + int64_t _bins; + double _min_val; + double _max_val; + ptrdiff_t _input_stride; + + Descriptor(infiniDtype_t dtype, size_t input_size, int64_t bins, + double min_val, double max_val, ptrdiff_t input_stride, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _input_size(input_size), + _bins(bins), + _min_val(min_val), + _max_val(max_val), + _input_stride(input_stride) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int64_t bins, + double min_val, + double max_val); + + size_t workspaceSize() const { return 0; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::histc::moore + +#endif // __HISTC_MOORE_H__ diff --git a/src/infiniop/ops/histc/moore/histc_moore.mu b/src/infiniop/ops/histc/moore/histc_moore.mu new file mode 100644 index 000000000..6b10b6007 --- /dev/null +++ b/src/infiniop/ops/histc/moore/histc_moore.mu @@ -0,0 +1,105 @@ +#include "histc_moore.h" +#include "../cuda/kernel.cuh" +#include "../../../../utils.h" +#include +#include + +namespace op::histc::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int64_t bins, + double min_val, + double max_val) { + + if (bins <= 0 || min_val >= max_val) { + return INFINI_STATUS_BAD_PARAM; + } + + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + if (x_shape.size() != 1 || y_shape.size() != 1 || y_shape[0] != static_cast(bins)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t input_size = x_shape[0]; + ptrdiff_t input_stride = x_desc->strides()[0]; + + *desc_ptr = new Descriptor(dtype, input_size, bins, min_val, max_val, + input_stride, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + auto musa_stream = reinterpret_cast(stream); + + // Initialize output to zero + CHECK_MOORE(musaMemsetAsync(y, 0, _bins * sizeof(float), musa_stream)); + + constexpr int BLOCK_SIZE = 256; + int num_blocks = (_input_size + BLOCK_SIZE - 1) / BLOCK_SIZE; + + switch (_dtype) { + case INFINI_DTYPE_F16: + cuda::histc_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + _input_size, + _input_stride, + _bins, + _min_val, + _max_val); + break; + case INFINI_DTYPE_BF16: + cuda::histc_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + _input_size, + _input_stride, + _bins, + _min_val, + _max_val); + break; + case INFINI_DTYPE_F32: + cuda::histc_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + _input_size, + _input_stride, + _bins, + _min_val, + _max_val); + break; + case INFINI_DTYPE_F64: + cuda::histc_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + _input_size, + _input_stride, + _bins, + _min_val, + _max_val); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::histc::moore diff --git a/src/infiniop/ops/histc/nvidia/histc_nvidia.cu b/src/infiniop/ops/histc/nvidia/histc_nvidia.cu new file mode 100644 index 000000000..943dae6f7 --- /dev/null +++ b/src/infiniop/ops/histc/nvidia/histc_nvidia.cu @@ -0,0 +1,119 @@ +#include "histc_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../../../../utils.h" + +namespace op::histc::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int64_t bins, + double min_val, + double max_val) { + + if (bins <= 0 || min_val >= max_val) { + return INFINI_STATUS_BAD_PARAM; + } + + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + // Histc output is always float32. This backend also requires a contiguous output. + if (y_desc->dtype() != INFINI_DTYPE_F32) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + if (x_shape.size() != 1 || y_shape.size() != 1 || y_shape[0] != static_cast(bins)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t input_size = x_shape[0]; + ptrdiff_t input_stride = x_desc->strides()[0]; + ptrdiff_t output_stride = y_desc->strides()[0]; + + // This implementation treats y as a contiguous `float*` buffer. + if (output_stride != 1) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + // Negative (or broadcasted) strides are not supported by this kernel without an explicit base offset. + if (input_stride <= 0) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + + *desc_ptr = new Descriptor(dtype, input_size, bins, min_val, max_val, + input_stride, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + auto cuda_stream = reinterpret_cast(stream); + + // Initialize output to zero + CHECK_CUDA(cudaMemsetAsync(y, 0, _bins * sizeof(float), cuda_stream)); + + constexpr int BLOCK_SIZE = 256; + int num_blocks = (_input_size + BLOCK_SIZE - 1) / BLOCK_SIZE; + + switch (_dtype) { + case INFINI_DTYPE_F16: + cuda::histc_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + _input_size, + _input_stride, + _bins, + _min_val, + _max_val); + break; + case INFINI_DTYPE_BF16: + cuda::histc_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + _input_size, + _input_stride, + _bins, + _min_val, + _max_val); + break; + case INFINI_DTYPE_F32: + cuda::histc_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + _input_size, + _input_stride, + _bins, + _min_val, + _max_val); + break; + case INFINI_DTYPE_F64: + cuda::histc_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + _input_size, + _input_stride, + _bins, + _min_val, + _max_val); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::histc::nvidia diff --git a/src/infiniop/ops/histc/nvidia/histc_nvidia.cuh b/src/infiniop/ops/histc/nvidia/histc_nvidia.cuh new file mode 100644 index 000000000..464f7ef03 --- /dev/null +++ b/src/infiniop/ops/histc/nvidia/histc_nvidia.cuh @@ -0,0 +1,53 @@ +#ifndef __HISTC_NVIDIA_H__ +#define __HISTC_NVIDIA_H__ + +#include "../../../operator.h" +#include "../../../tensor.h" +#include "../../../devices/nvidia/nvidia_common.cuh" + +namespace op::histc::nvidia { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + size_t _input_size; + int64_t _bins; + double _min_val; + double _max_val; + ptrdiff_t _input_stride; + + Descriptor(infiniDtype_t dtype, size_t input_size, int64_t bins, + double min_val, double max_val, ptrdiff_t input_stride, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _input_size(input_size), + _bins(bins), + _min_val(min_val), + _max_val(max_val), + _input_stride(input_stride) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int64_t bins, + double min_val, + double max_val); + + size_t workspaceSize() const { return 0; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::histc::nvidia + +#endif // __HISTC_NVIDIA_H__ diff --git a/src/infiniop/ops/histc/operator.cc b/src/infiniop/ops/histc/operator.cc new file mode 100644 index 000000000..acd205ee0 --- /dev/null +++ b/src/infiniop/ops/histc/operator.cc @@ -0,0 +1,163 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/histc.h" + +#ifdef ENABLE_CPU_API +#include "cpu/histc_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) +#include "nvidia/histc_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/histc_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/histc_moore.h" +#endif + +__C infiniStatus_t infiniopCreateHistcDescriptor( + infiniopHandle_t handle, + infiniopHistcDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int64_t bins, + double min_val, + double max_val) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::histc::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + x_desc, \ + bins, \ + min_val, \ + max_val) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetHistcWorkspaceSize(infiniopHistcDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopHistc( + infiniopHistcDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, x, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyHistcDescriptor(infiniopHistcDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/log10/cpu/log10_cpu.cc b/src/infiniop/ops/log10/cpu/log10_cpu.cc new file mode 100644 index 000000000..3148014dd --- /dev/null +++ b/src/infiniop/ops/log10/cpu/log10_cpu.cc @@ -0,0 +1,53 @@ +#include "log10_cpu.h" + +namespace op::log10::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::log10::cpu \ No newline at end of file diff --git a/src/infiniop/ops/log10/cpu/log10_cpu.h b/src/infiniop/ops/log10/cpu/log10_cpu.h new file mode 100644 index 000000000..6cc22a1e3 --- /dev/null +++ b/src/infiniop/ops/log10/cpu/log10_cpu.h @@ -0,0 +1,20 @@ +#ifndef __LOG10_CPU_H__ +#define __LOG10_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(log10, cpu) + +namespace op::log10::cpu { +typedef struct Log10Op { +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &x) const { + return std::log10(x); + } +} Log10Op; +} // namespace op::log10::cpu + +#endif // __LOG10_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/log10/cuda/kernel.cuh b/src/infiniop/ops/log10/cuda/kernel.cuh new file mode 100644 index 000000000..f02b7f98e --- /dev/null +++ b/src/infiniop/ops/log10/cuda/kernel.cuh @@ -0,0 +1,31 @@ +#pragma once +#include // 包含 log10f, log10, log, logf 等 +#include +#include +#include +#include +#include + +namespace op::cuda { + +// 移除 high_precision_log10f 避免混淆,让 Log10Op 直接实现逻辑。 + +struct Log10Op { +public: + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + // For F32: compute via F64 for improved accuracy. + return (float)log10((double)x); + } else if constexpr (std::is_same_v) { + return log10(x); + } else { + // For F16/BF16: promote to float, compute, then cast back. + return (T)(float)log10((double)(float)x); + } + } +}; + +} // namespace op::cuda diff --git a/src/infiniop/ops/log10/log10.h b/src/infiniop/ops/log10/log10.h new file mode 100644 index 000000000..917a1db1c --- /dev/null +++ b/src/infiniop/ops/log10/log10.h @@ -0,0 +1,8 @@ +#ifndef __LOG10_H__ +#define __LOG10_H__ + +#include "../../elementwise/elementwise.h" + +#define DESCRIPTOR(NAMESPACE) ELEMENTWISE_DESCRIPTOR(log10, NAMESPACE) + +#endif // __LOG10_H__ \ No newline at end of file diff --git a/src/infiniop/ops/log10/metax/log10_metax.h b/src/infiniop/ops/log10/metax/log10_metax.h new file mode 100644 index 000000000..a881d05b5 --- /dev/null +++ b/src/infiniop/ops/log10/metax/log10_metax.h @@ -0,0 +1,8 @@ +#ifndef __LOG10_METAX_API_H__ +#define __LOG10_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(log10, metax) + +#endif // __LOG10_METAX_API_H__ diff --git a/src/infiniop/ops/log10/metax/log10_metax.maca b/src/infiniop/ops/log10/metax/log10_metax.maca new file mode 100644 index 000000000..03195354d --- /dev/null +++ b/src/infiniop/ops/log10/metax/log10_metax.maca @@ -0,0 +1,60 @@ +#include "log10_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::log10::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create METAX elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::Log10Op, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::Log10Op, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::Log10Op, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::Log10Op, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::log10::metax diff --git a/src/infiniop/ops/log10/moore/log10_moore.h b/src/infiniop/ops/log10/moore/log10_moore.h new file mode 100644 index 000000000..fe148a0f1 --- /dev/null +++ b/src/infiniop/ops/log10/moore/log10_moore.h @@ -0,0 +1,8 @@ +#ifndef __LOG10_MOORE_API_H__ +#define __LOG10_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(log10, moore) + +#endif // __LOG10_MOORE_API_H__ diff --git a/src/infiniop/ops/log10/moore/log10_moore.mu b/src/infiniop/ops/log10/moore/log10_moore.mu new file mode 100644 index 000000000..66884d7af --- /dev/null +++ b/src/infiniop/ops/log10/moore/log10_moore.mu @@ -0,0 +1,61 @@ +#include "log10_moore.h" + +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "log10_moore_kernel.h" + +namespace op::log10::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create MOORE elementwise descriptor + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, moore::Log10Op, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, moore::Log10Op, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, moore::Log10Op, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, moore::Log10Op, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::log10::moore diff --git a/src/infiniop/ops/log10/moore/log10_moore_kernel.h b/src/infiniop/ops/log10/moore/log10_moore_kernel.h new file mode 100644 index 000000000..636663fc0 --- /dev/null +++ b/src/infiniop/ops/log10/moore/log10_moore_kernel.h @@ -0,0 +1,39 @@ +#ifndef __LOG10_MOORE_KERNEL_H__ +#define __LOG10_MOORE_KERNEL_H__ + +#include +#include +#include +#include + +namespace op::log10::moore { + +typedef struct Log10Op { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + // half2 path: convert to float, compute elementwise, pack back + float x0 = __low2float(x); + float x1 = __high2float(x); + return __floats2half2_rn(log10f(x0), log10f(x1)); + } else if constexpr (std::is_same_v) { + // FP16 path: convert to float for accuracy on MUSA backend + float xf = __half2float(x); + return __float2half(log10f(xf)); + } else if constexpr (std::is_same_v) { + // BF16 path: compute in FP32 then cast back + float xf = __bfloat162float(x); + return __float2bfloat16_rn(log10f(xf)); + } else if constexpr (std::is_same_v) { + return log10f(x); + } else { // double + return log10(x); + } + } +} Log10Op; + +} // namespace op::log10::moore + +#endif // __LOG10_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/log10/nvidia/log10_nvidia.cu b/src/infiniop/ops/log10/nvidia/log10_nvidia.cu new file mode 100644 index 000000000..03196e816 --- /dev/null +++ b/src/infiniop/ops/log10/nvidia/log10_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "log10_nvidia.cuh" + +namespace op::log10::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::Log10Op, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::Log10Op, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::Log10Op, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::Log10Op, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::log10::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/log10/nvidia/log10_nvidia.cuh b/src/infiniop/ops/log10/nvidia/log10_nvidia.cuh new file mode 100644 index 000000000..62af5f2a6 --- /dev/null +++ b/src/infiniop/ops/log10/nvidia/log10_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __LOG10_NVIDIA_H__ +#define __LOG10_NVIDIA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(log10, nvidia) + +#endif // __LOG10_NVIDIA_H__ \ No newline at end of file diff --git a/src/infiniop/ops/log10/operator.cc b/src/infiniop/ops/log10/operator.cc new file mode 100644 index 000000000..a5d1099bd --- /dev/null +++ b/src/infiniop/ops/log10/operator.cc @@ -0,0 +1,157 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/log10.h" + +#ifdef ENABLE_CPU_API +#include "cpu/log10_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) +#include "nvidia/log10_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/log10_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/log10_moore.h" +#endif + +__C infiniStatus_t infiniopCreateLog10Descriptor( + infiniopHandle_t handle, + infiniopLog10Descriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::log10::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetLog10WorkspaceSize(infiniopLog10Descriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopLog10( + infiniopLog10Descriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyLog10Descriptor(infiniopLog10Descriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/src/infiniop/ops/log1p/cpu/log1p_cpu.cc b/src/infiniop/ops/log1p/cpu/log1p_cpu.cc new file mode 100644 index 000000000..ddbb64cca --- /dev/null +++ b/src/infiniop/ops/log1p/cpu/log1p_cpu.cc @@ -0,0 +1,53 @@ +#include "log1p_cpu.h" + +namespace op::log1p::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::log1p::cpu diff --git a/src/infiniop/ops/log1p/cpu/log1p_cpu.h b/src/infiniop/ops/log1p/cpu/log1p_cpu.h new file mode 100644 index 000000000..33dc89159 --- /dev/null +++ b/src/infiniop/ops/log1p/cpu/log1p_cpu.h @@ -0,0 +1,20 @@ +#ifndef __LOG1P_CPU_H__ +#define __LOG1P_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(log1p, cpu) + +namespace op::log1p::cpu { +typedef struct Log1pOp { +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &x) const { + return std::log1p(x); + } +} Log1pOp; +} // namespace op::log1p::cpu + +#endif // __LOG1P_CPU_H__ diff --git a/src/infiniop/ops/log1p/cuda/kernel.cuh b/src/infiniop/ops/log1p/cuda/kernel.cuh new file mode 100644 index 000000000..381bad957 --- /dev/null +++ b/src/infiniop/ops/log1p/cuda/kernel.cuh @@ -0,0 +1,29 @@ +#pragma once +#include +#include +#include +#include +#include +#include + +namespace op::cuda { + +struct Log1pOp { +public: + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + // Use double precision for better accuracy. + return (float)log1p((double)x); + } else if constexpr (std::is_same_v) { + return log1p(x); + } else { + // For F16/BF16: promote to float, compute, then cast back. + return (T)(float)log1p((double)(float)x); + } + } +}; + +} // namespace op::cuda diff --git a/src/infiniop/ops/log1p/log1p.h b/src/infiniop/ops/log1p/log1p.h new file mode 100644 index 000000000..8ba2b6a91 --- /dev/null +++ b/src/infiniop/ops/log1p/log1p.h @@ -0,0 +1,8 @@ +#ifndef __LOG1P_H__ +#define __LOG1P_H__ + +#include "../../elementwise/elementwise.h" + +#define DESCRIPTOR(NAMESPACE) ELEMENTWISE_DESCRIPTOR(log1p, NAMESPACE) + +#endif // __LOG1P_H__ diff --git a/src/infiniop/ops/log1p/metax/log1p_metax.h b/src/infiniop/ops/log1p/metax/log1p_metax.h new file mode 100644 index 000000000..f0fad5913 --- /dev/null +++ b/src/infiniop/ops/log1p/metax/log1p_metax.h @@ -0,0 +1,8 @@ +#ifndef __LOG1P_METAX_API_H__ +#define __LOG1P_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(log1p, metax) + +#endif // __LOG1P_METAX_API_H__ diff --git a/src/infiniop/ops/log1p/metax/log1p_metax.maca b/src/infiniop/ops/log1p/metax/log1p_metax.maca new file mode 100644 index 000000000..0ca2803d3 --- /dev/null +++ b/src/infiniop/ops/log1p/metax/log1p_metax.maca @@ -0,0 +1,59 @@ +#include "log1p_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::log1p::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create METAX elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::Log1pOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::Log1pOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::Log1pOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::Log1pOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} // namespace op::log1p::metax diff --git a/src/infiniop/ops/log1p/moore/log1p_moore.h b/src/infiniop/ops/log1p/moore/log1p_moore.h new file mode 100644 index 000000000..974cbf5b3 --- /dev/null +++ b/src/infiniop/ops/log1p/moore/log1p_moore.h @@ -0,0 +1,8 @@ +#ifndef __LOG1P_MOORE_API_H__ +#define __LOG1P_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(log1p, moore) + +#endif // __LOG1P_MOORE_API_H__ diff --git a/src/infiniop/ops/log1p/moore/log1p_moore.mu b/src/infiniop/ops/log1p/moore/log1p_moore.mu new file mode 100644 index 000000000..7eed0c0d4 --- /dev/null +++ b/src/infiniop/ops/log1p/moore/log1p_moore.mu @@ -0,0 +1,61 @@ +#include "log1p_moore.h" + +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "log1p_moore_kernel.h" + +namespace op::log1p::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create MOORE elementwise descriptor + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, moore::Log1pOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, moore::Log1pOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, moore::Log1pOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, moore::Log1pOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::log1p::moore diff --git a/src/infiniop/ops/log1p/moore/log1p_moore_kernel.h b/src/infiniop/ops/log1p/moore/log1p_moore_kernel.h new file mode 100644 index 000000000..ff42f0f5e --- /dev/null +++ b/src/infiniop/ops/log1p/moore/log1p_moore_kernel.h @@ -0,0 +1,39 @@ +#ifndef __LOG1P_MOORE_KERNEL_H__ +#define __LOG1P_MOORE_KERNEL_H__ + +#include +#include +#include +#include + +namespace op::log1p::moore { + +typedef struct Log1pOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + // half2 path: convert to float, compute elementwise, pack back + float x0 = __low2float(x); + float x1 = __high2float(x); + return __floats2half2_rn(log1pf(x0), log1pf(x1)); + } else if constexpr (std::is_same_v) { + // FP16 path: convert to float for accuracy on MUSA backend + float xf = __half2float(x); + return __float2half(log1pf(xf)); + } else if constexpr (std::is_same_v) { + // BF16 path: compute in FP32 then cast back + float xf = __bfloat162float(x); + return __float2bfloat16_rn(log1pf(xf)); + } else if constexpr (std::is_same_v) { + return log1pf(x); + } else { // double + return log1p(x); + } + } +} Log1pOp; + +} // namespace op::log1p::moore + +#endif // __LOG1P_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cu b/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cu new file mode 100644 index 000000000..1190713d4 --- /dev/null +++ b/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "log1p_nvidia.cuh" + +namespace op::log1p::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::Log1pOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::Log1pOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::Log1pOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::Log1pOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::log1p::nvidia diff --git a/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cuh b/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cuh new file mode 100644 index 000000000..ccef48746 --- /dev/null +++ b/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __LOG1P_NVIDIA_H__ +#define __LOG1P_NVIDIA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(log1p, nvidia) + +#endif // __LOG1P_NVIDIA_H__ diff --git a/src/infiniop/ops/log1p/operator.cc b/src/infiniop/ops/log1p/operator.cc new file mode 100644 index 000000000..a0efc1d1b --- /dev/null +++ b/src/infiniop/ops/log1p/operator.cc @@ -0,0 +1,157 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/log1p.h" + +#ifdef ENABLE_CPU_API +#include "cpu/log1p_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) +#include "nvidia/log1p_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/log1p_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/log1p_moore.h" +#endif + +__C infiniStatus_t infiniopCreateLog1pDescriptor( + infiniopHandle_t handle, + infiniopLog1pDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::log1p::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetLog1pWorkspaceSize(infiniopLog1pDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopLog1p( + infiniopLog1pDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyLog1pDescriptor(infiniopLog1pDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infiniop/avg_pool3d.py b/test/infiniop/avg_pool3d.py new file mode 100644 index 000000000..2d76c1b97 --- /dev/null +++ b/test/infiniop/avg_pool3d.py @@ -0,0 +1,153 @@ +import torch +import ctypes +from ctypes import c_uint64, c_size_t, c_void_p +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +# ============================================================================== +# Configuration +# ============================================================================== + +# Test cases format: (x_shape, x_stride_or_None, kernel_size, stride_or_None, padding) +_TEST_CASES = [ + ((1, 2, 8, 8, 8), None, (2, 2, 2), None, (0, 0, 0)), + ((2, 3, 7, 9, 5), (756, 252, 36, 4, 1), (3, 3, 3), (2, 2, 1), (1, 1, 0)), + ((2, 1, 9, 11, 7), (693, 77, 77, 7, 1), (3, 2, 3), None, (1, 0, 1)), +] + +_TENSOR_DTYPES = [InfiniDtype.F32] + +_TOLERANCE_MAP = { + InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-4}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_avg_pool3d(x, kernel_size, stride, padding): + kwargs = {"kernel_size": kernel_size, "padding": padding} + if stride is not None: + kwargs["stride"] = stride + return torch.nn.functional.avg_pool3d(x, **kwargs) + + +def test( + handle, + device, + x_shape, + x_stride, + kernel_size, + stride, + padding, + dtype=torch.float32, + sync=None, +): + torch.manual_seed(0) + if device != 0: + torch.cuda.manual_seed_all(0) + + x = TestTensor(x_shape, x_stride, dtype, device) + # For arbitrary (possibly overlapping) strides, the operator reads memory + # according to (shape, strides) from the backing storage. Use actual_tensor + # (the strided view) as the reference input to match that behavior. + x_ref = x.actual_tensor() if x_stride is not None else x.torch_tensor() + y_ref = torch_avg_pool3d(x_ref, kernel_size, stride, padding) + y = TestTensor(tuple(y_ref.shape), None, dtype, device, mode="ones") + y.update_torch_tensor(y_ref) + + print( + f"Testing AvgPool3d on {InfiniDeviceNames[device]} with x_shape:{x_shape} x_stride:{x_stride} " + f"kernel_size:{kernel_size} stride:{stride} padding:{padding} dtype:{InfiniDtypeNames[dtype]}" + ) + + if sync is not None: + sync() + + ks_arr = (c_size_t * 3)(*kernel_size) + stride_ptr = None + if stride is not None: + s_arr = (c_size_t * 3)(*stride) + stride_ptr = ctypes.cast(s_arr, c_void_p) + pad_arr = (c_size_t * 3)(*padding) + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAvgPool3dDescriptor( + handle, + ctypes.byref(descriptor), + y.descriptor, + x.descriptor, + ctypes.cast(ks_arr, c_void_p), + stride_ptr, + ctypes.cast(pad_arr, c_void_p), + ) + ) + + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAvgPool3dWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_avg_pool3d(): + check_error( + LIBINFINIOP.infiniopAvgPool3d( + descriptor, + workspace.data(), + workspace.size(), + y.data(), + x.data(), + None, + ) + ) + + lib_avg_pool3d() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_avg_pool3d(x.torch_tensor(), kernel_size, stride, padding), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_avg_pool3d(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyAvgPool3dDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92m Test passed! \033[0m") diff --git a/test/infiniop/dot.py b/test/infiniop/dot.py new file mode 100644 index 000000000..346341ceb --- /dev/null +++ b/test/infiniop/dot.py @@ -0,0 +1,138 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +# ============================================================================== +# Configuration +# ============================================================================== + +_TEST_CASES = [ + # n, a_stride, b_stride + (3, None, None), + (8, (2,), (2,)), + (32, None, None), + (257, (3,), (3,)), +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2}, + InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-4}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 5e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def test( + handle, + device, + n, + a_stride=None, + b_stride=None, + dtype=torch.float16, + sync=None, +): + torch.manual_seed(0) + if device != 0: + torch.cuda.manual_seed_all(0) + + a = TestTensor((n,), a_stride, dtype, device) + b = TestTensor((n,), b_stride, dtype, device) + y = TestTensor((1,), None, dtype, device, mode="zeros") + + print( + f"Testing dot on {InfiniDeviceNames[device]} with n:{n} a_stride:{a_stride} b_stride:{b_stride} " + f"dtype:{InfiniDtypeNames[dtype]}" + ) + + y_ref = torch.dot(a.torch_tensor().reshape(-1), b.torch_tensor().reshape(-1)).reshape(1) + y.update_torch_tensor(y_ref) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateDotDescriptor( + handle, + ctypes.byref(descriptor), + y.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + for tensor in [a, b, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetDotWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_dot(): + check_error( + LIBINFINIOP.infiniopDot( + descriptor, + workspace.data(), + workspace.size(), + y.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_dot() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch.dot(a.torch_tensor().reshape(-1), b.torch_tensor().reshape(-1)), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_dot(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyDotDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92m Test passed! \033[0m") + diff --git a/test/infiniop/histc.py b/test/infiniop/histc.py new file mode 100644 index 000000000..95327fdd1 --- /dev/null +++ b/test/infiniop/histc.py @@ -0,0 +1,153 @@ +import torch +import ctypes +from ctypes import c_uint64, c_int64, c_double +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +# ============================================================================== +# Configuration +# ============================================================================== + +_TEST_CASES = [ + # x_shape, x_stride, bins, min, max + ((100,), None, 10, 0.0, 1.0), + ((50,), None, 5, -1.0, 1.0), + ((20,), (2,), 8, 0.0, 2.0), +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + # histc produces exact integer counts in float32 for these sizes + InfiniDtype.F16: {"atol": 0.0, "rtol": 0.0}, + InfiniDtype.BF16: {"atol": 0.0, "rtol": 0.0}, + InfiniDtype.F32: {"atol": 0.0, "rtol": 0.0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_histc(x, bins, min_val, max_val): + # torch.histc on CUDA does not support float16/bfloat16 directly. + return torch.histc(x.to(torch.float32), bins=bins, min=min_val, max=max_val) + + +def test( + handle, + device, + x_shape, + x_stride, + bins, + min_val, + max_val, + dtype=torch.float16, + sync=None, +): + torch.manual_seed(0) + if device != 0: + torch.cuda.manual_seed_all(0) + + x = TestTensor(x_shape, x_stride, dtype, device) + # Make values fall into [min, max] and force edge hits. + rng = max_val - min_val + x_tensor = x.torch_tensor() * rng + min_val + if x_tensor.numel() > 0: + flat = x_tensor.reshape(-1) + flat[0] = min_val + flat[-1] = max_val + x.set_tensor(x_tensor) + + y = TestTensor((bins,), None, InfiniDtype.F32, device, mode="zeros") + + print( + f"Testing Histc on {InfiniDeviceNames[device]} with x_shape:{x_shape} x_stride:{x_stride} " + f"bins:{bins} range:[{min_val},{max_val}] x_dtype:{InfiniDtypeNames[dtype]}" + ) + + y_ref = torch_histc(x.torch_tensor(), bins, min_val, max_val) + y.update_torch_tensor(y_ref) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateHistcDescriptor( + handle, + ctypes.byref(descriptor), + y.descriptor, + x.descriptor, + c_int64(bins), + c_double(min_val), + c_double(max_val), + ) + ) + + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetHistcWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_histc(): + check_error( + LIBINFINIOP.infiniopHistc( + descriptor, + workspace.data(), + workspace.size(), + y.data(), + x.data(), + None, + ) + ) + + lib_histc() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: y_ref, device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_histc(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyHistcDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92m Test passed! \033[0m") diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index cd3ad1b82..5d4e891de 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -11,7 +11,7 @@ POINTER, c_float, c_double, - c_uint64, + c_int64, c_bool, ) @@ -476,8 +476,171 @@ def logsoftmax_(lib): @OpRegister.operator -def conv_(lib): - pass +def log10_(lib): + lib.infiniopCreateLog10Descriptor.restype = c_int32 + lib.infiniopCreateLog10Descriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetLog10WorkspaceSize.restype = c_int32 + lib.infiniopGetLog10WorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopLog10.restype = c_int32 + lib.infiniopLog10.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyLog10Descriptor.restype = c_int32 + lib.infiniopDestroyLog10Descriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def log1p_(lib): + lib.infiniopCreateLog1pDescriptor.restype = c_int32 + lib.infiniopCreateLog1pDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetLog1pWorkspaceSize.restype = c_int32 + lib.infiniopGetLog1pWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopLog1p.restype = c_int32 + lib.infiniopLog1p.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyLog1pDescriptor.restype = c_int32 + lib.infiniopDestroyLog1pDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def histc_(lib): + lib.infiniopCreateHistcDescriptor.restype = c_int32 + lib.infiniopCreateHistcDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_int64, + c_double, + c_double, + ] + + lib.infiniopGetHistcWorkspaceSize.restype = c_int32 + lib.infiniopGetHistcWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopHistc.restype = c_int32 + lib.infiniopHistc.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyHistcDescriptor.restype = c_int32 + lib.infiniopDestroyHistcDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def dot_(lib): + lib.infiniopCreateDotDescriptor.restype = c_int32 + lib.infiniopCreateDotDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetDotWorkspaceSize.restype = c_int32 + lib.infiniopGetDotWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopDot.restype = c_int32 + lib.infiniopDot.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyDotDescriptor.restype = c_int32 + lib.infiniopDestroyDotDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def avg_pool3d_(lib): + lib.infiniopCreateAvgPool3dDescriptor.restype = c_int32 + lib.infiniopCreateAvgPool3dDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_void_p, # kernel_size + c_void_p, # stride (nullable) + c_void_p, # padding (nullable) + ] + + lib.infiniopGetAvgPool3dWorkspaceSize.restype = c_int32 + lib.infiniopGetAvgPool3dWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopAvgPool3d.restype = c_int32 + lib.infiniopAvgPool3d.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyAvgPool3dDescriptor.restype = c_int32 + lib.infiniopDestroyAvgPool3dDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] @OpRegister.operator diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py index b690e74d4..c0560dbde 100644 --- a/test/infiniop/libinfiniop/utils.py +++ b/test/infiniop/libinfiniop/utils.py @@ -188,8 +188,35 @@ def from_torch(torch_tensor, dt: InfiniDtype, device: InfiniDeviceEnum): def update_torch_tensor(self, new_tensor: torch.Tensor): self._torch_tensor = new_tensor - def update_torch_tensor(self, new_tensor: torch.Tensor): - self._torch_tensor = new_tensor + def set_tensor(self, new_tensor: torch.Tensor): + """ + Replace the logical tensor values while preserving the descriptor's + declared shape/strides. This keeps reference (torch_tensor) and + backing storage (actual_tensor) consistent for strided tensors. + """ + t = new_tensor.to(to_torch_dtype(self.dt)).to(torch_device_map[self.device]) + + # The logical tensor used for reference computations follows the + # "torch view" shape derived from (shape, strides). + torch_shape = [] + torch_strides = [] if self.strides is not None else None + for i in range(len(self.shape)): + if self.strides is not None and self.strides[i] == 0: + torch_shape.append(1) + torch_strides.append(1) + elif self.strides is not None and self.strides[i] != 0: + torch_shape.append(self.shape[i]) + torch_strides.append(self.strides[i]) + else: + torch_shape.append(self.shape[i]) + + assert list(t.shape) == torch_shape + + self._torch_tensor = t + if self.strides is not None: + self._data_tensor = rearrange_tensor(self._torch_tensor, torch_strides) + else: + self._data_tensor = self._torch_tensor.clone() def to_torch_dtype(dt: InfiniDtype, compatability_mode=False): diff --git a/test/infiniop/log10.py b/test/infiniop/log10.py new file mode 100644 index 000000000..4ba644d86 --- /dev/null +++ b/test/infiniop/log10.py @@ -0,0 +1,179 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, x_stride, y_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), (0, 1)), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (4, 0, 1)), + ((16, 5632), None, None), + ((16, 5632), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), + ((4, 4, 56320), None, None), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_log10(y, x): + # Input is preprocessed to be positive in the test harness. + torch.log10(x, out=y) + + +def test( + handle, + device, + shape, + x_stride=None, + y_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + x = TestTensor(shape, x_stride, dtype, device) + if inplace == Inplace.INPLACE_X: + if x_stride != y_stride: + return + y = x + else: + y = TestTensor(shape, y_stride, dtype, device, mode="ones") + + if y.is_broadcast(): + return + + print( + f"Testing Log10 on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + # Create positive input values for log10 operation + x_tensor = torch.abs(x.torch_tensor()) + 1e-6 # Add small epsilon to avoid log(0) + x.set_tensor(x_tensor) + + torch_log10(y.torch_tensor(), x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateLog10Descriptor( + handle, + ctypes.byref(descriptor), + y.descriptor, + x.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetLog10WorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_log10(): + check_error( + LIBINFINIOP.infiniopLog10( + descriptor, + workspace.data(), + workspace.size(), + y.data(), + x.data(), + None, + ) + ) + + lib_log10() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_log10(y.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_log10(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyLog10Descriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92m Test passed! \033[0m") diff --git a/test/infiniop/log1p.py b/test/infiniop/log1p.py new file mode 100644 index 000000000..cd98cc131 --- /dev/null +++ b/test/infiniop/log1p.py @@ -0,0 +1,162 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration +# ============================================================================== + +_TEST_CASES_ = [ + # shape, x_stride, y_stride + ((2, 3), None, None), + ((1, 4, 8), (32, 8, 1), (32, 8, 1)), + ((3, 2, 5, 7), None, None), + ((1, 8, 9, 11), (792, 99, 11, 1), (792, 99, 11, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_log1p(y, x): + torch.log1p(x, out=y) + + +def test( + handle, + device, + shape, + x_stride=None, + y_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + torch.manual_seed(0) + if device != 0: + torch.cuda.manual_seed_all(0) + + x = TestTensor(shape, x_stride, dtype, device) + if inplace == Inplace.INPLACE_X: + if x_stride != y_stride: + return + y = x + else: + y = TestTensor(shape, y_stride, dtype, device, mode="ones") + + if y.is_broadcast(): + return + + print( + f"Testing Log1p on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + torch_log1p(y.torch_tensor(), x.torch_tensor()) + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateLog1pDescriptor( + handle, + ctypes.byref(descriptor), + y.descriptor, + x.descriptor, + ) + ) + + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetLog1pWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_log1p(): + check_error( + LIBINFINIOP.infiniopLog1p( + descriptor, + workspace.data(), + workspace.size(), + y.data(), + x.data(), + None, + ) + ) + + lib_log1p() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_log1p(y.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_log1p(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyLog1pDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92m Test passed! \033[0m") From 02dc83eaddf3e40a98864754edd555b23482d705 Mon Sep 17 00:00:00 2001 From: PanZezhong Date: Thu, 9 Apr 2026 11:29:21 +0800 Subject: [PATCH 2/2] issue/1031 fix T1-1-2 --- include/infiniop/ops/avg_pool3d.h | 30 +++--- include/infiniop/ops/dot.h | 28 +++--- include/infiniop/ops/histc.h | 30 +++--- include/infiniop/ops/log10.h | 24 ++--- include/infiniop/ops/log1p.h | 24 ++--- src/infiniop/devices/metax/metax_common.h | 1 + src/infiniop/devices/metax/metax_ht2mc.h | 2 + .../devices/metax/metax_kernel_common.h | 1 + src/infiniop/devices/moore/moore_common.h | 3 + .../devices/moore/moore_kernel_common.h | 3 + .../ops/avg_pool3d/cpu/avg_pool3d_cpu.cc | 97 ++++++++----------- .../ops/avg_pool3d/cpu/avg_pool3d_cpu.h | 2 +- .../ops/avg_pool3d/metax/avg_pool3d_metax.h | 9 +- .../avg_pool3d/metax/avg_pool3d_metax.maca | 68 +++---------- .../ops/avg_pool3d/moore/avg_pool3d_moore.h | 4 +- .../ops/avg_pool3d/moore/avg_pool3d_moore.mu | 5 +- .../avg_pool3d/nvidia/avg_pool3d_nvidia.cu | 47 +++++---- .../avg_pool3d/nvidia/avg_pool3d_nvidia.cuh | 4 +- src/infiniop/ops/avg_pool3d/operator.cc | 42 ++++---- src/infiniop/ops/dot/cpu/dot_cpu.h | 4 +- src/infiniop/ops/dot/cuda/kernel.cuh | 19 ++-- src/infiniop/ops/dot/metax/dot_metax.h | 1 - src/infiniop/ops/dot/metax/dot_metax.maca | 84 ++++++++-------- src/infiniop/ops/dot/moore/dot_moore.h | 1 - src/infiniop/ops/dot/moore/dot_moore.mu | 85 ++++++++-------- src/infiniop/ops/dot/nvidia/dot_nvidia.cu | 76 +++++++-------- src/infiniop/ops/dot/nvidia/dot_nvidia.cuh | 2 +- src/infiniop/ops/dot/operator.cc | 30 +++--- src/infiniop/ops/histc/cpu/histc_cpu.h | 2 +- src/infiniop/ops/histc/cuda/kernel.cuh | 6 +- src/infiniop/ops/histc/metax/histc_metax.h | 1 - src/infiniop/ops/histc/metax/histc_metax.maca | 9 +- src/infiniop/ops/histc/moore/histc_moore.h | 1 - src/infiniop/ops/histc/moore/histc_moore.mu | 9 +- src/infiniop/ops/histc/nvidia/histc_nvidia.cu | 8 +- .../ops/histc/nvidia/histc_nvidia.cuh | 2 +- src/infiniop/ops/histc/operator.cc | 28 +++--- src/infiniop/ops/log10/cpu/log10_cpu.cc | 2 +- src/infiniop/ops/log10/cpu/log10_cpu.h | 2 +- src/infiniop/ops/log10/cuda/kernel.cuh | 16 ++- src/infiniop/ops/log10/log10.h | 2 +- .../ops/log10/moore/log10_moore_kernel.h | 4 +- src/infiniop/ops/log10/nvidia/log10_nvidia.cu | 2 +- .../ops/log10/nvidia/log10_nvidia.cuh | 2 +- src/infiniop/ops/log10/operator.cc | 10 +- src/infiniop/ops/log1p/cuda/kernel.cuh | 14 ++- src/infiniop/ops/log1p/metax/log1p_metax.maca | 1 + .../ops/log1p/moore/log1p_moore_kernel.h | 4 +- src/infiniop/ops/log1p/operator.cc | 8 +- test/infiniop/avg_pool3d.py | 2 +- 50 files changed, 390 insertions(+), 471 deletions(-) diff --git a/include/infiniop/ops/avg_pool3d.h b/include/infiniop/ops/avg_pool3d.h index 7170e7765..a54fbb263 100644 --- a/include/infiniop/ops/avg_pool3d.h +++ b/include/infiniop/ops/avg_pool3d.h @@ -5,23 +5,23 @@ typedef struct InfiniopDescriptor *infiniopAvgPool3dDescriptor_t; -__C __export infiniStatus_t infiniopCreateAvgPool3dDescriptor(infiniopHandle_t handle, - infiniopAvgPool3dDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x, - void *kernel_size, - void *stride, - void *padding); +__INFINI_C __export infiniStatus_t infiniopCreateAvgPool3dDescriptor(infiniopHandle_t handle, + infiniopAvgPool3dDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + void *kernel_size, + void *stride, + void *padding); -__C __export infiniStatus_t infiniopGetAvgPool3dWorkspaceSize(infiniopAvgPool3dDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetAvgPool3dWorkspaceSize(infiniopAvgPool3dDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopAvgPool3d(infiniopAvgPool3dDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); +__INFINI_C __export infiniStatus_t infiniopAvgPool3d(infiniopAvgPool3dDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); -__C __export infiniStatus_t infiniopDestroyAvgPool3dDescriptor(infiniopAvgPool3dDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyAvgPool3dDescriptor(infiniopAvgPool3dDescriptor_t desc); #endif diff --git a/include/infiniop/ops/dot.h b/include/infiniop/ops/dot.h index 7fa0d2659..1fb9197f9 100644 --- a/include/infiniop/ops/dot.h +++ b/include/infiniop/ops/dot.h @@ -5,22 +5,22 @@ typedef struct InfiniopDescriptor *infiniopDotDescriptor_t; -__C __export infiniStatus_t infiniopCreateDotDescriptor(infiniopHandle_t handle, - infiniopDotDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); +__INFINI_C __export infiniStatus_t infiniopCreateDotDescriptor(infiniopHandle_t handle, + infiniopDotDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); -__C __export infiniStatus_t infiniopGetDotWorkspaceSize(infiniopDotDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetDotWorkspaceSize(infiniopDotDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopDot(infiniopDotDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *a, - const void *b, - void *stream); +__INFINI_C __export infiniStatus_t infiniopDot(infiniopDotDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *a, + const void *b, + void *stream); -__C __export infiniStatus_t infiniopDestroyDotDescriptor(infiniopDotDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyDotDescriptor(infiniopDotDescriptor_t desc); #endif diff --git a/include/infiniop/ops/histc.h b/include/infiniop/ops/histc.h index cd4695d4e..3e7232259 100644 --- a/include/infiniop/ops/histc.h +++ b/include/infiniop/ops/histc.h @@ -6,23 +6,23 @@ typedef struct InfiniopDescriptor *infiniopHistcDescriptor_t; -__C __export infiniStatus_t infiniopCreateHistcDescriptor(infiniopHandle_t handle, - infiniopHistcDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x, - int64_t bins, - double min_val, - double max_val); +__INFINI_C __export infiniStatus_t infiniopCreateHistcDescriptor(infiniopHandle_t handle, + infiniopHistcDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + int64_t bins, + double min_val, + double max_val); -__C __export infiniStatus_t infiniopGetHistcWorkspaceSize(infiniopHistcDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetHistcWorkspaceSize(infiniopHistcDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopHistc(infiniopHistcDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); +__INFINI_C __export infiniStatus_t infiniopHistc(infiniopHistcDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); -__C __export infiniStatus_t infiniopDestroyHistcDescriptor(infiniopHistcDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyHistcDescriptor(infiniopHistcDescriptor_t desc); #endif diff --git a/include/infiniop/ops/log10.h b/include/infiniop/ops/log10.h index 7c105dcc3..5ca8ccd8d 100644 --- a/include/infiniop/ops/log10.h +++ b/include/infiniop/ops/log10.h @@ -5,20 +5,20 @@ typedef struct InfiniopDescriptor *infiniopLog10Descriptor_t; -__C __export infiniStatus_t infiniopCreateLog10Descriptor(infiniopHandle_t handle, - infiniopLog10Descriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); +__INFINI_C __export infiniStatus_t infiniopCreateLog10Descriptor(infiniopHandle_t handle, + infiniopLog10Descriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); -__C __export infiniStatus_t infiniopGetLog10WorkspaceSize(infiniopLog10Descriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetLog10WorkspaceSize(infiniopLog10Descriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopLog10(infiniopLog10Descriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); +__INFINI_C __export infiniStatus_t infiniopLog10(infiniopLog10Descriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); -__C __export infiniStatus_t infiniopDestroyLog10Descriptor(infiniopLog10Descriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyLog10Descriptor(infiniopLog10Descriptor_t desc); #endif diff --git a/include/infiniop/ops/log1p.h b/include/infiniop/ops/log1p.h index ebd608e9d..93f229c7d 100644 --- a/include/infiniop/ops/log1p.h +++ b/include/infiniop/ops/log1p.h @@ -5,20 +5,20 @@ typedef struct InfiniopDescriptor *infiniopLog1pDescriptor_t; -__C __export infiniStatus_t infiniopCreateLog1pDescriptor(infiniopHandle_t handle, - infiniopLog1pDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); +__INFINI_C __export infiniStatus_t infiniopCreateLog1pDescriptor(infiniopHandle_t handle, + infiniopLog1pDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); -__C __export infiniStatus_t infiniopGetLog1pWorkspaceSize(infiniopLog1pDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetLog1pWorkspaceSize(infiniopLog1pDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopLog1p(infiniopLog1pDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); +__INFINI_C __export infiniStatus_t infiniopLog1p(infiniopLog1pDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); -__C __export infiniStatus_t infiniopDestroyLog1pDescriptor(infiniopLog1pDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyLog1pDescriptor(infiniopLog1pDescriptor_t desc); #endif diff --git a/src/infiniop/devices/metax/metax_common.h b/src/infiniop/devices/metax/metax_common.h index a29d44791..2aef6c786 100644 --- a/src/infiniop/devices/metax/metax_common.h +++ b/src/infiniop/devices/metax/metax_common.h @@ -1,3 +1,4 @@ +#pragma once #include "../../../utils.h" #include "../pool.h" #include "metax_handle.h" diff --git a/src/infiniop/devices/metax/metax_ht2mc.h b/src/infiniop/devices/metax/metax_ht2mc.h index 9846c3054..e8b388307 100644 --- a/src/infiniop/devices/metax/metax_ht2mc.h +++ b/src/infiniop/devices/metax/metax_ht2mc.h @@ -61,8 +61,10 @@ #define hcdnnTensorDescriptor_t mcdnnTensorDescriptor_t #define hcdnnStatus_t mcdnnStatus_t #define hcdnnSetTensor4dDescriptor mcdnnSetTensor4dDescriptor +#define hcdnnSetTensorNdDescriptor mcdnnSetTensorNdDescriptor #define hcdnnSetStream mcdnnSetStream #define hcdnnSetPooling2dDescriptor mcdnnSetPooling2dDescriptor +#define hcdnnSetPoolingNdDescriptor mcdnnSetPoolingNdDescriptor #define hcdnnPoolingMode_t mcdnnPoolingMode_t #define hcdnnPoolingForward mcdnnPoolingForward #define hcdnnPoolingDescriptor_t mcdnnPoolingDescriptor_t diff --git a/src/infiniop/devices/metax/metax_kernel_common.h b/src/infiniop/devices/metax/metax_kernel_common.h index b51209b3b..3d2b31a5e 100644 --- a/src/infiniop/devices/metax/metax_kernel_common.h +++ b/src/infiniop/devices/metax/metax_kernel_common.h @@ -1,3 +1,4 @@ +#pragma once #define INFINIOP_METAX_KERNEL __global__ void #ifdef ENABLE_METAX_MC_API diff --git a/src/infiniop/devices/moore/moore_common.h b/src/infiniop/devices/moore/moore_common.h index 725e230f5..c60677127 100644 --- a/src/infiniop/devices/moore/moore_common.h +++ b/src/infiniop/devices/moore/moore_common.h @@ -1,3 +1,5 @@ +#ifndef __INFINIOP_MOORE_COMMON_H__ +#define __INFINIOP_MOORE_COMMON_H__ #include "../../../utils.h" #include "../pool.h" #include "moore_handle.h" @@ -42,3 +44,4 @@ class Handle::Internal { }; } // namespace device::moore +#endif diff --git a/src/infiniop/devices/moore/moore_kernel_common.h b/src/infiniop/devices/moore/moore_kernel_common.h index 5e4ba6c0f..e121fddf1 100644 --- a/src/infiniop/devices/moore/moore_kernel_common.h +++ b/src/infiniop/devices/moore/moore_kernel_common.h @@ -1,3 +1,5 @@ +#ifndef __INFINIOP_MOORE_KERNEL_COMMON_H__ +#define __INFINIOP_MOORE_KERNEL_COMMON_H__ #define INFINIOP_MOORE_KERNEL __global__ void #include @@ -72,3 +74,4 @@ exp_(const __mt_bfloat16 x) { float f_result = expf(f_val); return __float2bfloat16(f_result); } +#endif diff --git a/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.cc b/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.cc index c280a2d92..9553c17f5 100644 --- a/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.cc +++ b/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.cc @@ -70,9 +70,7 @@ utils::Result AvgPool3dInfo::create( } // Calculate output dimensions. Guard against unsigned underflow when kernel > input + 2*pad. - if (pad_d > (std::numeric_limits::max() - input_d) / 2 || - pad_h > (std::numeric_limits::max() - input_h) / 2 || - pad_w > (std::numeric_limits::max() - input_w) / 2) { + if (pad_d > (std::numeric_limits::max() - input_d) / 2 || pad_h > (std::numeric_limits::max() - input_h) / 2 || pad_w > (std::numeric_limits::max() - input_w) / 2) { return INFINI_STATUS_BAD_PARAM; } size_t effective_d = input_d + 2 * pad_d; @@ -87,8 +85,7 @@ utils::Result AvgPool3dInfo::create( size_t output_w = (effective_w - kernel_w) / stride_w + 1; // Verify output shape - if (y_shape[0] != batch || y_shape[1] != channels || - y_shape[2] != output_d || y_shape[3] != output_h || y_shape[4] != output_w) { + if (y_shape[0] != batch || y_shape[1] != channels || y_shape[2] != output_d || y_shape[3] != output_h || y_shape[4] != output_w) { return INFINI_STATUS_BAD_TENSOR_SHAPE; } @@ -168,59 +165,51 @@ void avg_pool3d_impl( using Tacc = std::conditional_t, double, float>; const Tacc inv_kernel_size = Tacc(1) / static_cast(kernel_size); -#pragma omp parallel for collapse(2) - for (ptrdiff_t b = 0; b < static_cast(info.batch); ++b) { - for (ptrdiff_t c = 0; c < static_cast(info.channels); ++c) { - for (size_t od = 0; od < info.output_d; ++od) { - for (size_t oh = 0; oh < info.output_h; ++oh) { - for (size_t ow = 0; ow < info.output_w; ++ow) { - Tacc sum = Tacc(0); - - // Calculate input window - ptrdiff_t id_start = - static_cast(od) * static_cast(info.stride_d) - - static_cast(info.pad_d); - ptrdiff_t ih_start = - static_cast(oh) * static_cast(info.stride_h) - - static_cast(info.pad_h); - ptrdiff_t iw_start = - static_cast(ow) * static_cast(info.stride_w) - - static_cast(info.pad_w); - - for (size_t kd = 0; kd < info.kernel_d; ++kd) { - for (size_t kh = 0; kh < info.kernel_h; ++kh) { - for (size_t kw = 0; kw < info.kernel_w; ++kw) { - ptrdiff_t id = id_start + static_cast(kd); - ptrdiff_t ih = ih_start + static_cast(kh); - ptrdiff_t iw = iw_start + static_cast(kw); - - // Check bounds (accounting for padding) - if (id >= 0 && id < static_cast(info.input_d) && - ih >= 0 && ih < static_cast(info.input_h) && - iw >= 0 && iw < static_cast(info.input_w)) { - size_t x_idx = b * info.input_strides[0] + - c * info.input_strides[1] + - static_cast(id) * info.input_strides[2] + - static_cast(ih) * info.input_strides[3] + - static_cast(iw) * info.input_strides[4]; - sum += utils::cast(x[x_idx]); - } - } - } - } - - size_t y_idx = b * info.output_strides[0] + - c * info.output_strides[1] + - od * info.output_strides[2] + - oh * info.output_strides[3] + - ow * info.output_strides[4]; - // Match torch.nn.functional.avg_pool3d default behavior (count_include_pad=True): - // padding contributes zeros but still counts in the divisor. - y[y_idx] = utils::cast(sum * inv_kernel_size); + size_t total = info.batch * info.channels * info.output_d * info.output_h * info.output_w; + +#pragma omp parallel for + for (ptrdiff_t idx = 0; idx < (ptrdiff_t)total; ++idx) { + + size_t tmp = idx; + + size_t ow = tmp % info.output_w; + tmp /= info.output_w; + size_t oh = tmp % info.output_h; + tmp /= info.output_h; + size_t od = tmp % info.output_d; + tmp /= info.output_d; + size_t c = tmp % info.channels; + tmp /= info.channels; + size_t b = tmp; + + using Tacc = std::conditional_t, double, float>; + Tacc sum = Tacc(0); + + ptrdiff_t id_start = (ptrdiff_t)od * (ptrdiff_t)info.stride_d - (ptrdiff_t)info.pad_d; + ptrdiff_t ih_start = (ptrdiff_t)oh * (ptrdiff_t)info.stride_h - (ptrdiff_t)info.pad_h; + ptrdiff_t iw_start = (ptrdiff_t)ow * (ptrdiff_t)info.stride_w - (ptrdiff_t)info.pad_w; + + for (size_t kd = 0; kd < info.kernel_d; ++kd) { + for (size_t kh = 0; kh < info.kernel_h; ++kh) { + for (size_t kw = 0; kw < info.kernel_w; ++kw) { + + ptrdiff_t id = id_start + kd; + ptrdiff_t ih = ih_start + kh; + ptrdiff_t iw = iw_start + kw; + + if (id >= 0 && id < (ptrdiff_t)info.input_d && ih >= 0 && ih < (ptrdiff_t)info.input_h && iw >= 0 && iw < (ptrdiff_t)info.input_w) { + + size_t x_idx = b * info.input_strides[0] + c * info.input_strides[1] + (size_t)id * info.input_strides[2] + (size_t)ih * info.input_strides[3] + (size_t)iw * info.input_strides[4]; + + sum += utils::cast(x[x_idx]); } } } } + + size_t y_idx = b * info.output_strides[0] + c * info.output_strides[1] + od * info.output_strides[2] + oh * info.output_strides[3] + ow * info.output_strides[4]; + + y[y_idx] = utils::cast(sum * inv_kernel_size); } } diff --git a/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.h b/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.h index f01aeb5cf..57be33f9e 100644 --- a/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.h +++ b/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.h @@ -1,9 +1,9 @@ #ifndef __AVG_POOL3D_CPU_H__ #define __AVG_POOL3D_CPU_H__ +#include "../../../devices/cpu/common_cpu.h" #include "../../../operator.h" #include "../../../tensor.h" -#include "../../../devices/cpu/common_cpu.h" #include namespace op::avg_pool3d::cpu { diff --git a/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.h b/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.h index b4277393e..fdc838868 100644 --- a/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.h +++ b/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.h @@ -3,14 +3,7 @@ #include "../../../operator.h" #include "../../../tensor.h" -#include "../../../devices/metax/metax_common.h" -#include "../../../devices/metax/metax_handle.h" - -#ifdef ENABLE_METAX_MC_API -#include -#else -#include -#endif +#include namespace op::avg_pool3d::metax { diff --git a/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.maca b/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.maca index c7955812a..fdd2f7d15 100644 --- a/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.maca +++ b/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.maca @@ -1,34 +1,30 @@ -#include "avg_pool3d_metax.h" #include "../../../../utils.h" +#include "../../../devices/metax/metax_common.h" +#include "avg_pool3d_metax.h" namespace op::avg_pool3d::metax { struct Descriptor::Opaque { std::shared_ptr internal; -#ifdef ENABLE_METAX_MC_API - mcdnnTensorDescriptor_t x_desc = nullptr; - mcdnnTensorDescriptor_t y_desc = nullptr; - mcdnnPoolingDescriptor_t pool_desc = nullptr; -#else hcdnnTensorDescriptor_t x_desc = nullptr; hcdnnTensorDescriptor_t y_desc = nullptr; hcdnnPoolingDescriptor_t pool_desc = nullptr; -#endif + size_t workspace_size = 0; Opaque(std::shared_ptr internal_ptr) : internal(internal_ptr) {} ~Opaque() { -#ifdef ENABLE_METAX_MC_API - if (x_desc) mcdnnDestroyTensorDescriptor(x_desc); - if (y_desc) mcdnnDestroyTensorDescriptor(y_desc); - if (pool_desc) mcdnnDestroyPoolingDescriptor(pool_desc); -#else - if (x_desc) hcdnnDestroyTensorDescriptor(x_desc); - if (y_desc) hcdnnDestroyTensorDescriptor(y_desc); - if (pool_desc) hcdnnDestroyPoolingDescriptor(pool_desc); -#endif + if (x_desc) { + hcdnnDestroyTensorDescriptor(x_desc); + } + if (y_desc) { + hcdnnDestroyTensorDescriptor(y_desc); + } + if (pool_desc) { + hcdnnDestroyPoolingDescriptor(pool_desc); + } } }; @@ -88,15 +84,9 @@ infiniStatus_t Descriptor::create( auto opaque = std::make_unique(metax_handle->internal()); // Create hcdnn descriptors -#ifdef ENABLE_METAX_MC_API - CHECK_MCDNN(mcdnnCreateTensorDescriptor(&opaque->x_desc)); - CHECK_MCDNN(mcdnnCreateTensorDescriptor(&opaque->y_desc)); - CHECK_MCDNN(mcdnnCreatePoolingDescriptor(&opaque->pool_desc)); -#else CHECK_MCDNN(hcdnnCreateTensorDescriptor(&opaque->x_desc)); CHECK_MCDNN(hcdnnCreateTensorDescriptor(&opaque->y_desc)); CHECK_MCDNN(hcdnnCreatePoolingDescriptor(&opaque->pool_desc)); -#endif // Set tensor descriptors int n = static_cast(x_shape[0]); @@ -114,8 +104,7 @@ infiniStatus_t Descriptor::create( static_cast(d * h * w), static_cast(h * w), static_cast(w), - 1 - }; + 1}; int output_dims[5] = {n, c, out_d, out_h, out_w}; int output_strides[5] = { @@ -123,27 +112,9 @@ infiniStatus_t Descriptor::create( static_cast(out_d * out_h * out_w), static_cast(out_h * out_w), static_cast(out_w), - 1 - }; + 1}; hcdnnDataType_t hcdnn_dtype = device::metax::getHcdnnDtype(dtype); -#ifdef ENABLE_METAX_MC_API - CHECK_MCDNN(mcdnnSetTensorNdDescriptor( - opaque->x_desc, hcdnn_dtype, 5, input_dims, input_strides)); - CHECK_MCDNN(mcdnnSetTensorNdDescriptor( - opaque->y_desc, hcdnn_dtype, 5, output_dims, output_strides)); - - // Set pooling descriptor - int window_dims[3] = {static_cast(kernel_d), static_cast(kernel_h), static_cast(kernel_w)}; - int padding_dims[3] = {static_cast(pad_d), static_cast(pad_h), static_cast(pad_w)}; - int stride_dims[3] = {static_cast(stride_d), static_cast(stride_h), static_cast(stride_w)}; - - CHECK_MCDNN(mcdnnSetPoolingNdDescriptor( - opaque->pool_desc, - MCDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING, - MCDNN_NOT_PROPAGATE_NAN, - 3, window_dims, padding_dims, stride_dims)); -#else CHECK_MCDNN(hcdnnSetTensorNdDescriptor( opaque->x_desc, hcdnn_dtype, 5, input_dims, input_strides)); CHECK_MCDNN(hcdnnSetTensorNdDescriptor( @@ -159,7 +130,6 @@ infiniStatus_t Descriptor::create( HCDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING, HCDNN_NOT_PROPAGATE_NAN, 3, window_dims, padding_dims, stride_dims)); -#endif *desc_ptr = new Descriptor(dtype, std::move(opaque), handle->device, handle->device_id); return INFINI_STATUS_SUCCESS; @@ -194,15 +164,6 @@ infiniStatus_t Descriptor::calculate( beta = &beta_val; } -#ifdef ENABLE_METAX_MC_API - CHECK_MCDNN(mcdnnPoolingForward( - hcdnn_handle, - _opaque->pool_desc, - alpha, - _opaque->x_desc, x, - beta, - _opaque->y_desc, y)); -#else CHECK_MCDNN(hcdnnPoolingForward( hcdnn_handle, _opaque->pool_desc, @@ -210,7 +171,6 @@ infiniStatus_t Descriptor::calculate( _opaque->x_desc, x, beta, _opaque->y_desc, y)); -#endif return INFINI_STATUS_SUCCESS; }); diff --git a/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.h b/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.h index 526d36543..4ee472b28 100644 --- a/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.h +++ b/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.h @@ -3,9 +3,7 @@ #include "../../../operator.h" #include "../../../tensor.h" -#include "../../../devices/moore/moore_common.h" -#include "../../../devices/moore/moore_handle.h" -#include +#include namespace op::avg_pool3d::moore { diff --git a/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.mu b/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.mu index 5e1cfd4a6..a538bb92c 100644 --- a/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.mu +++ b/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.mu @@ -1,7 +1,6 @@ -#include "avg_pool3d_moore.h" #include "../../../../utils.h" -// MOORE uses CUDA-compatible API, so we can reuse NVIDIA implementation -// by including the NVIDIA source and adapting stream types +#include "../../../devices/moore/moore_common.h" +#include "avg_pool3d_moore.h" namespace op::avg_pool3d::moore { diff --git a/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cu b/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cu index ce78f7bd2..8bee6c14a 100644 --- a/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cu +++ b/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cu @@ -1,5 +1,5 @@ -#include "avg_pool3d_nvidia.cuh" #include "../../../../utils.h" +#include "avg_pool3d_nvidia.cuh" #include #include @@ -16,9 +16,15 @@ struct Descriptor::Opaque { : internal(internal_ptr) {} ~Opaque() { - if (x_desc) cudnnDestroyTensorDescriptor(x_desc); - if (y_desc) cudnnDestroyTensorDescriptor(y_desc); - if (pool_desc) cudnnDestroyPoolingDescriptor(pool_desc); + if (x_desc) { + cudnnDestroyTensorDescriptor(x_desc); + } + if (y_desc) { + cudnnDestroyTensorDescriptor(y_desc); + } + if (pool_desc) { + cudnnDestroyPoolingDescriptor(pool_desc); + } } }; @@ -162,23 +168,22 @@ infiniStatus_t Descriptor::calculate( auto cuda_stream = reinterpret_cast(stream); return _opaque->internal->useCudnn(cuda_stream, [&](cudnnHandle_t cudnn_handle) { - - const void *alpha = nullptr; - const void *beta = nullptr; - if (_dtype == INFINI_DTYPE_F32) { - static const float alpha_val = 1.0f, beta_val = 0.0f; - alpha = &alpha_val; - beta = &beta_val; - } else if (_dtype == INFINI_DTYPE_F64) { - static const double alpha_val = 1.0, beta_val = 0.0; - alpha = &alpha_val; - beta = &beta_val; - } else { - // For F16/BF16, use float alpha/beta - static const float alpha_val = 1.0f, beta_val = 0.0f; - alpha = &alpha_val; - beta = &beta_val; - } + const void *alpha = nullptr; + const void *beta = nullptr; + if (_dtype == INFINI_DTYPE_F32) { + static const float alpha_val = 1.0f, beta_val = 0.0f; + alpha = &alpha_val; + beta = &beta_val; + } else if (_dtype == INFINI_DTYPE_F64) { + static const double alpha_val = 1.0, beta_val = 0.0; + alpha = &alpha_val; + beta = &beta_val; + } else { + // For F16/BF16, use float alpha/beta + static const float alpha_val = 1.0f, beta_val = 0.0f; + alpha = &alpha_val; + beta = &beta_val; + } CHECK_CUDNN(cudnnPoolingForward( cudnn_handle, diff --git a/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cuh b/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cuh index 9ace65e7b..b2a1801cb 100644 --- a/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cuh +++ b/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cuh @@ -1,10 +1,10 @@ #ifndef __AVG_POOL3D_NVIDIA_H__ #define __AVG_POOL3D_NVIDIA_H__ -#include "../../../operator.h" -#include "../../../tensor.h" #include "../../../devices/nvidia/nvidia_common.cuh" #include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../operator.h" +#include "../../../tensor.h" #include namespace op::avg_pool3d::nvidia { diff --git a/src/infiniop/ops/avg_pool3d/operator.cc b/src/infiniop/ops/avg_pool3d/operator.cc index f9e01b991..df58e1282 100644 --- a/src/infiniop/ops/avg_pool3d/operator.cc +++ b/src/infiniop/ops/avg_pool3d/operator.cc @@ -15,7 +15,7 @@ #include "moore/avg_pool3d_moore.h" #endif -__C infiniStatus_t infiniopCreateAvgPool3dDescriptor( +__INFINI_C infiniStatus_t infiniopCreateAvgPool3dDescriptor( infiniopHandle_t handle, infiniopAvgPool3dDescriptor_t *desc_ptr, infiniopTensorDescriptor_t y_desc, @@ -24,15 +24,15 @@ __C infiniStatus_t infiniopCreateAvgPool3dDescriptor( void *stride, void *padding) { -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::avg_pool3d::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - x_desc, \ - kernel_size, \ - stride, \ +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::avg_pool3d::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + x_desc, \ + kernel_size, \ + stride, \ padding) switch (handle->device) { @@ -60,10 +60,10 @@ __C infiniStatus_t infiniopCreateAvgPool3dDescriptor( #undef CREATE } -__C infiniStatus_t infiniopGetAvgPool3dWorkspaceSize(infiniopAvgPool3dDescriptor_t desc, size_t *size) { +__INFINI_C infiniStatus_t infiniopGetAvgPool3dWorkspaceSize(infiniopAvgPool3dDescriptor_t desc, size_t *size) { -#define GET(CASE, NAMESPACE) \ - case CASE: \ +#define GET(CASE, NAMESPACE) \ + case CASE: \ *size = reinterpret_cast(desc)->workspaceSize(); \ return INFINI_STATUS_SUCCESS; @@ -91,7 +91,7 @@ __C infiniStatus_t infiniopGetAvgPool3dWorkspaceSize(infiniopAvgPool3dDescriptor return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopAvgPool3d( +__INFINI_C infiniStatus_t infiniopAvgPool3d( infiniopAvgPool3dDescriptor_t desc, void *workspace, size_t workspace_size, @@ -99,9 +99,9 @@ __C infiniStatus_t infiniopAvgPool3d( const void *x, void *stream) { -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ ->calculate(workspace, workspace_size, y, x, stream) switch (desc->device_type) { @@ -129,12 +129,12 @@ __C infiniStatus_t infiniopAvgPool3d( #undef CALCULATE } -__C infiniStatus_t +__INFINI_C infiniStatus_t infiniopDestroyAvgPool3dDescriptor(infiniopAvgPool3dDescriptor_t desc) { -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS; switch (desc->device_type) { diff --git a/src/infiniop/ops/dot/cpu/dot_cpu.h b/src/infiniop/ops/dot/cpu/dot_cpu.h index 5c8bbc113..95a7a4d12 100644 --- a/src/infiniop/ops/dot/cpu/dot_cpu.h +++ b/src/infiniop/ops/dot/cpu/dot_cpu.h @@ -1,10 +1,10 @@ #ifndef __DOT_CPU_H__ #define __DOT_CPU_H__ -#include "../../../operator.h" -#include "../../../tensor.h" #include "../../../devices/cpu/common_cpu.h" +#include "../../../operator.h" #include "../../../reduce/cpu/reduce.h" +#include "../../../tensor.h" #include namespace op::dot::cpu { diff --git a/src/infiniop/ops/dot/cuda/kernel.cuh b/src/infiniop/ops/dot/cuda/kernel.cuh index 0c4b1249d..e044bf523 100644 --- a/src/infiniop/ops/dot/cuda/kernel.cuh +++ b/src/infiniop/ops/dot/cuda/kernel.cuh @@ -1,16 +1,13 @@ #pragma once + #include "../../../reduce/cuda/reduce.cuh" -#include -#include -#include #include namespace op::cuda { -// Dot product kernel: computes dot(a, b) = sum(a * b) -template +template __global__ void dot_kernel( - Tcompute *result, + Tout *result, const Tdata *a, const Tdata *b, size_t n, @@ -19,21 +16,17 @@ __global__ void dot_kernel( Tcompute sum = 0; - // Each thread computes partial dot product for (size_t i = threadIdx.x; i < n; i += BLOCK_SIZE) { - Tcompute a_val = Tcompute(a[i * a_stride]); - Tcompute b_val = Tcompute(b[i * b_stride]); - sum += a_val * b_val; + sum += Tcompute(a[i * a_stride]) * Tcompute(b[i * b_stride]); } - // Use CUB block-level reduction using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; + Tcompute block_sum = BlockReduce(temp_storage).Sum(sum); - // Write result (only thread 0, since we only launch 1 block) if (threadIdx.x == 0) { - *result = block_sum; + result[0] = static_cast(block_sum); } } diff --git a/src/infiniop/ops/dot/metax/dot_metax.h b/src/infiniop/ops/dot/metax/dot_metax.h index f7b4c0e21..c9e49f636 100644 --- a/src/infiniop/ops/dot/metax/dot_metax.h +++ b/src/infiniop/ops/dot/metax/dot_metax.h @@ -3,7 +3,6 @@ #include "../../../operator.h" #include "../../../tensor.h" -#include "../../../devices/metax/metax_common.h" namespace op::dot::metax { diff --git a/src/infiniop/ops/dot/metax/dot_metax.maca b/src/infiniop/ops/dot/metax/dot_metax.maca index 79b12e8a3..38c99536f 100644 --- a/src/infiniop/ops/dot/metax/dot_metax.maca +++ b/src/infiniop/ops/dot/metax/dot_metax.maca @@ -1,8 +1,9 @@ -#include "dot_metax.h" -#include "../cuda/kernel.cuh" #include "../../../../utils.h" -#include -#include +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_kernel_common.h" + +#include "../cuda/kernel.cuh" +#include "dot_metax.h" namespace op::dot::metax { @@ -50,56 +51,47 @@ infiniStatus_t Descriptor::calculate( auto hc_stream = reinterpret_cast(stream); constexpr unsigned int BLOCK_SIZE = 256; - // Initialize result to zero switch (_dtype) { - case INFINI_DTYPE_F16: { - float *result_f = nullptr; - CHECK_METAX(hcMalloc((void **)&result_f, sizeof(float))); - CHECK_METAX(hcMemsetAsync(result_f, 0, sizeof(float), hc_stream)); - cuda::dot_kernel<<<1, BLOCK_SIZE, 0, hc_stream>>>( - result_f, reinterpret_cast(a), reinterpret_cast(b), - _n, _a_stride, _b_stride); - float result_val; - CHECK_METAX(hcMemcpyAsync(&result_val, result_f, sizeof(float), hcMemcpyDeviceToHost, hc_stream)); - CHECK_METAX(hcStreamSynchronize(hc_stream)); - *reinterpret_cast(y) = __float2half(result_val); - CHECK_METAX(hcFree(result_f)); + + case INFINI_DTYPE_F32: + cuda::dot_kernel + <<<1, BLOCK_SIZE, 0, hc_stream>>>( + reinterpret_cast(y), + reinterpret_cast(a), + reinterpret_cast(b), + _n, _a_stride, _b_stride); break; - } - case INFINI_DTYPE_BF16: { - float *result_f = nullptr; - CHECK_METAX(hcMalloc((void **)&result_f, sizeof(float))); - CHECK_METAX(hcMemsetAsync(result_f, 0, sizeof(float), hc_stream)); - cuda::dot_kernel<<<1, BLOCK_SIZE, 0, hc_stream>>>( - result_f, reinterpret_cast(a), reinterpret_cast(b), - _n, _a_stride, _b_stride); - float result_val; - CHECK_METAX(hcMemcpyAsync(&result_val, result_f, sizeof(float), hcMemcpyDeviceToHost, hc_stream)); - CHECK_METAX(hcStreamSynchronize(hc_stream)); - *reinterpret_cast(y) = __float2bfloat16_rn(result_val); - CHECK_METAX(hcFree(result_f)); + + case INFINI_DTYPE_F64: + cuda::dot_kernel + <<<1, BLOCK_SIZE, 0, hc_stream>>>( + reinterpret_cast(y), + reinterpret_cast(a), + reinterpret_cast(b), + _n, _a_stride, _b_stride); break; - } - case INFINI_DTYPE_F32: { - float *result_f = reinterpret_cast(y); - CHECK_METAX(hcMemsetAsync(result_f, 0, sizeof(float), hc_stream)); - cuda::dot_kernel<<<1, BLOCK_SIZE, 0, hc_stream>>>( - result_f, reinterpret_cast(a), reinterpret_cast(b), - _n, _a_stride, _b_stride); + + case INFINI_DTYPE_F16: + cuda::dot_kernel + <<<1, BLOCK_SIZE, 0, hc_stream>>>( + reinterpret_cast(y), + reinterpret_cast(a), + reinterpret_cast(b), + _n, _a_stride, _b_stride); break; - } - case INFINI_DTYPE_F64: { - double *result_d = reinterpret_cast(y); - CHECK_METAX(hcMemsetAsync(result_d, 0, sizeof(double), hc_stream)); - cuda::dot_kernel<<<1, BLOCK_SIZE, 0, hc_stream>>>( - result_d, reinterpret_cast(a), reinterpret_cast(b), - _n, _a_stride, _b_stride); + + case INFINI_DTYPE_BF16: + cuda::dot_kernel + <<<1, BLOCK_SIZE, 0, hc_stream>>>( + reinterpret_cast(y), + reinterpret_cast(a), + reinterpret_cast(b), + _n, _a_stride, _b_stride); break; - } + default: return INFINI_STATUS_BAD_TENSOR_DTYPE; } - return INFINI_STATUS_SUCCESS; } diff --git a/src/infiniop/ops/dot/moore/dot_moore.h b/src/infiniop/ops/dot/moore/dot_moore.h index eebc0e528..b11178e15 100644 --- a/src/infiniop/ops/dot/moore/dot_moore.h +++ b/src/infiniop/ops/dot/moore/dot_moore.h @@ -3,7 +3,6 @@ #include "../../../operator.h" #include "../../../tensor.h" -#include "../../../devices/moore/moore_common.h" namespace op::dot::moore { diff --git a/src/infiniop/ops/dot/moore/dot_moore.mu b/src/infiniop/ops/dot/moore/dot_moore.mu index 7b9e9ec77..7516f64bb 100644 --- a/src/infiniop/ops/dot/moore/dot_moore.mu +++ b/src/infiniop/ops/dot/moore/dot_moore.mu @@ -1,8 +1,9 @@ -#include "dot_moore.h" -#include "../cuda/kernel.cuh" #include "../../../../utils.h" -#include -#include +#include "../../../devices/moore/moore_common.h" +#include "../../../devices/moore/moore_kernel_common.h" + +#include "../cuda/kernel.cuh" +#include "dot_moore.h" namespace op::dot::moore { @@ -47,55 +48,47 @@ infiniStatus_t Descriptor::calculate( const void *b, void *stream) const { - auto cuda_stream = reinterpret_cast(stream); + auto cuda_stream = reinterpret_cast(stream); constexpr unsigned int BLOCK_SIZE = 256; - // Initialize result to zero switch (_dtype) { - case INFINI_DTYPE_F16: { - float *result_f = nullptr; - CHECK_MOORE(cudaMallocAsync(&result_f, sizeof(float), cuda_stream)); - CHECK_MOORE(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream)); - cuda::dot_kernel<<<1, BLOCK_SIZE, 0, cuda_stream>>>( - result_f, reinterpret_cast(a), reinterpret_cast(b), - _n, _a_stride, _b_stride); - float result_val; - CHECK_MOORE(cudaMemcpyAsync(&result_val, result_f, sizeof(float), cudaMemcpyDeviceToHost, cuda_stream)); - CHECK_MOORE(cudaStreamSynchronize(cuda_stream)); - *reinterpret_cast(y) = __float2half(result_val); - CHECK_MOORE(cudaFreeAsync(result_f, cuda_stream)); + + case INFINI_DTYPE_F32: + cuda::dot_kernel + <<<1, BLOCK_SIZE, 0, cuda_stream>>>( + reinterpret_cast(y), + reinterpret_cast(a), + reinterpret_cast(b), + _n, _a_stride, _b_stride); break; - } - case INFINI_DTYPE_BF16: { - float *result_f = nullptr; - CHECK_MOORE(cudaMallocAsync(&result_f, sizeof(float), cuda_stream)); - CHECK_MOORE(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream)); - cuda::dot_kernel<<<1, BLOCK_SIZE, 0, cuda_stream>>>( - result_f, reinterpret_cast(a), reinterpret_cast(b), - _n, _a_stride, _b_stride); - float result_val; - CHECK_MOORE(cudaMemcpyAsync(&result_val, result_f, sizeof(float), cudaMemcpyDeviceToHost, cuda_stream)); - CHECK_MOORE(cudaStreamSynchronize(cuda_stream)); - *reinterpret_cast(y) = __float2bfloat16_rn(result_val); - CHECK_MOORE(cudaFreeAsync(result_f, cuda_stream)); + + case INFINI_DTYPE_F64: + cuda::dot_kernel + <<<1, BLOCK_SIZE, 0, cuda_stream>>>( + reinterpret_cast(y), + reinterpret_cast(a), + reinterpret_cast(b), + _n, _a_stride, _b_stride); break; - } - case INFINI_DTYPE_F32: { - float *result_f = reinterpret_cast(y); - CHECK_MOORE(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream)); - cuda::dot_kernel<<<1, BLOCK_SIZE, 0, cuda_stream>>>( - result_f, reinterpret_cast(a), reinterpret_cast(b), - _n, _a_stride, _b_stride); + + case INFINI_DTYPE_F16: + cuda::dot_kernel + <<<1, BLOCK_SIZE, 0, cuda_stream>>>( + reinterpret_cast(y), + reinterpret_cast(a), + reinterpret_cast(b), + _n, _a_stride, _b_stride); break; - } - case INFINI_DTYPE_F64: { - double *result_d = reinterpret_cast(y); - CHECK_MOORE(cudaMemsetAsync(result_d, 0, sizeof(double), cuda_stream)); - cuda::dot_kernel<<<1, BLOCK_SIZE, 0, cuda_stream>>>( - result_d, reinterpret_cast(a), reinterpret_cast(b), - _n, _a_stride, _b_stride); + + case INFINI_DTYPE_BF16: + cuda::dot_kernel + <<<1, BLOCK_SIZE, 0, cuda_stream>>>( + reinterpret_cast(y), + reinterpret_cast(a), + reinterpret_cast(b), + _n, _a_stride, _b_stride); break; - } + default: return INFINI_STATUS_BAD_TENSOR_DTYPE; } diff --git a/src/infiniop/ops/dot/nvidia/dot_nvidia.cu b/src/infiniop/ops/dot/nvidia/dot_nvidia.cu index 464e0a15e..d3a2e6f8e 100644 --- a/src/infiniop/ops/dot/nvidia/dot_nvidia.cu +++ b/src/infiniop/ops/dot/nvidia/dot_nvidia.cu @@ -1,7 +1,9 @@ -#include "dot_nvidia.cuh" -#include "../cuda/kernel.cuh" -#include "../../../devices/nvidia/nvidia_kernel_common.cuh" #include "../../../../utils.h" +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" + +#include "../cuda/kernel.cuh" +#include "dot_nvidia.cuh" namespace op::dot::nvidia { @@ -71,46 +73,44 @@ infiniStatus_t Descriptor::calculate( auto cuda_stream = reinterpret_cast(stream); constexpr unsigned int BLOCK_SIZE = 256; - // Initialize result to zero switch (_dtype) { - case INFINI_DTYPE_F16: { - float *result_f = nullptr; - CHECK_CUDA(cudaMallocAsync(&result_f, sizeof(float), cuda_stream)); - CHECK_CUDA(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream)); - cuda::dot_kernel<<<1, BLOCK_SIZE, 0, cuda_stream>>>( - result_f, reinterpret_cast(a), reinterpret_cast(b), - _n, _a_stride, _b_stride); - store_half_from_f32<<<1, 1, 0, cuda_stream>>>(reinterpret_cast(y), result_f); - CHECK_CUDA(cudaFreeAsync(result_f, cuda_stream)); + + case INFINI_DTYPE_F32: + cuda::dot_kernel + <<<1, BLOCK_SIZE, 0, cuda_stream>>>( + reinterpret_cast(y), + reinterpret_cast(a), + reinterpret_cast(b), + _n, _a_stride, _b_stride); break; - } - case INFINI_DTYPE_BF16: { - float *result_f = nullptr; - CHECK_CUDA(cudaMallocAsync(&result_f, sizeof(float), cuda_stream)); - CHECK_CUDA(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream)); - cuda::dot_kernel<<<1, BLOCK_SIZE, 0, cuda_stream>>>( - result_f, reinterpret_cast(a), reinterpret_cast(b), - _n, _a_stride, _b_stride); - store_bf16_from_f32<<<1, 1, 0, cuda_stream>>>(reinterpret_cast(y), result_f); - CHECK_CUDA(cudaFreeAsync(result_f, cuda_stream)); + + case INFINI_DTYPE_F64: + cuda::dot_kernel + <<<1, BLOCK_SIZE, 0, cuda_stream>>>( + reinterpret_cast(y), + reinterpret_cast(a), + reinterpret_cast(b), + _n, _a_stride, _b_stride); break; - } - case INFINI_DTYPE_F32: { - float *result_f = reinterpret_cast(y); - CHECK_CUDA(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream)); - cuda::dot_kernel<<<1, BLOCK_SIZE, 0, cuda_stream>>>( - result_f, reinterpret_cast(a), reinterpret_cast(b), - _n, _a_stride, _b_stride); + + case INFINI_DTYPE_F16: + cuda::dot_kernel + <<<1, BLOCK_SIZE, 0, cuda_stream>>>( + reinterpret_cast(y), + reinterpret_cast(a), + reinterpret_cast(b), + _n, _a_stride, _b_stride); break; - } - case INFINI_DTYPE_F64: { - double *result_d = reinterpret_cast(y); - CHECK_CUDA(cudaMemsetAsync(result_d, 0, sizeof(double), cuda_stream)); - cuda::dot_kernel<<<1, BLOCK_SIZE, 0, cuda_stream>>>( - result_d, reinterpret_cast(a), reinterpret_cast(b), - _n, _a_stride, _b_stride); + + case INFINI_DTYPE_BF16: + cuda::dot_kernel + <<<1, BLOCK_SIZE, 0, cuda_stream>>>( + reinterpret_cast(y), + reinterpret_cast(a), + reinterpret_cast(b), + _n, _a_stride, _b_stride); break; - } + default: return INFINI_STATUS_BAD_TENSOR_DTYPE; } diff --git a/src/infiniop/ops/dot/nvidia/dot_nvidia.cuh b/src/infiniop/ops/dot/nvidia/dot_nvidia.cuh index 76d22aec3..8a15e50f1 100644 --- a/src/infiniop/ops/dot/nvidia/dot_nvidia.cuh +++ b/src/infiniop/ops/dot/nvidia/dot_nvidia.cuh @@ -1,9 +1,9 @@ #ifndef __DOT_NVIDIA_H__ #define __DOT_NVIDIA_H__ +#include "../../../devices/nvidia/nvidia_common.cuh" #include "../../../operator.h" #include "../../../tensor.h" -#include "../../../devices/nvidia/nvidia_common.cuh" #include namespace op::dot::nvidia { diff --git a/src/infiniop/ops/dot/operator.cc b/src/infiniop/ops/dot/operator.cc index b0f8c6be9..10f9fda57 100644 --- a/src/infiniop/ops/dot/operator.cc +++ b/src/infiniop/ops/dot/operator.cc @@ -15,20 +15,20 @@ #include "moore/dot_moore.h" #endif -__C infiniStatus_t infiniopCreateDotDescriptor( +__INFINI_C infiniStatus_t infiniopCreateDotDescriptor( infiniopHandle_t handle, infiniopDotDescriptor_t *desc_ptr, infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc) { -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ return op::dot::NAMESPACE::Descriptor::create( \ - handle, \ + handle, \ reinterpret_cast(desc_ptr), \ - y_desc, \ - a_desc, \ + y_desc, \ + a_desc, \ b_desc) switch (handle->device) { @@ -56,10 +56,10 @@ __C infiniStatus_t infiniopCreateDotDescriptor( #undef CREATE } -__C infiniStatus_t infiniopGetDotWorkspaceSize(infiniopDotDescriptor_t desc, size_t *size) { +__INFINI_C infiniStatus_t infiniopGetDotWorkspaceSize(infiniopDotDescriptor_t desc, size_t *size) { -#define GET(CASE, NAMESPACE) \ - case CASE: \ +#define GET(CASE, NAMESPACE) \ + case CASE: \ *size = reinterpret_cast(desc)->workspaceSize(); \ return INFINI_STATUS_SUCCESS; @@ -87,7 +87,7 @@ __C infiniStatus_t infiniopGetDotWorkspaceSize(infiniopDotDescriptor_t desc, siz return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopDot( +__INFINI_C infiniStatus_t infiniopDot( infiniopDotDescriptor_t desc, void *workspace, size_t workspace_size, @@ -96,8 +96,8 @@ __C infiniStatus_t infiniopDot( const void *b, void *stream) { -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ return reinterpret_cast(desc) \ ->calculate(workspace, workspace_size, y, a, b, stream) @@ -126,11 +126,11 @@ __C infiniStatus_t infiniopDot( #undef CALCULATE } -__C infiniStatus_t +__INFINI_C infiniStatus_t infiniopDestroyDotDescriptor(infiniopDotDescriptor_t desc) { -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS; diff --git a/src/infiniop/ops/histc/cpu/histc_cpu.h b/src/infiniop/ops/histc/cpu/histc_cpu.h index 673e8ff4b..4b4f1a315 100644 --- a/src/infiniop/ops/histc/cpu/histc_cpu.h +++ b/src/infiniop/ops/histc/cpu/histc_cpu.h @@ -1,9 +1,9 @@ #ifndef __HISTC_CPU_H__ #define __HISTC_CPU_H__ +#include "../../../devices/cpu/common_cpu.h" #include "../../../operator.h" #include "../../../tensor.h" -#include "../../../devices/cpu/common_cpu.h" namespace op::histc::cpu { diff --git a/src/infiniop/ops/histc/cuda/kernel.cuh b/src/infiniop/ops/histc/cuda/kernel.cuh index ee6dd4f4b..75a2474ff 100644 --- a/src/infiniop/ops/histc/cuda/kernel.cuh +++ b/src/infiniop/ops/histc/cuda/kernel.cuh @@ -1,7 +1,4 @@ #pragma once -#include -#include -#include #include namespace op::cuda { @@ -16,8 +13,7 @@ __global__ void histc_kernel( double min_val, double max_val) { - size_t idx = static_cast(blockIdx.x) * static_cast(blockDim.x) + - static_cast(threadIdx.x); + size_t idx = static_cast(blockIdx.x) * static_cast(blockDim.x) + static_cast(threadIdx.x); size_t stride = static_cast(blockDim.x) * static_cast(gridDim.x); size_t input_stride_u = static_cast(input_stride); diff --git a/src/infiniop/ops/histc/metax/histc_metax.h b/src/infiniop/ops/histc/metax/histc_metax.h index 7ba259939..6bb74a87c 100644 --- a/src/infiniop/ops/histc/metax/histc_metax.h +++ b/src/infiniop/ops/histc/metax/histc_metax.h @@ -3,7 +3,6 @@ #include "../../../operator.h" #include "../../../tensor.h" -#include "../../../devices/metax/metax_common.h" namespace op::histc::metax { diff --git a/src/infiniop/ops/histc/metax/histc_metax.maca b/src/infiniop/ops/histc/metax/histc_metax.maca index a72f0d980..7d290b7d2 100644 --- a/src/infiniop/ops/histc/metax/histc_metax.maca +++ b/src/infiniop/ops/histc/metax/histc_metax.maca @@ -1,8 +1,9 @@ -#include "histc_metax.h" -#include "../cuda/kernel.cuh" #include "../../../../utils.h" -#include -#include +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_kernel_common.h" + +#include "../cuda/kernel.cuh" +#include "histc_metax.h" namespace op::histc::metax { diff --git a/src/infiniop/ops/histc/moore/histc_moore.h b/src/infiniop/ops/histc/moore/histc_moore.h index 877fb5bf3..2d9cb1d4f 100644 --- a/src/infiniop/ops/histc/moore/histc_moore.h +++ b/src/infiniop/ops/histc/moore/histc_moore.h @@ -3,7 +3,6 @@ #include "../../../operator.h" #include "../../../tensor.h" -#include "../../../devices/moore/moore_common.h" namespace op::histc::moore { diff --git a/src/infiniop/ops/histc/moore/histc_moore.mu b/src/infiniop/ops/histc/moore/histc_moore.mu index 6b10b6007..cc3173e94 100644 --- a/src/infiniop/ops/histc/moore/histc_moore.mu +++ b/src/infiniop/ops/histc/moore/histc_moore.mu @@ -1,8 +1,9 @@ -#include "histc_moore.h" -#include "../cuda/kernel.cuh" #include "../../../../utils.h" -#include -#include +#include "../../../devices/moore/moore_common.h" +#include "../../../devices/moore/moore_kernel_common.h" + +#include "../cuda/kernel.cuh" +#include "histc_moore.h" namespace op::histc::moore { diff --git a/src/infiniop/ops/histc/nvidia/histc_nvidia.cu b/src/infiniop/ops/histc/nvidia/histc_nvidia.cu index 943dae6f7..43daf7d0f 100644 --- a/src/infiniop/ops/histc/nvidia/histc_nvidia.cu +++ b/src/infiniop/ops/histc/nvidia/histc_nvidia.cu @@ -1,7 +1,9 @@ -#include "histc_nvidia.cuh" -#include "../cuda/kernel.cuh" -#include "../../../devices/nvidia/nvidia_kernel_common.cuh" #include "../../../../utils.h" +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" + +#include "../cuda/kernel.cuh" +#include "histc_nvidia.cuh" namespace op::histc::nvidia { diff --git a/src/infiniop/ops/histc/nvidia/histc_nvidia.cuh b/src/infiniop/ops/histc/nvidia/histc_nvidia.cuh index 464f7ef03..60acb6d3d 100644 --- a/src/infiniop/ops/histc/nvidia/histc_nvidia.cuh +++ b/src/infiniop/ops/histc/nvidia/histc_nvidia.cuh @@ -1,9 +1,9 @@ #ifndef __HISTC_NVIDIA_H__ #define __HISTC_NVIDIA_H__ +#include "../../../devices/nvidia/nvidia_common.cuh" #include "../../../operator.h" #include "../../../tensor.h" -#include "../../../devices/nvidia/nvidia_common.cuh" namespace op::histc::nvidia { diff --git a/src/infiniop/ops/histc/operator.cc b/src/infiniop/ops/histc/operator.cc index acd205ee0..ed4d07662 100644 --- a/src/infiniop/ops/histc/operator.cc +++ b/src/infiniop/ops/histc/operator.cc @@ -15,7 +15,7 @@ #include "moore/histc_moore.h" #endif -__C infiniStatus_t infiniopCreateHistcDescriptor( +__INFINI_C infiniStatus_t infiniopCreateHistcDescriptor( infiniopHandle_t handle, infiniopHistcDescriptor_t *desc_ptr, infiniopTensorDescriptor_t y_desc, @@ -24,15 +24,15 @@ __C infiniStatus_t infiniopCreateHistcDescriptor( double min_val, double max_val) { -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::histc::NAMESPACE::Descriptor::create( \ - handle, \ +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::histc::NAMESPACE::Descriptor::create( \ + handle, \ reinterpret_cast(desc_ptr), \ - y_desc, \ - x_desc, \ - bins, \ - min_val, \ + y_desc, \ + x_desc, \ + bins, \ + min_val, \ max_val) switch (handle->device) { @@ -60,7 +60,7 @@ __C infiniStatus_t infiniopCreateHistcDescriptor( #undef CREATE } -__C infiniStatus_t infiniopGetHistcWorkspaceSize(infiniopHistcDescriptor_t desc, size_t *size) { +__INFINI_C infiniStatus_t infiniopGetHistcWorkspaceSize(infiniopHistcDescriptor_t desc, size_t *size) { #define GET(CASE, NAMESPACE) \ case CASE: \ @@ -91,7 +91,7 @@ __C infiniStatus_t infiniopGetHistcWorkspaceSize(infiniopHistcDescriptor_t desc, return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopHistc( +__INFINI_C infiniStatus_t infiniopHistc( infiniopHistcDescriptor_t desc, void *workspace, size_t workspace_size, @@ -99,7 +99,7 @@ __C infiniStatus_t infiniopHistc( const void *x, void *stream) { -#define CALCULATE(CASE, NAMESPACE) \ +#define CALCULATE(CASE, NAMESPACE) \ case CASE: \ return reinterpret_cast(desc) \ ->calculate(workspace, workspace_size, y, x, stream) @@ -129,10 +129,10 @@ __C infiniStatus_t infiniopHistc( #undef CALCULATE } -__C infiniStatus_t +__INFINI_C infiniStatus_t infiniopDestroyHistcDescriptor(infiniopHistcDescriptor_t desc) { -#define DELETE(CASE, NAMESPACE) \ +#define DELETE(CASE, NAMESPACE) \ case CASE: \ delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS; diff --git a/src/infiniop/ops/log10/cpu/log10_cpu.cc b/src/infiniop/ops/log10/cpu/log10_cpu.cc index 3148014dd..fb89c86cc 100644 --- a/src/infiniop/ops/log10/cpu/log10_cpu.cc +++ b/src/infiniop/ops/log10/cpu/log10_cpu.cc @@ -50,4 +50,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::log10::cpu \ No newline at end of file +} // namespace op::log10::cpu diff --git a/src/infiniop/ops/log10/cpu/log10_cpu.h b/src/infiniop/ops/log10/cpu/log10_cpu.h index 6cc22a1e3..b6aed710c 100644 --- a/src/infiniop/ops/log10/cpu/log10_cpu.h +++ b/src/infiniop/ops/log10/cpu/log10_cpu.h @@ -17,4 +17,4 @@ typedef struct Log10Op { } Log10Op; } // namespace op::log10::cpu -#endif // __LOG10_CPU_H__ \ No newline at end of file +#endif // __LOG10_CPU_H__ diff --git a/src/infiniop/ops/log10/cuda/kernel.cuh b/src/infiniop/ops/log10/cuda/kernel.cuh index f02b7f98e..189dc457f 100644 --- a/src/infiniop/ops/log10/cuda/kernel.cuh +++ b/src/infiniop/ops/log10/cuda/kernel.cuh @@ -1,15 +1,10 @@ -#pragma once +#ifndef __LOG10_KERNEL_CUH__ +#define __LOG10_KERNEL_CUH__ #include // 包含 log10f, log10, log, logf 等 -#include -#include -#include -#include #include namespace op::cuda { -// 移除 high_precision_log10f 避免混淆,让 Log10Op 直接实现逻辑。 - struct Log10Op { public: static constexpr size_t num_inputs = 1; @@ -18,14 +13,15 @@ public: __device__ __forceinline__ T operator()(const T &x) const { if constexpr (std::is_same_v) { // For F32: compute via F64 for improved accuracy. - return (float)log10((double)x); + return (float)::log10((double)x); } else if constexpr (std::is_same_v) { - return log10(x); + return ::log10(x); } else { // For F16/BF16: promote to float, compute, then cast back. - return (T)(float)log10((double)(float)x); + return (T)(float)::log10((double)(float)x); } } }; } // namespace op::cuda +#endif diff --git a/src/infiniop/ops/log10/log10.h b/src/infiniop/ops/log10/log10.h index 917a1db1c..4519d6ac3 100644 --- a/src/infiniop/ops/log10/log10.h +++ b/src/infiniop/ops/log10/log10.h @@ -5,4 +5,4 @@ #define DESCRIPTOR(NAMESPACE) ELEMENTWISE_DESCRIPTOR(log10, NAMESPACE) -#endif // __LOG10_H__ \ No newline at end of file +#endif // __LOG10_H__ diff --git a/src/infiniop/ops/log10/moore/log10_moore_kernel.h b/src/infiniop/ops/log10/moore/log10_moore_kernel.h index 636663fc0..aa905c12d 100644 --- a/src/infiniop/ops/log10/moore/log10_moore_kernel.h +++ b/src/infiniop/ops/log10/moore/log10_moore_kernel.h @@ -2,8 +2,6 @@ #define __LOG10_MOORE_KERNEL_H__ #include -#include -#include #include namespace op::log10::moore { @@ -29,7 +27,7 @@ typedef struct Log10Op { } else if constexpr (std::is_same_v) { return log10f(x); } else { // double - return log10(x); + return ::log10(x); } } } Log10Op; diff --git a/src/infiniop/ops/log10/nvidia/log10_nvidia.cu b/src/infiniop/ops/log10/nvidia/log10_nvidia.cu index 03196e816..815170274 100644 --- a/src/infiniop/ops/log10/nvidia/log10_nvidia.cu +++ b/src/infiniop/ops/log10/nvidia/log10_nvidia.cu @@ -56,4 +56,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::log10::nvidia \ No newline at end of file +} // namespace op::log10::nvidia diff --git a/src/infiniop/ops/log10/nvidia/log10_nvidia.cuh b/src/infiniop/ops/log10/nvidia/log10_nvidia.cuh index 62af5f2a6..d1f93465b 100644 --- a/src/infiniop/ops/log10/nvidia/log10_nvidia.cuh +++ b/src/infiniop/ops/log10/nvidia/log10_nvidia.cuh @@ -5,4 +5,4 @@ ELEMENTWISE_DESCRIPTOR(log10, nvidia) -#endif // __LOG10_NVIDIA_H__ \ No newline at end of file +#endif // __LOG10_NVIDIA_H__ diff --git a/src/infiniop/ops/log10/operator.cc b/src/infiniop/ops/log10/operator.cc index a5d1099bd..31a2edaa3 100644 --- a/src/infiniop/ops/log10/operator.cc +++ b/src/infiniop/ops/log10/operator.cc @@ -15,7 +15,7 @@ #include "moore/log10_moore.h" #endif -__C infiniStatus_t infiniopCreateLog10Descriptor( +__INFINI_C infiniStatus_t infiniopCreateLog10Descriptor( infiniopHandle_t handle, infiniopLog10Descriptor_t *desc_ptr, infiniopTensorDescriptor_t y_desc, @@ -54,7 +54,7 @@ __C infiniStatus_t infiniopCreateLog10Descriptor( #undef CREATE } -__C infiniStatus_t infiniopGetLog10WorkspaceSize(infiniopLog10Descriptor_t desc, size_t *size) { +__INFINI_C infiniStatus_t infiniopGetLog10WorkspaceSize(infiniopLog10Descriptor_t desc, size_t *size) { #define GET(CASE, NAMESPACE) \ case CASE: \ @@ -85,7 +85,7 @@ __C infiniStatus_t infiniopGetLog10WorkspaceSize(infiniopLog10Descriptor_t desc, return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopLog10( +__INFINI_C infiniStatus_t infiniopLog10( infiniopLog10Descriptor_t desc, void *workspace, size_t workspace_size, @@ -123,7 +123,7 @@ __C infiniStatus_t infiniopLog10( #undef CALCULATE } -__C infiniStatus_t +__INFINI_C infiniStatus_t infiniopDestroyLog10Descriptor(infiniopLog10Descriptor_t desc) { #define DELETE(CASE, NAMESPACE) \ @@ -154,4 +154,4 @@ infiniopDestroyLog10Descriptor(infiniopLog10Descriptor_t desc) { } #undef DELETE -} \ No newline at end of file +} diff --git a/src/infiniop/ops/log1p/cuda/kernel.cuh b/src/infiniop/ops/log1p/cuda/kernel.cuh index 381bad957..cc46c859a 100644 --- a/src/infiniop/ops/log1p/cuda/kernel.cuh +++ b/src/infiniop/ops/log1p/cuda/kernel.cuh @@ -1,9 +1,6 @@ -#pragma once +#ifndef __LOG1P_KERNEL_CUH__ +#define __LOG1P_KERNEL_CUH__ #include -#include -#include -#include -#include #include namespace op::cuda { @@ -16,14 +13,15 @@ public: __device__ __forceinline__ T operator()(const T &x) const { if constexpr (std::is_same_v) { // Use double precision for better accuracy. - return (float)log1p((double)x); + return (float)::log1p((double)x); } else if constexpr (std::is_same_v) { - return log1p(x); + return ::log1p(x); } else { // For F16/BF16: promote to float, compute, then cast back. - return (T)(float)log1p((double)(float)x); + return (T)(float)::log1p((double)(float)x); } } }; } // namespace op::cuda +#endif diff --git a/src/infiniop/ops/log1p/metax/log1p_metax.maca b/src/infiniop/ops/log1p/metax/log1p_metax.maca index 0ca2803d3..13ee7f618 100644 --- a/src/infiniop/ops/log1p/metax/log1p_metax.maca +++ b/src/infiniop/ops/log1p/metax/log1p_metax.maca @@ -56,4 +56,5 @@ infiniStatus_t Descriptor::calculate( } return INFINI_STATUS_SUCCESS; +} } // namespace op::log1p::metax diff --git a/src/infiniop/ops/log1p/moore/log1p_moore_kernel.h b/src/infiniop/ops/log1p/moore/log1p_moore_kernel.h index ff42f0f5e..426f3ed75 100644 --- a/src/infiniop/ops/log1p/moore/log1p_moore_kernel.h +++ b/src/infiniop/ops/log1p/moore/log1p_moore_kernel.h @@ -2,8 +2,6 @@ #define __LOG1P_MOORE_KERNEL_H__ #include -#include -#include #include namespace op::log1p::moore { @@ -29,7 +27,7 @@ typedef struct Log1pOp { } else if constexpr (std::is_same_v) { return log1pf(x); } else { // double - return log1p(x); + return ::log1p(x); } } } Log1pOp; diff --git a/src/infiniop/ops/log1p/operator.cc b/src/infiniop/ops/log1p/operator.cc index a0efc1d1b..9c422cf6d 100644 --- a/src/infiniop/ops/log1p/operator.cc +++ b/src/infiniop/ops/log1p/operator.cc @@ -15,7 +15,7 @@ #include "moore/log1p_moore.h" #endif -__C infiniStatus_t infiniopCreateLog1pDescriptor( +__INFINI_C infiniStatus_t infiniopCreateLog1pDescriptor( infiniopHandle_t handle, infiniopLog1pDescriptor_t *desc_ptr, infiniopTensorDescriptor_t y_desc, @@ -54,7 +54,7 @@ __C infiniStatus_t infiniopCreateLog1pDescriptor( #undef CREATE } -__C infiniStatus_t infiniopGetLog1pWorkspaceSize(infiniopLog1pDescriptor_t desc, size_t *size) { +__INFINI_C infiniStatus_t infiniopGetLog1pWorkspaceSize(infiniopLog1pDescriptor_t desc, size_t *size) { #define GET(CASE, NAMESPACE) \ case CASE: \ @@ -85,7 +85,7 @@ __C infiniStatus_t infiniopGetLog1pWorkspaceSize(infiniopLog1pDescriptor_t desc, return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopLog1p( +__INFINI_C infiniStatus_t infiniopLog1p( infiniopLog1pDescriptor_t desc, void *workspace, size_t workspace_size, @@ -123,7 +123,7 @@ __C infiniStatus_t infiniopLog1p( #undef CALCULATE } -__C infiniStatus_t +__INFINI_C infiniStatus_t infiniopDestroyLog1pDescriptor(infiniopLog1pDescriptor_t desc) { #define DELETE(CASE, NAMESPACE) \ diff --git a/test/infiniop/avg_pool3d.py b/test/infiniop/avg_pool3d.py index 2d76c1b97..2971a85bc 100644 --- a/test/infiniop/avg_pool3d.py +++ b/test/infiniop/avg_pool3d.py @@ -25,7 +25,7 @@ # Test cases format: (x_shape, x_stride_or_None, kernel_size, stride_or_None, padding) _TEST_CASES = [ ((1, 2, 8, 8, 8), None, (2, 2, 2), None, (0, 0, 0)), - ((2, 3, 7, 9, 5), (756, 252, 36, 4, 1), (3, 3, 3), (2, 2, 1), (1, 1, 0)), + ((2, 3, 7, 9, 5), None, (3, 3, 3), (2, 2, 1), (1, 1, 0)), ((2, 1, 9, 11, 7), (693, 77, 77, 7, 1), (3, 2, 3), None, (1, 0, 1)), ]