From b37636c109d79979343b4ee0639675a2b18ea189 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B5=96=E6=B3=89laiquan?=
 <167113918+LaiQuan-conquer@users.noreply.github.com>
Date: Thu, 9 Apr 2026 11:29:21 +0800
Subject: [PATCH 1/2] issue/1031 merge T1-1-2

---
 include/infiniop.h                            |   5 +
 include/infiniop/ops/avg_pool3d.h             |  27 ++
 include/infiniop/ops/dot.h                    |  26 ++
 include/infiniop/ops/histc.h                  |  28 ++
 include/infiniop/ops/log10.h                  |  24 ++
 include/infiniop/ops/log1p.h                  |  24 ++
 .../ops/avg_pool3d/cpu/avg_pool3d_cpu.cc      | 254 ++++++++++++++++++
 .../ops/avg_pool3d/cpu/avg_pool3d_cpu.h       |  64 +++++
 .../ops/avg_pool3d/metax/avg_pool3d_metax.h   |  49 ++++
 .../avg_pool3d/metax/avg_pool3d_metax.maca    | 219 +++++++++++++++
 .../ops/avg_pool3d/moore/avg_pool3d_moore.h   |  44 +++
 .../ops/avg_pool3d/moore/avg_pool3d_moore.mu  |  52 ++++
 .../avg_pool3d/nvidia/avg_pool3d_nvidia.cu    | 195 ++++++++++++++
 .../avg_pool3d/nvidia/avg_pool3d_nvidia.cuh   |  46 ++++
 src/infiniop/ops/avg_pool3d/operator.cc       | 163 +++++++++++
 src/infiniop/ops/dot/cpu/dot_cpu.cc           | 105 ++++++++
 src/infiniop/ops/dot/cpu/dot_cpu.h            |  49 ++++
 src/infiniop/ops/dot/cuda/kernel.cuh          |  40 +++
 src/infiniop/ops/dot/metax/dot_metax.h        |  47 ++++
 src/infiniop/ops/dot/metax/dot_metax.maca     | 106 ++++++++
 src/infiniop/ops/dot/moore/dot_moore.h        |  47 ++++
 src/infiniop/ops/dot/moore/dot_moore.mu       | 106 ++++++++
 src/infiniop/ops/dot/nvidia/dot_nvidia.cu     | 121 +++++++++
 src/infiniop/ops/dot/nvidia/dot_nvidia.cuh    |  48 ++++
 src/infiniop/ops/dot/operator.cc              | 160 +++++++++++
 src/infiniop/ops/histc/cpu/histc_cpu.cc       | 150 +++++++++++
 src/infiniop/ops/histc/cpu/histc_cpu.h        |  60 +++++
 src/infiniop/ops/histc/cuda/kernel.cuh        |  49 ++++
 src/infiniop/ops/histc/metax/histc_metax.h    |  53 ++++
 src/infiniop/ops/histc/metax/histc_metax.maca | 105 ++++++++
 src/infiniop/ops/histc/moore/histc_moore.h    |  53 ++++
 src/infiniop/ops/histc/moore/histc_moore.mu   | 105 ++++++++
 src/infiniop/ops/histc/nvidia/histc_nvidia.cu | 119 ++++++++
 .../ops/histc/nvidia/histc_nvidia.cuh         |  53 ++++
 src/infiniop/ops/histc/operator.cc            | 163 +++++++++++
 src/infiniop/ops/log10/cpu/log10_cpu.cc       |  53 ++++
 src/infiniop/ops/log10/cpu/log10_cpu.h        |  20 ++
 src/infiniop/ops/log10/cuda/kernel.cuh        |  31 +++
 src/infiniop/ops/log10/log10.h                |   8 +
 src/infiniop/ops/log10/metax/log10_metax.h    |   8 +
 src/infiniop/ops/log10/metax/log10_metax.maca |  60 +++++
 src/infiniop/ops/log10/moore/log10_moore.h    |   8 +
 src/infiniop/ops/log10/moore/log10_moore.mu   |  61 +++++
 .../ops/log10/moore/log10_moore_kernel.h      |  39 +++
 src/infiniop/ops/log10/nvidia/log10_nvidia.cu |  59 ++++
 .../ops/log10/nvidia/log10_nvidia.cuh         |   8 +
 src/infiniop/ops/log10/operator.cc            | 157 +++++++++++
 src/infiniop/ops/log1p/cpu/log1p_cpu.cc       |  53 ++++
 src/infiniop/ops/log1p/cpu/log1p_cpu.h        |  20 ++
 src/infiniop/ops/log1p/cuda/kernel.cuh        |  29 ++
 src/infiniop/ops/log1p/log1p.h                |   8 +
 src/infiniop/ops/log1p/metax/log1p_metax.h    |   8 +
 src/infiniop/ops/log1p/metax/log1p_metax.maca |  59 ++++
 src/infiniop/ops/log1p/moore/log1p_moore.h    |   8 +
 src/infiniop/ops/log1p/moore/log1p_moore.mu   |  61 +++++
 .../ops/log1p/moore/log1p_moore_kernel.h      |  39 +++
 src/infiniop/ops/log1p/nvidia/log1p_nvidia.cu |  59 ++++
 .../ops/log1p/nvidia/log1p_nvidia.cuh         |   8 +
 src/infiniop/ops/log1p/operator.cc            | 157 +++++++++++
 test/infiniop/avg_pool3d.py                   | 153 +++++++++++
 test/infiniop/dot.py                          | 138 ++++++++++
 test/infiniop/histc.py                        | 153 +++++++++++
 test/infiniop/libinfiniop/op_register.py      | 169 +++++++++++-
 test/infiniop/libinfiniop/utils.py            |  31 ++-
 test/infiniop/log10.py                        | 179 ++++++++++++
 test/infiniop/log1p.py                        | 162 +++++++++++
 66 files changed, 4960 insertions(+), 5 deletions(-)
 create mode 100644 include/infiniop/ops/avg_pool3d.h
 create mode 100644 include/infiniop/ops/dot.h
 create mode 100644 include/infiniop/ops/histc.h
 create mode 100644 include/infiniop/ops/log10.h
 create mode 100644 include/infiniop/ops/log1p.h
 create mode 100644 src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.cc
 create mode 100644 src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.h
 create mode 100644 src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.h
 create mode 100644 src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.maca
 create mode 100644 src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.h
 create mode 100644 src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.mu
 create mode 100644 src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cu
 create mode 100644 src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cuh
 create mode 100644 src/infiniop/ops/avg_pool3d/operator.cc
 create mode 100644 src/infiniop/ops/dot/cpu/dot_cpu.cc
 create mode 100644 src/infiniop/ops/dot/cpu/dot_cpu.h
 create mode 100644 src/infiniop/ops/dot/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/dot/metax/dot_metax.h
 create mode 100644 src/infiniop/ops/dot/metax/dot_metax.maca
 create mode 100644 src/infiniop/ops/dot/moore/dot_moore.h
 create mode 100644 src/infiniop/ops/dot/moore/dot_moore.mu
 create mode 100644 src/infiniop/ops/dot/nvidia/dot_nvidia.cu
 create mode 100644 src/infiniop/ops/dot/nvidia/dot_nvidia.cuh
 create mode 100644 src/infiniop/ops/dot/operator.cc
 create mode 100644 src/infiniop/ops/histc/cpu/histc_cpu.cc
 create mode 100644 src/infiniop/ops/histc/cpu/histc_cpu.h
 create mode 100644 src/infiniop/ops/histc/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/histc/metax/histc_metax.h
 create mode 100644 src/infiniop/ops/histc/metax/histc_metax.maca
 create mode 100644 src/infiniop/ops/histc/moore/histc_moore.h
 create mode 100644 src/infiniop/ops/histc/moore/histc_moore.mu
 create mode 100644 src/infiniop/ops/histc/nvidia/histc_nvidia.cu
 create mode 100644 src/infiniop/ops/histc/nvidia/histc_nvidia.cuh
 create mode 100644 src/infiniop/ops/histc/operator.cc
 create mode 100644 src/infiniop/ops/log10/cpu/log10_cpu.cc
 create mode 100644 src/infiniop/ops/log10/cpu/log10_cpu.h
 create mode 100644 src/infiniop/ops/log10/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/log10/log10.h
 create mode 100644 src/infiniop/ops/log10/metax/log10_metax.h
 create mode 100644 src/infiniop/ops/log10/metax/log10_metax.maca
 create mode 100644 src/infiniop/ops/log10/moore/log10_moore.h
 create mode 100644 src/infiniop/ops/log10/moore/log10_moore.mu
 create mode 100644 src/infiniop/ops/log10/moore/log10_moore_kernel.h
 create mode 100644 src/infiniop/ops/log10/nvidia/log10_nvidia.cu
 create mode 100644 src/infiniop/ops/log10/nvidia/log10_nvidia.cuh
 create mode 100644 src/infiniop/ops/log10/operator.cc
 create mode 100644 src/infiniop/ops/log1p/cpu/log1p_cpu.cc
 create mode 100644 src/infiniop/ops/log1p/cpu/log1p_cpu.h
 create mode 100644 src/infiniop/ops/log1p/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/log1p/log1p.h
 create mode 100644 src/infiniop/ops/log1p/metax/log1p_metax.h
 create mode 100644 src/infiniop/ops/log1p/metax/log1p_metax.maca
 create mode 100644 src/infiniop/ops/log1p/moore/log1p_moore.h
 create mode 100644 src/infiniop/ops/log1p/moore/log1p_moore.mu
 create mode 100644 src/infiniop/ops/log1p/moore/log1p_moore_kernel.h
 create mode 100644 src/infiniop/ops/log1p/nvidia/log1p_nvidia.cu
 create mode 100644 src/infiniop/ops/log1p/nvidia/log1p_nvidia.cuh
 create mode 100644 src/infiniop/ops/log1p/operator.cc
 create mode 100644 test/infiniop/avg_pool3d.py
 create mode 100644 test/infiniop/dot.py
 create mode 100644 test/infiniop/histc.py
 create mode 100644 test/infiniop/log10.py
 create mode 100644 test/infiniop/log1p.py

diff --git a/include/infiniop.h b/include/infiniop.h
index 5ddd7d297..630244afd 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -18,6 +18,7 @@
 #include "infiniop/ops/atanh.h"
 #include "infiniop/ops/attention.h"
 #include "infiniop/ops/avg_pool1d.h"
+#include "infiniop/ops/avg_pool3d.h"
 #include "infiniop/ops/binary_cross_entropy_with_logits.h"
 #include "infiniop/ops/block_diag.h"
 #include "infiniop/ops/broadcast_to.h"
@@ -32,6 +33,7 @@
 #include "infiniop/ops/diff.h"
 #include "infiniop/ops/digamma.h"
 #include "infiniop/ops/dist.h"
+#include "infiniop/ops/dot.h"
 #include "infiniop/ops/embedding.h"
 #include "infiniop/ops/equal.h"
 #include "infiniop/ops/flash_attention.h"
@@ -46,6 +48,7 @@
 #include "infiniop/ops/hardswish.h"
 #include "infiniop/ops/hardtanh.h"
 #include "infiniop/ops/hinge_embedding_loss.h"
+#include "infiniop/ops/histc.h"
 #include "infiniop/ops/huber_loss.h"
 #include "infiniop/ops/hypot.h"
 #include "infiniop/ops/index_add.h"
@@ -58,6 +61,8 @@
 #include "infiniop/ops/layer_norm.h"
 #include "infiniop/ops/ldexp.h"
 #include "infiniop/ops/lerp.h"
+#include "infiniop/ops/log10.h"
+#include "infiniop/ops/log1p.h"
 #include "infiniop/ops/log_softmax.h"
 #include "infiniop/ops/logaddexp.h"
 #include "infiniop/ops/logaddexp2.h"
diff --git a/include/infiniop/ops/avg_pool3d.h b/include/infiniop/ops/avg_pool3d.h
new file mode 100644
index 000000000..7170e7765
--- /dev/null
+++ b/include/infiniop/ops/avg_pool3d.h
@@ -0,0 +1,27 @@
+#ifndef __INFINIOP_AVG_POOL3D_API_H__
+#define __INFINIOP_AVG_POOL3D_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAvgPool3dDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAvgPool3dDescriptor(infiniopHandle_t handle,
+                                                               infiniopAvgPool3dDescriptor_t *desc_ptr,
+                                                               infiniopTensorDescriptor_t y,
+                                                               infiniopTensorDescriptor_t x,
+                                                               void *kernel_size,
+                                                               void *stride,
+                                                               void *padding);
+
+__C __export infiniStatus_t infiniopGetAvgPool3dWorkspaceSize(infiniopAvgPool3dDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAvgPool3d(infiniopAvgPool3dDescriptor_t desc,
+                                               void *workspace,
+                                               size_t workspace_size,
+                                               void *y,
+                                               const void *x,
+                                               void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAvgPool3dDescriptor(infiniopAvgPool3dDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/dot.h b/include/infiniop/ops/dot.h
new file mode 100644
index 000000000..7fa0d2659
--- /dev/null
+++ b/include/infiniop/ops/dot.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_DOT_API_H__
+#define __INFINIOP_DOT_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopDotDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateDotDescriptor(infiniopHandle_t handle,
+                                                        infiniopDotDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t a,
+                                                        infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetDotWorkspaceSize(infiniopDotDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopDot(infiniopDotDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *a,
+                                        const void *b,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyDotDescriptor(infiniopDotDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/histc.h b/include/infiniop/ops/histc.h
new file mode 100644
index 000000000..cd4695d4e
--- /dev/null
+++ b/include/infiniop/ops/histc.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_HISTC_API_H__
+#define __INFINIOP_HISTC_API_H__
+
+#include "../operator_descriptor.h"
+#include <stdint.h>
+
+typedef struct InfiniopDescriptor *infiniopHistcDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateHistcDescriptor(infiniopHandle_t handle,
+                                                           infiniopHistcDescriptor_t *desc_ptr,
+                                                           infiniopTensorDescriptor_t y,
+                                                           infiniopTensorDescriptor_t x,
+                                                           int64_t bins,
+                                                           double min_val,
+                                                           double max_val);
+
+__C __export infiniStatus_t infiniopGetHistcWorkspaceSize(infiniopHistcDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopHistc(infiniopHistcDescriptor_t desc,
+                                           void *workspace,
+                                           size_t workspace_size,
+                                           void *y,
+                                           const void *x,
+                                           void *stream);
+
+__C __export infiniStatus_t infiniopDestroyHistcDescriptor(infiniopHistcDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/log10.h b/include/infiniop/ops/log10.h
new file mode 100644
index 000000000..7c105dcc3
--- /dev/null
+++ b/include/infiniop/ops/log10.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_LOG10_API_H__
+#define __INFINIOP_LOG10_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopLog10Descriptor_t;
+
+__C __export infiniStatus_t infiniopCreateLog10Descriptor(infiniopHandle_t handle,
+                                                          infiniopLog10Descriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t y,
+                                                          infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetLog10WorkspaceSize(infiniopLog10Descriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopLog10(infiniopLog10Descriptor_t desc,
+                                          void *workspace,
+                                          size_t workspace_size,
+                                          void *y,
+                                          const void *x,
+                                          void *stream);
+
+__C __export infiniStatus_t infiniopDestroyLog10Descriptor(infiniopLog10Descriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/log1p.h b/include/infiniop/ops/log1p.h
new file mode 100644
index 000000000..ebd608e9d
--- /dev/null
+++ b/include/infiniop/ops/log1p.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_LOG1P_API_H__
+#define __INFINIOP_LOG1P_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopLog1pDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateLog1pDescriptor(infiniopHandle_t handle,
+                                                          infiniopLog1pDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t y,
+                                                          infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetLog1pWorkspaceSize(infiniopLog1pDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopLog1p(infiniopLog1pDescriptor_t desc,
+                                          void *workspace,
+                                          size_t workspace_size,
+                                          void *y,
+                                          const void *x,
+                                          void *stream);
+
+__C __export infiniStatus_t infiniopDestroyLog1pDescriptor(infiniopLog1pDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.cc b/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.cc
new file mode 100644
index 000000000..c280a2d92
--- /dev/null
+++ b/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.cc
@@ -0,0 +1,254 @@
+#include "avg_pool3d_cpu.h"
+#include "../../../../utils.h"
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <type_traits>
+
+namespace op::avg_pool3d::cpu {
+
+utils::Result<AvgPool3dInfo> AvgPool3dInfo::create(
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc,
+    void *kernel_size,
+    void *stride,
+    void *padding) {
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    if (x_shape.size() != 5 || y_shape.size() != 5) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    size_t batch = x_shape[0];
+    size_t channels = x_shape[1];
+    size_t input_d = x_shape[2];
+    size_t input_h = x_shape[3];
+    size_t input_w = x_shape[4];
+
+    // Parse kernel_size
+    size_t kernel_d, kernel_h, kernel_w;
+    if (kernel_size) {
+        size_t *ks = reinterpret_cast<size_t *>(kernel_size);
+        if (ks[0] == 0 || ks[1] == 0 || ks[2] == 0) {
+            return INFINI_STATUS_BAD_PARAM;
+        }
+        kernel_d = ks[0];
+        kernel_h = ks[1];
+        kernel_w = ks[2];
+    } else {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    // Parse stride (default to kernel_size if not provided)
+    size_t stride_d, stride_h, stride_w;
+    if (stride) {
+        size_t *s = reinterpret_cast<size_t *>(stride);
+        if (s[0] == 0 || s[1] == 0 || s[2] == 0) {
+            return INFINI_STATUS_BAD_PARAM;
+        }
+        stride_d = s[0];
+        stride_h = s[1];
+        stride_w = s[2];
+    } else {
+        stride_d = kernel_d;
+        stride_h = kernel_h;
+        stride_w = kernel_w;
+    }
+
+    // Parse padding
+    size_t pad_d, pad_h, pad_w;
+    if (padding) {
+        size_t *p = reinterpret_cast<size_t *>(padding);
+        // Assume it's always a tuple of 3 values for 3D pooling
+        pad_d = p[0];
+        pad_h = p[1];
+        pad_w = p[2];
+    } else {
+        pad_d = pad_h = pad_w = 0;
+    }
+
+    // Calculate output dimensions. Guard against unsigned underflow when kernel > input + 2*pad.
+    if (pad_d > (std::numeric_limits<size_t>::max() - input_d) / 2 ||
+        pad_h > (std::numeric_limits<size_t>::max() - input_h) / 2 ||
+        pad_w > (std::numeric_limits<size_t>::max() - input_w) / 2) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+    size_t effective_d = input_d + 2 * pad_d;
+    size_t effective_h = input_h + 2 * pad_h;
+    size_t effective_w = input_w + 2 * pad_w;
+    if (kernel_d > effective_d || kernel_h > effective_h || kernel_w > effective_w) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    size_t output_d = (effective_d - kernel_d) / stride_d + 1;
+    size_t output_h = (effective_h - kernel_h) / stride_h + 1;
+    size_t output_w = (effective_w - kernel_w) / stride_w + 1;
+
+    // Verify output shape
+    if (y_shape[0] != batch || y_shape[1] != channels ||
+        y_shape[2] != output_d || y_shape[3] != output_h || y_shape[4] != output_w) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    AvgPool3dInfo info;
+    info.batch = batch;
+    info.channels = channels;
+    info.input_d = input_d;
+    info.input_h = input_h;
+    info.input_w = input_w;
+    info.output_d = output_d;
+    info.output_h = output_h;
+    info.output_w = output_w;
+    info.kernel_d = kernel_d;
+    info.kernel_h = kernel_h;
+    info.kernel_w = kernel_w;
+    info.stride_d = stride_d;
+    info.stride_h = stride_h;
+    info.stride_w = stride_w;
+    info.pad_d = pad_d;
+    info.pad_h = pad_h;
+    info.pad_w = pad_w;
+    info.input_strides = x_desc->strides();
+    info.output_strides = y_desc->strides();
+
+    if (info.input_strides.size() != x_shape.size() || info.output_strides.size() != y_shape.size()) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    // Reject broadcasted (0-stride) or negative strides for dimensions that are actually indexed.
+    // The kernel computes indices using size_t, so negative strides would underflow and go OOB.
+    for (size_t i = 0; i < x_shape.size(); ++i) {
+        if (x_shape[i] > 1 && info.input_strides[i] <= 0) {
+            return INFINI_STATUS_BAD_TENSOR_STRIDES;
+        }
+    }
+    for (size_t i = 0; i < y_shape.size(); ++i) {
+        if (y_shape[i] > 1 && info.output_strides[i] <= 0) {
+            return INFINI_STATUS_BAD_TENSOR_STRIDES;
+        }
+    }
+
+    return utils::Result<AvgPool3dInfo>(std::move(info));
+}
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    void *kernel_size,
+    void *stride,
+    void *padding) {
+
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    if (y_desc->dtype() != dtype) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    auto info_result = AvgPool3dInfo::create(x_desc, y_desc, kernel_size, stride, padding);
+    CHECK_RESULT(info_result);
+
+    *desc_ptr = new Descriptor(dtype, info_result.take(), handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+void avg_pool3d_impl(
+    const AvgPool3dInfo &info,
+    T *y,
+    const T *x) {
+
+    const size_t kernel_size = info.kernel_d * info.kernel_h * info.kernel_w;
+    using Tacc = std::conditional_t<std::is_same_v<T, double>, double, float>;
+    const Tacc inv_kernel_size = Tacc(1) / static_cast<Tacc>(kernel_size);
+
+#pragma omp parallel for collapse(2)
+    for (ptrdiff_t b = 0; b < static_cast<ptrdiff_t>(info.batch); ++b) {
+        for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(info.channels); ++c) {
+            for (size_t od = 0; od < info.output_d; ++od) {
+                for (size_t oh = 0; oh < info.output_h; ++oh) {
+                    for (size_t ow = 0; ow < info.output_w; ++ow) {
+                        Tacc sum = Tacc(0);
+
+                        // Calculate input window
+                        ptrdiff_t id_start =
+                            static_cast<ptrdiff_t>(od) * static_cast<ptrdiff_t>(info.stride_d) -
+                            static_cast<ptrdiff_t>(info.pad_d);
+                        ptrdiff_t ih_start =
+                            static_cast<ptrdiff_t>(oh) * static_cast<ptrdiff_t>(info.stride_h) -
+                            static_cast<ptrdiff_t>(info.pad_h);
+                        ptrdiff_t iw_start =
+                            static_cast<ptrdiff_t>(ow) * static_cast<ptrdiff_t>(info.stride_w) -
+                            static_cast<ptrdiff_t>(info.pad_w);
+
+                        for (size_t kd = 0; kd < info.kernel_d; ++kd) {
+                            for (size_t kh = 0; kh < info.kernel_h; ++kh) {
+                                for (size_t kw = 0; kw < info.kernel_w; ++kw) {
+                                    ptrdiff_t id = id_start + static_cast<ptrdiff_t>(kd);
+                                    ptrdiff_t ih = ih_start + static_cast<ptrdiff_t>(kh);
+                                    ptrdiff_t iw = iw_start + static_cast<ptrdiff_t>(kw);
+
+                                    // Check bounds (accounting for padding)
+                                    if (id >= 0 && id < static_cast<ptrdiff_t>(info.input_d) &&
+                                        ih >= 0 && ih < static_cast<ptrdiff_t>(info.input_h) &&
+                                        iw >= 0 && iw < static_cast<ptrdiff_t>(info.input_w)) {
+                                        size_t x_idx = b * info.input_strides[0] +
+                                                      c * info.input_strides[1] +
+                                                      static_cast<size_t>(id) * info.input_strides[2] +
+                                                      static_cast<size_t>(ih) * info.input_strides[3] +
+                                                      static_cast<size_t>(iw) * info.input_strides[4];
+                                        sum += utils::cast<Tacc>(x[x_idx]);
+                                    }
+                                }
+                            }
+                        }
+
+                        size_t y_idx = b * info.output_strides[0] +
+                                      c * info.output_strides[1] +
+                                      od * info.output_strides[2] +
+                                      oh * info.output_strides[3] +
+                                      ow * info.output_strides[4];
+                        // Match torch.nn.functional.avg_pool3d default behavior (count_include_pad=True):
+                        // padding contributes zeros but still counts in the divisor.
+                        y[y_idx] = utils::cast<T>(sum * inv_kernel_size);
+                    }
+                }
+            }
+        }
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        avg_pool3d_impl<fp16_t>(_info, reinterpret_cast<fp16_t *>(y), reinterpret_cast<const fp16_t *>(x));
+        break;
+    case INFINI_DTYPE_BF16:
+        avg_pool3d_impl<bf16_t>(_info, reinterpret_cast<bf16_t *>(y), reinterpret_cast<const bf16_t *>(x));
+        break;
+    case INFINI_DTYPE_F32:
+        avg_pool3d_impl<float>(_info, reinterpret_cast<float *>(y), reinterpret_cast<const float *>(x));
+        break;
+    case INFINI_DTYPE_F64:
+        avg_pool3d_impl<double>(_info, reinterpret_cast<double *>(y), reinterpret_cast<const double *>(x));
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::avg_pool3d::cpu
diff --git a/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.h b/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.h
new file mode 100644
index 000000000..f01aeb5cf
--- /dev/null
+++ b/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.h
@@ -0,0 +1,64 @@
+#ifndef __AVG_POOL3D_CPU_H__
+#define __AVG_POOL3D_CPU_H__
+
+#include "../../../operator.h"
+#include "../../../tensor.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <vector>
+
+namespace op::avg_pool3d::cpu {
+
+struct AvgPool3dInfo {
+    size_t batch;
+    size_t channels;
+    size_t input_d, input_h, input_w;
+    size_t output_d, output_h, output_w;
+    size_t kernel_d, kernel_h, kernel_w;
+    size_t stride_d, stride_h, stride_w;
+    size_t pad_d, pad_h, pad_w;
+    std::vector<ptrdiff_t> input_strides;
+    std::vector<ptrdiff_t> output_strides;
+
+    static utils::Result<AvgPool3dInfo> create(
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t y_desc,
+        void *kernel_size,
+        void *stride,
+        void *padding);
+};
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    AvgPool3dInfo _info;
+
+    Descriptor(infiniDtype_t dtype, AvgPool3dInfo info,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _info(std::move(info)) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        void *kernel_size,
+        void *stride,
+        void *padding);
+
+    size_t workspaceSize() const { return 0; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::avg_pool3d::cpu
+
+#endif // __AVG_POOL3D_CPU_H__
diff --git a/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.h b/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.h
new file mode 100644
index 000000000..b4277393e
--- /dev/null
+++ b/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.h
@@ -0,0 +1,49 @@
+#ifndef __AVG_POOL3D_METAX_H__
+#define __AVG_POOL3D_METAX_H__
+
+#include "../../../operator.h"
+#include "../../../tensor.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+
+#ifdef ENABLE_METAX_MC_API
+#include <mcdnn/mcdnn.h>
+#else
+#include <hcdnn/hcdnn.h>
+#endif
+
+namespace op::avg_pool3d::metax {
+
+class Descriptor final : public InfiniopDescriptor {
+    struct Opaque;
+    std::unique_ptr<Opaque> _opaque;
+    infiniDtype_t _dtype;
+
+    Descriptor(infiniDtype_t dtype, std::unique_ptr<Opaque> opaque,
+               infiniDevice_t device_type, int device_id);
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        void *kernel_size,
+        void *stride,
+        void *padding);
+
+    size_t workspaceSize() const;
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::avg_pool3d::metax
+
+#endif // __AVG_POOL3D_METAX_H__
diff --git a/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.maca b/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.maca
new file mode 100644
index 000000000..c7955812a
--- /dev/null
+++ b/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.maca
@@ -0,0 +1,219 @@
+#include "avg_pool3d_metax.h"
+#include "../../../../utils.h"
+
+namespace op::avg_pool3d::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+#ifdef ENABLE_METAX_MC_API
+    mcdnnTensorDescriptor_t x_desc = nullptr;
+    mcdnnTensorDescriptor_t y_desc = nullptr;
+    mcdnnPoolingDescriptor_t pool_desc = nullptr;
+#else
+    hcdnnTensorDescriptor_t x_desc = nullptr;
+    hcdnnTensorDescriptor_t y_desc = nullptr;
+    hcdnnPoolingDescriptor_t pool_desc = nullptr;
+#endif
+    size_t workspace_size = 0;
+
+    Opaque(std::shared_ptr<device::metax::Handle::Internal> internal_ptr)
+        : internal(internal_ptr) {}
+
+    ~Opaque() {
+#ifdef ENABLE_METAX_MC_API
+        if (x_desc) mcdnnDestroyTensorDescriptor(x_desc);
+        if (y_desc) mcdnnDestroyTensorDescriptor(y_desc);
+        if (pool_desc) mcdnnDestroyPoolingDescriptor(pool_desc);
+#else
+        if (x_desc) hcdnnDestroyTensorDescriptor(x_desc);
+        if (y_desc) hcdnnDestroyTensorDescriptor(y_desc);
+        if (pool_desc) hcdnnDestroyPoolingDescriptor(pool_desc);
+#endif
+    }
+};
+
+Descriptor::Descriptor(infiniDtype_t dtype, std::unique_ptr<Opaque> opaque,
+                       infiniDevice_t device_type, int device_id)
+    : InfiniopDescriptor{device_type, device_id},
+      _opaque(std::move(opaque)),
+      _dtype(dtype) {}
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    void *kernel_size,
+    void *stride,
+    void *padding) {
+
+    auto metax_handle = reinterpret_cast<device::metax::Handle *>(handle);
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    if (x_shape.size() != 5 || y_shape.size() != 5) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    size_t *ks = reinterpret_cast<size_t *>(kernel_size);
+    size_t kernel_d = ks[0], kernel_h = ks[1], kernel_w = ks[2];
+
+    size_t stride_d, stride_h, stride_w;
+    if (stride) {
+        size_t *s = reinterpret_cast<size_t *>(stride);
+        stride_d = s[0];
+        stride_h = s[1];
+        stride_w = s[2];
+    } else {
+        stride_d = kernel_d;
+        stride_h = kernel_h;
+        stride_w = kernel_w;
+    }
+
+    size_t pad_d, pad_h, pad_w;
+    if (padding) {
+        size_t *p = reinterpret_cast<size_t *>(padding);
+        pad_d = p[0];
+        pad_h = p[1];
+        pad_w = p[2];
+    } else {
+        pad_d = pad_h = pad_w = 0;
+    }
+
+    auto opaque = std::make_unique<Opaque>(metax_handle->internal());
+
+    // Create hcdnn descriptors
+#ifdef ENABLE_METAX_MC_API
+    CHECK_MCDNN(mcdnnCreateTensorDescriptor(&opaque->x_desc));
+    CHECK_MCDNN(mcdnnCreateTensorDescriptor(&opaque->y_desc));
+    CHECK_MCDNN(mcdnnCreatePoolingDescriptor(&opaque->pool_desc));
+#else
+    CHECK_MCDNN(hcdnnCreateTensorDescriptor(&opaque->x_desc));
+    CHECK_MCDNN(hcdnnCreateTensorDescriptor(&opaque->y_desc));
+    CHECK_MCDNN(hcdnnCreatePoolingDescriptor(&opaque->pool_desc));
+#endif
+
+    // Set tensor descriptors
+    int n = static_cast<int>(x_shape[0]);
+    int c = static_cast<int>(x_shape[1]);
+    int d = static_cast<int>(x_shape[2]);
+    int h = static_cast<int>(x_shape[3]);
+    int w = static_cast<int>(x_shape[4]);
+    int out_d = static_cast<int>(y_shape[2]);
+    int out_h = static_cast<int>(y_shape[3]);
+    int out_w = static_cast<int>(y_shape[4]);
+
+    int input_dims[5] = {n, c, d, h, w};
+    int input_strides[5] = {
+        static_cast<int>(c * d * h * w),
+        static_cast<int>(d * h * w),
+        static_cast<int>(h * w),
+        static_cast<int>(w),
+        1
+    };
+
+    int output_dims[5] = {n, c, out_d, out_h, out_w};
+    int output_strides[5] = {
+        static_cast<int>(c * out_d * out_h * out_w),
+        static_cast<int>(out_d * out_h * out_w),
+        static_cast<int>(out_h * out_w),
+        static_cast<int>(out_w),
+        1
+    };
+
+    hcdnnDataType_t hcdnn_dtype = device::metax::getHcdnnDtype(dtype);
+#ifdef ENABLE_METAX_MC_API
+    CHECK_MCDNN(mcdnnSetTensorNdDescriptor(
+        opaque->x_desc, hcdnn_dtype, 5, input_dims, input_strides));
+    CHECK_MCDNN(mcdnnSetTensorNdDescriptor(
+        opaque->y_desc, hcdnn_dtype, 5, output_dims, output_strides));
+
+    // Set pooling descriptor
+    int window_dims[3] = {static_cast<int>(kernel_d), static_cast<int>(kernel_h), static_cast<int>(kernel_w)};
+    int padding_dims[3] = {static_cast<int>(pad_d), static_cast<int>(pad_h), static_cast<int>(pad_w)};
+    int stride_dims[3] = {static_cast<int>(stride_d), static_cast<int>(stride_h), static_cast<int>(stride_w)};
+
+    CHECK_MCDNN(mcdnnSetPoolingNdDescriptor(
+        opaque->pool_desc,
+        MCDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING,
+        MCDNN_NOT_PROPAGATE_NAN,
+        3, window_dims, padding_dims, stride_dims));
+#else
+    CHECK_MCDNN(hcdnnSetTensorNdDescriptor(
+        opaque->x_desc, hcdnn_dtype, 5, input_dims, input_strides));
+    CHECK_MCDNN(hcdnnSetTensorNdDescriptor(
+        opaque->y_desc, hcdnn_dtype, 5, output_dims, output_strides));
+
+    // Set pooling descriptor
+    int window_dims[3] = {static_cast<int>(kernel_d), static_cast<int>(kernel_h), static_cast<int>(kernel_w)};
+    int padding_dims[3] = {static_cast<int>(pad_d), static_cast<int>(pad_h), static_cast<int>(pad_w)};
+    int stride_dims[3] = {static_cast<int>(stride_d), static_cast<int>(stride_h), static_cast<int>(stride_w)};
+
+    CHECK_MCDNN(hcdnnSetPoolingNdDescriptor(
+        opaque->pool_desc,
+        HCDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING,
+        HCDNN_NOT_PROPAGATE_NAN,
+        3, window_dims, padding_dims, stride_dims));
+#endif
+
+    *desc_ptr = new Descriptor(dtype, std::move(opaque), handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+size_t Descriptor::workspaceSize() const {
+    return _opaque->workspace_size;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    auto hc_stream = reinterpret_cast<hcStream_t>(stream);
+    return _opaque->internal->useMcdnn(hc_stream, [&](hcdnnHandle_t hcdnn_handle) {
+        const void *alpha = nullptr;
+        const void *beta = nullptr;
+        if (_dtype == INFINI_DTYPE_F32) {
+            static const float alpha_val = 1.0f, beta_val = 0.0f;
+            alpha = &alpha_val;
+            beta = &beta_val;
+        } else if (_dtype == INFINI_DTYPE_F64) {
+            static const double alpha_val = 1.0, beta_val = 0.0;
+            alpha = &alpha_val;
+            beta = &beta_val;
+        } else {
+            static const float alpha_val = 1.0f, beta_val = 0.0f;
+            alpha = &alpha_val;
+            beta = &beta_val;
+        }
+
+#ifdef ENABLE_METAX_MC_API
+        CHECK_MCDNN(mcdnnPoolingForward(
+            hcdnn_handle,
+            _opaque->pool_desc,
+            alpha,
+            _opaque->x_desc, x,
+            beta,
+            _opaque->y_desc, y));
+#else
+        CHECK_MCDNN(hcdnnPoolingForward(
+            hcdnn_handle,
+            _opaque->pool_desc,
+            alpha,
+            _opaque->x_desc, x,
+            beta,
+            _opaque->y_desc, y));
+#endif
+
+        return INFINI_STATUS_SUCCESS;
+    });
+}
+
+} // namespace op::avg_pool3d::metax
diff --git a/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.h b/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.h
new file mode 100644
index 000000000..526d36543
--- /dev/null
+++ b/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.h
@@ -0,0 +1,44 @@
+#ifndef __AVG_POOL3D_MOORE_H__
+#define __AVG_POOL3D_MOORE_H__
+
+#include "../../../operator.h"
+#include "../../../tensor.h"
+#include "../../../devices/moore/moore_common.h"
+#include "../../../devices/moore/moore_handle.h"
+#include <mudnn.h>
+
+namespace op::avg_pool3d::moore {
+
+class Descriptor final : public InfiniopDescriptor {
+    struct Opaque;
+    std::unique_ptr<Opaque> _opaque;
+    infiniDtype_t _dtype;
+
+    Descriptor(infiniDtype_t dtype, std::unique_ptr<Opaque> opaque,
+               infiniDevice_t device_type, int device_id);
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        void *kernel_size,
+        void *stride,
+        void *padding);
+
+    size_t workspaceSize() const;
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::avg_pool3d::moore
+
+#endif // __AVG_POOL3D_MOORE_H__
diff --git a/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.mu b/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.mu
new file mode 100644
index 000000000..5e1cfd4a6
--- /dev/null
+++ b/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.mu
@@ -0,0 +1,52 @@
+#include "avg_pool3d_moore.h"
+#include "../../../../utils.h"
+// MOORE uses CUDA-compatible API, so we can reuse NVIDIA implementation
+// by including the NVIDIA source and adapting stream types
+
+namespace op::avg_pool3d::moore {
+
+// MOORE platform uses musa API which is CUDA-compatible
+// We can reuse the NVIDIA implementation structure
+// For now, return NOT_IMPLEMENTED as a placeholder
+// Full implementation would require adapting NVIDIA code to use musaStream_t
+
+struct Descriptor::Opaque {};
+
+Descriptor::Descriptor(infiniDtype_t dtype, std::unique_ptr<Opaque> opaque,
+                       infiniDevice_t device_type, int device_id)
+    : InfiniopDescriptor{device_type, device_id},
+      _opaque(std::move(opaque)),
+      _dtype(dtype) {}
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    void *kernel_size,
+    void *stride,
+    void *padding) {
+
+    // MOORE implementation would be similar to NVIDIA but using musa API
+    // For now, delegate to a CPU fallback or implement custom kernel
+    // This is a simplified placeholder - full implementation needed
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+}
+
+size_t Descriptor::workspaceSize() const {
+    return 0;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+    // Placeholder - full implementation needed
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+}
+
+} // namespace op::avg_pool3d::moore
diff --git a/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cu b/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cu
new file mode 100644
index 000000000..ce78f7bd2
--- /dev/null
+++ b/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cu
@@ -0,0 +1,195 @@
+#include "avg_pool3d_nvidia.cuh"
+#include "../../../../utils.h"
+#include <cudnn.h>
+#include <limits>
+
+namespace op::avg_pool3d::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+    cudnnTensorDescriptor_t x_desc = nullptr;
+    cudnnTensorDescriptor_t y_desc = nullptr;
+    cudnnPoolingDescriptor_t pool_desc = nullptr;
+    size_t workspace_size = 0;
+
+    Opaque(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr)
+        : internal(internal_ptr) {}
+
+    ~Opaque() {
+        if (x_desc) cudnnDestroyTensorDescriptor(x_desc);
+        if (y_desc) cudnnDestroyTensorDescriptor(y_desc);
+        if (pool_desc) cudnnDestroyPoolingDescriptor(pool_desc);
+    }
+};
+
+Descriptor::Descriptor(infiniDtype_t dtype, std::unique_ptr<Opaque> opaque,
+                       infiniDevice_t device_type, int device_id)
+    : InfiniopDescriptor{device_type, device_id},
+      _opaque(std::move(opaque)),
+      _dtype(dtype) {}
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    void *kernel_size,
+    void *stride,
+    void *padding) {
+
+    auto nvidia_handle = reinterpret_cast<device::nvidia::Handle *>(handle);
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    if (y_desc->dtype() != dtype) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    if (x_shape.size() != 5 || y_shape.size() != 5) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    if (!kernel_size) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+    size_t *ks = reinterpret_cast<size_t *>(kernel_size);
+    if (ks[0] == 0 || ks[1] == 0 || ks[2] == 0) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+    size_t kernel_d = ks[0], kernel_h = ks[1], kernel_w = ks[2];
+
+    size_t stride_d, stride_h, stride_w;
+    if (stride) {
+        size_t *s = reinterpret_cast<size_t *>(stride);
+        stride_d = s[0];
+        stride_h = s[1];
+        stride_w = s[2];
+    } else {
+        stride_d = kernel_d;
+        stride_h = kernel_h;
+        stride_w = kernel_w;
+    }
+
+    size_t pad_d, pad_h, pad_w;
+    if (padding) {
+        size_t *p = reinterpret_cast<size_t *>(padding);
+        pad_d = p[0];
+        pad_h = p[1];
+        pad_w = p[2];
+    } else {
+        pad_d = pad_h = pad_w = 0;
+    }
+
+    auto opaque = std::make_unique<Opaque>(nvidia_handle->internal());
+
+    // Create cuDNN descriptors
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&opaque->x_desc));
+    CHECK_CUDNN(cudnnCreateTensorDescriptor(&opaque->y_desc));
+    CHECK_CUDNN(cudnnCreatePoolingDescriptor(&opaque->pool_desc));
+
+    // Set tensor descriptors
+    int n = static_cast<int>(x_shape[0]);
+    int c = static_cast<int>(x_shape[1]);
+    int d = static_cast<int>(x_shape[2]);
+    int h = static_cast<int>(x_shape[3]);
+    int w = static_cast<int>(x_shape[4]);
+    int out_d = static_cast<int>(y_shape[2]);
+    int out_h = static_cast<int>(y_shape[3]);
+    int out_w = static_cast<int>(y_shape[4]);
+
+    int input_dims[5] = {n, c, d, h, w};
+    auto x_strides = x_desc->strides();
+    if (x_strides.size() != 5) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+    int input_strides[5] = {};
+    for (size_t i = 0; i < 5; ++i) {
+        if (x_strides[i] <= 0 || x_strides[i] > std::numeric_limits<int>::max()) {
+            return INFINI_STATUS_BAD_TENSOR_STRIDES;
+        }
+        input_strides[i] = static_cast<int>(x_strides[i]);
+    }
+
+    int output_dims[5] = {n, c, out_d, out_h, out_w};
+    auto y_strides = y_desc->strides();
+    if (y_strides.size() != 5) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+    int output_strides[5] = {};
+    for (size_t i = 0; i < 5; ++i) {
+        if (y_strides[i] <= 0 || y_strides[i] > std::numeric_limits<int>::max()) {
+            return INFINI_STATUS_BAD_TENSOR_STRIDES;
+        }
+        output_strides[i] = static_cast<int>(y_strides[i]);
+    }
+
+    cudnnDataType_t cudnn_dtype = device::nvidia::getCudnnDtype(dtype);
+    CHECK_CUDNN(cudnnSetTensorNdDescriptor(
+        opaque->x_desc, cudnn_dtype, 5, input_dims, input_strides));
+    CHECK_CUDNN(cudnnSetTensorNdDescriptor(
+        opaque->y_desc, cudnn_dtype, 5, output_dims, output_strides));
+
+    // Set pooling descriptor
+    int window_dims[3] = {static_cast<int>(kernel_d), static_cast<int>(kernel_h), static_cast<int>(kernel_w)};
+    int padding_dims[3] = {static_cast<int>(pad_d), static_cast<int>(pad_h), static_cast<int>(pad_w)};
+    int stride_dims[3] = {static_cast<int>(stride_d), static_cast<int>(stride_h), static_cast<int>(stride_w)};
+
+    CHECK_CUDNN(cudnnSetPoolingNdDescriptor(
+        opaque->pool_desc,
+        CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING,
+        CUDNN_NOT_PROPAGATE_NAN,
+        3, window_dims, padding_dims, stride_dims));
+
+    *desc_ptr = new Descriptor(dtype, std::move(opaque), handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+size_t Descriptor::workspaceSize() const {
+    return _opaque->workspace_size;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+    return _opaque->internal->useCudnn(cuda_stream, [&](cudnnHandle_t cudnn_handle) {
+
+    const void *alpha = nullptr;
+    const void *beta = nullptr;
+    if (_dtype == INFINI_DTYPE_F32) {
+        static const float alpha_val = 1.0f, beta_val = 0.0f;
+        alpha = &alpha_val;
+        beta = &beta_val;
+    } else if (_dtype == INFINI_DTYPE_F64) {
+        static const double alpha_val = 1.0, beta_val = 0.0;
+        alpha = &alpha_val;
+        beta = &beta_val;
+    } else {
+        // For F16/BF16, use float alpha/beta
+        static const float alpha_val = 1.0f, beta_val = 0.0f;
+        alpha = &alpha_val;
+        beta = &beta_val;
+    }
+
+        CHECK_CUDNN(cudnnPoolingForward(
+            cudnn_handle,
+            _opaque->pool_desc,
+            alpha,
+            _opaque->x_desc, x,
+            beta,
+            _opaque->y_desc, y));
+
+        return INFINI_STATUS_SUCCESS;
+    });
+}
+
+} // namespace op::avg_pool3d::nvidia
diff --git a/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cuh b/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cuh
new file mode 100644
index 000000000..9ace65e7b
--- /dev/null
+++ b/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cuh
@@ -0,0 +1,46 @@
+#ifndef __AVG_POOL3D_NVIDIA_H__
+#define __AVG_POOL3D_NVIDIA_H__
+
+#include "../../../operator.h"
+#include "../../../tensor.h"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include <cudnn.h>
+
+namespace op::avg_pool3d::nvidia {
+
+class Descriptor final : public InfiniopDescriptor {
+    struct Opaque;
+    std::unique_ptr<Opaque> _opaque;
+    infiniDtype_t _dtype;
+
+    // Defined out-of-line (in .cu) to avoid requiring Opaque to be complete
+    // in every translation unit that includes this header.
+    Descriptor(infiniDtype_t dtype, std::unique_ptr<Opaque> opaque,
+               infiniDevice_t device_type, int device_id);
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        void *kernel_size,
+        void *stride,
+        void *padding);
+
+    size_t workspaceSize() const;
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::avg_pool3d::nvidia
+
+#endif // __AVG_POOL3D_NVIDIA_H__
diff --git a/src/infiniop/ops/avg_pool3d/operator.cc b/src/infiniop/ops/avg_pool3d/operator.cc
new file mode 100644
index 000000000..f9e01b991
--- /dev/null
+++ b/src/infiniop/ops/avg_pool3d/operator.cc
@@ -0,0 +1,163 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/avg_pool3d.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/avg_pool3d_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/avg_pool3d_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/avg_pool3d_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/avg_pool3d_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateAvgPool3dDescriptor(
+    infiniopHandle_t handle,
+    infiniopAvgPool3dDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    void *kernel_size,
+    void *stride,
+    void *padding) {
+
+#define CREATE(CASE, NAMESPACE)                                                     \
+    case CASE:                                                                      \
+        return op::avg_pool3d::NAMESPACE::Descriptor::create(                      \
+            handle,                                                                 \
+            reinterpret_cast<op::avg_pool3d::NAMESPACE::Descriptor **>(desc_ptr),  \
+            y_desc,                                                                 \
+            x_desc,                                                                 \
+            kernel_size,                                                            \
+            stride,                                                                 \
+            padding)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAvgPool3dWorkspaceSize(infiniopAvgPool3dDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                        \
+    case CASE:                                                                                      \
+        *size = reinterpret_cast<op::avg_pool3d::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAvgPool3d(
+    infiniopAvgPool3dDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                          \
+        return reinterpret_cast<const op::avg_pool3d::NAMESPACE::Descriptor *>(desc)   \
+            ->calculate(workspace, workspace_size, y, x, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyAvgPool3dDescriptor(infiniopAvgPool3dDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                          \
+    case CASE:                                                                           \
+        delete reinterpret_cast<const op::avg_pool3d::NAMESPACE::Descriptor *>(desc);   \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/dot/cpu/dot_cpu.cc b/src/infiniop/ops/dot/cpu/dot_cpu.cc
new file mode 100644
index 000000000..2a45bc971
--- /dev/null
+++ b/src/infiniop/ops/dot/cpu/dot_cpu.cc
@@ -0,0 +1,105 @@
+#include "dot_cpu.h"
+#include "../../../../utils.h"
+
+namespace op::dot::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+    auto dtype = a_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    // This op does not do implicit dtype conversion: y/a/b must match.
+    if (b_desc->dtype() != dtype || y_desc->dtype() != dtype) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    // Check that y is a scalar (0D tensor or shape [1])
+    auto y_shape = y_desc->shape();
+    if (y_shape.size() != 0 && (y_shape.size() != 1 || y_shape[0] != 1)) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    // Check that a and b are 1D vectors with same length
+    auto a_shape = a_desc->shape();
+    auto b_shape = b_desc->shape();
+    if (a_shape.size() != 1 || b_shape.size() != 1 || a_shape[0] != b_shape[0]) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    size_t n = a_shape[0];
+    ptrdiff_t a_stride = a_desc->strides()[0];
+    ptrdiff_t b_stride = b_desc->strides()[0];
+
+    // Negative/broadcasted strides are not supported without an explicit base offset.
+    if (a_stride <= 0 || b_stride <= 0) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    *desc_ptr = new Descriptor(dtype, n, a_stride, b_stride, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *a,
+    const void *b,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16: {
+        const fp16_t *a_ptr = reinterpret_cast<const fp16_t *>(a);
+        const fp16_t *b_ptr = reinterpret_cast<const fp16_t *>(b);
+        float result = 0.0f;
+        for (size_t i = 0; i < _n; ++i) {
+            result += utils::cast<float>(a_ptr[i * _a_stride]) * utils::cast<float>(b_ptr[i * _b_stride]);
+        }
+        *reinterpret_cast<fp16_t *>(y) = utils::cast<fp16_t>(result);
+        break;
+    }
+    case INFINI_DTYPE_BF16: {
+        const bf16_t *a_ptr = reinterpret_cast<const bf16_t *>(a);
+        const bf16_t *b_ptr = reinterpret_cast<const bf16_t *>(b);
+        float result = 0.0f;
+        for (size_t i = 0; i < _n; ++i) {
+            result += utils::cast<float>(a_ptr[i * _a_stride]) * utils::cast<float>(b_ptr[i * _b_stride]);
+        }
+        *reinterpret_cast<bf16_t *>(y) = utils::cast<bf16_t>(result);
+        break;
+    }
+    case INFINI_DTYPE_F32: {
+        const float *a_ptr = reinterpret_cast<const float *>(a);
+        const float *b_ptr = reinterpret_cast<const float *>(b);
+        float result = 0.0f;
+        for (size_t i = 0; i < _n; ++i) {
+            result += a_ptr[i * _a_stride] * b_ptr[i * _b_stride];
+        }
+        *reinterpret_cast<float *>(y) = result;
+        break;
+    }
+    case INFINI_DTYPE_F64: {
+        const double *a_ptr = reinterpret_cast<const double *>(a);
+        const double *b_ptr = reinterpret_cast<const double *>(b);
+        double result = 0.0;
+        for (size_t i = 0; i < _n; ++i) {
+            result += a_ptr[i * _a_stride] * b_ptr[i * _b_stride];
+        }
+        *reinterpret_cast<double *>(y) = result;
+        break;
+    }
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::dot::cpu
diff --git a/src/infiniop/ops/dot/cpu/dot_cpu.h b/src/infiniop/ops/dot/cpu/dot_cpu.h
new file mode 100644
index 000000000..5c8bbc113
--- /dev/null
+++ b/src/infiniop/ops/dot/cpu/dot_cpu.h
@@ -0,0 +1,49 @@
+#ifndef __DOT_CPU_H__
+#define __DOT_CPU_H__
+
+#include "../../../operator.h"
+#include "../../../tensor.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+#include <cmath>
+
+namespace op::dot::cpu {
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    size_t _n;
+    ptrdiff_t _a_stride;
+    ptrdiff_t _b_stride;
+
+    Descriptor(infiniDtype_t dtype, size_t n, ptrdiff_t a_stride, ptrdiff_t b_stride,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _n(n),
+          _a_stride(a_stride),
+          _b_stride(b_stride) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t a_desc,
+        infiniopTensorDescriptor_t b_desc);
+
+    size_t workspaceSize() const { return 0; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *a,
+        const void *b,
+        void *stream) const;
+};
+
+} // namespace op::dot::cpu
+
+#endif // __DOT_CPU_H__
diff --git a/src/infiniop/ops/dot/cuda/kernel.cuh b/src/infiniop/ops/dot/cuda/kernel.cuh
new file mode 100644
index 000000000..0c4b1249d
--- /dev/null
+++ b/src/infiniop/ops/dot/cuda/kernel.cuh
@@ -0,0 +1,40 @@
+#pragma once
+#include "../../../reduce/cuda/reduce.cuh"
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <type_traits>
+
+namespace op::cuda {
+
+// Dot product kernel: computes dot(a, b) = sum(a * b)
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+__global__ void dot_kernel(
+    Tcompute *result,
+    const Tdata *a,
+    const Tdata *b,
+    size_t n,
+    ptrdiff_t a_stride,
+    ptrdiff_t b_stride) {
+
+    Tcompute sum = 0;
+
+    // Each thread computes partial dot product
+    for (size_t i = threadIdx.x; i < n; i += BLOCK_SIZE) {
+        Tcompute a_val = Tcompute(a[i * a_stride]);
+        Tcompute b_val = Tcompute(b[i * b_stride]);
+        sum += a_val * b_val;
+    }
+
+    // Use CUB block-level reduction
+    using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    Tcompute block_sum = BlockReduce(temp_storage).Sum(sum);
+
+    // Write result (only thread 0, since we only launch 1 block)
+    if (threadIdx.x == 0) {
+        *result = block_sum;
+    }
+}
+
+} // namespace op::cuda
diff --git a/src/infiniop/ops/dot/metax/dot_metax.h b/src/infiniop/ops/dot/metax/dot_metax.h
new file mode 100644
index 000000000..f7b4c0e21
--- /dev/null
+++ b/src/infiniop/ops/dot/metax/dot_metax.h
@@ -0,0 +1,47 @@
+#ifndef __DOT_METAX_H__
+#define __DOT_METAX_H__
+
+#include "../../../operator.h"
+#include "../../../tensor.h"
+#include "../../../devices/metax/metax_common.h"
+
+namespace op::dot::metax {
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    size_t _n;
+    ptrdiff_t _a_stride;
+    ptrdiff_t _b_stride;
+
+    Descriptor(infiniDtype_t dtype, size_t n, ptrdiff_t a_stride, ptrdiff_t b_stride,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _n(n),
+          _a_stride(a_stride),
+          _b_stride(b_stride) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t a_desc,
+        infiniopTensorDescriptor_t b_desc);
+
+    size_t workspaceSize() const { return 0; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *a,
+        const void *b,
+        void *stream) const;
+};
+
+} // namespace op::dot::metax
+
+#endif // __DOT_METAX_H__
diff --git a/src/infiniop/ops/dot/metax/dot_metax.maca b/src/infiniop/ops/dot/metax/dot_metax.maca
new file mode 100644
index 000000000..79b12e8a3
--- /dev/null
+++ b/src/infiniop/ops/dot/metax/dot_metax.maca
@@ -0,0 +1,106 @@
+#include "dot_metax.h"
+#include "../cuda/kernel.cuh"
+#include "../../../../utils.h"
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace op::dot::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+    auto dtype = a_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    // Check that y is a scalar (0D tensor or shape [1])
+    auto y_shape = y_desc->shape();
+    if (y_shape.size() != 0 && (y_shape.size() != 1 || y_shape[0] != 1)) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    // Check that a and b are 1D vectors with same length
+    auto a_shape = a_desc->shape();
+    auto b_shape = b_desc->shape();
+    if (a_shape.size() != 1 || b_shape.size() != 1 || a_shape[0] != b_shape[0]) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    size_t n = a_shape[0];
+    ptrdiff_t a_stride = a_desc->strides()[0];
+    ptrdiff_t b_stride = b_desc->strides()[0];
+
+    *desc_ptr = new Descriptor(dtype, n, a_stride, b_stride, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *a,
+    const void *b,
+    void *stream) const {
+
+    auto hc_stream = reinterpret_cast<hcStream_t>(stream);
+    constexpr unsigned int BLOCK_SIZE = 256;
+
+    // Initialize result to zero
+    switch (_dtype) {
+    case INFINI_DTYPE_F16: {
+        float *result_f = nullptr;
+        CHECK_METAX(hcMalloc((void **)&result_f, sizeof(float)));
+        CHECK_METAX(hcMemsetAsync(result_f, 0, sizeof(float), hc_stream));
+        cuda::dot_kernel<BLOCK_SIZE, half, float><<<1, BLOCK_SIZE, 0, hc_stream>>>(
+            result_f, reinterpret_cast<const half *>(a), reinterpret_cast<const half *>(b),
+            _n, _a_stride, _b_stride);
+        float result_val;
+        CHECK_METAX(hcMemcpyAsync(&result_val, result_f, sizeof(float), hcMemcpyDeviceToHost, hc_stream));
+        CHECK_METAX(hcStreamSynchronize(hc_stream));
+        *reinterpret_cast<half *>(y) = __float2half(result_val);
+        CHECK_METAX(hcFree(result_f));
+        break;
+    }
+    case INFINI_DTYPE_BF16: {
+        float *result_f = nullptr;
+        CHECK_METAX(hcMalloc((void **)&result_f, sizeof(float)));
+        CHECK_METAX(hcMemsetAsync(result_f, 0, sizeof(float), hc_stream));
+        cuda::dot_kernel<BLOCK_SIZE, cuda_bfloat16, float><<<1, BLOCK_SIZE, 0, hc_stream>>>(
+            result_f, reinterpret_cast<const cuda_bfloat16 *>(a), reinterpret_cast<const cuda_bfloat16 *>(b),
+            _n, _a_stride, _b_stride);
+        float result_val;
+        CHECK_METAX(hcMemcpyAsync(&result_val, result_f, sizeof(float), hcMemcpyDeviceToHost, hc_stream));
+        CHECK_METAX(hcStreamSynchronize(hc_stream));
+        *reinterpret_cast<cuda_bfloat16 *>(y) = __float2bfloat16_rn(result_val);
+        CHECK_METAX(hcFree(result_f));
+        break;
+    }
+    case INFINI_DTYPE_F32: {
+        float *result_f = reinterpret_cast<float *>(y);
+        CHECK_METAX(hcMemsetAsync(result_f, 0, sizeof(float), hc_stream));
+        cuda::dot_kernel<BLOCK_SIZE, float, float><<<1, BLOCK_SIZE, 0, hc_stream>>>(
+            result_f, reinterpret_cast<const float *>(a), reinterpret_cast<const float *>(b),
+            _n, _a_stride, _b_stride);
+        break;
+    }
+    case INFINI_DTYPE_F64: {
+        double *result_d = reinterpret_cast<double *>(y);
+        CHECK_METAX(hcMemsetAsync(result_d, 0, sizeof(double), hc_stream));
+        cuda::dot_kernel<BLOCK_SIZE, double, double><<<1, BLOCK_SIZE, 0, hc_stream>>>(
+            result_d, reinterpret_cast<const double *>(a), reinterpret_cast<const double *>(b),
+            _n, _a_stride, _b_stride);
+        break;
+    }
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::dot::metax
diff --git a/src/infiniop/ops/dot/moore/dot_moore.h b/src/infiniop/ops/dot/moore/dot_moore.h
new file mode 100644
index 000000000..eebc0e528
--- /dev/null
+++ b/src/infiniop/ops/dot/moore/dot_moore.h
@@ -0,0 +1,47 @@
+#ifndef __DOT_MOORE_H__
+#define __DOT_MOORE_H__
+
+#include "../../../operator.h"
+#include "../../../tensor.h"
+#include "../../../devices/moore/moore_common.h"
+
+namespace op::dot::moore {
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    size_t _n;
+    ptrdiff_t _a_stride;
+    ptrdiff_t _b_stride;
+
+    Descriptor(infiniDtype_t dtype, size_t n, ptrdiff_t a_stride, ptrdiff_t b_stride,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _n(n),
+          _a_stride(a_stride),
+          _b_stride(b_stride) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t a_desc,
+        infiniopTensorDescriptor_t b_desc);
+
+    size_t workspaceSize() const { return 0; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *a,
+        const void *b,
+        void *stream) const;
+};
+
+} // namespace op::dot::moore
+
+#endif // __DOT_MOORE_H__
diff --git a/src/infiniop/ops/dot/moore/dot_moore.mu b/src/infiniop/ops/dot/moore/dot_moore.mu
new file mode 100644
index 000000000..7b9e9ec77
--- /dev/null
+++ b/src/infiniop/ops/dot/moore/dot_moore.mu
@@ -0,0 +1,106 @@
+#include "dot_moore.h"
+#include "../cuda/kernel.cuh"
+#include "../../../../utils.h"
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace op::dot::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+    auto dtype = a_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    // Check that y is a scalar (0D tensor or shape [1])
+    auto y_shape = y_desc->shape();
+    if (y_shape.size() != 0 && (y_shape.size() != 1 || y_shape[0] != 1)) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    // Check that a and b are 1D vectors with same length
+    auto a_shape = a_desc->shape();
+    auto b_shape = b_desc->shape();
+    if (a_shape.size() != 1 || b_shape.size() != 1 || a_shape[0] != b_shape[0]) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    size_t n = a_shape[0];
+    ptrdiff_t a_stride = a_desc->strides()[0];
+    ptrdiff_t b_stride = b_desc->strides()[0];
+
+    *desc_ptr = new Descriptor(dtype, n, a_stride, b_stride, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *a,
+    const void *b,
+    void *stream) const {
+
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+    constexpr unsigned int BLOCK_SIZE = 256;
+
+    // Initialize result to zero
+    switch (_dtype) {
+    case INFINI_DTYPE_F16: {
+        float *result_f = nullptr;
+        CHECK_MOORE(cudaMallocAsync(&result_f, sizeof(float), cuda_stream));
+        CHECK_MOORE(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream));
+        cuda::dot_kernel<BLOCK_SIZE, half, float><<<1, BLOCK_SIZE, 0, cuda_stream>>>(
+            result_f, reinterpret_cast<const half *>(a), reinterpret_cast<const half *>(b),
+            _n, _a_stride, _b_stride);
+        float result_val;
+        CHECK_MOORE(cudaMemcpyAsync(&result_val, result_f, sizeof(float), cudaMemcpyDeviceToHost, cuda_stream));
+        CHECK_MOORE(cudaStreamSynchronize(cuda_stream));
+        *reinterpret_cast<half *>(y) = __float2half(result_val);
+        CHECK_MOORE(cudaFreeAsync(result_f, cuda_stream));
+        break;
+    }
+    case INFINI_DTYPE_BF16: {
+        float *result_f = nullptr;
+        CHECK_MOORE(cudaMallocAsync(&result_f, sizeof(float), cuda_stream));
+        CHECK_MOORE(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream));
+        cuda::dot_kernel<BLOCK_SIZE, cuda_bfloat16, float><<<1, BLOCK_SIZE, 0, cuda_stream>>>(
+            result_f, reinterpret_cast<const cuda_bfloat16 *>(a), reinterpret_cast<const cuda_bfloat16 *>(b),
+            _n, _a_stride, _b_stride);
+        float result_val;
+        CHECK_MOORE(cudaMemcpyAsync(&result_val, result_f, sizeof(float), cudaMemcpyDeviceToHost, cuda_stream));
+        CHECK_MOORE(cudaStreamSynchronize(cuda_stream));
+        *reinterpret_cast<cuda_bfloat16 *>(y) = __float2bfloat16_rn(result_val);
+        CHECK_MOORE(cudaFreeAsync(result_f, cuda_stream));
+        break;
+    }
+    case INFINI_DTYPE_F32: {
+        float *result_f = reinterpret_cast<float *>(y);
+        CHECK_MOORE(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream));
+        cuda::dot_kernel<BLOCK_SIZE, float, float><<<1, BLOCK_SIZE, 0, cuda_stream>>>(
+            result_f, reinterpret_cast<const float *>(a), reinterpret_cast<const float *>(b),
+            _n, _a_stride, _b_stride);
+        break;
+    }
+    case INFINI_DTYPE_F64: {
+        double *result_d = reinterpret_cast<double *>(y);
+        CHECK_MOORE(cudaMemsetAsync(result_d, 0, sizeof(double), cuda_stream));
+        cuda::dot_kernel<BLOCK_SIZE, double, double><<<1, BLOCK_SIZE, 0, cuda_stream>>>(
+            result_d, reinterpret_cast<const double *>(a), reinterpret_cast<const double *>(b),
+            _n, _a_stride, _b_stride);
+        break;
+    }
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::dot::moore
diff --git a/src/infiniop/ops/dot/nvidia/dot_nvidia.cu b/src/infiniop/ops/dot/nvidia/dot_nvidia.cu
new file mode 100644
index 000000000..464e0a15e
--- /dev/null
+++ b/src/infiniop/ops/dot/nvidia/dot_nvidia.cu
@@ -0,0 +1,121 @@
+#include "dot_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../../../../utils.h"
+
+namespace op::dot::nvidia {
+
+__global__ void store_half_from_f32(half *dst, const float *src) {
+    if (threadIdx.x == 0) {
+        dst[0] = __float2half(src[0]);
+    }
+}
+
+__global__ void store_bf16_from_f32(cuda_bfloat16 *dst, const float *src) {
+    if (threadIdx.x == 0) {
+        dst[0] = __float2bfloat16_rn(src[0]);
+    }
+}
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+    auto dtype = a_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    // This op does not do implicit dtype conversion: y/a/b must match.
+    if (b_desc->dtype() != dtype || y_desc->dtype() != dtype) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    // Check that y is a scalar (0D tensor or shape [1])
+    auto y_shape = y_desc->shape();
+    if (y_shape.size() != 0 && (y_shape.size() != 1 || y_shape[0] != 1)) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    // Check that a and b are 1D vectors with same length
+    auto a_shape = a_desc->shape();
+    auto b_shape = b_desc->shape();
+    if (a_shape.size() != 1 || b_shape.size() != 1 || a_shape[0] != b_shape[0]) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    size_t n = a_shape[0];
+    ptrdiff_t a_stride = a_desc->strides()[0];
+    ptrdiff_t b_stride = b_desc->strides()[0];
+
+    // Negative/broadcasted strides are not supported without an explicit base offset.
+    if (a_stride <= 0 || b_stride <= 0) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    *desc_ptr = new Descriptor(dtype, n, a_stride, b_stride, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *a,
+    const void *b,
+    void *stream) const {
+
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+    constexpr unsigned int BLOCK_SIZE = 256;
+
+    // Initialize result to zero
+    switch (_dtype) {
+    case INFINI_DTYPE_F16: {
+        float *result_f = nullptr;
+        CHECK_CUDA(cudaMallocAsync(&result_f, sizeof(float), cuda_stream));
+        CHECK_CUDA(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream));
+        cuda::dot_kernel<BLOCK_SIZE, half, float><<<1, BLOCK_SIZE, 0, cuda_stream>>>(
+            result_f, reinterpret_cast<const half *>(a), reinterpret_cast<const half *>(b),
+            _n, _a_stride, _b_stride);
+        store_half_from_f32<<<1, 1, 0, cuda_stream>>>(reinterpret_cast<half *>(y), result_f);
+        CHECK_CUDA(cudaFreeAsync(result_f, cuda_stream));
+        break;
+    }
+    case INFINI_DTYPE_BF16: {
+        float *result_f = nullptr;
+        CHECK_CUDA(cudaMallocAsync(&result_f, sizeof(float), cuda_stream));
+        CHECK_CUDA(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream));
+        cuda::dot_kernel<BLOCK_SIZE, cuda_bfloat16, float><<<1, BLOCK_SIZE, 0, cuda_stream>>>(
+            result_f, reinterpret_cast<const cuda_bfloat16 *>(a), reinterpret_cast<const cuda_bfloat16 *>(b),
+            _n, _a_stride, _b_stride);
+        store_bf16_from_f32<<<1, 1, 0, cuda_stream>>>(reinterpret_cast<cuda_bfloat16 *>(y), result_f);
+        CHECK_CUDA(cudaFreeAsync(result_f, cuda_stream));
+        break;
+    }
+    case INFINI_DTYPE_F32: {
+        float *result_f = reinterpret_cast<float *>(y);
+        CHECK_CUDA(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream));
+        cuda::dot_kernel<BLOCK_SIZE, float, float><<<1, BLOCK_SIZE, 0, cuda_stream>>>(
+            result_f, reinterpret_cast<const float *>(a), reinterpret_cast<const float *>(b),
+            _n, _a_stride, _b_stride);
+        break;
+    }
+    case INFINI_DTYPE_F64: {
+        double *result_d = reinterpret_cast<double *>(y);
+        CHECK_CUDA(cudaMemsetAsync(result_d, 0, sizeof(double), cuda_stream));
+        cuda::dot_kernel<BLOCK_SIZE, double, double><<<1, BLOCK_SIZE, 0, cuda_stream>>>(
+            result_d, reinterpret_cast<const double *>(a), reinterpret_cast<const double *>(b),
+            _n, _a_stride, _b_stride);
+        break;
+    }
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::dot::nvidia
diff --git a/src/infiniop/ops/dot/nvidia/dot_nvidia.cuh b/src/infiniop/ops/dot/nvidia/dot_nvidia.cuh
new file mode 100644
index 000000000..76d22aec3
--- /dev/null
+++ b/src/infiniop/ops/dot/nvidia/dot_nvidia.cuh
@@ -0,0 +1,48 @@
+#ifndef __DOT_NVIDIA_H__
+#define __DOT_NVIDIA_H__
+
+#include "../../../operator.h"
+#include "../../../tensor.h"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include <cuda_runtime.h>
+
+namespace op::dot::nvidia {
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    size_t _n;
+    ptrdiff_t _a_stride;
+    ptrdiff_t _b_stride;
+
+    Descriptor(infiniDtype_t dtype, size_t n, ptrdiff_t a_stride, ptrdiff_t b_stride,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _n(n),
+          _a_stride(a_stride),
+          _b_stride(b_stride) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t a_desc,
+        infiniopTensorDescriptor_t b_desc);
+
+    size_t workspaceSize() const { return 0; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *a,
+        const void *b,
+        void *stream) const;
+};
+
+} // namespace op::dot::nvidia
+
+#endif // __DOT_NVIDIA_H__
diff --git a/src/infiniop/ops/dot/operator.cc b/src/infiniop/ops/dot/operator.cc
new file mode 100644
index 000000000..b0f8c6be9
--- /dev/null
+++ b/src/infiniop/ops/dot/operator.cc
@@ -0,0 +1,160 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/dot.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/dot_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/dot_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/dot_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/dot_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateDotDescriptor(
+    infiniopHandle_t handle,
+    infiniopDotDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::dot::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::dot::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            a_desc,                                                         \
+            b_desc)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetDotWorkspaceSize(infiniopDotDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::dot::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopDot(
+    infiniopDotDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::dot::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, a, b, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyDotDescriptor(infiniopDotDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::dot::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/histc/cpu/histc_cpu.cc b/src/infiniop/ops/histc/cpu/histc_cpu.cc
new file mode 100644
index 000000000..d10e560f8
--- /dev/null
+++ b/src/infiniop/ops/histc/cpu/histc_cpu.cc
@@ -0,0 +1,150 @@
+#include "histc_cpu.h"
+#include "../../../../utils.h"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+
+namespace op::histc::cpu {
+
+utils::Result<HistcInfo> HistcInfo::create(
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc,
+    int64_t bins,
+    double min_val,
+    double max_val) {
+
+    if (bins <= 0) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    if (min_val >= max_val) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    // Input should be 1D
+    if (x_shape.size() != 1) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    // Output should be 1D with bins elements
+    if (y_shape.size() != 1 || y_shape[0] != static_cast<size_t>(bins)) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    HistcInfo info;
+    info.input_size = x_shape[0];
+    info.bins = bins;
+    info.min_val = min_val;
+    info.max_val = max_val;
+    info.input_stride = x_desc->strides()[0];
+    info.output_stride = y_desc->strides()[0];
+
+    // This implementation assumes x points to the first logical element and uses linear indexing.
+    // Negative (or broadcasted) strides would require an explicit base offset.
+    if (info.input_stride <= 0) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    // Writing a histogram into a broadcasted or negatively strided output is undefined.
+    if (info.output_stride <= 0) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    return utils::Result<HistcInfo>(std::move(info));
+}
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int64_t bins,
+    double min_val,
+    double max_val) {
+
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    // Histc output is always float32.
+    if (y_desc->dtype() != INFINI_DTYPE_F32) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    auto info_result = HistcInfo::create(x_desc, y_desc, bins, min_val, max_val);
+    CHECK_RESULT(info_result);
+
+    *desc_ptr = new Descriptor(dtype, info_result.take(), handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+void histc_impl(
+    const HistcInfo &info,
+    float *y,
+    const T *x) {
+
+    // Initialize output to zero (supports non-unit stride).
+    for (int64_t b = 0; b < info.bins; ++b) {
+        y[b * info.output_stride] = 0.0f;
+    }
+
+    const double bin_width = (info.max_val - info.min_val) / static_cast<double>(info.bins);
+
+    for (size_t i = 0; i < info.input_size; ++i) {
+        double val = utils::cast<double>(x[i * info.input_stride]);
+
+        // Skip values outside range
+        if (val < info.min_val || val > info.max_val) {
+            continue;
+        }
+
+        // Calculate bin index
+        int64_t bin_idx = static_cast<int64_t>((val - info.min_val) / bin_width);
+
+        // Handle edge case: max_val should go to last bin
+        if (bin_idx >= info.bins) {
+            bin_idx = info.bins - 1;
+        }
+        if (bin_idx < 0) {
+            bin_idx = 0;
+        }
+
+        y[bin_idx * info.output_stride] += 1.0f;
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    float *y_ptr = reinterpret_cast<float *>(y);
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        histc_impl<fp16_t>(_info, y_ptr, reinterpret_cast<const fp16_t *>(x));
+        break;
+    case INFINI_DTYPE_BF16:
+        histc_impl<bf16_t>(_info, y_ptr, reinterpret_cast<const bf16_t *>(x));
+        break;
+    case INFINI_DTYPE_F32:
+        histc_impl<float>(_info, y_ptr, reinterpret_cast<const float *>(x));
+        break;
+    case INFINI_DTYPE_F64:
+        histc_impl<double>(_info, y_ptr, reinterpret_cast<const double *>(x));
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::histc::cpu
diff --git a/src/infiniop/ops/histc/cpu/histc_cpu.h b/src/infiniop/ops/histc/cpu/histc_cpu.h
new file mode 100644
index 000000000..673e8ff4b
--- /dev/null
+++ b/src/infiniop/ops/histc/cpu/histc_cpu.h
@@ -0,0 +1,60 @@
+#ifndef __HISTC_CPU_H__
+#define __HISTC_CPU_H__
+
+#include "../../../operator.h"
+#include "../../../tensor.h"
+#include "../../../devices/cpu/common_cpu.h"
+
+namespace op::histc::cpu {
+
+struct HistcInfo {
+    size_t input_size;
+    int64_t bins;
+    double min_val;
+    double max_val;
+    ptrdiff_t input_stride;
+    ptrdiff_t output_stride;
+
+    static utils::Result<HistcInfo> create(
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t y_desc,
+        int64_t bins,
+        double min_val,
+        double max_val);
+};
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    HistcInfo _info;
+
+    Descriptor(infiniDtype_t dtype, HistcInfo info,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _info(std::move(info)) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        int64_t bins,
+        double min_val,
+        double max_val);
+
+    size_t workspaceSize() const { return 0; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::histc::cpu
+
+#endif // __HISTC_CPU_H__
diff --git a/src/infiniop/ops/histc/cuda/kernel.cuh b/src/infiniop/ops/histc/cuda/kernel.cuh
new file mode 100644
index 000000000..ee6dd4f4b
--- /dev/null
+++ b/src/infiniop/ops/histc/cuda/kernel.cuh
@@ -0,0 +1,49 @@
+#pragma once
+#include <cuda_runtime.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <type_traits>
+
+namespace op::cuda {
+
+template <typename T>
+__global__ void histc_kernel(
+    float *hist,
+    const T *input,
+    size_t input_size,
+    ptrdiff_t input_stride,
+    int64_t bins,
+    double min_val,
+    double max_val) {
+
+    size_t idx = static_cast<size_t>(blockIdx.x) * static_cast<size_t>(blockDim.x) +
+                 static_cast<size_t>(threadIdx.x);
+    size_t stride = static_cast<size_t>(blockDim.x) * static_cast<size_t>(gridDim.x);
+    size_t input_stride_u = static_cast<size_t>(input_stride);
+
+    double bin_width = (max_val - min_val) / static_cast<double>(bins);
+
+    for (size_t i = idx; i < input_size; i += stride) {
+        double val = static_cast<double>(input[i * input_stride_u]);
+
+        // Skip values outside range
+        if (val < min_val || val > max_val) {
+            continue;
+        }
+
+        // Calculate bin index
+        int64_t bin_idx = static_cast<int64_t>((val - min_val) / bin_width);
+
+        // Handle edge case: max_val should go to last bin
+        if (bin_idx >= bins) {
+            bin_idx = bins - 1;
+        }
+        if (bin_idx < 0) {
+            bin_idx = 0;
+        }
+
+        atomicAdd(&hist[bin_idx], 1.0f);
+    }
+}
+
+} // namespace op::cuda
diff --git a/src/infiniop/ops/histc/metax/histc_metax.h b/src/infiniop/ops/histc/metax/histc_metax.h
new file mode 100644
index 000000000..7ba259939
--- /dev/null
+++ b/src/infiniop/ops/histc/metax/histc_metax.h
@@ -0,0 +1,53 @@
+#ifndef __HISTC_METAX_H__
+#define __HISTC_METAX_H__
+
+#include "../../../operator.h"
+#include "../../../tensor.h"
+#include "../../../devices/metax/metax_common.h"
+
+namespace op::histc::metax {
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    size_t _input_size;
+    int64_t _bins;
+    double _min_val;
+    double _max_val;
+    ptrdiff_t _input_stride;
+
+    Descriptor(infiniDtype_t dtype, size_t input_size, int64_t bins,
+               double min_val, double max_val, ptrdiff_t input_stride,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _input_size(input_size),
+          _bins(bins),
+          _min_val(min_val),
+          _max_val(max_val),
+          _input_stride(input_stride) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        int64_t bins,
+        double min_val,
+        double max_val);
+
+    size_t workspaceSize() const { return 0; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::histc::metax
+
+#endif // __HISTC_METAX_H__
diff --git a/src/infiniop/ops/histc/metax/histc_metax.maca b/src/infiniop/ops/histc/metax/histc_metax.maca
new file mode 100644
index 000000000..a72f0d980
--- /dev/null
+++ b/src/infiniop/ops/histc/metax/histc_metax.maca
@@ -0,0 +1,105 @@
+#include "histc_metax.h"
+#include "../cuda/kernel.cuh"
+#include "../../../../utils.h"
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace op::histc::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int64_t bins,
+    double min_val,
+    double max_val) {
+
+    if (bins <= 0 || min_val >= max_val) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    if (x_shape.size() != 1 || y_shape.size() != 1 || y_shape[0] != static_cast<size_t>(bins)) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    size_t input_size = x_shape[0];
+    ptrdiff_t input_stride = x_desc->strides()[0];
+
+    *desc_ptr = new Descriptor(dtype, input_size, bins, min_val, max_val,
+                               input_stride, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    auto hc_stream = reinterpret_cast<hcStream_t>(stream);
+
+    // Initialize output to zero
+    CHECK_METAX(hcMemsetAsync(y, 0, _bins * sizeof(float), hc_stream));
+
+    constexpr int BLOCK_SIZE = 256;
+    int num_blocks = (_input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        cuda::histc_kernel<half><<<num_blocks, BLOCK_SIZE, 0, hc_stream>>>(
+            reinterpret_cast<float *>(y),
+            reinterpret_cast<const half *>(x),
+            _input_size,
+            _input_stride,
+            _bins,
+            _min_val,
+            _max_val);
+        break;
+    case INFINI_DTYPE_BF16:
+        cuda::histc_kernel<cuda_bfloat16><<<num_blocks, BLOCK_SIZE, 0, hc_stream>>>(
+            reinterpret_cast<float *>(y),
+            reinterpret_cast<const cuda_bfloat16 *>(x),
+            _input_size,
+            _input_stride,
+            _bins,
+            _min_val,
+            _max_val);
+        break;
+    case INFINI_DTYPE_F32:
+        cuda::histc_kernel<float><<<num_blocks, BLOCK_SIZE, 0, hc_stream>>>(
+            reinterpret_cast<float *>(y),
+            reinterpret_cast<const float *>(x),
+            _input_size,
+            _input_stride,
+            _bins,
+            _min_val,
+            _max_val);
+        break;
+    case INFINI_DTYPE_F64:
+        cuda::histc_kernel<double><<<num_blocks, BLOCK_SIZE, 0, hc_stream>>>(
+            reinterpret_cast<float *>(y),
+            reinterpret_cast<const double *>(x),
+            _input_size,
+            _input_stride,
+            _bins,
+            _min_val,
+            _max_val);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::histc::metax
diff --git a/src/infiniop/ops/histc/moore/histc_moore.h b/src/infiniop/ops/histc/moore/histc_moore.h
new file mode 100644
index 000000000..877fb5bf3
--- /dev/null
+++ b/src/infiniop/ops/histc/moore/histc_moore.h
@@ -0,0 +1,53 @@
+#ifndef __HISTC_MOORE_H__
+#define __HISTC_MOORE_H__
+
+#include "../../../operator.h"
+#include "../../../tensor.h"
+#include "../../../devices/moore/moore_common.h"
+
+namespace op::histc::moore {
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    size_t _input_size;
+    int64_t _bins;
+    double _min_val;
+    double _max_val;
+    ptrdiff_t _input_stride;
+
+    Descriptor(infiniDtype_t dtype, size_t input_size, int64_t bins,
+               double min_val, double max_val, ptrdiff_t input_stride,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _input_size(input_size),
+          _bins(bins),
+          _min_val(min_val),
+          _max_val(max_val),
+          _input_stride(input_stride) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        int64_t bins,
+        double min_val,
+        double max_val);
+
+    size_t workspaceSize() const { return 0; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::histc::moore
+
+#endif // __HISTC_MOORE_H__
diff --git a/src/infiniop/ops/histc/moore/histc_moore.mu b/src/infiniop/ops/histc/moore/histc_moore.mu
new file mode 100644
index 000000000..6b10b6007
--- /dev/null
+++ b/src/infiniop/ops/histc/moore/histc_moore.mu
@@ -0,0 +1,105 @@
+#include "histc_moore.h"
+#include "../cuda/kernel.cuh"
+#include "../../../../utils.h"
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace op::histc::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int64_t bins,
+    double min_val,
+    double max_val) {
+
+    if (bins <= 0 || min_val >= max_val) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    if (x_shape.size() != 1 || y_shape.size() != 1 || y_shape[0] != static_cast<size_t>(bins)) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    size_t input_size = x_shape[0];
+    ptrdiff_t input_stride = x_desc->strides()[0];
+
+    *desc_ptr = new Descriptor(dtype, input_size, bins, min_val, max_val,
+                               input_stride, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
+
+    // Initialize output to zero
+    CHECK_MOORE(musaMemsetAsync(y, 0, _bins * sizeof(float), musa_stream));
+
+    constexpr int BLOCK_SIZE = 256;
+    int num_blocks = (_input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        cuda::histc_kernel<half><<<num_blocks, BLOCK_SIZE, 0, musa_stream>>>(
+            reinterpret_cast<float *>(y),
+            reinterpret_cast<const half *>(x),
+            _input_size,
+            _input_stride,
+            _bins,
+            _min_val,
+            _max_val);
+        break;
+    case INFINI_DTYPE_BF16:
+        cuda::histc_kernel<cuda_bfloat16><<<num_blocks, BLOCK_SIZE, 0, musa_stream>>>(
+            reinterpret_cast<float *>(y),
+            reinterpret_cast<const cuda_bfloat16 *>(x),
+            _input_size,
+            _input_stride,
+            _bins,
+            _min_val,
+            _max_val);
+        break;
+    case INFINI_DTYPE_F32:
+        cuda::histc_kernel<float><<<num_blocks, BLOCK_SIZE, 0, musa_stream>>>(
+            reinterpret_cast<float *>(y),
+            reinterpret_cast<const float *>(x),
+            _input_size,
+            _input_stride,
+            _bins,
+            _min_val,
+            _max_val);
+        break;
+    case INFINI_DTYPE_F64:
+        cuda::histc_kernel<double><<<num_blocks, BLOCK_SIZE, 0, musa_stream>>>(
+            reinterpret_cast<float *>(y),
+            reinterpret_cast<const double *>(x),
+            _input_size,
+            _input_stride,
+            _bins,
+            _min_val,
+            _max_val);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::histc::moore
diff --git a/src/infiniop/ops/histc/nvidia/histc_nvidia.cu b/src/infiniop/ops/histc/nvidia/histc_nvidia.cu
new file mode 100644
index 000000000..943dae6f7
--- /dev/null
+++ b/src/infiniop/ops/histc/nvidia/histc_nvidia.cu
@@ -0,0 +1,119 @@
+#include "histc_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../../../../utils.h"
+
+namespace op::histc::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int64_t bins,
+    double min_val,
+    double max_val) {
+
+    if (bins <= 0 || min_val >= max_val) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    // Histc output is always float32. This backend also requires a contiguous output.
+    if (y_desc->dtype() != INFINI_DTYPE_F32) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    if (x_shape.size() != 1 || y_shape.size() != 1 || y_shape[0] != static_cast<size_t>(bins)) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    size_t input_size = x_shape[0];
+    ptrdiff_t input_stride = x_desc->strides()[0];
+    ptrdiff_t output_stride = y_desc->strides()[0];
+
+    // This implementation treats y as a contiguous `float*` buffer.
+    if (output_stride != 1) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+    // Negative (or broadcasted) strides are not supported by this kernel without an explicit base offset.
+    if (input_stride <= 0) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    *desc_ptr = new Descriptor(dtype, input_size, bins, min_val, max_val,
+                               input_stride, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+    // Initialize output to zero
+    CHECK_CUDA(cudaMemsetAsync(y, 0, _bins * sizeof(float), cuda_stream));
+
+    constexpr int BLOCK_SIZE = 256;
+    int num_blocks = (_input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        cuda::histc_kernel<half><<<num_blocks, BLOCK_SIZE, 0, cuda_stream>>>(
+            reinterpret_cast<float *>(y),
+            reinterpret_cast<const half *>(x),
+            _input_size,
+            _input_stride,
+            _bins,
+            _min_val,
+            _max_val);
+        break;
+    case INFINI_DTYPE_BF16:
+        cuda::histc_kernel<cuda_bfloat16><<<num_blocks, BLOCK_SIZE, 0, cuda_stream>>>(
+            reinterpret_cast<float *>(y),
+            reinterpret_cast<const cuda_bfloat16 *>(x),
+            _input_size,
+            _input_stride,
+            _bins,
+            _min_val,
+            _max_val);
+        break;
+    case INFINI_DTYPE_F32:
+        cuda::histc_kernel<float><<<num_blocks, BLOCK_SIZE, 0, cuda_stream>>>(
+            reinterpret_cast<float *>(y),
+            reinterpret_cast<const float *>(x),
+            _input_size,
+            _input_stride,
+            _bins,
+            _min_val,
+            _max_val);
+        break;
+    case INFINI_DTYPE_F64:
+        cuda::histc_kernel<double><<<num_blocks, BLOCK_SIZE, 0, cuda_stream>>>(
+            reinterpret_cast<float *>(y),
+            reinterpret_cast<const double *>(x),
+            _input_size,
+            _input_stride,
+            _bins,
+            _min_val,
+            _max_val);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::histc::nvidia
diff --git a/src/infiniop/ops/histc/nvidia/histc_nvidia.cuh b/src/infiniop/ops/histc/nvidia/histc_nvidia.cuh
new file mode 100644
index 000000000..464f7ef03
--- /dev/null
+++ b/src/infiniop/ops/histc/nvidia/histc_nvidia.cuh
@@ -0,0 +1,53 @@
+#ifndef __HISTC_NVIDIA_H__
+#define __HISTC_NVIDIA_H__
+
+#include "../../../operator.h"
+#include "../../../tensor.h"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+
+namespace op::histc::nvidia {
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    size_t _input_size;
+    int64_t _bins;
+    double _min_val;
+    double _max_val;
+    ptrdiff_t _input_stride;
+
+    Descriptor(infiniDtype_t dtype, size_t input_size, int64_t bins,
+               double min_val, double max_val, ptrdiff_t input_stride,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _input_size(input_size),
+          _bins(bins),
+          _min_val(min_val),
+          _max_val(max_val),
+          _input_stride(input_stride) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        int64_t bins,
+        double min_val,
+        double max_val);
+
+    size_t workspaceSize() const { return 0; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::histc::nvidia
+
+#endif // __HISTC_NVIDIA_H__
diff --git a/src/infiniop/ops/histc/operator.cc b/src/infiniop/ops/histc/operator.cc
new file mode 100644
index 000000000..acd205ee0
--- /dev/null
+++ b/src/infiniop/ops/histc/operator.cc
@@ -0,0 +1,163 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/histc.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/histc_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/histc_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/histc_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/histc_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateHistcDescriptor(
+    infiniopHandle_t handle,
+    infiniopHistcDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int64_t bins,
+    double min_val,
+    double max_val) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::histc::NAMESPACE::Descriptor::create(                   \
+            handle,                                                         \
+            reinterpret_cast<op::histc::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            x_desc,                                                         \
+            bins,                                                           \
+            min_val,                                                        \
+            max_val)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetHistcWorkspaceSize(infiniopHistcDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::histc::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopHistc(
+    infiniopHistcDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::histc::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, x, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyHistcDescriptor(infiniopHistcDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::histc::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/log10/cpu/log10_cpu.cc b/src/infiniop/ops/log10/cpu/log10_cpu.cc
new file mode 100644
index 000000000..3148014dd
--- /dev/null
+++ b/src/infiniop/ops/log10/cpu/log10_cpu.cc
@@ -0,0 +1,53 @@
+#include "log10_cpu.h"
+
+namespace op::log10::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<Log10Op, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<Log10Op, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<Log10Op, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<Log10Op, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::log10::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/log10/cpu/log10_cpu.h b/src/infiniop/ops/log10/cpu/log10_cpu.h
new file mode 100644
index 000000000..6cc22a1e3
--- /dev/null
+++ b/src/infiniop/ops/log10/cpu/log10_cpu.h
@@ -0,0 +1,20 @@
+#ifndef __LOG10_CPU_H__
+#define __LOG10_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(log10, cpu)
+
+namespace op::log10::cpu {
+typedef struct Log10Op {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::log10(x);
+    }
+} Log10Op;
+} // namespace op::log10::cpu
+
+#endif // __LOG10_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/log10/cuda/kernel.cuh b/src/infiniop/ops/log10/cuda/kernel.cuh
new file mode 100644
index 000000000..f02b7f98e
--- /dev/null
+++ b/src/infiniop/ops/log10/cuda/kernel.cuh
@@ -0,0 +1,31 @@
+#pragma once
+#include <cmath> // 包含 log10f, log10, log, logf 等
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <math_constants.h>
+#include <type_traits>
+
+namespace op::cuda {
+
+// 移除 high_precision_log10f 避免混淆，让 Log10Op 直接实现逻辑。
+
+struct Log10Op {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, float>) {
+            // For F32: compute via F64 for improved accuracy.
+            return (float)log10((double)x);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return log10(x);
+        } else {
+            // For F16/BF16: promote to float, compute, then cast back.
+            return (T)(float)log10((double)(float)x);
+        }
+    }
+};
+
+} // namespace op::cuda
diff --git a/src/infiniop/ops/log10/log10.h b/src/infiniop/ops/log10/log10.h
new file mode 100644
index 000000000..917a1db1c
--- /dev/null
+++ b/src/infiniop/ops/log10/log10.h
@@ -0,0 +1,8 @@
+#ifndef __LOG10_H__
+#define __LOG10_H__
+
+#include "../../elementwise/elementwise.h"
+
+#define DESCRIPTOR(NAMESPACE) ELEMENTWISE_DESCRIPTOR(log10, NAMESPACE)
+
+#endif // __LOG10_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/log10/metax/log10_metax.h b/src/infiniop/ops/log10/metax/log10_metax.h
new file mode 100644
index 000000000..a881d05b5
--- /dev/null
+++ b/src/infiniop/ops/log10/metax/log10_metax.h
@@ -0,0 +1,8 @@
+#ifndef __LOG10_METAX_API_H__
+#define __LOG10_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(log10, metax)
+
+#endif // __LOG10_METAX_API_H__
diff --git a/src/infiniop/ops/log10/metax/log10_metax.maca b/src/infiniop/ops/log10/metax/log10_metax.maca
new file mode 100644
index 000000000..03195354d
--- /dev/null
+++ b/src/infiniop/ops/log10/metax/log10_metax.maca
@@ -0,0 +1,60 @@
+#include "log10_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::log10::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create METAX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::Log10Op, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::Log10Op, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::Log10Op, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::Log10Op, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::log10::metax
diff --git a/src/infiniop/ops/log10/moore/log10_moore.h b/src/infiniop/ops/log10/moore/log10_moore.h
new file mode 100644
index 000000000..fe148a0f1
--- /dev/null
+++ b/src/infiniop/ops/log10/moore/log10_moore.h
@@ -0,0 +1,8 @@
+#ifndef __LOG10_MOORE_API_H__
+#define __LOG10_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(log10, moore)
+
+#endif // __LOG10_MOORE_API_H__
diff --git a/src/infiniop/ops/log10/moore/log10_moore.mu b/src/infiniop/ops/log10/moore/log10_moore.mu
new file mode 100644
index 000000000..66884d7af
--- /dev/null
+++ b/src/infiniop/ops/log10/moore/log10_moore.mu
@@ -0,0 +1,61 @@
+#include "log10_moore.h"
+
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "log10_moore_kernel.h"
+
+namespace op::log10::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create MOORE elementwise descriptor
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, moore::Log10Op, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, moore::Log10Op, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, moore::Log10Op, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, moore::Log10Op, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::log10::moore
diff --git a/src/infiniop/ops/log10/moore/log10_moore_kernel.h b/src/infiniop/ops/log10/moore/log10_moore_kernel.h
new file mode 100644
index 000000000..636663fc0
--- /dev/null
+++ b/src/infiniop/ops/log10/moore/log10_moore_kernel.h
@@ -0,0 +1,39 @@
+#ifndef __LOG10_MOORE_KERNEL_H__
+#define __LOG10_MOORE_KERNEL_H__
+
+#include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <type_traits>
+
+namespace op::log10::moore {
+
+typedef struct Log10Op {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            // half2 path: convert to float, compute elementwise, pack back
+            float x0 = __low2float(x);
+            float x1 = __high2float(x);
+            return __floats2half2_rn(log10f(x0), log10f(x1));
+        } else if constexpr (std::is_same_v<T, half>) {
+            // FP16 path: convert to float for accuracy on MUSA backend
+            float xf = __half2float(x);
+            return __float2half(log10f(xf));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // BF16 path: compute in FP32 then cast back
+            float xf = __bfloat162float(x);
+            return __float2bfloat16_rn(log10f(xf));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return log10f(x);
+        } else { // double
+            return log10(x);
+        }
+    }
+} Log10Op;
+
+} // namespace op::log10::moore
+
+#endif // __LOG10_MOORE_KERNEL_H__
diff --git a/src/infiniop/ops/log10/nvidia/log10_nvidia.cu b/src/infiniop/ops/log10/nvidia/log10_nvidia.cu
new file mode 100644
index 000000000..03196e816
--- /dev/null
+++ b/src/infiniop/ops/log10/nvidia/log10_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "log10_nvidia.cuh"
+
+namespace op::log10::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::Log10Op, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::Log10Op, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::Log10Op, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::Log10Op, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::log10::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/log10/nvidia/log10_nvidia.cuh b/src/infiniop/ops/log10/nvidia/log10_nvidia.cuh
new file mode 100644
index 000000000..62af5f2a6
--- /dev/null
+++ b/src/infiniop/ops/log10/nvidia/log10_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __LOG10_NVIDIA_H__
+#define __LOG10_NVIDIA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(log10, nvidia)
+
+#endif // __LOG10_NVIDIA_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/log10/operator.cc b/src/infiniop/ops/log10/operator.cc
new file mode 100644
index 000000000..a5d1099bd
--- /dev/null
+++ b/src/infiniop/ops/log10/operator.cc
@@ -0,0 +1,157 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/log10.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/log10_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/log10_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/log10_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/log10_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateLog10Descriptor(
+    infiniopHandle_t handle,
+    infiniopLog10Descriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::log10::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::log10::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                          \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetLog10WorkspaceSize(infiniopLog10Descriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::log10::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopLog10(
+    infiniopLog10Descriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::log10::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyLog10Descriptor(infiniopLog10Descriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::log10::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/log1p/cpu/log1p_cpu.cc b/src/infiniop/ops/log1p/cpu/log1p_cpu.cc
new file mode 100644
index 000000000..ddbb64cca
--- /dev/null
+++ b/src/infiniop/ops/log1p/cpu/log1p_cpu.cc
@@ -0,0 +1,53 @@
+#include "log1p_cpu.h"
+
+namespace op::log1p::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<Log1pOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<Log1pOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<Log1pOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<Log1pOp, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::log1p::cpu
diff --git a/src/infiniop/ops/log1p/cpu/log1p_cpu.h b/src/infiniop/ops/log1p/cpu/log1p_cpu.h
new file mode 100644
index 000000000..33dc89159
--- /dev/null
+++ b/src/infiniop/ops/log1p/cpu/log1p_cpu.h
@@ -0,0 +1,20 @@
+#ifndef __LOG1P_CPU_H__
+#define __LOG1P_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(log1p, cpu)
+
+namespace op::log1p::cpu {
+typedef struct Log1pOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::log1p(x);
+    }
+} Log1pOp;
+} // namespace op::log1p::cpu
+
+#endif // __LOG1P_CPU_H__
diff --git a/src/infiniop/ops/log1p/cuda/kernel.cuh b/src/infiniop/ops/log1p/cuda/kernel.cuh
new file mode 100644
index 000000000..381bad957
--- /dev/null
+++ b/src/infiniop/ops/log1p/cuda/kernel.cuh
@@ -0,0 +1,29 @@
+#pragma once
+#include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <math_constants.h>
+#include <type_traits>
+
+namespace op::cuda {
+
+struct Log1pOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, float>) {
+            // Use double precision for better accuracy.
+            return (float)log1p((double)x);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return log1p(x);
+        } else {
+            // For F16/BF16: promote to float, compute, then cast back.
+            return (T)(float)log1p((double)(float)x);
+        }
+    }
+};
+
+} // namespace op::cuda
diff --git a/src/infiniop/ops/log1p/log1p.h b/src/infiniop/ops/log1p/log1p.h
new file mode 100644
index 000000000..8ba2b6a91
--- /dev/null
+++ b/src/infiniop/ops/log1p/log1p.h
@@ -0,0 +1,8 @@
+#ifndef __LOG1P_H__
+#define __LOG1P_H__
+
+#include "../../elementwise/elementwise.h"
+
+#define DESCRIPTOR(NAMESPACE) ELEMENTWISE_DESCRIPTOR(log1p, NAMESPACE)
+
+#endif // __LOG1P_H__
diff --git a/src/infiniop/ops/log1p/metax/log1p_metax.h b/src/infiniop/ops/log1p/metax/log1p_metax.h
new file mode 100644
index 000000000..f0fad5913
--- /dev/null
+++ b/src/infiniop/ops/log1p/metax/log1p_metax.h
@@ -0,0 +1,8 @@
+#ifndef __LOG1P_METAX_API_H__
+#define __LOG1P_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(log1p, metax)
+
+#endif // __LOG1P_METAX_API_H__
diff --git a/src/infiniop/ops/log1p/metax/log1p_metax.maca b/src/infiniop/ops/log1p/metax/log1p_metax.maca
new file mode 100644
index 000000000..0ca2803d3
--- /dev/null
+++ b/src/infiniop/ops/log1p/metax/log1p_metax.maca
@@ -0,0 +1,59 @@
+#include "log1p_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::log1p::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create METAX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::Log1pOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::Log1pOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::Log1pOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::Log1pOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+} // namespace op::log1p::metax
diff --git a/src/infiniop/ops/log1p/moore/log1p_moore.h b/src/infiniop/ops/log1p/moore/log1p_moore.h
new file mode 100644
index 000000000..974cbf5b3
--- /dev/null
+++ b/src/infiniop/ops/log1p/moore/log1p_moore.h
@@ -0,0 +1,8 @@
+#ifndef __LOG1P_MOORE_API_H__
+#define __LOG1P_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(log1p, moore)
+
+#endif // __LOG1P_MOORE_API_H__
diff --git a/src/infiniop/ops/log1p/moore/log1p_moore.mu b/src/infiniop/ops/log1p/moore/log1p_moore.mu
new file mode 100644
index 000000000..7eed0c0d4
--- /dev/null
+++ b/src/infiniop/ops/log1p/moore/log1p_moore.mu
@@ -0,0 +1,61 @@
+#include "log1p_moore.h"
+
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "log1p_moore_kernel.h"
+
+namespace op::log1p::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create MOORE elementwise descriptor
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, moore::Log1pOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, moore::Log1pOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, moore::Log1pOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, moore::Log1pOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::log1p::moore
diff --git a/src/infiniop/ops/log1p/moore/log1p_moore_kernel.h b/src/infiniop/ops/log1p/moore/log1p_moore_kernel.h
new file mode 100644
index 000000000..ff42f0f5e
--- /dev/null
+++ b/src/infiniop/ops/log1p/moore/log1p_moore_kernel.h
@@ -0,0 +1,39 @@
+#ifndef __LOG1P_MOORE_KERNEL_H__
+#define __LOG1P_MOORE_KERNEL_H__
+
+#include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <type_traits>
+
+namespace op::log1p::moore {
+
+typedef struct Log1pOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            // half2 path: convert to float, compute elementwise, pack back
+            float x0 = __low2float(x);
+            float x1 = __high2float(x);
+            return __floats2half2_rn(log1pf(x0), log1pf(x1));
+        } else if constexpr (std::is_same_v<T, half>) {
+            // FP16 path: convert to float for accuracy on MUSA backend
+            float xf = __half2float(x);
+            return __float2half(log1pf(xf));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // BF16 path: compute in FP32 then cast back
+            float xf = __bfloat162float(x);
+            return __float2bfloat16_rn(log1pf(xf));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return log1pf(x);
+        } else { // double
+            return log1p(x);
+        }
+    }
+} Log1pOp;
+
+} // namespace op::log1p::moore
+
+#endif // __LOG1P_MOORE_KERNEL_H__
diff --git a/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cu b/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cu
new file mode 100644
index 000000000..1190713d4
--- /dev/null
+++ b/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "log1p_nvidia.cuh"
+
+namespace op::log1p::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::Log1pOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::Log1pOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::Log1pOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::Log1pOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::log1p::nvidia
diff --git a/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cuh b/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cuh
new file mode 100644
index 000000000..ccef48746
--- /dev/null
+++ b/src/infiniop/ops/log1p/nvidia/log1p_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __LOG1P_NVIDIA_H__
+#define __LOG1P_NVIDIA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(log1p, nvidia)
+
+#endif // __LOG1P_NVIDIA_H__
diff --git a/src/infiniop/ops/log1p/operator.cc b/src/infiniop/ops/log1p/operator.cc
new file mode 100644
index 000000000..a0efc1d1b
--- /dev/null
+++ b/src/infiniop/ops/log1p/operator.cc
@@ -0,0 +1,157 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/log1p.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/log1p_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/log1p_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/log1p_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/log1p_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateLog1pDescriptor(
+    infiniopHandle_t handle,
+    infiniopLog1pDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::log1p::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::log1p::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                          \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetLog1pWorkspaceSize(infiniopLog1pDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::log1p::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopLog1p(
+    infiniopLog1pDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::log1p::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyLog1pDescriptor(infiniopLog1pDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::log1p::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/test/infiniop/avg_pool3d.py b/test/infiniop/avg_pool3d.py
new file mode 100644
index 000000000..2d76c1b97
--- /dev/null
+++ b/test/infiniop/avg_pool3d.py
@@ -0,0 +1,153 @@
+import torch
+import ctypes
+from ctypes import c_uint64, c_size_t, c_void_p
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+# ==============================================================================
+#  Configuration
+# ==============================================================================
+
+# Test cases format: (x_shape, x_stride_or_None, kernel_size, stride_or_None, padding)
+_TEST_CASES = [
+    ((1, 2, 8, 8, 8), None, (2, 2, 2), None, (0, 0, 0)),
+    ((2, 3, 7, 9, 5), (756, 252, 36, 4, 1), (3, 3, 3), (2, 2, 1), (1, 1, 0)),
+    ((2, 1, 9, 11, 7), (693, 77, 77, 7, 1), (3, 2, 3), None, (1, 0, 1)),
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F32]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-4},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_avg_pool3d(x, kernel_size, stride, padding):
+    kwargs = {"kernel_size": kernel_size, "padding": padding}
+    if stride is not None:
+        kwargs["stride"] = stride
+    return torch.nn.functional.avg_pool3d(x, **kwargs)
+
+
+def test(
+    handle,
+    device,
+    x_shape,
+    x_stride,
+    kernel_size,
+    stride,
+    padding,
+    dtype=torch.float32,
+    sync=None,
+):
+    torch.manual_seed(0)
+    if device != 0:
+        torch.cuda.manual_seed_all(0)
+
+    x = TestTensor(x_shape, x_stride, dtype, device)
+    # For arbitrary (possibly overlapping) strides, the operator reads memory
+    # according to (shape, strides) from the backing storage. Use actual_tensor
+    # (the strided view) as the reference input to match that behavior.
+    x_ref = x.actual_tensor() if x_stride is not None else x.torch_tensor()
+    y_ref = torch_avg_pool3d(x_ref, kernel_size, stride, padding)
+    y = TestTensor(tuple(y_ref.shape), None, dtype, device, mode="ones")
+    y.update_torch_tensor(y_ref)
+
+    print(
+        f"Testing AvgPool3d on {InfiniDeviceNames[device]} with x_shape:{x_shape} x_stride:{x_stride} "
+        f"kernel_size:{kernel_size} stride:{stride} padding:{padding} dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    if sync is not None:
+        sync()
+
+    ks_arr = (c_size_t * 3)(*kernel_size)
+    stride_ptr = None
+    if stride is not None:
+        s_arr = (c_size_t * 3)(*stride)
+        stride_ptr = ctypes.cast(s_arr, c_void_p)
+    pad_arr = (c_size_t * 3)(*padding)
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAvgPool3dDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            x.descriptor,
+            ctypes.cast(ks_arr, c_void_p),
+            stride_ptr,
+            ctypes.cast(pad_arr, c_void_p),
+        )
+    )
+
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAvgPool3dWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_avg_pool3d():
+        check_error(
+            LIBINFINIOP.infiniopAvgPool3d(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                y.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_avg_pool3d()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_avg_pool3d(x.torch_tensor(), kernel_size, stride, padding), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_avg_pool3d(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyAvgPool3dDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92m  Test passed!  \033[0m")
diff --git a/test/infiniop/dot.py b/test/infiniop/dot.py
new file mode 100644
index 000000000..346341ceb
--- /dev/null
+++ b/test/infiniop/dot.py
@@ -0,0 +1,138 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+# ==============================================================================
+#  Configuration
+# ==============================================================================
+
+_TEST_CASES = [
+    # n, a_stride, b_stride
+    (3, None, None),
+    (8, (2,), (2,)),
+    (32, None, None),
+    (257, (3,), (3,)),
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-4},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 5e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def test(
+    handle,
+    device,
+    n,
+    a_stride=None,
+    b_stride=None,
+    dtype=torch.float16,
+    sync=None,
+):
+    torch.manual_seed(0)
+    if device != 0:
+        torch.cuda.manual_seed_all(0)
+
+    a = TestTensor((n,), a_stride, dtype, device)
+    b = TestTensor((n,), b_stride, dtype, device)
+    y = TestTensor((1,), None, dtype, device, mode="zeros")
+
+    print(
+        f"Testing dot on {InfiniDeviceNames[device]} with n:{n} a_stride:{a_stride} b_stride:{b_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    y_ref = torch.dot(a.torch_tensor().reshape(-1), b.torch_tensor().reshape(-1)).reshape(1)
+    y.update_torch_tensor(y_ref)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateDotDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    for tensor in [a, b, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetDotWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_dot():
+        check_error(
+            LIBINFINIOP.infiniopDot(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                y.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_dot()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch.dot(a.torch_tensor().reshape(-1), b.torch_tensor().reshape(-1)), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_dot(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyDotDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92m  Test passed!  \033[0m")
+
diff --git a/test/infiniop/histc.py b/test/infiniop/histc.py
new file mode 100644
index 000000000..95327fdd1
--- /dev/null
+++ b/test/infiniop/histc.py
@@ -0,0 +1,153 @@
+import torch
+import ctypes
+from ctypes import c_uint64, c_int64, c_double
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+# ==============================================================================
+#  Configuration
+# ==============================================================================
+
+_TEST_CASES = [
+    # x_shape, x_stride, bins, min, max
+    ((100,), None, 10, 0.0, 1.0),
+    ((50,), None, 5, -1.0, 1.0),
+    ((20,), (2,), 8, 0.0, 2.0),
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    # histc produces exact integer counts in float32 for these sizes
+    InfiniDtype.F16: {"atol": 0.0, "rtol": 0.0},
+    InfiniDtype.BF16: {"atol": 0.0, "rtol": 0.0},
+    InfiniDtype.F32: {"atol": 0.0, "rtol": 0.0},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_histc(x, bins, min_val, max_val):
+    # torch.histc on CUDA does not support float16/bfloat16 directly.
+    return torch.histc(x.to(torch.float32), bins=bins, min=min_val, max=max_val)
+
+
+def test(
+    handle,
+    device,
+    x_shape,
+    x_stride,
+    bins,
+    min_val,
+    max_val,
+    dtype=torch.float16,
+    sync=None,
+):
+    torch.manual_seed(0)
+    if device != 0:
+        torch.cuda.manual_seed_all(0)
+
+    x = TestTensor(x_shape, x_stride, dtype, device)
+    # Make values fall into [min, max] and force edge hits.
+    rng = max_val - min_val
+    x_tensor = x.torch_tensor() * rng + min_val
+    if x_tensor.numel() > 0:
+        flat = x_tensor.reshape(-1)
+        flat[0] = min_val
+        flat[-1] = max_val
+    x.set_tensor(x_tensor)
+
+    y = TestTensor((bins,), None, InfiniDtype.F32, device, mode="zeros")
+
+    print(
+        f"Testing Histc on {InfiniDeviceNames[device]} with x_shape:{x_shape} x_stride:{x_stride} "
+        f"bins:{bins} range:[{min_val},{max_val}] x_dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    y_ref = torch_histc(x.torch_tensor(), bins, min_val, max_val)
+    y.update_torch_tensor(y_ref)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateHistcDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            x.descriptor,
+            c_int64(bins),
+            c_double(min_val),
+            c_double(max_val),
+        )
+    )
+
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetHistcWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_histc():
+        check_error(
+            LIBINFINIOP.infiniopHistc(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                y.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_histc()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: y_ref, device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_histc(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyHistcDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92m  Test passed!  \033[0m")
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index cd3ad1b82..5d4e891de 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -11,7 +11,7 @@
     POINTER,
     c_float,
     c_double,
-    c_uint64,
+    c_int64,
     c_bool,
 )
 
@@ -476,8 +476,171 @@ def logsoftmax_(lib):
 
 
 @OpRegister.operator
-def conv_(lib):
-    pass
+def log10_(lib):
+    lib.infiniopCreateLog10Descriptor.restype = c_int32
+    lib.infiniopCreateLog10Descriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetLog10WorkspaceSize.restype = c_int32
+    lib.infiniopGetLog10WorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopLog10.restype = c_int32
+    lib.infiniopLog10.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyLog10Descriptor.restype = c_int32
+    lib.infiniopDestroyLog10Descriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def log1p_(lib):
+    lib.infiniopCreateLog1pDescriptor.restype = c_int32
+    lib.infiniopCreateLog1pDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetLog1pWorkspaceSize.restype = c_int32
+    lib.infiniopGetLog1pWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopLog1p.restype = c_int32
+    lib.infiniopLog1p.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyLog1pDescriptor.restype = c_int32
+    lib.infiniopDestroyLog1pDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def histc_(lib):
+    lib.infiniopCreateHistcDescriptor.restype = c_int32
+    lib.infiniopCreateHistcDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_int64,
+        c_double,
+        c_double,
+    ]
+
+    lib.infiniopGetHistcWorkspaceSize.restype = c_int32
+    lib.infiniopGetHistcWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopHistc.restype = c_int32
+    lib.infiniopHistc.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyHistcDescriptor.restype = c_int32
+    lib.infiniopDestroyHistcDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def dot_(lib):
+    lib.infiniopCreateDotDescriptor.restype = c_int32
+    lib.infiniopCreateDotDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetDotWorkspaceSize.restype = c_int32
+    lib.infiniopGetDotWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopDot.restype = c_int32
+    lib.infiniopDot.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyDotDescriptor.restype = c_int32
+    lib.infiniopDestroyDotDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def avg_pool3d_(lib):
+    lib.infiniopCreateAvgPool3dDescriptor.restype = c_int32
+    lib.infiniopCreateAvgPool3dDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_void_p,  # kernel_size
+        c_void_p,  # stride (nullable)
+        c_void_p,  # padding (nullable)
+    ]
+
+    lib.infiniopGetAvgPool3dWorkspaceSize.restype = c_int32
+    lib.infiniopGetAvgPool3dWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopAvgPool3d.restype = c_int32
+    lib.infiniopAvgPool3d.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyAvgPool3dDescriptor.restype = c_int32
+    lib.infiniopDestroyAvgPool3dDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
 
 
 @OpRegister.operator
diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py
index b690e74d4..c0560dbde 100644
--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -188,8 +188,35 @@ def from_torch(torch_tensor, dt: InfiniDtype, device: InfiniDeviceEnum):
     def update_torch_tensor(self, new_tensor: torch.Tensor):
         self._torch_tensor = new_tensor
 
-    def update_torch_tensor(self, new_tensor: torch.Tensor):
-        self._torch_tensor = new_tensor
+    def set_tensor(self, new_tensor: torch.Tensor):
+        """
+        Replace the logical tensor values while preserving the descriptor's
+        declared shape/strides. This keeps reference (torch_tensor) and
+        backing storage (actual_tensor) consistent for strided tensors.
+        """
+        t = new_tensor.to(to_torch_dtype(self.dt)).to(torch_device_map[self.device])
+
+        # The logical tensor used for reference computations follows the
+        # "torch view" shape derived from (shape, strides).
+        torch_shape = []
+        torch_strides = [] if self.strides is not None else None
+        for i in range(len(self.shape)):
+            if self.strides is not None and self.strides[i] == 0:
+                torch_shape.append(1)
+                torch_strides.append(1)
+            elif self.strides is not None and self.strides[i] != 0:
+                torch_shape.append(self.shape[i])
+                torch_strides.append(self.strides[i])
+            else:
+                torch_shape.append(self.shape[i])
+
+        assert list(t.shape) == torch_shape
+
+        self._torch_tensor = t
+        if self.strides is not None:
+            self._data_tensor = rearrange_tensor(self._torch_tensor, torch_strides)
+        else:
+            self._data_tensor = self._torch_tensor.clone()
 
 
 def to_torch_dtype(dt: InfiniDtype, compatability_mode=False):
diff --git a/test/infiniop/log10.py b/test/infiniop/log10.py
new file mode 100644
index 000000000..4ba644d86
--- /dev/null
+++ b/test/infiniop/log10.py
@@ -0,0 +1,179 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, x_stride, y_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), (0, 1)),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (4, 0, 1)),
+    ((16, 5632), None, None),
+    ((16, 5632), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+    ((4, 4, 56320), None, None),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_log10(y, x):
+    # Input is preprocessed to be positive in the test harness.
+    torch.log10(x, out=y)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    x_stride=None,
+    y_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    x = TestTensor(shape, x_stride, dtype, device)
+    if inplace == Inplace.INPLACE_X:
+        if x_stride != y_stride:
+            return
+        y = x
+    else:
+        y = TestTensor(shape, y_stride, dtype, device, mode="ones")
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Log10 on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    # Create positive input values for log10 operation
+    x_tensor = torch.abs(x.torch_tensor()) + 1e-6  # Add small epsilon to avoid log(0)
+    x.set_tensor(x_tensor)
+    
+    torch_log10(y.torch_tensor(), x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateLog10Descriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            x.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetLog10WorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_log10():
+        check_error(
+            LIBINFINIOP.infiniopLog10(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                y.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_log10()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_log10(y.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_log10(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyLog10Descriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92m  Test passed!  \033[0m")
diff --git a/test/infiniop/log1p.py b/test/infiniop/log1p.py
new file mode 100644
index 000000000..cd98cc131
--- /dev/null
+++ b/test/infiniop/log1p.py
@@ -0,0 +1,162 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration
+# ==============================================================================
+
+_TEST_CASES_ = [
+    # shape, x_stride, y_stride
+    ((2, 3), None, None),
+    ((1, 4, 8), (32, 8, 1), (32, 8, 1)),
+    ((3, 2, 5, 7), None, None),
+    ((1, 8, 9, 11), (792, 99, 11, 1), (792, 99, 11, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_log1p(y, x):
+    torch.log1p(x, out=y)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    x_stride=None,
+    y_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    torch.manual_seed(0)
+    if device != 0:
+        torch.cuda.manual_seed_all(0)
+
+    x = TestTensor(shape, x_stride, dtype, device)
+    if inplace == Inplace.INPLACE_X:
+        if x_stride != y_stride:
+            return
+        y = x
+    else:
+        y = TestTensor(shape, y_stride, dtype, device, mode="ones")
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Log1p on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    torch_log1p(y.torch_tensor(), x.torch_tensor())
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateLog1pDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            x.descriptor,
+        )
+    )
+
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetLog1pWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_log1p():
+        check_error(
+            LIBINFINIOP.infiniopLog1p(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                y.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_log1p()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_log1p(y.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_log1p(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyLog1pDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92m  Test passed!  \033[0m")

From 02dc83eaddf3e40a98864754edd555b23482d705 Mon Sep 17 00:00:00 2001
From: PanZezhong <panzezhong@qiyuanlab.com>
Date: Thu, 9 Apr 2026 11:29:21 +0800
Subject: [PATCH 2/2] issue/1031 fix T1-1-2

---
 include/infiniop/ops/avg_pool3d.h             | 30 +++---
 include/infiniop/ops/dot.h                    | 28 +++---
 include/infiniop/ops/histc.h                  | 30 +++---
 include/infiniop/ops/log10.h                  | 24 ++---
 include/infiniop/ops/log1p.h                  | 24 ++---
 src/infiniop/devices/metax/metax_common.h     |  1 +
 src/infiniop/devices/metax/metax_ht2mc.h      |  2 +
 .../devices/metax/metax_kernel_common.h       |  1 +
 src/infiniop/devices/moore/moore_common.h     |  3 +
 .../devices/moore/moore_kernel_common.h       |  3 +
 .../ops/avg_pool3d/cpu/avg_pool3d_cpu.cc      | 97 ++++++++-----------
 .../ops/avg_pool3d/cpu/avg_pool3d_cpu.h       |  2 +-
 .../ops/avg_pool3d/metax/avg_pool3d_metax.h   |  9 +-
 .../avg_pool3d/metax/avg_pool3d_metax.maca    | 68 +++----------
 .../ops/avg_pool3d/moore/avg_pool3d_moore.h   |  4 +-
 .../ops/avg_pool3d/moore/avg_pool3d_moore.mu  |  5 +-
 .../avg_pool3d/nvidia/avg_pool3d_nvidia.cu    | 47 +++++----
 .../avg_pool3d/nvidia/avg_pool3d_nvidia.cuh   |  4 +-
 src/infiniop/ops/avg_pool3d/operator.cc       | 42 ++++----
 src/infiniop/ops/dot/cpu/dot_cpu.h            |  4 +-
 src/infiniop/ops/dot/cuda/kernel.cuh          | 19 ++--
 src/infiniop/ops/dot/metax/dot_metax.h        |  1 -
 src/infiniop/ops/dot/metax/dot_metax.maca     | 84 ++++++++--------
 src/infiniop/ops/dot/moore/dot_moore.h        |  1 -
 src/infiniop/ops/dot/moore/dot_moore.mu       | 85 ++++++++--------
 src/infiniop/ops/dot/nvidia/dot_nvidia.cu     | 76 +++++++--------
 src/infiniop/ops/dot/nvidia/dot_nvidia.cuh    |  2 +-
 src/infiniop/ops/dot/operator.cc              | 30 +++---
 src/infiniop/ops/histc/cpu/histc_cpu.h        |  2 +-
 src/infiniop/ops/histc/cuda/kernel.cuh        |  6 +-
 src/infiniop/ops/histc/metax/histc_metax.h    |  1 -
 src/infiniop/ops/histc/metax/histc_metax.maca |  9 +-
 src/infiniop/ops/histc/moore/histc_moore.h    |  1 -
 src/infiniop/ops/histc/moore/histc_moore.mu   |  9 +-
 src/infiniop/ops/histc/nvidia/histc_nvidia.cu |  8 +-
 .../ops/histc/nvidia/histc_nvidia.cuh         |  2 +-
 src/infiniop/ops/histc/operator.cc            | 28 +++---
 src/infiniop/ops/log10/cpu/log10_cpu.cc       |  2 +-
 src/infiniop/ops/log10/cpu/log10_cpu.h        |  2 +-
 src/infiniop/ops/log10/cuda/kernel.cuh        | 16 ++-
 src/infiniop/ops/log10/log10.h                |  2 +-
 .../ops/log10/moore/log10_moore_kernel.h      |  4 +-
 src/infiniop/ops/log10/nvidia/log10_nvidia.cu |  2 +-
 .../ops/log10/nvidia/log10_nvidia.cuh         |  2 +-
 src/infiniop/ops/log10/operator.cc            | 10 +-
 src/infiniop/ops/log1p/cuda/kernel.cuh        | 14 ++-
 src/infiniop/ops/log1p/metax/log1p_metax.maca |  1 +
 .../ops/log1p/moore/log1p_moore_kernel.h      |  4 +-
 src/infiniop/ops/log1p/operator.cc            |  8 +-
 test/infiniop/avg_pool3d.py                   |  2 +-
 50 files changed, 390 insertions(+), 471 deletions(-)

diff --git a/include/infiniop/ops/avg_pool3d.h b/include/infiniop/ops/avg_pool3d.h
index 7170e7765..a54fbb263 100644
--- a/include/infiniop/ops/avg_pool3d.h
+++ b/include/infiniop/ops/avg_pool3d.h
@@ -5,23 +5,23 @@
 
 typedef struct InfiniopDescriptor *infiniopAvgPool3dDescriptor_t;
 
-__C __export infiniStatus_t infiniopCreateAvgPool3dDescriptor(infiniopHandle_t handle,
-                                                               infiniopAvgPool3dDescriptor_t *desc_ptr,
-                                                               infiniopTensorDescriptor_t y,
-                                                               infiniopTensorDescriptor_t x,
-                                                               void *kernel_size,
-                                                               void *stride,
-                                                               void *padding);
+__INFINI_C __export infiniStatus_t infiniopCreateAvgPool3dDescriptor(infiniopHandle_t handle,
+                                                                     infiniopAvgPool3dDescriptor_t *desc_ptr,
+                                                                     infiniopTensorDescriptor_t y,
+                                                                     infiniopTensorDescriptor_t x,
+                                                                     void *kernel_size,
+                                                                     void *stride,
+                                                                     void *padding);
 
-__C __export infiniStatus_t infiniopGetAvgPool3dWorkspaceSize(infiniopAvgPool3dDescriptor_t desc, size_t *size);
+__INFINI_C __export infiniStatus_t infiniopGetAvgPool3dWorkspaceSize(infiniopAvgPool3dDescriptor_t desc, size_t *size);
 
-__C __export infiniStatus_t infiniopAvgPool3d(infiniopAvgPool3dDescriptor_t desc,
-                                               void *workspace,
-                                               size_t workspace_size,
-                                               void *y,
-                                               const void *x,
-                                               void *stream);
+__INFINI_C __export infiniStatus_t infiniopAvgPool3d(infiniopAvgPool3dDescriptor_t desc,
+                                                     void *workspace,
+                                                     size_t workspace_size,
+                                                     void *y,
+                                                     const void *x,
+                                                     void *stream);
 
-__C __export infiniStatus_t infiniopDestroyAvgPool3dDescriptor(infiniopAvgPool3dDescriptor_t desc);
+__INFINI_C __export infiniStatus_t infiniopDestroyAvgPool3dDescriptor(infiniopAvgPool3dDescriptor_t desc);
 
 #endif
diff --git a/include/infiniop/ops/dot.h b/include/infiniop/ops/dot.h
index 7fa0d2659..1fb9197f9 100644
--- a/include/infiniop/ops/dot.h
+++ b/include/infiniop/ops/dot.h
@@ -5,22 +5,22 @@
 
 typedef struct InfiniopDescriptor *infiniopDotDescriptor_t;
 
-__C __export infiniStatus_t infiniopCreateDotDescriptor(infiniopHandle_t handle,
-                                                        infiniopDotDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t a,
-                                                        infiniopTensorDescriptor_t b);
+__INFINI_C __export infiniStatus_t infiniopCreateDotDescriptor(infiniopHandle_t handle,
+                                                               infiniopDotDescriptor_t *desc_ptr,
+                                                               infiniopTensorDescriptor_t y,
+                                                               infiniopTensorDescriptor_t a,
+                                                               infiniopTensorDescriptor_t b);
 
-__C __export infiniStatus_t infiniopGetDotWorkspaceSize(infiniopDotDescriptor_t desc, size_t *size);
+__INFINI_C __export infiniStatus_t infiniopGetDotWorkspaceSize(infiniopDotDescriptor_t desc, size_t *size);
 
-__C __export infiniStatus_t infiniopDot(infiniopDotDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *a,
-                                        const void *b,
-                                        void *stream);
+__INFINI_C __export infiniStatus_t infiniopDot(infiniopDotDescriptor_t desc,
+                                               void *workspace,
+                                               size_t workspace_size,
+                                               void *y,
+                                               const void *a,
+                                               const void *b,
+                                               void *stream);
 
-__C __export infiniStatus_t infiniopDestroyDotDescriptor(infiniopDotDescriptor_t desc);
+__INFINI_C __export infiniStatus_t infiniopDestroyDotDescriptor(infiniopDotDescriptor_t desc);
 
 #endif
diff --git a/include/infiniop/ops/histc.h b/include/infiniop/ops/histc.h
index cd4695d4e..3e7232259 100644
--- a/include/infiniop/ops/histc.h
+++ b/include/infiniop/ops/histc.h
@@ -6,23 +6,23 @@
 
 typedef struct InfiniopDescriptor *infiniopHistcDescriptor_t;
 
-__C __export infiniStatus_t infiniopCreateHistcDescriptor(infiniopHandle_t handle,
-                                                           infiniopHistcDescriptor_t *desc_ptr,
-                                                           infiniopTensorDescriptor_t y,
-                                                           infiniopTensorDescriptor_t x,
-                                                           int64_t bins,
-                                                           double min_val,
-                                                           double max_val);
+__INFINI_C __export infiniStatus_t infiniopCreateHistcDescriptor(infiniopHandle_t handle,
+                                                                 infiniopHistcDescriptor_t *desc_ptr,
+                                                                 infiniopTensorDescriptor_t y,
+                                                                 infiniopTensorDescriptor_t x,
+                                                                 int64_t bins,
+                                                                 double min_val,
+                                                                 double max_val);
 
-__C __export infiniStatus_t infiniopGetHistcWorkspaceSize(infiniopHistcDescriptor_t desc, size_t *size);
+__INFINI_C __export infiniStatus_t infiniopGetHistcWorkspaceSize(infiniopHistcDescriptor_t desc, size_t *size);
 
-__C __export infiniStatus_t infiniopHistc(infiniopHistcDescriptor_t desc,
-                                           void *workspace,
-                                           size_t workspace_size,
-                                           void *y,
-                                           const void *x,
-                                           void *stream);
+__INFINI_C __export infiniStatus_t infiniopHistc(infiniopHistcDescriptor_t desc,
+                                                 void *workspace,
+                                                 size_t workspace_size,
+                                                 void *y,
+                                                 const void *x,
+                                                 void *stream);
 
-__C __export infiniStatus_t infiniopDestroyHistcDescriptor(infiniopHistcDescriptor_t desc);
+__INFINI_C __export infiniStatus_t infiniopDestroyHistcDescriptor(infiniopHistcDescriptor_t desc);
 
 #endif
diff --git a/include/infiniop/ops/log10.h b/include/infiniop/ops/log10.h
index 7c105dcc3..5ca8ccd8d 100644
--- a/include/infiniop/ops/log10.h
+++ b/include/infiniop/ops/log10.h
@@ -5,20 +5,20 @@
 
 typedef struct InfiniopDescriptor *infiniopLog10Descriptor_t;
 
-__C __export infiniStatus_t infiniopCreateLog10Descriptor(infiniopHandle_t handle,
-                                                          infiniopLog10Descriptor_t *desc_ptr,
-                                                          infiniopTensorDescriptor_t y,
-                                                          infiniopTensorDescriptor_t x);
+__INFINI_C __export infiniStatus_t infiniopCreateLog10Descriptor(infiniopHandle_t handle,
+                                                                 infiniopLog10Descriptor_t *desc_ptr,
+                                                                 infiniopTensorDescriptor_t y,
+                                                                 infiniopTensorDescriptor_t x);
 
-__C __export infiniStatus_t infiniopGetLog10WorkspaceSize(infiniopLog10Descriptor_t desc, size_t *size);
+__INFINI_C __export infiniStatus_t infiniopGetLog10WorkspaceSize(infiniopLog10Descriptor_t desc, size_t *size);
 
-__C __export infiniStatus_t infiniopLog10(infiniopLog10Descriptor_t desc,
-                                          void *workspace,
-                                          size_t workspace_size,
-                                          void *y,
-                                          const void *x,
-                                          void *stream);
+__INFINI_C __export infiniStatus_t infiniopLog10(infiniopLog10Descriptor_t desc,
+                                                 void *workspace,
+                                                 size_t workspace_size,
+                                                 void *y,
+                                                 const void *x,
+                                                 void *stream);
 
-__C __export infiniStatus_t infiniopDestroyLog10Descriptor(infiniopLog10Descriptor_t desc);
+__INFINI_C __export infiniStatus_t infiniopDestroyLog10Descriptor(infiniopLog10Descriptor_t desc);
 
 #endif
diff --git a/include/infiniop/ops/log1p.h b/include/infiniop/ops/log1p.h
index ebd608e9d..93f229c7d 100644
--- a/include/infiniop/ops/log1p.h
+++ b/include/infiniop/ops/log1p.h
@@ -5,20 +5,20 @@
 
 typedef struct InfiniopDescriptor *infiniopLog1pDescriptor_t;
 
-__C __export infiniStatus_t infiniopCreateLog1pDescriptor(infiniopHandle_t handle,
-                                                          infiniopLog1pDescriptor_t *desc_ptr,
-                                                          infiniopTensorDescriptor_t y,
-                                                          infiniopTensorDescriptor_t x);
+__INFINI_C __export infiniStatus_t infiniopCreateLog1pDescriptor(infiniopHandle_t handle,
+                                                                 infiniopLog1pDescriptor_t *desc_ptr,
+                                                                 infiniopTensorDescriptor_t y,
+                                                                 infiniopTensorDescriptor_t x);
 
-__C __export infiniStatus_t infiniopGetLog1pWorkspaceSize(infiniopLog1pDescriptor_t desc, size_t *size);
+__INFINI_C __export infiniStatus_t infiniopGetLog1pWorkspaceSize(infiniopLog1pDescriptor_t desc, size_t *size);
 
-__C __export infiniStatus_t infiniopLog1p(infiniopLog1pDescriptor_t desc,
-                                          void *workspace,
-                                          size_t workspace_size,
-                                          void *y,
-                                          const void *x,
-                                          void *stream);
+__INFINI_C __export infiniStatus_t infiniopLog1p(infiniopLog1pDescriptor_t desc,
+                                                 void *workspace,
+                                                 size_t workspace_size,
+                                                 void *y,
+                                                 const void *x,
+                                                 void *stream);
 
-__C __export infiniStatus_t infiniopDestroyLog1pDescriptor(infiniopLog1pDescriptor_t desc);
+__INFINI_C __export infiniStatus_t infiniopDestroyLog1pDescriptor(infiniopLog1pDescriptor_t desc);
 
 #endif
diff --git a/src/infiniop/devices/metax/metax_common.h b/src/infiniop/devices/metax/metax_common.h
index a29d44791..2aef6c786 100644
--- a/src/infiniop/devices/metax/metax_common.h
+++ b/src/infiniop/devices/metax/metax_common.h
@@ -1,3 +1,4 @@
+#pragma once
 #include "../../../utils.h"
 #include "../pool.h"
 #include "metax_handle.h"
diff --git a/src/infiniop/devices/metax/metax_ht2mc.h b/src/infiniop/devices/metax/metax_ht2mc.h
index 9846c3054..e8b388307 100644
--- a/src/infiniop/devices/metax/metax_ht2mc.h
+++ b/src/infiniop/devices/metax/metax_ht2mc.h
@@ -61,8 +61,10 @@
 #define hcdnnTensorDescriptor_t mcdnnTensorDescriptor_t
 #define hcdnnStatus_t mcdnnStatus_t
 #define hcdnnSetTensor4dDescriptor mcdnnSetTensor4dDescriptor
+#define hcdnnSetTensorNdDescriptor mcdnnSetTensorNdDescriptor
 #define hcdnnSetStream mcdnnSetStream
 #define hcdnnSetPooling2dDescriptor mcdnnSetPooling2dDescriptor
+#define hcdnnSetPoolingNdDescriptor mcdnnSetPoolingNdDescriptor
 #define hcdnnPoolingMode_t mcdnnPoolingMode_t
 #define hcdnnPoolingForward mcdnnPoolingForward
 #define hcdnnPoolingDescriptor_t mcdnnPoolingDescriptor_t
diff --git a/src/infiniop/devices/metax/metax_kernel_common.h b/src/infiniop/devices/metax/metax_kernel_common.h
index b51209b3b..3d2b31a5e 100644
--- a/src/infiniop/devices/metax/metax_kernel_common.h
+++ b/src/infiniop/devices/metax/metax_kernel_common.h
@@ -1,3 +1,4 @@
+#pragma once
 #define INFINIOP_METAX_KERNEL __global__ void
 
 #ifdef ENABLE_METAX_MC_API
diff --git a/src/infiniop/devices/moore/moore_common.h b/src/infiniop/devices/moore/moore_common.h
index 725e230f5..c60677127 100644
--- a/src/infiniop/devices/moore/moore_common.h
+++ b/src/infiniop/devices/moore/moore_common.h
@@ -1,3 +1,5 @@
+#ifndef __INFINIOP_MOORE_COMMON_H__
+#define __INFINIOP_MOORE_COMMON_H__
 #include "../../../utils.h"
 #include "../pool.h"
 #include "moore_handle.h"
@@ -42,3 +44,4 @@ class Handle::Internal {
 };
 
 } // namespace device::moore
+#endif
diff --git a/src/infiniop/devices/moore/moore_kernel_common.h b/src/infiniop/devices/moore/moore_kernel_common.h
index 5e4ba6c0f..e121fddf1 100644
--- a/src/infiniop/devices/moore/moore_kernel_common.h
+++ b/src/infiniop/devices/moore/moore_kernel_common.h
@@ -1,3 +1,5 @@
+#ifndef __INFINIOP_MOORE_KERNEL_COMMON_H__
+#define __INFINIOP_MOORE_KERNEL_COMMON_H__
 #define INFINIOP_MOORE_KERNEL __global__ void
 
 #include <musa_bf16.h>
@@ -72,3 +74,4 @@ exp_(const __mt_bfloat16 x) {
     float f_result = expf(f_val);
     return __float2bfloat16(f_result);
 }
+#endif
diff --git a/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.cc b/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.cc
index c280a2d92..9553c17f5 100644
--- a/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.cc
+++ b/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.cc
@@ -70,9 +70,7 @@ utils::Result<AvgPool3dInfo> AvgPool3dInfo::create(
     }
 
     // Calculate output dimensions. Guard against unsigned underflow when kernel > input + 2*pad.
-    if (pad_d > (std::numeric_limits<size_t>::max() - input_d) / 2 ||
-        pad_h > (std::numeric_limits<size_t>::max() - input_h) / 2 ||
-        pad_w > (std::numeric_limits<size_t>::max() - input_w) / 2) {
+    if (pad_d > (std::numeric_limits<size_t>::max() - input_d) / 2 || pad_h > (std::numeric_limits<size_t>::max() - input_h) / 2 || pad_w > (std::numeric_limits<size_t>::max() - input_w) / 2) {
         return INFINI_STATUS_BAD_PARAM;
     }
     size_t effective_d = input_d + 2 * pad_d;
@@ -87,8 +85,7 @@ utils::Result<AvgPool3dInfo> AvgPool3dInfo::create(
     size_t output_w = (effective_w - kernel_w) / stride_w + 1;
 
     // Verify output shape
-    if (y_shape[0] != batch || y_shape[1] != channels ||
-        y_shape[2] != output_d || y_shape[3] != output_h || y_shape[4] != output_w) {
+    if (y_shape[0] != batch || y_shape[1] != channels || y_shape[2] != output_d || y_shape[3] != output_h || y_shape[4] != output_w) {
         return INFINI_STATUS_BAD_TENSOR_SHAPE;
     }
 
@@ -168,59 +165,51 @@ void avg_pool3d_impl(
     using Tacc = std::conditional_t<std::is_same_v<T, double>, double, float>;
     const Tacc inv_kernel_size = Tacc(1) / static_cast<Tacc>(kernel_size);
 
-#pragma omp parallel for collapse(2)
-    for (ptrdiff_t b = 0; b < static_cast<ptrdiff_t>(info.batch); ++b) {
-        for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(info.channels); ++c) {
-            for (size_t od = 0; od < info.output_d; ++od) {
-                for (size_t oh = 0; oh < info.output_h; ++oh) {
-                    for (size_t ow = 0; ow < info.output_w; ++ow) {
-                        Tacc sum = Tacc(0);
-
-                        // Calculate input window
-                        ptrdiff_t id_start =
-                            static_cast<ptrdiff_t>(od) * static_cast<ptrdiff_t>(info.stride_d) -
-                            static_cast<ptrdiff_t>(info.pad_d);
-                        ptrdiff_t ih_start =
-                            static_cast<ptrdiff_t>(oh) * static_cast<ptrdiff_t>(info.stride_h) -
-                            static_cast<ptrdiff_t>(info.pad_h);
-                        ptrdiff_t iw_start =
-                            static_cast<ptrdiff_t>(ow) * static_cast<ptrdiff_t>(info.stride_w) -
-                            static_cast<ptrdiff_t>(info.pad_w);
-
-                        for (size_t kd = 0; kd < info.kernel_d; ++kd) {
-                            for (size_t kh = 0; kh < info.kernel_h; ++kh) {
-                                for (size_t kw = 0; kw < info.kernel_w; ++kw) {
-                                    ptrdiff_t id = id_start + static_cast<ptrdiff_t>(kd);
-                                    ptrdiff_t ih = ih_start + static_cast<ptrdiff_t>(kh);
-                                    ptrdiff_t iw = iw_start + static_cast<ptrdiff_t>(kw);
-
-                                    // Check bounds (accounting for padding)
-                                    if (id >= 0 && id < static_cast<ptrdiff_t>(info.input_d) &&
-                                        ih >= 0 && ih < static_cast<ptrdiff_t>(info.input_h) &&
-                                        iw >= 0 && iw < static_cast<ptrdiff_t>(info.input_w)) {
-                                        size_t x_idx = b * info.input_strides[0] +
-                                                      c * info.input_strides[1] +
-                                                      static_cast<size_t>(id) * info.input_strides[2] +
-                                                      static_cast<size_t>(ih) * info.input_strides[3] +
-                                                      static_cast<size_t>(iw) * info.input_strides[4];
-                                        sum += utils::cast<Tacc>(x[x_idx]);
-                                    }
-                                }
-                            }
-                        }
-
-                        size_t y_idx = b * info.output_strides[0] +
-                                      c * info.output_strides[1] +
-                                      od * info.output_strides[2] +
-                                      oh * info.output_strides[3] +
-                                      ow * info.output_strides[4];
-                        // Match torch.nn.functional.avg_pool3d default behavior (count_include_pad=True):
-                        // padding contributes zeros but still counts in the divisor.
-                        y[y_idx] = utils::cast<T>(sum * inv_kernel_size);
+    size_t total = info.batch * info.channels * info.output_d * info.output_h * info.output_w;
+
+#pragma omp parallel for
+    for (ptrdiff_t idx = 0; idx < (ptrdiff_t)total; ++idx) {
+
+        size_t tmp = idx;
+
+        size_t ow = tmp % info.output_w;
+        tmp /= info.output_w;
+        size_t oh = tmp % info.output_h;
+        tmp /= info.output_h;
+        size_t od = tmp % info.output_d;
+        tmp /= info.output_d;
+        size_t c = tmp % info.channels;
+        tmp /= info.channels;
+        size_t b = tmp;
+
+        using Tacc = std::conditional_t<std::is_same_v<T, double>, double, float>;
+        Tacc sum = Tacc(0);
+
+        ptrdiff_t id_start = (ptrdiff_t)od * (ptrdiff_t)info.stride_d - (ptrdiff_t)info.pad_d;
+        ptrdiff_t ih_start = (ptrdiff_t)oh * (ptrdiff_t)info.stride_h - (ptrdiff_t)info.pad_h;
+        ptrdiff_t iw_start = (ptrdiff_t)ow * (ptrdiff_t)info.stride_w - (ptrdiff_t)info.pad_w;
+
+        for (size_t kd = 0; kd < info.kernel_d; ++kd) {
+            for (size_t kh = 0; kh < info.kernel_h; ++kh) {
+                for (size_t kw = 0; kw < info.kernel_w; ++kw) {
+
+                    ptrdiff_t id = id_start + kd;
+                    ptrdiff_t ih = ih_start + kh;
+                    ptrdiff_t iw = iw_start + kw;
+
+                    if (id >= 0 && id < (ptrdiff_t)info.input_d && ih >= 0 && ih < (ptrdiff_t)info.input_h && iw >= 0 && iw < (ptrdiff_t)info.input_w) {
+
+                        size_t x_idx = b * info.input_strides[0] + c * info.input_strides[1] + (size_t)id * info.input_strides[2] + (size_t)ih * info.input_strides[3] + (size_t)iw * info.input_strides[4];
+
+                        sum += utils::cast<Tacc>(x[x_idx]);
                     }
                 }
             }
         }
+
+        size_t y_idx = b * info.output_strides[0] + c * info.output_strides[1] + od * info.output_strides[2] + oh * info.output_strides[3] + ow * info.output_strides[4];
+
+        y[y_idx] = utils::cast<T>(sum * inv_kernel_size);
     }
 }
 
diff --git a/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.h b/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.h
index f01aeb5cf..57be33f9e 100644
--- a/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.h
+++ b/src/infiniop/ops/avg_pool3d/cpu/avg_pool3d_cpu.h
@@ -1,9 +1,9 @@
 #ifndef __AVG_POOL3D_CPU_H__
 #define __AVG_POOL3D_CPU_H__
 
+#include "../../../devices/cpu/common_cpu.h"
 #include "../../../operator.h"
 #include "../../../tensor.h"
-#include "../../../devices/cpu/common_cpu.h"
 #include <vector>
 
 namespace op::avg_pool3d::cpu {
diff --git a/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.h b/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.h
index b4277393e..fdc838868 100644
--- a/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.h
+++ b/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.h
@@ -3,14 +3,7 @@
 
 #include "../../../operator.h"
 #include "../../../tensor.h"
-#include "../../../devices/metax/metax_common.h"
-#include "../../../devices/metax/metax_handle.h"
-
-#ifdef ENABLE_METAX_MC_API
-#include <mcdnn/mcdnn.h>
-#else
-#include <hcdnn/hcdnn.h>
-#endif
+#include <memory>
 
 namespace op::avg_pool3d::metax {
 
diff --git a/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.maca b/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.maca
index c7955812a..fdd2f7d15 100644
--- a/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.maca
+++ b/src/infiniop/ops/avg_pool3d/metax/avg_pool3d_metax.maca
@@ -1,34 +1,30 @@
-#include "avg_pool3d_metax.h"
 #include "../../../../utils.h"
+#include "../../../devices/metax/metax_common.h"
+#include "avg_pool3d_metax.h"
 
 namespace op::avg_pool3d::metax {
 
 struct Descriptor::Opaque {
     std::shared_ptr<device::metax::Handle::Internal> internal;
-#ifdef ENABLE_METAX_MC_API
-    mcdnnTensorDescriptor_t x_desc = nullptr;
-    mcdnnTensorDescriptor_t y_desc = nullptr;
-    mcdnnPoolingDescriptor_t pool_desc = nullptr;
-#else
     hcdnnTensorDescriptor_t x_desc = nullptr;
     hcdnnTensorDescriptor_t y_desc = nullptr;
     hcdnnPoolingDescriptor_t pool_desc = nullptr;
-#endif
+
     size_t workspace_size = 0;
 
     Opaque(std::shared_ptr<device::metax::Handle::Internal> internal_ptr)
         : internal(internal_ptr) {}
 
     ~Opaque() {
-#ifdef ENABLE_METAX_MC_API
-        if (x_desc) mcdnnDestroyTensorDescriptor(x_desc);
-        if (y_desc) mcdnnDestroyTensorDescriptor(y_desc);
-        if (pool_desc) mcdnnDestroyPoolingDescriptor(pool_desc);
-#else
-        if (x_desc) hcdnnDestroyTensorDescriptor(x_desc);
-        if (y_desc) hcdnnDestroyTensorDescriptor(y_desc);
-        if (pool_desc) hcdnnDestroyPoolingDescriptor(pool_desc);
-#endif
+        if (x_desc) {
+            hcdnnDestroyTensorDescriptor(x_desc);
+        }
+        if (y_desc) {
+            hcdnnDestroyTensorDescriptor(y_desc);
+        }
+        if (pool_desc) {
+            hcdnnDestroyPoolingDescriptor(pool_desc);
+        }
     }
 };
 
@@ -88,15 +84,9 @@ infiniStatus_t Descriptor::create(
     auto opaque = std::make_unique<Opaque>(metax_handle->internal());
 
     // Create hcdnn descriptors
-#ifdef ENABLE_METAX_MC_API
-    CHECK_MCDNN(mcdnnCreateTensorDescriptor(&opaque->x_desc));
-    CHECK_MCDNN(mcdnnCreateTensorDescriptor(&opaque->y_desc));
-    CHECK_MCDNN(mcdnnCreatePoolingDescriptor(&opaque->pool_desc));
-#else
     CHECK_MCDNN(hcdnnCreateTensorDescriptor(&opaque->x_desc));
     CHECK_MCDNN(hcdnnCreateTensorDescriptor(&opaque->y_desc));
     CHECK_MCDNN(hcdnnCreatePoolingDescriptor(&opaque->pool_desc));
-#endif
 
     // Set tensor descriptors
     int n = static_cast<int>(x_shape[0]);
@@ -114,8 +104,7 @@ infiniStatus_t Descriptor::create(
         static_cast<int>(d * h * w),
         static_cast<int>(h * w),
         static_cast<int>(w),
-        1
-    };
+        1};
 
     int output_dims[5] = {n, c, out_d, out_h, out_w};
     int output_strides[5] = {
@@ -123,27 +112,9 @@ infiniStatus_t Descriptor::create(
         static_cast<int>(out_d * out_h * out_w),
         static_cast<int>(out_h * out_w),
         static_cast<int>(out_w),
-        1
-    };
+        1};
 
     hcdnnDataType_t hcdnn_dtype = device::metax::getHcdnnDtype(dtype);
-#ifdef ENABLE_METAX_MC_API
-    CHECK_MCDNN(mcdnnSetTensorNdDescriptor(
-        opaque->x_desc, hcdnn_dtype, 5, input_dims, input_strides));
-    CHECK_MCDNN(mcdnnSetTensorNdDescriptor(
-        opaque->y_desc, hcdnn_dtype, 5, output_dims, output_strides));
-
-    // Set pooling descriptor
-    int window_dims[3] = {static_cast<int>(kernel_d), static_cast<int>(kernel_h), static_cast<int>(kernel_w)};
-    int padding_dims[3] = {static_cast<int>(pad_d), static_cast<int>(pad_h), static_cast<int>(pad_w)};
-    int stride_dims[3] = {static_cast<int>(stride_d), static_cast<int>(stride_h), static_cast<int>(stride_w)};
-
-    CHECK_MCDNN(mcdnnSetPoolingNdDescriptor(
-        opaque->pool_desc,
-        MCDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING,
-        MCDNN_NOT_PROPAGATE_NAN,
-        3, window_dims, padding_dims, stride_dims));
-#else
     CHECK_MCDNN(hcdnnSetTensorNdDescriptor(
         opaque->x_desc, hcdnn_dtype, 5, input_dims, input_strides));
     CHECK_MCDNN(hcdnnSetTensorNdDescriptor(
@@ -159,7 +130,6 @@ infiniStatus_t Descriptor::create(
         HCDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING,
         HCDNN_NOT_PROPAGATE_NAN,
         3, window_dims, padding_dims, stride_dims));
-#endif
 
     *desc_ptr = new Descriptor(dtype, std::move(opaque), handle->device, handle->device_id);
     return INFINI_STATUS_SUCCESS;
@@ -194,15 +164,6 @@ infiniStatus_t Descriptor::calculate(
             beta = &beta_val;
         }
 
-#ifdef ENABLE_METAX_MC_API
-        CHECK_MCDNN(mcdnnPoolingForward(
-            hcdnn_handle,
-            _opaque->pool_desc,
-            alpha,
-            _opaque->x_desc, x,
-            beta,
-            _opaque->y_desc, y));
-#else
         CHECK_MCDNN(hcdnnPoolingForward(
             hcdnn_handle,
             _opaque->pool_desc,
@@ -210,7 +171,6 @@ infiniStatus_t Descriptor::calculate(
             _opaque->x_desc, x,
             beta,
             _opaque->y_desc, y));
-#endif
 
         return INFINI_STATUS_SUCCESS;
     });
diff --git a/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.h b/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.h
index 526d36543..4ee472b28 100644
--- a/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.h
+++ b/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.h
@@ -3,9 +3,7 @@
 
 #include "../../../operator.h"
 #include "../../../tensor.h"
-#include "../../../devices/moore/moore_common.h"
-#include "../../../devices/moore/moore_handle.h"
-#include <mudnn.h>
+#include <memory>
 
 namespace op::avg_pool3d::moore {
 
diff --git a/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.mu b/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.mu
index 5e1cfd4a6..a538bb92c 100644
--- a/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.mu
+++ b/src/infiniop/ops/avg_pool3d/moore/avg_pool3d_moore.mu
@@ -1,7 +1,6 @@
-#include "avg_pool3d_moore.h"
 #include "../../../../utils.h"
-// MOORE uses CUDA-compatible API, so we can reuse NVIDIA implementation
-// by including the NVIDIA source and adapting stream types
+#include "../../../devices/moore/moore_common.h"
+#include "avg_pool3d_moore.h"
 
 namespace op::avg_pool3d::moore {
 
diff --git a/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cu b/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cu
index ce78f7bd2..8bee6c14a 100644
--- a/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cu
+++ b/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cu
@@ -1,5 +1,5 @@
-#include "avg_pool3d_nvidia.cuh"
 #include "../../../../utils.h"
+#include "avg_pool3d_nvidia.cuh"
 #include <cudnn.h>
 #include <limits>
 
@@ -16,9 +16,15 @@ struct Descriptor::Opaque {
         : internal(internal_ptr) {}
 
     ~Opaque() {
-        if (x_desc) cudnnDestroyTensorDescriptor(x_desc);
-        if (y_desc) cudnnDestroyTensorDescriptor(y_desc);
-        if (pool_desc) cudnnDestroyPoolingDescriptor(pool_desc);
+        if (x_desc) {
+            cudnnDestroyTensorDescriptor(x_desc);
+        }
+        if (y_desc) {
+            cudnnDestroyTensorDescriptor(y_desc);
+        }
+        if (pool_desc) {
+            cudnnDestroyPoolingDescriptor(pool_desc);
+        }
     }
 };
 
@@ -162,23 +168,22 @@ infiniStatus_t Descriptor::calculate(
 
     auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
     return _opaque->internal->useCudnn(cuda_stream, [&](cudnnHandle_t cudnn_handle) {
-
-    const void *alpha = nullptr;
-    const void *beta = nullptr;
-    if (_dtype == INFINI_DTYPE_F32) {
-        static const float alpha_val = 1.0f, beta_val = 0.0f;
-        alpha = &alpha_val;
-        beta = &beta_val;
-    } else if (_dtype == INFINI_DTYPE_F64) {
-        static const double alpha_val = 1.0, beta_val = 0.0;
-        alpha = &alpha_val;
-        beta = &beta_val;
-    } else {
-        // For F16/BF16, use float alpha/beta
-        static const float alpha_val = 1.0f, beta_val = 0.0f;
-        alpha = &alpha_val;
-        beta = &beta_val;
-    }
+        const void *alpha = nullptr;
+        const void *beta = nullptr;
+        if (_dtype == INFINI_DTYPE_F32) {
+            static const float alpha_val = 1.0f, beta_val = 0.0f;
+            alpha = &alpha_val;
+            beta = &beta_val;
+        } else if (_dtype == INFINI_DTYPE_F64) {
+            static const double alpha_val = 1.0, beta_val = 0.0;
+            alpha = &alpha_val;
+            beta = &beta_val;
+        } else {
+            // For F16/BF16, use float alpha/beta
+            static const float alpha_val = 1.0f, beta_val = 0.0f;
+            alpha = &alpha_val;
+            beta = &beta_val;
+        }
 
         CHECK_CUDNN(cudnnPoolingForward(
             cudnn_handle,
diff --git a/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cuh b/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cuh
index 9ace65e7b..b2a1801cb 100644
--- a/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cuh
+++ b/src/infiniop/ops/avg_pool3d/nvidia/avg_pool3d_nvidia.cuh
@@ -1,10 +1,10 @@
 #ifndef __AVG_POOL3D_NVIDIA_H__
 #define __AVG_POOL3D_NVIDIA_H__
 
-#include "../../../operator.h"
-#include "../../../tensor.h"
 #include "../../../devices/nvidia/nvidia_common.cuh"
 #include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../operator.h"
+#include "../../../tensor.h"
 #include <cudnn.h>
 
 namespace op::avg_pool3d::nvidia {
diff --git a/src/infiniop/ops/avg_pool3d/operator.cc b/src/infiniop/ops/avg_pool3d/operator.cc
index f9e01b991..df58e1282 100644
--- a/src/infiniop/ops/avg_pool3d/operator.cc
+++ b/src/infiniop/ops/avg_pool3d/operator.cc
@@ -15,7 +15,7 @@
 #include "moore/avg_pool3d_moore.h"
 #endif
 
-__C infiniStatus_t infiniopCreateAvgPool3dDescriptor(
+__INFINI_C infiniStatus_t infiniopCreateAvgPool3dDescriptor(
     infiniopHandle_t handle,
     infiniopAvgPool3dDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t y_desc,
@@ -24,15 +24,15 @@ __C infiniStatus_t infiniopCreateAvgPool3dDescriptor(
     void *stride,
     void *padding) {
 
-#define CREATE(CASE, NAMESPACE)                                                     \
-    case CASE:                                                                      \
-        return op::avg_pool3d::NAMESPACE::Descriptor::create(                      \
-            handle,                                                                 \
-            reinterpret_cast<op::avg_pool3d::NAMESPACE::Descriptor **>(desc_ptr),  \
-            y_desc,                                                                 \
-            x_desc,                                                                 \
-            kernel_size,                                                            \
-            stride,                                                                 \
+#define CREATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                    \
+        return op::avg_pool3d::NAMESPACE::Descriptor::create(                     \
+            handle,                                                               \
+            reinterpret_cast<op::avg_pool3d::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                               \
+            x_desc,                                                               \
+            kernel_size,                                                          \
+            stride,                                                               \
             padding)
 
     switch (handle->device) {
@@ -60,10 +60,10 @@ __C infiniStatus_t infiniopCreateAvgPool3dDescriptor(
 #undef CREATE
 }
 
-__C infiniStatus_t infiniopGetAvgPool3dWorkspaceSize(infiniopAvgPool3dDescriptor_t desc, size_t *size) {
+__INFINI_C infiniStatus_t infiniopGetAvgPool3dWorkspaceSize(infiniopAvgPool3dDescriptor_t desc, size_t *size) {
 
-#define GET(CASE, NAMESPACE)                                                                        \
-    case CASE:                                                                                      \
+#define GET(CASE, NAMESPACE)                                                                      \
+    case CASE:                                                                                    \
         *size = reinterpret_cast<op::avg_pool3d::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
         return INFINI_STATUS_SUCCESS;
 
@@ -91,7 +91,7 @@ __C infiniStatus_t infiniopGetAvgPool3dWorkspaceSize(infiniopAvgPool3dDescriptor
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
-__C infiniStatus_t infiniopAvgPool3d(
+__INFINI_C infiniStatus_t infiniopAvgPool3d(
     infiniopAvgPool3dDescriptor_t desc,
     void *workspace,
     size_t workspace_size,
@@ -99,9 +99,9 @@ __C infiniStatus_t infiniopAvgPool3d(
     const void *x,
     void *stream) {
 
-#define CALCULATE(CASE, NAMESPACE)                                                      \
-    case CASE:                                                                          \
-        return reinterpret_cast<const op::avg_pool3d::NAMESPACE::Descriptor *>(desc)   \
+#define CALCULATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                       \
+        return reinterpret_cast<const op::avg_pool3d::NAMESPACE::Descriptor *>(desc) \
             ->calculate(workspace, workspace_size, y, x, stream)
 
     switch (desc->device_type) {
@@ -129,12 +129,12 @@ __C infiniStatus_t infiniopAvgPool3d(
 #undef CALCULATE
 }
 
-__C infiniStatus_t
+__INFINI_C infiniStatus_t
 infiniopDestroyAvgPool3dDescriptor(infiniopAvgPool3dDescriptor_t desc) {
 
-#define DELETE(CASE, NAMESPACE)                                                          \
-    case CASE:                                                                           \
-        delete reinterpret_cast<const op::avg_pool3d::NAMESPACE::Descriptor *>(desc);   \
+#define DELETE(CASE, NAMESPACE)                                                       \
+    case CASE:                                                                        \
+        delete reinterpret_cast<const op::avg_pool3d::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS;
 
     switch (desc->device_type) {
diff --git a/src/infiniop/ops/dot/cpu/dot_cpu.h b/src/infiniop/ops/dot/cpu/dot_cpu.h
index 5c8bbc113..95a7a4d12 100644
--- a/src/infiniop/ops/dot/cpu/dot_cpu.h
+++ b/src/infiniop/ops/dot/cpu/dot_cpu.h
@@ -1,10 +1,10 @@
 #ifndef __DOT_CPU_H__
 #define __DOT_CPU_H__
 
-#include "../../../operator.h"
-#include "../../../tensor.h"
 #include "../../../devices/cpu/common_cpu.h"
+#include "../../../operator.h"
 #include "../../../reduce/cpu/reduce.h"
+#include "../../../tensor.h"
 #include <cmath>
 
 namespace op::dot::cpu {
diff --git a/src/infiniop/ops/dot/cuda/kernel.cuh b/src/infiniop/ops/dot/cuda/kernel.cuh
index 0c4b1249d..e044bf523 100644
--- a/src/infiniop/ops/dot/cuda/kernel.cuh
+++ b/src/infiniop/ops/dot/cuda/kernel.cuh
@@ -1,16 +1,13 @@
 #pragma once
+
 #include "../../../reduce/cuda/reduce.cuh"
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
 #include <type_traits>
 
 namespace op::cuda {
 
-// Dot product kernel: computes dot(a, b) = sum(a * b)
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tout, typename Tcompute>
 __global__ void dot_kernel(
-    Tcompute *result,
+    Tout *result,
     const Tdata *a,
     const Tdata *b,
     size_t n,
@@ -19,21 +16,17 @@ __global__ void dot_kernel(
 
     Tcompute sum = 0;
 
-    // Each thread computes partial dot product
     for (size_t i = threadIdx.x; i < n; i += BLOCK_SIZE) {
-        Tcompute a_val = Tcompute(a[i * a_stride]);
-        Tcompute b_val = Tcompute(b[i * b_stride]);
-        sum += a_val * b_val;
+        sum += Tcompute(a[i * a_stride]) * Tcompute(b[i * b_stride]);
     }
 
-    // Use CUB block-level reduction
     using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
     __shared__ typename BlockReduce::TempStorage temp_storage;
+
     Tcompute block_sum = BlockReduce(temp_storage).Sum(sum);
 
-    // Write result (only thread 0, since we only launch 1 block)
     if (threadIdx.x == 0) {
-        *result = block_sum;
+        result[0] = static_cast<Tout>(block_sum);
     }
 }
 
diff --git a/src/infiniop/ops/dot/metax/dot_metax.h b/src/infiniop/ops/dot/metax/dot_metax.h
index f7b4c0e21..c9e49f636 100644
--- a/src/infiniop/ops/dot/metax/dot_metax.h
+++ b/src/infiniop/ops/dot/metax/dot_metax.h
@@ -3,7 +3,6 @@
 
 #include "../../../operator.h"
 #include "../../../tensor.h"
-#include "../../../devices/metax/metax_common.h"
 
 namespace op::dot::metax {
 
diff --git a/src/infiniop/ops/dot/metax/dot_metax.maca b/src/infiniop/ops/dot/metax/dot_metax.maca
index 79b12e8a3..38c99536f 100644
--- a/src/infiniop/ops/dot/metax/dot_metax.maca
+++ b/src/infiniop/ops/dot/metax/dot_metax.maca
@@ -1,8 +1,9 @@
-#include "dot_metax.h"
-#include "../cuda/kernel.cuh"
 #include "../../../../utils.h"
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_kernel_common.h"
+
+#include "../cuda/kernel.cuh"
+#include "dot_metax.h"
 
 namespace op::dot::metax {
 
@@ -50,56 +51,47 @@ infiniStatus_t Descriptor::calculate(
     auto hc_stream = reinterpret_cast<hcStream_t>(stream);
     constexpr unsigned int BLOCK_SIZE = 256;
 
-    // Initialize result to zero
     switch (_dtype) {
-    case INFINI_DTYPE_F16: {
-        float *result_f = nullptr;
-        CHECK_METAX(hcMalloc((void **)&result_f, sizeof(float)));
-        CHECK_METAX(hcMemsetAsync(result_f, 0, sizeof(float), hc_stream));
-        cuda::dot_kernel<BLOCK_SIZE, half, float><<<1, BLOCK_SIZE, 0, hc_stream>>>(
-            result_f, reinterpret_cast<const half *>(a), reinterpret_cast<const half *>(b),
-            _n, _a_stride, _b_stride);
-        float result_val;
-        CHECK_METAX(hcMemcpyAsync(&result_val, result_f, sizeof(float), hcMemcpyDeviceToHost, hc_stream));
-        CHECK_METAX(hcStreamSynchronize(hc_stream));
-        *reinterpret_cast<half *>(y) = __float2half(result_val);
-        CHECK_METAX(hcFree(result_f));
+
+    case INFINI_DTYPE_F32:
+        cuda::dot_kernel<BLOCK_SIZE, float, float, float>
+            <<<1, BLOCK_SIZE, 0, hc_stream>>>(
+                reinterpret_cast<float *>(y),
+                reinterpret_cast<const float *>(a),
+                reinterpret_cast<const float *>(b),
+                _n, _a_stride, _b_stride);
         break;
-    }
-    case INFINI_DTYPE_BF16: {
-        float *result_f = nullptr;
-        CHECK_METAX(hcMalloc((void **)&result_f, sizeof(float)));
-        CHECK_METAX(hcMemsetAsync(result_f, 0, sizeof(float), hc_stream));
-        cuda::dot_kernel<BLOCK_SIZE, cuda_bfloat16, float><<<1, BLOCK_SIZE, 0, hc_stream>>>(
-            result_f, reinterpret_cast<const cuda_bfloat16 *>(a), reinterpret_cast<const cuda_bfloat16 *>(b),
-            _n, _a_stride, _b_stride);
-        float result_val;
-        CHECK_METAX(hcMemcpyAsync(&result_val, result_f, sizeof(float), hcMemcpyDeviceToHost, hc_stream));
-        CHECK_METAX(hcStreamSynchronize(hc_stream));
-        *reinterpret_cast<cuda_bfloat16 *>(y) = __float2bfloat16_rn(result_val);
-        CHECK_METAX(hcFree(result_f));
+
+    case INFINI_DTYPE_F64:
+        cuda::dot_kernel<BLOCK_SIZE, double, double, double>
+            <<<1, BLOCK_SIZE, 0, hc_stream>>>(
+                reinterpret_cast<double *>(y),
+                reinterpret_cast<const double *>(a),
+                reinterpret_cast<const double *>(b),
+                _n, _a_stride, _b_stride);
         break;
-    }
-    case INFINI_DTYPE_F32: {
-        float *result_f = reinterpret_cast<float *>(y);
-        CHECK_METAX(hcMemsetAsync(result_f, 0, sizeof(float), hc_stream));
-        cuda::dot_kernel<BLOCK_SIZE, float, float><<<1, BLOCK_SIZE, 0, hc_stream>>>(
-            result_f, reinterpret_cast<const float *>(a), reinterpret_cast<const float *>(b),
-            _n, _a_stride, _b_stride);
+
+    case INFINI_DTYPE_F16:
+        cuda::dot_kernel<BLOCK_SIZE, half, half, float>
+            <<<1, BLOCK_SIZE, 0, hc_stream>>>(
+                reinterpret_cast<half *>(y),
+                reinterpret_cast<const half *>(a),
+                reinterpret_cast<const half *>(b),
+                _n, _a_stride, _b_stride);
         break;
-    }
-    case INFINI_DTYPE_F64: {
-        double *result_d = reinterpret_cast<double *>(y);
-        CHECK_METAX(hcMemsetAsync(result_d, 0, sizeof(double), hc_stream));
-        cuda::dot_kernel<BLOCK_SIZE, double, double><<<1, BLOCK_SIZE, 0, hc_stream>>>(
-            result_d, reinterpret_cast<const double *>(a), reinterpret_cast<const double *>(b),
-            _n, _a_stride, _b_stride);
+
+    case INFINI_DTYPE_BF16:
+        cuda::dot_kernel<BLOCK_SIZE, cuda_bfloat16, cuda_bfloat16, float>
+            <<<1, BLOCK_SIZE, 0, hc_stream>>>(
+                reinterpret_cast<cuda_bfloat16 *>(y),
+                reinterpret_cast<const cuda_bfloat16 *>(a),
+                reinterpret_cast<const cuda_bfloat16 *>(b),
+                _n, _a_stride, _b_stride);
         break;
-    }
+
     default:
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
-
     return INFINI_STATUS_SUCCESS;
 }
 
diff --git a/src/infiniop/ops/dot/moore/dot_moore.h b/src/infiniop/ops/dot/moore/dot_moore.h
index eebc0e528..b11178e15 100644
--- a/src/infiniop/ops/dot/moore/dot_moore.h
+++ b/src/infiniop/ops/dot/moore/dot_moore.h
@@ -3,7 +3,6 @@
 
 #include "../../../operator.h"
 #include "../../../tensor.h"
-#include "../../../devices/moore/moore_common.h"
 
 namespace op::dot::moore {
 
diff --git a/src/infiniop/ops/dot/moore/dot_moore.mu b/src/infiniop/ops/dot/moore/dot_moore.mu
index 7b9e9ec77..7516f64bb 100644
--- a/src/infiniop/ops/dot/moore/dot_moore.mu
+++ b/src/infiniop/ops/dot/moore/dot_moore.mu
@@ -1,8 +1,9 @@
-#include "dot_moore.h"
-#include "../cuda/kernel.cuh"
 #include "../../../../utils.h"
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
+#include "../../../devices/moore/moore_common.h"
+#include "../../../devices/moore/moore_kernel_common.h"
+
+#include "../cuda/kernel.cuh"
+#include "dot_moore.h"
 
 namespace op::dot::moore {
 
@@ -47,55 +48,47 @@ infiniStatus_t Descriptor::calculate(
     const void *b,
     void *stream) const {
 
-    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+    auto cuda_stream = reinterpret_cast<musaStream_t>(stream);
     constexpr unsigned int BLOCK_SIZE = 256;
 
-    // Initialize result to zero
     switch (_dtype) {
-    case INFINI_DTYPE_F16: {
-        float *result_f = nullptr;
-        CHECK_MOORE(cudaMallocAsync(&result_f, sizeof(float), cuda_stream));
-        CHECK_MOORE(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream));
-        cuda::dot_kernel<BLOCK_SIZE, half, float><<<1, BLOCK_SIZE, 0, cuda_stream>>>(
-            result_f, reinterpret_cast<const half *>(a), reinterpret_cast<const half *>(b),
-            _n, _a_stride, _b_stride);
-        float result_val;
-        CHECK_MOORE(cudaMemcpyAsync(&result_val, result_f, sizeof(float), cudaMemcpyDeviceToHost, cuda_stream));
-        CHECK_MOORE(cudaStreamSynchronize(cuda_stream));
-        *reinterpret_cast<half *>(y) = __float2half(result_val);
-        CHECK_MOORE(cudaFreeAsync(result_f, cuda_stream));
+
+    case INFINI_DTYPE_F32:
+        cuda::dot_kernel<BLOCK_SIZE, float, float, float>
+            <<<1, BLOCK_SIZE, 0, cuda_stream>>>(
+                reinterpret_cast<float *>(y),
+                reinterpret_cast<const float *>(a),
+                reinterpret_cast<const float *>(b),
+                _n, _a_stride, _b_stride);
         break;
-    }
-    case INFINI_DTYPE_BF16: {
-        float *result_f = nullptr;
-        CHECK_MOORE(cudaMallocAsync(&result_f, sizeof(float), cuda_stream));
-        CHECK_MOORE(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream));
-        cuda::dot_kernel<BLOCK_SIZE, cuda_bfloat16, float><<<1, BLOCK_SIZE, 0, cuda_stream>>>(
-            result_f, reinterpret_cast<const cuda_bfloat16 *>(a), reinterpret_cast<const cuda_bfloat16 *>(b),
-            _n, _a_stride, _b_stride);
-        float result_val;
-        CHECK_MOORE(cudaMemcpyAsync(&result_val, result_f, sizeof(float), cudaMemcpyDeviceToHost, cuda_stream));
-        CHECK_MOORE(cudaStreamSynchronize(cuda_stream));
-        *reinterpret_cast<cuda_bfloat16 *>(y) = __float2bfloat16_rn(result_val);
-        CHECK_MOORE(cudaFreeAsync(result_f, cuda_stream));
+
+    case INFINI_DTYPE_F64:
+        cuda::dot_kernel<BLOCK_SIZE, double, double, double>
+            <<<1, BLOCK_SIZE, 0, cuda_stream>>>(
+                reinterpret_cast<double *>(y),
+                reinterpret_cast<const double *>(a),
+                reinterpret_cast<const double *>(b),
+                _n, _a_stride, _b_stride);
         break;
-    }
-    case INFINI_DTYPE_F32: {
-        float *result_f = reinterpret_cast<float *>(y);
-        CHECK_MOORE(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream));
-        cuda::dot_kernel<BLOCK_SIZE, float, float><<<1, BLOCK_SIZE, 0, cuda_stream>>>(
-            result_f, reinterpret_cast<const float *>(a), reinterpret_cast<const float *>(b),
-            _n, _a_stride, _b_stride);
+
+    case INFINI_DTYPE_F16:
+        cuda::dot_kernel<BLOCK_SIZE, half, half, float>
+            <<<1, BLOCK_SIZE, 0, cuda_stream>>>(
+                reinterpret_cast<half *>(y),
+                reinterpret_cast<const half *>(a),
+                reinterpret_cast<const half *>(b),
+                _n, _a_stride, _b_stride);
         break;
-    }
-    case INFINI_DTYPE_F64: {
-        double *result_d = reinterpret_cast<double *>(y);
-        CHECK_MOORE(cudaMemsetAsync(result_d, 0, sizeof(double), cuda_stream));
-        cuda::dot_kernel<BLOCK_SIZE, double, double><<<1, BLOCK_SIZE, 0, cuda_stream>>>(
-            result_d, reinterpret_cast<const double *>(a), reinterpret_cast<const double *>(b),
-            _n, _a_stride, _b_stride);
+
+    case INFINI_DTYPE_BF16:
+        cuda::dot_kernel<BLOCK_SIZE, cuda_bfloat16, cuda_bfloat16, float>
+            <<<1, BLOCK_SIZE, 0, cuda_stream>>>(
+                reinterpret_cast<cuda_bfloat16 *>(y),
+                reinterpret_cast<const cuda_bfloat16 *>(a),
+                reinterpret_cast<const cuda_bfloat16 *>(b),
+                _n, _a_stride, _b_stride);
         break;
-    }
+
     default:
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
diff --git a/src/infiniop/ops/dot/nvidia/dot_nvidia.cu b/src/infiniop/ops/dot/nvidia/dot_nvidia.cu
index 464e0a15e..d3a2e6f8e 100644
--- a/src/infiniop/ops/dot/nvidia/dot_nvidia.cu
+++ b/src/infiniop/ops/dot/nvidia/dot_nvidia.cu
@@ -1,7 +1,9 @@
-#include "dot_nvidia.cuh"
-#include "../cuda/kernel.cuh"
-#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
 #include "../../../../utils.h"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "dot_nvidia.cuh"
 
 namespace op::dot::nvidia {
 
@@ -71,46 +73,44 @@ infiniStatus_t Descriptor::calculate(
     auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
     constexpr unsigned int BLOCK_SIZE = 256;
 
-    // Initialize result to zero
     switch (_dtype) {
-    case INFINI_DTYPE_F16: {
-        float *result_f = nullptr;
-        CHECK_CUDA(cudaMallocAsync(&result_f, sizeof(float), cuda_stream));
-        CHECK_CUDA(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream));
-        cuda::dot_kernel<BLOCK_SIZE, half, float><<<1, BLOCK_SIZE, 0, cuda_stream>>>(
-            result_f, reinterpret_cast<const half *>(a), reinterpret_cast<const half *>(b),
-            _n, _a_stride, _b_stride);
-        store_half_from_f32<<<1, 1, 0, cuda_stream>>>(reinterpret_cast<half *>(y), result_f);
-        CHECK_CUDA(cudaFreeAsync(result_f, cuda_stream));
+
+    case INFINI_DTYPE_F32:
+        cuda::dot_kernel<BLOCK_SIZE, float, float, float>
+            <<<1, BLOCK_SIZE, 0, cuda_stream>>>(
+                reinterpret_cast<float *>(y),
+                reinterpret_cast<const float *>(a),
+                reinterpret_cast<const float *>(b),
+                _n, _a_stride, _b_stride);
         break;
-    }
-    case INFINI_DTYPE_BF16: {
-        float *result_f = nullptr;
-        CHECK_CUDA(cudaMallocAsync(&result_f, sizeof(float), cuda_stream));
-        CHECK_CUDA(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream));
-        cuda::dot_kernel<BLOCK_SIZE, cuda_bfloat16, float><<<1, BLOCK_SIZE, 0, cuda_stream>>>(
-            result_f, reinterpret_cast<const cuda_bfloat16 *>(a), reinterpret_cast<const cuda_bfloat16 *>(b),
-            _n, _a_stride, _b_stride);
-        store_bf16_from_f32<<<1, 1, 0, cuda_stream>>>(reinterpret_cast<cuda_bfloat16 *>(y), result_f);
-        CHECK_CUDA(cudaFreeAsync(result_f, cuda_stream));
+
+    case INFINI_DTYPE_F64:
+        cuda::dot_kernel<BLOCK_SIZE, double, double, double>
+            <<<1, BLOCK_SIZE, 0, cuda_stream>>>(
+                reinterpret_cast<double *>(y),
+                reinterpret_cast<const double *>(a),
+                reinterpret_cast<const double *>(b),
+                _n, _a_stride, _b_stride);
         break;
-    }
-    case INFINI_DTYPE_F32: {
-        float *result_f = reinterpret_cast<float *>(y);
-        CHECK_CUDA(cudaMemsetAsync(result_f, 0, sizeof(float), cuda_stream));
-        cuda::dot_kernel<BLOCK_SIZE, float, float><<<1, BLOCK_SIZE, 0, cuda_stream>>>(
-            result_f, reinterpret_cast<const float *>(a), reinterpret_cast<const float *>(b),
-            _n, _a_stride, _b_stride);
+
+    case INFINI_DTYPE_F16:
+        cuda::dot_kernel<BLOCK_SIZE, half, half, float>
+            <<<1, BLOCK_SIZE, 0, cuda_stream>>>(
+                reinterpret_cast<half *>(y),
+                reinterpret_cast<const half *>(a),
+                reinterpret_cast<const half *>(b),
+                _n, _a_stride, _b_stride);
         break;
-    }
-    case INFINI_DTYPE_F64: {
-        double *result_d = reinterpret_cast<double *>(y);
-        CHECK_CUDA(cudaMemsetAsync(result_d, 0, sizeof(double), cuda_stream));
-        cuda::dot_kernel<BLOCK_SIZE, double, double><<<1, BLOCK_SIZE, 0, cuda_stream>>>(
-            result_d, reinterpret_cast<const double *>(a), reinterpret_cast<const double *>(b),
-            _n, _a_stride, _b_stride);
+
+    case INFINI_DTYPE_BF16:
+        cuda::dot_kernel<BLOCK_SIZE, cuda_bfloat16, cuda_bfloat16, float>
+            <<<1, BLOCK_SIZE, 0, cuda_stream>>>(
+                reinterpret_cast<cuda_bfloat16 *>(y),
+                reinterpret_cast<const cuda_bfloat16 *>(a),
+                reinterpret_cast<const cuda_bfloat16 *>(b),
+                _n, _a_stride, _b_stride);
         break;
-    }
+
     default:
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
diff --git a/src/infiniop/ops/dot/nvidia/dot_nvidia.cuh b/src/infiniop/ops/dot/nvidia/dot_nvidia.cuh
index 76d22aec3..8a15e50f1 100644
--- a/src/infiniop/ops/dot/nvidia/dot_nvidia.cuh
+++ b/src/infiniop/ops/dot/nvidia/dot_nvidia.cuh
@@ -1,9 +1,9 @@
 #ifndef __DOT_NVIDIA_H__
 #define __DOT_NVIDIA_H__
 
+#include "../../../devices/nvidia/nvidia_common.cuh"
 #include "../../../operator.h"
 #include "../../../tensor.h"
-#include "../../../devices/nvidia/nvidia_common.cuh"
 #include <cuda_runtime.h>
 
 namespace op::dot::nvidia {
diff --git a/src/infiniop/ops/dot/operator.cc b/src/infiniop/ops/dot/operator.cc
index b0f8c6be9..10f9fda57 100644
--- a/src/infiniop/ops/dot/operator.cc
+++ b/src/infiniop/ops/dot/operator.cc
@@ -15,20 +15,20 @@
 #include "moore/dot_moore.h"
 #endif
 
-__C infiniStatus_t infiniopCreateDotDescriptor(
+__INFINI_C infiniStatus_t infiniopCreateDotDescriptor(
     infiniopHandle_t handle,
     infiniopDotDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t y_desc,
     infiniopTensorDescriptor_t a_desc,
     infiniopTensorDescriptor_t b_desc) {
 
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
         return op::dot::NAMESPACE::Descriptor::create(                     \
-            handle,                                                         \
+            handle,                                                        \
             reinterpret_cast<op::dot::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                         \
-            a_desc,                                                         \
+            y_desc,                                                        \
+            a_desc,                                                        \
             b_desc)
 
     switch (handle->device) {
@@ -56,10 +56,10 @@ __C infiniStatus_t infiniopCreateDotDescriptor(
 #undef CREATE
 }
 
-__C infiniStatus_t infiniopGetDotWorkspaceSize(infiniopDotDescriptor_t desc, size_t *size) {
+__INFINI_C infiniStatus_t infiniopGetDotWorkspaceSize(infiniopDotDescriptor_t desc, size_t *size) {
 
-#define GET(CASE, NAMESPACE)                                                                 \
-    case CASE:                                                                               \
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
         *size = reinterpret_cast<op::dot::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
         return INFINI_STATUS_SUCCESS;
 
@@ -87,7 +87,7 @@ __C infiniStatus_t infiniopGetDotWorkspaceSize(infiniopDotDescriptor_t desc, siz
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
-__C infiniStatus_t infiniopDot(
+__INFINI_C infiniStatus_t infiniopDot(
     infiniopDotDescriptor_t desc,
     void *workspace,
     size_t workspace_size,
@@ -96,8 +96,8 @@ __C infiniStatus_t infiniopDot(
     const void *b,
     void *stream) {
 
-#define CALCULATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                                 \
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
         return reinterpret_cast<const op::dot::NAMESPACE::Descriptor *>(desc) \
             ->calculate(workspace, workspace_size, y, a, b, stream)
 
@@ -126,11 +126,11 @@ __C infiniStatus_t infiniopDot(
 #undef CALCULATE
 }
 
-__C infiniStatus_t
+__INFINI_C infiniStatus_t
 infiniopDestroyDotDescriptor(infiniopDotDescriptor_t desc) {
 
-#define DELETE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
         delete reinterpret_cast<const op::dot::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS;
 
diff --git a/src/infiniop/ops/histc/cpu/histc_cpu.h b/src/infiniop/ops/histc/cpu/histc_cpu.h
index 673e8ff4b..4b4f1a315 100644
--- a/src/infiniop/ops/histc/cpu/histc_cpu.h
+++ b/src/infiniop/ops/histc/cpu/histc_cpu.h
@@ -1,9 +1,9 @@
 #ifndef __HISTC_CPU_H__
 #define __HISTC_CPU_H__
 
+#include "../../../devices/cpu/common_cpu.h"
 #include "../../../operator.h"
 #include "../../../tensor.h"
-#include "../../../devices/cpu/common_cpu.h"
 
 namespace op::histc::cpu {
 
diff --git a/src/infiniop/ops/histc/cuda/kernel.cuh b/src/infiniop/ops/histc/cuda/kernel.cuh
index ee6dd4f4b..75a2474ff 100644
--- a/src/infiniop/ops/histc/cuda/kernel.cuh
+++ b/src/infiniop/ops/histc/cuda/kernel.cuh
@@ -1,7 +1,4 @@
 #pragma once
-#include <cuda_runtime.h>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
 #include <type_traits>
 
 namespace op::cuda {
@@ -16,8 +13,7 @@ __global__ void histc_kernel(
     double min_val,
     double max_val) {
 
-    size_t idx = static_cast<size_t>(blockIdx.x) * static_cast<size_t>(blockDim.x) +
-                 static_cast<size_t>(threadIdx.x);
+    size_t idx = static_cast<size_t>(blockIdx.x) * static_cast<size_t>(blockDim.x) + static_cast<size_t>(threadIdx.x);
     size_t stride = static_cast<size_t>(blockDim.x) * static_cast<size_t>(gridDim.x);
     size_t input_stride_u = static_cast<size_t>(input_stride);
 
diff --git a/src/infiniop/ops/histc/metax/histc_metax.h b/src/infiniop/ops/histc/metax/histc_metax.h
index 7ba259939..6bb74a87c 100644
--- a/src/infiniop/ops/histc/metax/histc_metax.h
+++ b/src/infiniop/ops/histc/metax/histc_metax.h
@@ -3,7 +3,6 @@
 
 #include "../../../operator.h"
 #include "../../../tensor.h"
-#include "../../../devices/metax/metax_common.h"
 
 namespace op::histc::metax {
 
diff --git a/src/infiniop/ops/histc/metax/histc_metax.maca b/src/infiniop/ops/histc/metax/histc_metax.maca
index a72f0d980..7d290b7d2 100644
--- a/src/infiniop/ops/histc/metax/histc_metax.maca
+++ b/src/infiniop/ops/histc/metax/histc_metax.maca
@@ -1,8 +1,9 @@
-#include "histc_metax.h"
-#include "../cuda/kernel.cuh"
 #include "../../../../utils.h"
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_kernel_common.h"
+
+#include "../cuda/kernel.cuh"
+#include "histc_metax.h"
 
 namespace op::histc::metax {
 
diff --git a/src/infiniop/ops/histc/moore/histc_moore.h b/src/infiniop/ops/histc/moore/histc_moore.h
index 877fb5bf3..2d9cb1d4f 100644
--- a/src/infiniop/ops/histc/moore/histc_moore.h
+++ b/src/infiniop/ops/histc/moore/histc_moore.h
@@ -3,7 +3,6 @@
 
 #include "../../../operator.h"
 #include "../../../tensor.h"
-#include "../../../devices/moore/moore_common.h"
 
 namespace op::histc::moore {
 
diff --git a/src/infiniop/ops/histc/moore/histc_moore.mu b/src/infiniop/ops/histc/moore/histc_moore.mu
index 6b10b6007..cc3173e94 100644
--- a/src/infiniop/ops/histc/moore/histc_moore.mu
+++ b/src/infiniop/ops/histc/moore/histc_moore.mu
@@ -1,8 +1,9 @@
-#include "histc_moore.h"
-#include "../cuda/kernel.cuh"
 #include "../../../../utils.h"
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
+#include "../../../devices/moore/moore_common.h"
+#include "../../../devices/moore/moore_kernel_common.h"
+
+#include "../cuda/kernel.cuh"
+#include "histc_moore.h"
 
 namespace op::histc::moore {
 
diff --git a/src/infiniop/ops/histc/nvidia/histc_nvidia.cu b/src/infiniop/ops/histc/nvidia/histc_nvidia.cu
index 943dae6f7..43daf7d0f 100644
--- a/src/infiniop/ops/histc/nvidia/histc_nvidia.cu
+++ b/src/infiniop/ops/histc/nvidia/histc_nvidia.cu
@@ -1,7 +1,9 @@
-#include "histc_nvidia.cuh"
-#include "../cuda/kernel.cuh"
-#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
 #include "../../../../utils.h"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "histc_nvidia.cuh"
 
 namespace op::histc::nvidia {
 
diff --git a/src/infiniop/ops/histc/nvidia/histc_nvidia.cuh b/src/infiniop/ops/histc/nvidia/histc_nvidia.cuh
index 464f7ef03..60acb6d3d 100644
--- a/src/infiniop/ops/histc/nvidia/histc_nvidia.cuh
+++ b/src/infiniop/ops/histc/nvidia/histc_nvidia.cuh
@@ -1,9 +1,9 @@
 #ifndef __HISTC_NVIDIA_H__
 #define __HISTC_NVIDIA_H__
 
+#include "../../../devices/nvidia/nvidia_common.cuh"
 #include "../../../operator.h"
 #include "../../../tensor.h"
-#include "../../../devices/nvidia/nvidia_common.cuh"
 
 namespace op::histc::nvidia {
 
diff --git a/src/infiniop/ops/histc/operator.cc b/src/infiniop/ops/histc/operator.cc
index acd205ee0..ed4d07662 100644
--- a/src/infiniop/ops/histc/operator.cc
+++ b/src/infiniop/ops/histc/operator.cc
@@ -15,7 +15,7 @@
 #include "moore/histc_moore.h"
 #endif
 
-__C infiniStatus_t infiniopCreateHistcDescriptor(
+__INFINI_C infiniStatus_t infiniopCreateHistcDescriptor(
     infiniopHandle_t handle,
     infiniopHistcDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t y_desc,
@@ -24,15 +24,15 @@ __C infiniStatus_t infiniopCreateHistcDescriptor(
     double min_val,
     double max_val) {
 
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::histc::NAMESPACE::Descriptor::create(                   \
-            handle,                                                         \
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::histc::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
             reinterpret_cast<op::histc::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                         \
-            x_desc,                                                         \
-            bins,                                                           \
-            min_val,                                                        \
+            y_desc,                                                          \
+            x_desc,                                                          \
+            bins,                                                            \
+            min_val,                                                         \
             max_val)
 
     switch (handle->device) {
@@ -60,7 +60,7 @@ __C infiniStatus_t infiniopCreateHistcDescriptor(
 #undef CREATE
 }
 
-__C infiniStatus_t infiniopGetHistcWorkspaceSize(infiniopHistcDescriptor_t desc, size_t *size) {
+__INFINI_C infiniStatus_t infiniopGetHistcWorkspaceSize(infiniopHistcDescriptor_t desc, size_t *size) {
 
 #define GET(CASE, NAMESPACE)                                                                 \
     case CASE:                                                                               \
@@ -91,7 +91,7 @@ __C infiniStatus_t infiniopGetHistcWorkspaceSize(infiniopHistcDescriptor_t desc,
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
-__C infiniStatus_t infiniopHistc(
+__INFINI_C infiniStatus_t infiniopHistc(
     infiniopHistcDescriptor_t desc,
     void *workspace,
     size_t workspace_size,
@@ -99,7 +99,7 @@ __C infiniStatus_t infiniopHistc(
     const void *x,
     void *stream) {
 
-#define CALCULATE(CASE, NAMESPACE)                                             \
+#define CALCULATE(CASE, NAMESPACE)                                              \
     case CASE:                                                                  \
         return reinterpret_cast<const op::histc::NAMESPACE::Descriptor *>(desc) \
             ->calculate(workspace, workspace_size, y, x, stream)
@@ -129,10 +129,10 @@ __C infiniStatus_t infiniopHistc(
 #undef CALCULATE
 }
 
-__C infiniStatus_t
+__INFINI_C infiniStatus_t
 infiniopDestroyHistcDescriptor(infiniopHistcDescriptor_t desc) {
 
-#define DELETE(CASE, NAMESPACE)                                                 \
+#define DELETE(CASE, NAMESPACE)                                                  \
     case CASE:                                                                   \
         delete reinterpret_cast<const op::histc::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS;
diff --git a/src/infiniop/ops/log10/cpu/log10_cpu.cc b/src/infiniop/ops/log10/cpu/log10_cpu.cc
index 3148014dd..fb89c86cc 100644
--- a/src/infiniop/ops/log10/cpu/log10_cpu.cc
+++ b/src/infiniop/ops/log10/cpu/log10_cpu.cc
@@ -50,4 +50,4 @@ infiniStatus_t Descriptor::calculate(
     return INFINI_STATUS_SUCCESS;
 }
 
-} // namespace op::log10::cpu
\ No newline at end of file
+} // namespace op::log10::cpu
diff --git a/src/infiniop/ops/log10/cpu/log10_cpu.h b/src/infiniop/ops/log10/cpu/log10_cpu.h
index 6cc22a1e3..b6aed710c 100644
--- a/src/infiniop/ops/log10/cpu/log10_cpu.h
+++ b/src/infiniop/ops/log10/cpu/log10_cpu.h
@@ -17,4 +17,4 @@ typedef struct Log10Op {
 } Log10Op;
 } // namespace op::log10::cpu
 
-#endif // __LOG10_CPU_H__
\ No newline at end of file
+#endif // __LOG10_CPU_H__
diff --git a/src/infiniop/ops/log10/cuda/kernel.cuh b/src/infiniop/ops/log10/cuda/kernel.cuh
index f02b7f98e..189dc457f 100644
--- a/src/infiniop/ops/log10/cuda/kernel.cuh
+++ b/src/infiniop/ops/log10/cuda/kernel.cuh
@@ -1,15 +1,10 @@
-#pragma once
+#ifndef __LOG10_KERNEL_CUH__
+#define __LOG10_KERNEL_CUH__
 #include <cmath> // 包含 log10f, log10, log, logf 等
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <math_constants.h>
 #include <type_traits>
 
 namespace op::cuda {
 
-// 移除 high_precision_log10f 避免混淆，让 Log10Op 直接实现逻辑。
-
 struct Log10Op {
 public:
     static constexpr size_t num_inputs = 1;
@@ -18,14 +13,15 @@ public:
     __device__ __forceinline__ T operator()(const T &x) const {
         if constexpr (std::is_same_v<T, float>) {
             // For F32: compute via F64 for improved accuracy.
-            return (float)log10((double)x);
+            return (float)::log10((double)x);
         } else if constexpr (std::is_same_v<T, double>) {
-            return log10(x);
+            return ::log10(x);
         } else {
             // For F16/BF16: promote to float, compute, then cast back.
-            return (T)(float)log10((double)(float)x);
+            return (T)(float)::log10((double)(float)x);
         }
     }
 };
 
 } // namespace op::cuda
+#endif
diff --git a/src/infiniop/ops/log10/log10.h b/src/infiniop/ops/log10/log10.h
index 917a1db1c..4519d6ac3 100644
--- a/src/infiniop/ops/log10/log10.h
+++ b/src/infiniop/ops/log10/log10.h
@@ -5,4 +5,4 @@
 
 #define DESCRIPTOR(NAMESPACE) ELEMENTWISE_DESCRIPTOR(log10, NAMESPACE)
 
-#endif // __LOG10_H__
\ No newline at end of file
+#endif // __LOG10_H__
diff --git a/src/infiniop/ops/log10/moore/log10_moore_kernel.h b/src/infiniop/ops/log10/moore/log10_moore_kernel.h
index 636663fc0..aa905c12d 100644
--- a/src/infiniop/ops/log10/moore/log10_moore_kernel.h
+++ b/src/infiniop/ops/log10/moore/log10_moore_kernel.h
@@ -2,8 +2,6 @@
 #define __LOG10_MOORE_KERNEL_H__
 
 #include <cmath>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
 #include <type_traits>
 
 namespace op::log10::moore {
@@ -29,7 +27,7 @@ typedef struct Log10Op {
         } else if constexpr (std::is_same_v<T, float>) {
             return log10f(x);
         } else { // double
-            return log10(x);
+            return ::log10(x);
         }
     }
 } Log10Op;
diff --git a/src/infiniop/ops/log10/nvidia/log10_nvidia.cu b/src/infiniop/ops/log10/nvidia/log10_nvidia.cu
index 03196e816..815170274 100644
--- a/src/infiniop/ops/log10/nvidia/log10_nvidia.cu
+++ b/src/infiniop/ops/log10/nvidia/log10_nvidia.cu
@@ -56,4 +56,4 @@ infiniStatus_t Descriptor::calculate(
 
     return INFINI_STATUS_SUCCESS;
 }
-} // namespace op::log10::nvidia
\ No newline at end of file
+} // namespace op::log10::nvidia
diff --git a/src/infiniop/ops/log10/nvidia/log10_nvidia.cuh b/src/infiniop/ops/log10/nvidia/log10_nvidia.cuh
index 62af5f2a6..d1f93465b 100644
--- a/src/infiniop/ops/log10/nvidia/log10_nvidia.cuh
+++ b/src/infiniop/ops/log10/nvidia/log10_nvidia.cuh
@@ -5,4 +5,4 @@
 
 ELEMENTWISE_DESCRIPTOR(log10, nvidia)
 
-#endif // __LOG10_NVIDIA_H__
\ No newline at end of file
+#endif // __LOG10_NVIDIA_H__
diff --git a/src/infiniop/ops/log10/operator.cc b/src/infiniop/ops/log10/operator.cc
index a5d1099bd..31a2edaa3 100644
--- a/src/infiniop/ops/log10/operator.cc
+++ b/src/infiniop/ops/log10/operator.cc
@@ -15,7 +15,7 @@
 #include "moore/log10_moore.h"
 #endif
 
-__C infiniStatus_t infiniopCreateLog10Descriptor(
+__INFINI_C infiniStatus_t infiniopCreateLog10Descriptor(
     infiniopHandle_t handle,
     infiniopLog10Descriptor_t *desc_ptr,
     infiniopTensorDescriptor_t y_desc,
@@ -54,7 +54,7 @@ __C infiniStatus_t infiniopCreateLog10Descriptor(
 #undef CREATE
 }
 
-__C infiniStatus_t infiniopGetLog10WorkspaceSize(infiniopLog10Descriptor_t desc, size_t *size) {
+__INFINI_C infiniStatus_t infiniopGetLog10WorkspaceSize(infiniopLog10Descriptor_t desc, size_t *size) {
 
 #define GET(CASE, NAMESPACE)                                                                 \
     case CASE:                                                                               \
@@ -85,7 +85,7 @@ __C infiniStatus_t infiniopGetLog10WorkspaceSize(infiniopLog10Descriptor_t desc,
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
-__C infiniStatus_t infiniopLog10(
+__INFINI_C infiniStatus_t infiniopLog10(
     infiniopLog10Descriptor_t desc,
     void *workspace,
     size_t workspace_size,
@@ -123,7 +123,7 @@ __C infiniStatus_t infiniopLog10(
 #undef CALCULATE
 }
 
-__C infiniStatus_t
+__INFINI_C infiniStatus_t
 infiniopDestroyLog10Descriptor(infiniopLog10Descriptor_t desc) {
 
 #define DELETE(CASE, NAMESPACE)                                                  \
@@ -154,4 +154,4 @@ infiniopDestroyLog10Descriptor(infiniopLog10Descriptor_t desc) {
     }
 
 #undef DELETE
-}
\ No newline at end of file
+}
diff --git a/src/infiniop/ops/log1p/cuda/kernel.cuh b/src/infiniop/ops/log1p/cuda/kernel.cuh
index 381bad957..cc46c859a 100644
--- a/src/infiniop/ops/log1p/cuda/kernel.cuh
+++ b/src/infiniop/ops/log1p/cuda/kernel.cuh
@@ -1,9 +1,6 @@
-#pragma once
+#ifndef __LOG1P_KERNEL_CUH__
+#define __LOG1P_KERNEL_CUH__
 #include <cmath>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <math_constants.h>
 #include <type_traits>
 
 namespace op::cuda {
@@ -16,14 +13,15 @@ public:
     __device__ __forceinline__ T operator()(const T &x) const {
         if constexpr (std::is_same_v<T, float>) {
             // Use double precision for better accuracy.
-            return (float)log1p((double)x);
+            return (float)::log1p((double)x);
         } else if constexpr (std::is_same_v<T, double>) {
-            return log1p(x);
+            return ::log1p(x);
         } else {
             // For F16/BF16: promote to float, compute, then cast back.
-            return (T)(float)log1p((double)(float)x);
+            return (T)(float)::log1p((double)(float)x);
         }
     }
 };
 
 } // namespace op::cuda
+#endif
diff --git a/src/infiniop/ops/log1p/metax/log1p_metax.maca b/src/infiniop/ops/log1p/metax/log1p_metax.maca
index 0ca2803d3..13ee7f618 100644
--- a/src/infiniop/ops/log1p/metax/log1p_metax.maca
+++ b/src/infiniop/ops/log1p/metax/log1p_metax.maca
@@ -56,4 +56,5 @@ infiniStatus_t Descriptor::calculate(
     }
 
     return INFINI_STATUS_SUCCESS;
+}
 } // namespace op::log1p::metax
diff --git a/src/infiniop/ops/log1p/moore/log1p_moore_kernel.h b/src/infiniop/ops/log1p/moore/log1p_moore_kernel.h
index ff42f0f5e..426f3ed75 100644
--- a/src/infiniop/ops/log1p/moore/log1p_moore_kernel.h
+++ b/src/infiniop/ops/log1p/moore/log1p_moore_kernel.h
@@ -2,8 +2,6 @@
 #define __LOG1P_MOORE_KERNEL_H__
 
 #include <cmath>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
 #include <type_traits>
 
 namespace op::log1p::moore {
@@ -29,7 +27,7 @@ typedef struct Log1pOp {
         } else if constexpr (std::is_same_v<T, float>) {
             return log1pf(x);
         } else { // double
-            return log1p(x);
+            return ::log1p(x);
         }
     }
 } Log1pOp;
diff --git a/src/infiniop/ops/log1p/operator.cc b/src/infiniop/ops/log1p/operator.cc
index a0efc1d1b..9c422cf6d 100644
--- a/src/infiniop/ops/log1p/operator.cc
+++ b/src/infiniop/ops/log1p/operator.cc
@@ -15,7 +15,7 @@
 #include "moore/log1p_moore.h"
 #endif
 
-__C infiniStatus_t infiniopCreateLog1pDescriptor(
+__INFINI_C infiniStatus_t infiniopCreateLog1pDescriptor(
     infiniopHandle_t handle,
     infiniopLog1pDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t y_desc,
@@ -54,7 +54,7 @@ __C infiniStatus_t infiniopCreateLog1pDescriptor(
 #undef CREATE
 }
 
-__C infiniStatus_t infiniopGetLog1pWorkspaceSize(infiniopLog1pDescriptor_t desc, size_t *size) {
+__INFINI_C infiniStatus_t infiniopGetLog1pWorkspaceSize(infiniopLog1pDescriptor_t desc, size_t *size) {
 
 #define GET(CASE, NAMESPACE)                                                                 \
     case CASE:                                                                               \
@@ -85,7 +85,7 @@ __C infiniStatus_t infiniopGetLog1pWorkspaceSize(infiniopLog1pDescriptor_t desc,
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
-__C infiniStatus_t infiniopLog1p(
+__INFINI_C infiniStatus_t infiniopLog1p(
     infiniopLog1pDescriptor_t desc,
     void *workspace,
     size_t workspace_size,
@@ -123,7 +123,7 @@ __C infiniStatus_t infiniopLog1p(
 #undef CALCULATE
 }
 
-__C infiniStatus_t
+__INFINI_C infiniStatus_t
 infiniopDestroyLog1pDescriptor(infiniopLog1pDescriptor_t desc) {
 
 #define DELETE(CASE, NAMESPACE)                                                  \
diff --git a/test/infiniop/avg_pool3d.py b/test/infiniop/avg_pool3d.py
index 2d76c1b97..2971a85bc 100644
--- a/test/infiniop/avg_pool3d.py
+++ b/test/infiniop/avg_pool3d.py
@@ -25,7 +25,7 @@
 # Test cases format: (x_shape, x_stride_or_None, kernel_size, stride_or_None, padding)
 _TEST_CASES = [
     ((1, 2, 8, 8, 8), None, (2, 2, 2), None, (0, 0, 0)),
-    ((2, 3, 7, 9, 5), (756, 252, 36, 4, 1), (3, 3, 3), (2, 2, 1), (1, 1, 0)),
+    ((2, 3, 7, 9, 5), None, (3, 3, 3), (2, 2, 1), (1, 1, 0)),
     ((2, 1, 9, 11, 7), (693, 77, 77, 7, 1), (3, 2, 3), None, (1, 0, 1)),
 ]