InfiniTensor · wooway777 · Jan 15, 2026 · Jan 21, 2026 · Apr 8, 2026
diff --git a/include/infinicore/adaptor/aten_adaptor.hpp b/include/infinicore/adaptor/aten_adaptor.hpp
@@ -6,9 +6,9 @@
 #include <ATen/ATen.h>
 
 #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
-#include <c10/cuda/CUDAStream.h>
-#include <c10/cuda/CUDAGuard.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
 #endif
 
 namespace infinicore::adaptor {

diff --git a/include/infinicore/nn.hpp b/include/infinicore/nn.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
 #include "nn/embedding.hpp"
+#include "nn/layernorm.hpp"
 #include "nn/linear.hpp"
 #include "nn/rmsnorm.hpp"
diff --git a/include/infinicore/nn/embedding.hpp b/include/infinicore/nn/embedding.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
-#include "module.hpp"
 #include "../ops.hpp"
+#include "module.hpp"
 #include <optional>
 
 namespace infinicore::nn {
@@ -78,10 +78,10 @@ class Embedding : public Module {
     INFINICORE_NN_PARAMETER(weight);
 
 private:
-    size_t num_embeddings_;   // Vocabulary size
-    size_t embedding_dim_;    // Embedding dimension
-    std::optional<int64_t> padding_idx_;  // Optional padding index
-    DataType dtype_;           // Data type for embedding weights
+    size_t num_embeddings_;              // Vocabulary size
+    size_t embedding_dim_;               // Embedding dimension
+    std::optional<int64_t> padding_idx_; // Optional padding index
+    DataType dtype_;                     // Data type for embedding weights
 };
 
 } // namespace infinicore::nn
diff --git a/include/infinicore/nn/layernorm.hpp b/include/infinicore/nn/layernorm.hpp
@@ -0,0 +1,60 @@
+#pragma once
+
+#include "../ops.hpp"
+#include "module.hpp"
+
+namespace infinicore::nn {
+
+/**
+ * @brief Layer Normalization
+ *
+ * Applies LayerNorm over the last dimension.
+ *
+ * Formula: y = (x - mean) / sqrt(var + eps) * weight + bias
+ */
+class LayerNorm : public Module {
+public:
+    /**
+     * @brief Construct a LayerNorm layer
+     *
+     * @param normalized_shape Size of the feature dimension to normalize (typically hidden_size)
+     * @param eps Small constant for numerical stability (default: 1e-5)
+     * @param dtype Data type for the weight/bias (default: DataType::F32)
+     * @param device Device to create the parameters on
+     */
+    LayerNorm(size_t normalized_shape,
+              double eps = 1e-5,
+              const DataType &dtype = DataType::F32,
+              const Device &device = Device());
+
+    /**
+     * @brief Forward pass: apply LayerNorm
+     *
+     * @param x Input tensor of shape (*, normalized_shape)
+     * @return Normalized tensor with same shape as input
+     */
+    Tensor forward(const Tensor &x) const;
+
+    // Module information
+    size_t normalized_shape() const { return normalized_shape_; }
+    double eps() const { return eps_; }
+    DataType dtype() const { return dtype_; }
+
+    // String representation
+    std::string extra_repr() const;
+
+    // Accessors for parameters
+    Tensor weight() const { return weight_; }
+    Tensor bias() const { return bias_; }
+
+protected:
+    INFINICORE_NN_PARAMETER(weight);
+    INFINICORE_NN_PARAMETER(bias);
+
+private:
+    size_t normalized_shape_;
+    double eps_;
+    DataType dtype_;
+};
+
+} // namespace infinicore::nn
diff --git a/include/infinicore/nn/module.hpp b/include/infinicore/nn/module.hpp
@@ -3,10 +3,10 @@
 #include "../tensor.hpp"
 #include "parameter.hpp"
 
+#include <spdlog/spdlog.h>
 #include <type_traits>
 #include <unordered_map>
 #include <vector>
-#include <spdlog/spdlog.h>
 
 namespace infinicore::nn {
 class Module {

diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp
@@ -14,26 +14,34 @@
 #include "ops/binary_cross_entropy_with_logits.hpp"
 #include "ops/causal_softmax.hpp"
 #include "ops/cdist.hpp"
+#include "ops/conv2d.hpp"
 #include "ops/cross_entropy.hpp"
 #include "ops/embedding.hpp"
 #include "ops/flash_attention.hpp"
 #include "ops/fmin.hpp"
 #include "ops/fmod.hpp"
+#include "ops/gelu.hpp"
+#include "ops/gelutanh.hpp"
 #include "ops/hardswish.hpp"
 #include "ops/hardtanh.hpp"
 #include "ops/kv_caching.hpp"
+#include "ops/layer_norm.hpp"
+#include "ops/linear.hpp"
 #include "ops/matmul.hpp"
 #include "ops/ones.hpp"
 #include "ops/paged_attention.hpp"
 #include "ops/paged_attention_prefill.hpp"
 #include "ops/paged_caching.hpp"
 #include "ops/per_tensor_dequant_i8.hpp"
 #include "ops/per_tensor_quant_i8.hpp"
+#include "ops/quickgelu.hpp"
 #include "ops/random_sample.hpp"
 #include "ops/rearrange.hpp"
 #include "ops/reciprocal.hpp"
+#include "ops/relu.hpp"
 #include "ops/rms_norm.hpp"
 #include "ops/rope.hpp"
 #include "ops/silu.hpp"
 #include "ops/silu_and_mul.hpp"
+#include "ops/softmax.hpp"
 #include "ops/swiglu.hpp"
diff --git a/include/infinicore/ops/conv2d.hpp b/include/infinicore/ops/conv2d.hpp
@@ -0,0 +1,38 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+#include <cstddef>
+#include <vector>
+
+namespace infinicore::op {
+class Conv2d {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor,
+                            const size_t *, const size_t *, const size_t *, size_t);
+    static void execute(Tensor output,
+                        Tensor input,
+                        Tensor weight,
+                        Tensor bias,
+                        const size_t *pads,
+                        const size_t *strides,
+                        const size_t *dilations,
+                        size_t n);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor conv2d(Tensor input,
+              Tensor weight,
+              Tensor bias,
+              const std::vector<size_t> &pads,
+              const std::vector<size_t> &strides,
+              const std::vector<size_t> &dilations);
+void conv2d_(Tensor output,
+             Tensor input,
+             Tensor weight,
+             Tensor bias,
+             const std::vector<size_t> &pads,
+             const std::vector<size_t> &strides,
+             const std::vector<size_t> &dilations);
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/gelu.hpp b/include/infinicore/ops/gelu.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Gelu {
+public:
+    using schema = void (*)(Tensor, Tensor);
+    static void execute(Tensor output, Tensor input);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor gelu(Tensor input);
+void gelu_(Tensor output, Tensor input);
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/gelutanh.hpp b/include/infinicore/ops/gelutanh.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class GeluTanh {
+public:
+    using schema = void (*)(Tensor, Tensor);
+    static void execute(Tensor output, Tensor input);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor gelu_tanh(Tensor input);
+void gelu_tanh_(Tensor output, Tensor input);
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/layer_norm.hpp b/include/infinicore/ops/layer_norm.hpp
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class LayerNorm {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, float);
+    static void execute(Tensor output,
+                        Tensor input_standardization,
+                        Tensor input_std_deviation,
+                        Tensor input,
+                        Tensor weight,
+                        Tensor bias,
+                        float epsilon);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor layer_norm(Tensor input, Tensor weight, Tensor bias, float epsilon = 1e-5f);
+void layer_norm_(Tensor output,
+                 Tensor input_standardization,
+                 Tensor input_std_deviation,
+                 Tensor input,
+                 Tensor weight,
+                 Tensor bias,
+                 float epsilon = 1e-5f);
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/mha_kvcache.hpp b/include/infinicore/ops/mha_kvcache.hpp
@@ -22,14 +22,14 @@ namespace infinicore::op {
 
 INFINICORE_GRAPH_OP_CLASS(
     MhaKVCache,
-    Tensor,           // out
-    const Tensor &,   // q
-    const Tensor &,   // k_cache
-    const Tensor &,   // v_cache
-    const Tensor &,   // seqlens_k
-    const Tensor &,   // block_table
-    std::optional<Tensor>,  // alibi_slopes
-    float);           // scale
+    Tensor,                // out
+    const Tensor &,        // q
+    const Tensor &,        // k_cache
+    const Tensor &,        // v_cache
+    const Tensor &,        // seqlens_k
+    const Tensor &,        // block_table
+    std::optional<Tensor>, // alibi_slopes
+    float);                // scale
 
 Tensor mha_kvcache(const Tensor &q,
                    const Tensor &k_cache,

diff --git a/include/infinicore/ops/quickgelu.hpp b/include/infinicore/ops/quickgelu.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class QuickGelu {
+public:
+    using schema = void (*)(Tensor, Tensor);
+    static void execute(Tensor output, Tensor input);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor quick_gelu(Tensor input);
+void quick_gelu_(Tensor output, Tensor input);
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/relu.hpp b/include/infinicore/ops/relu.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Relu {
+public:
+    using schema = void (*)(Tensor, Tensor);
+    static void execute(Tensor output, Tensor input);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor relu(Tensor input);
+void relu_(Tensor output, Tensor input);
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/softmax.hpp b/include/infinicore/ops/softmax.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Softmax {
+public:
+    using schema = void (*)(Tensor, Tensor, int);
+    static void execute(Tensor output, Tensor input, int axis);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor softmax(Tensor input, int axis = -1);
+void softmax_(Tensor output, Tensor input, int axis = -1);
+} // namespace infinicore::op
diff --git a/include/infinicore/quantization/compressed_tensors.hpp b/include/infinicore/quantization/compressed_tensors.hpp
@@ -9,7 +9,7 @@ class CompressedTensors : public BaseQuantization {
     // information and support multiple quantization schemes.
 public:
     explicit CompressedTensors(const nlohmann::json &quant_config)
-        : BaseQuantization(quant_config) {};
+        : BaseQuantization(quant_config){};
 
     infinicore::quantization::QuantScheme
     get_quant_scheme() const override {

diff --git a/include/infinicore/quantization/none_quantizaiton.hpp b/include/infinicore/quantization/none_quantizaiton.hpp
@@ -9,7 +9,7 @@ class NoneQuantization : public BaseQuantization {
     // information and support multiple quantization schemes.
 public:
     explicit NoneQuantization(const nlohmann::json &quant_config)
-        : BaseQuantization(quant_config) {};
+        : BaseQuantization(quant_config){};
 
     infinicore::quantization::QuantScheme
     get_quant_scheme() const override {

diff --git a/include/infiniop.h b/include/infiniop.h
@@ -38,6 +38,7 @@
 #include "infiniop/ops/fmin.h"
 #include "infiniop/ops/fmod.h"
 #include "infiniop/ops/gelu.h"
+#include "infiniop/ops/gelutanh.h"
 #include "infiniop/ops/gemm.h"
 #include "infiniop/ops/hardswish.h"
 #include "infiniop/ops/hardtanh.h"
@@ -66,6 +67,7 @@
 #include "infiniop/ops/paged_caching.h"
 #include "infiniop/ops/quant/per_channel_quant_int8.h"
 #include "infiniop/ops/quant/per_tensor_quant_int8.h"
+#include "infiniop/ops/quickgelu.h"
 #include "infiniop/ops/random_sample.h"
 #include "infiniop/ops/rearrange.h"
 #include "infiniop/ops/reciprocal.h"

diff --git a/include/infiniop/ops/add.h b/include/infiniop/ops/add.h
@@ -6,20 +6,20 @@
 typedef struct InfiniopDescriptor *infiniopAddDescriptor_t;
 
 __INFINI_C __export infiniStatus_t infiniopCreateAddDescriptor(infiniopHandle_t handle,
-                                                        infiniopAddDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t c,
-                                                        infiniopTensorDescriptor_t a,
-                                                        infiniopTensorDescriptor_t b);
+                                                               infiniopAddDescriptor_t *desc_ptr,
+                                                               infiniopTensorDescriptor_t c,
+                                                               infiniopTensorDescriptor_t a,
+                                                               infiniopTensorDescriptor_t b);
 
 __INFINI_C __export infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, size_t *size);
 
 __INFINI_C __export infiniStatus_t infiniopAdd(infiniopAddDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *c,
-                                        const void *a,
-                                        const void *b,
-                                        void *stream);
+                                               void *workspace,
+                                               size_t workspace_size,
+                                               void *c,
+                                               const void *a,
+                                               const void *b,
+                                               void *stream);
 
 __INFINI_C __export infiniStatus_t infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc);