diff --git a/CMakeLists.txt b/CMakeLists.txt
index df636b27..d22a2583 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -204,3 +204,38 @@ link_infini_train_exe(test_precision_check)
 add_executable(test_lora test/lora/test_lora.cc)
 link_infini_train_exe(test_lora)
 
+add_executable(test_scalar test/scalar/test_scalar.cc)
+link_infini_train_exe(test_scalar)
+
+add_executable(test_dtype_dispatch test/dispatch/test_dtype_dispatch.cc)
+link_infini_train_exe(test_dtype_dispatch)
+
+# Negative compile test: missing dtype registration must fail at compile time.
+set(DTYPE_DISPATCH_COMPILE_FAIL_SOURCE
+  ${PROJECT_SOURCE_DIR}/test/dispatch/test_dtype_dispatch_compile_fail.cc)
+
+try_compile(DTYPE_DISPATCH_COMPILE_UNEXPECTEDLY_SUCCEEDED
+  ${CMAKE_BINARY_DIR}/CMakeFiles/try_compile_dtype_dispatch_missing_map
+  SOURCES ${DTYPE_DISPATCH_COMPILE_FAIL_SOURCE}
+  CMAKE_FLAGS
+    "-DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}"
+    "-DCMAKE_CXX_STANDARD_REQUIRED=ON"
+    "-DCMAKE_CXX_EXTENSIONS=OFF"
+    "-DCMAKE_CXX_FLAGS=-I${PROJECT_SOURCE_DIR}"
+  OUTPUT_VARIABLE DTYPE_DISPATCH_TRY_COMPILE_OUTPUT
+)
+
+if(DTYPE_DISPATCH_COMPILE_UNEXPECTEDLY_SUCCEEDED)
+  message(FATAL_ERROR
+    "dtype dispatch compile-fail test unexpectedly succeeded.\n"
+    "Source: ${DTYPE_DISPATCH_COMPILE_FAIL_SOURCE}\n"
+    "Output:\n${DTYPE_DISPATCH_TRY_COMPILE_OUTPUT}")
+endif()
+
+add_custom_target(test_dtype_dispatch_compile_fail
+  COMMAND ${CMAKE_COMMAND} -E echo
+    "dtype dispatch compile-fail check passed (missing dtype registration correctly fails to compile)."
+  VERBATIM
+)
+
+add_dependencies(test_dtype_dispatch test_dtype_dispatch_compile_fail)
diff --git a/docs/device_guard_design.md b/docs/device_guard_design.md
new file mode 100644
index 00000000..64e59892
--- /dev/null
+++ b/docs/device_guard_design.md
@@ -0,0 +1,210 @@
+# Device Guard Design
+device 注册初版基建 pr：https://github.com/InfiniTensor/InfiniTrain/pull/103
+
+## 1. 设计背景与目标
+
+### 1.1 背景
+
+InfiniTrain 需要长期支持：
+
+- 多种设备类型（CPU/CUDA/国产芯片）
+- 多种运行时能力（stream、memory、blas、通信等）
+- 在不侵入上层逻辑的前提下进行后端扩展与替换
+
+在实际工程中，如果设备相关逻辑散落在框架各个模块，会导致：
+
+- `#ifdef USE_CUDA/USE_MUSA/...` 泛滥
+- 新硬件接入需要修改大量框架核心代码
+- 设备切换与资源管理缺乏统一语义
+
+### 1.2 设计目标
+
+InfiniTrain 的 device 注册机制设计目标是：
+
+1. 统一抽象：将所有与设备相关的运行时行为抽象到一个统一接口中。
+2. 后端可插拔：新设备后端可通过注册机制接入，无需修改框架核心逻辑。
+3. RAII 语义清晰：设备切换、资源恢复具备严格的作用域。
+4. 最小上层侵入：上层模块（Tensor/Autograd/Module）只感知 DeviceGuard/DeviceGuardImpl，不感知具体后端实现。
+
+## 2. 核心组件
+
+InfiniTrain 的 device 机制由三类核心组件构成：
+
+```C++
++-------------------+
+|   DeviceGuard     |  ← 对外 RAII 接口（public）
++-------------------+
+          |
+          v
++-------------------+
+| DeviceGuardImpl   |  ← 后端抽象接口（virtual）
++-------------------+
+          ^
+          |
++-------------------+
+| DeviceGuardImpl   |
+|   Registry        |  ← 全局注册表（singleton）
++-------------------+
+```
+
+其中 DeviceGuard 与 DeviceGuardImpl 的关系是：
+
+| 组件            | 职责                                                         |
+| --------------- | ------------------------------------------------------------ |
+| DeviceGuard     | 管理 “当前在哪个 device 上” 的上下文语义（RAII），语义与 device index 绑定；负责 device 的保存/切换/恢复，并将具体 runtime 操作转发给对应的 DeviceGuardImpl。 |
+| DeviceGuardImpl | 管理 “在该类 device 上如何执行 runtime 操作”，语义与 device type 绑定；对外提供 设备管理查询、stream、blas、同步、内存 等运行时能力接口。 |
+
+### 2.1 DeviceGuardImpl：运行时能力抽象（对外暴露）
+
+DeviceGuardImpl 是 InfiniTrain 中 device runtime 能力的统一抽象接口，并且是框架内部对外暴露的能力接口，封装了所有与 device 相关的行为（待补充 event 相关接口）：
+
+```C++
+// ----------------------------------------------------------------------
+// Device management
+// ----------------------------------------------------------------------
+
+virtual Device GetDevice() const = 0;
+
+virtual void SetDevice(Device device) const;
+
+virtual int8_t DeviceCount() const;
+
+virtual Device::DeviceType Type() const = 0;
+
+// ----------------------------------------------------------------------
+// Stream management
+// ----------------------------------------------------------------------
+
+virtual Stream *GetStream(Device) const;
+
+// ----------------------------------------------------------------------
+// Synchronization
+// ----------------------------------------------------------------------
+
+virtual void SynchronizeDevice(Device) const;
+
+virtual void SynchronizeStream(Stream *) const;
+
+// ----------------------------------------------------------------------
+// BLAS handle
+// ----------------------------------------------------------------------
+
+virtual BlasHandle *GetBlasHandle(Device) const;
+
+// ----------------------------------------------------------------------
+// Memory operations
+// ----------------------------------------------------------------------
+
+virtual void Malloc(void **dev_ptr, size_t size) = 0;
+
+virtual void MallocAsync(void **dev_ptr, size_t size, Stream *stream);
+
+virtual void Free(void *dev_ptr) = 0;
+
+virtual void FreeAsync(void *dev_ptr, Stream *stream);
+
+virtual void Memcpy(void *dst, const void *src, size_t count, MemcpyKind kind) = 0;
+
+virtual void MemcpyAsync(void *dst, const void *src, size_t count, MemcpyKind kind, Stream *stream);
+
+virtual void ResetMemPoolHighWatermarks(Device device) const;
+
+virtual std::pair<size_t, size_t> GetMemPoolPeakMB(Device device) const;
+```
+
+### 2.2 DeviceGuard：RAII 前端接口
+
+DeviceGuard 是设备上下文的 RAII 管理器，其职责严格限定为：
+
+- 保存当前 device
+- 切换到目标 device
+- 在作用域结束时恢复原 device
+
+DeviceGuard 不直接提供任何运行时能力接口。
+
+使用示例：
+
+```C++
+{
+    DeviceGuard guard(Device(DeviceType::kCUDA, 1));
+    // 当前线程的 device 上下文被切换到 CUDA:1
+    // 所有 runtime 操作将发生在 CUDA:1
+}
+// 离开作用域后，自动恢复进入前的 device
+```
+
+### 2.3 DeviceGuardImplRegistry：全局注册表
+
+`DeviceGuardImplRegistry`是 InfiniTrain 中用于管理 device runtime 后端实现的全局注册表，采用 singleton 模式，生命周期覆盖整个进程。
+
+其核心职责是维护`DeviceType -> DeviceGuardImpl`的一对一映射关系：
+
+```C++
+std::unordered_map<Device::DeviceType, std::unique_ptr<DeviceGuardImpl>> impls_;
+```
+
+## 3. Runtime Capability 获取与使用范式
+
+### 3.1 获取入口
+
+```C++
+DeviceGuardImpl* GetDeviceGuardImpl(Device::DeviceType type);
+```
+
+- 返回指定`DeviceType`的 DeviceGuardImpl
+- 若未注册对应 backend，直接报错
+
+### 3.2 推荐使用模式（标准范式）
+
+```C++
+auto device = tensor->GetDevice();
+const int64_t num_elements = tensor->NumElements();
+std::vector<float> buffer(num_elements);
+
+{
+    // 1. 切换 device 上下文（RAII scope）
+    core::DeviceGuard guard(device);
+
+    // 2. 获取 runtime capability
+    auto* impl = core::GetDeviceGuardImpl(device.type());
+
+    // 3. 执行 runtime 操作
+    const core::MemcpyKind kind =
+        device.type() == Device::DeviceType::kCPU
+            ? core::MemcpyKind::kD2D   // CPU: host-host memcpy
+            : core::MemcpyKind::kH2D;  // Device: host-device copy
+
+    impl->MemcpyAsync(
+        tensor->DataPtr(),               // dst
+        buffer.data(),                   // src
+        num_elements * sizeof(float),    // count
+        kind,                            // kind（说明：在 CPU backend 中，kD2D 对应普通 memcpy）
+        impl->GetStream(device)          // stream
+    );
+}  // <-- DeviceGuard 在此处析构，device 上下文被恢复
+```
+
+## 4. Backend 注册机制（静态注册）
+
+### 4.1 注册宏
+
+```C++
+#define INFINI_TRAIN_REGISTER_DEVICE_GUARD_IMPL(device_type, class_impl)                                               \
+    static const bool __infini_train_device_guard_registered##__COUNTER__ = []() {                                     \
+        infini_train::core::DeviceGuardImplRegistry::Instance().Register(device_type, std::make_unique<class_impl>()); \
+        return true;                                                                                                   \
+    }();
+```
+
+采用静态变量 + lambda 在程序启动阶段完成注册。
+
+### 4.2 使用示例（CUDA Backend）
+
+```C++
+class CudaGuardImpl : public DeviceGuardImpl {
+    ...
+};
+
+INFINI_TRAIN_REGISTER_DEVICE_GUARD_IMPL(Device::DeviceType::kCUDA, CudaGuardImpl)
+```
+
diff --git a/docs/dtype_registry_design.md b/docs/dtype_registry_design.md
new file mode 100644
index 00000000..d667f76e
--- /dev/null
+++ b/docs/dtype_registry_design.md
@@ -0,0 +1,96 @@
+# Low-Precision DType Abstraction & Backend Registration Design
+统一低精度类型抽象与后端显式注册 pr：https://github.com/InfiniTensor/InfiniTrain/pull/114
+
+## 1. 背景与目标
+
+InfiniTrain 在引入 BF16 / FP16 之前，框架层并没有低精度类型的统一抽象，所有 16-bit 浮点语义都直接绑定到后端原生类型：CUDA 侧使用 __half / __nv_bfloat16，CPU 侧则直接使用 uint16_t。这种设计带来了几个问题：
+
+1. **框架代码被 `#ifdef USE_CUDA` 污染。**
+   `infini_train/include/datatype.h`、`infini_train/src/nn/init.cc` 等通用模块都需要写出 `#ifdef USE_CUDA … #else …` 来在「有 CUDA」和「没有 CUDA」两个版本之间切换 16-bit 类型映射；非 CUDA 路径只能退化成 `uint16_t`，而 `uint16_t` 又会与
+   `kUINT16` 的反向映射产生歧义。
+2. **`TypeMap<DType>` 是「全后端共享」的单点表。**
+   旧 `TypeMap` 把所有标量类型直接映射到 C++ 类型。CPU 与 CUDA 共享同一个表，意味着不可能在不同后端把 `kFLOAT16` 映射到不同的本地标量；要扩展新硬件必须改框架头文件。
+3. **类型提升耦合具体后端类型。**
+   旧的 `WidestType_t<T1, T2>` 在 C++ 模板层面做提升，需要每个调用点先 dispatch 出一对具体的标量类型（例如 `nv_bfloat16` + `float`），再交给元函数做选择。这把「类型提升」这一纯 dtype 级别的逻辑跟「后端具体标量」捆死了。
+4. **静默 fallback 容易掩盖错误。**
+   一旦某个后端忘记定义低精度类型，旧实现默认映射到 `uint16_t`，会得到一个语义错误的内核，而不是显式报错。
+
+本工作的目标是：
+
+> **抽象出框架级通用低精度类型 FP16/BF16**，让框架代码不再直接依赖任何后端原生 16-bit 类型；同时把框架 [DataType -> 后端 C++ 类型] 的映射改为**显式注册**机制，未注册的类型如果被实例化，会在编译期被拦截报错。
+
+## 2. Design In One Diagram
+
+```
+framework code ──► FP16 / BF16 (datatype.h, 纯软件实现，提供基本转换操作)
+                   PromoteDataTypes(DataType, DataType)
+
+kernel code    ──► DispatchCpuFunc / DispatchCudaFunc / DispatchXxxFunc
+                         │
+                         ▼
+                   BackendTypeMap<Dev, DType>       (主模板只声明不定义)
+                         │
+                         ├─ kFLOAT16 / kBFLOAT16  → 后端在 *_dispatch.h 显式特化后注册
+                         │      └── CUDA: __half / __nv_bfloat16
+                         │      └── CPU : FP16 / BF16
+                         └─ 其它 10 个标量 dtype 使用默认注册 → INFINI_REGISTER_STANDARD_BACKEND_TYPES(DEV)
+```
+
+要点：
+
+- 框架层不提供任何「DataType → 后端 C++ 类型」映射路径；所有具体类型绑定均在后端通过 `BackendTypeMap<Dev, DType>` 完成。
+- `BackendTypeMap<Dev, DType>` 主模板**只声明不定义**，只有后端显式特化并完成注册的组合才允许参与 kernel dispatch；未注册组合会在模板实例化阶段被 `static_assert` 于编译期拦截。
+
+## 3. Core API
+
+| API | 位置 | 说明 |
+| --- | --- | --- |
+| `struct FP16 / BF16` | [datatype.h](../infini_train/include/datatype.h) | 16-bit 软件包装（IEEE-754 half / truncated bf16），承担框架身份、存储布局、fallback 转换；不承担后端高性能算术语义。 |
+| `PromoteDataTypes(DataType, DataType)` | [datatype.h](../infini_train/include/datatype.h) | 纯枚举到枚举的类型提升。规则：FP16+BF16→FP32；浮点优先于整数；同类按字节宽取大。 |
+| `BackendTypeMap<Dev, DType>` | [core/backend_type_map.h](../infini_train/include/core/backend_type_map.h) | 主模板**只声明不定义**；后端通过显式特化提供 `::type`。 |
+| `INFINI_REGISTER_STANDARD_BACKEND_TYPES(DEV)` | [core/backend_type_map.h](../infini_train/include/core/backend_type_map.h) | 一次性注册 10 个非低精度 dtype（`kUINT8…kFLOAT64`）到对应 C++ 标量。 |
+| `DispatchCpuFunc / DispatchCudaFunc<AllowedDTypes...>` | `src/core/runtime/{cpu,cuda}/{cpu,cuda}_dispatch.h` | 后端 dispatch 入口，底层转发到 `DispatchByTypeMap<TypeMap, AllowedDTypes...>`。 |
+
+## 4. How To Add A New Backend
+
+按以下清单操作，**不需要**修改 `infini_train/include/` 下的任何框架头文件，也不需要 `#ifdef`：
+
+1. 在后端的 `*_dispatch.h` 里 include `core/backend_type_map.h` 与 `dtype_dispatch.h`。
+2. 调用 `INFINI_REGISTER_STANDARD_BACKEND_TYPES(Device::DeviceType::kXxx)` 注册 10 个标准 dtype。
+3. 若硬件支持低精度，显式特化 `BackendTypeMap<kXxx, kFLOAT16>` / `BackendTypeMap<kXxx, kBFLOAT16>` 指向后端本地 16-bit 标量类型；不支持则直接跳过，调用方一旦 dispatch 到未注册的 dtype 会在编译期触发 `static_assert`。
+4. 定义 `XxxTypeMap<DType>` 转发/继承到 `BackendTypeMap<kXxx, DType>`。
+5. 提供 `DispatchXxxFunc` 入口，转发到 `DispatchByTypeMap<XxxTypeMap, AllowedDTypes...>`。
+
+### 最小示例
+
+```cpp
+// xxx_dispatch.h
+#include "infini_train/include/core/backend_type_map.h"
+#include "infini_train/include/dtype_dispatch.h"
+
+namespace infini_train::core {
+// 若硬件支持低精度，显式特化 FP16/BF16
+template <> struct BackendTypeMap<Device::DeviceType::kXxx, DataType::kFLOAT16>  { using type = xxx_half;   };
+template <> struct BackendTypeMap<Device::DeviceType::kXxx, DataType::kBFLOAT16> { using type = xxx_bfloat; };
+} // namespace infini_train::core
+
+INFINI_REGISTER_STANDARD_BACKEND_TYPES(infini_train::Device::DeviceType::kXxx)
+
+namespace infini_train::core::xxx {
+template <DataType DType>
+struct XxxTypeMap : BackendTypeMap<Device::DeviceType::kXxx, DType> {};
+
+template <DataType... AllowedDTypes, typename Functor, typename... Args>
+auto DispatchXxxFunc(DataType dtype, Functor &&f, std::string_view ctx = "", Args &&...a) {
+    return DispatchByTypeMap<XxxTypeMap, AllowedDTypes...>(
+        dtype, std::forward<Functor>(f), ctx, std::forward<Args>(a)...);
+}
+} // namespace infini_train::core::xxx
+```
+
+## 5. Failure Modes
+
+| 情形 | 表现 |
+| --- | --- |
+| 后端未注册某个 dtype（`BackendTypeMap<Dev, DType>` 无特化），但被 dispatch 命中 | 编译期 `static_assert` 触发，错误信息指向 `BackendTypeMap` 的显式注册要求。 |
+| dispatch 的 dtype 不在调用点 `AllowedDTypes...` 白名单内 | 运行期 `LOG_UNSUPPORTED_DTYPE` 报错。 |
diff --git a/infini_train/include/common/common.h b/infini_train/include/common/common.h
index b6a02543..80cba728 100644
--- a/infini_train/include/common/common.h
+++ b/infini_train/include/common/common.h
@@ -7,11 +7,21 @@
 
 #include "infini_train/include/datatype.h"
 
+/**
+ * General Utility Macros
+ */
+#define EXPAND(X) X
+// This macro lets you pass an arbitrary expression that may contain internal
+// commas to another macro without having the commas causing the expression
+// to be interpreted as being multiple arguments
+// Basically an alternative for __VA_OPTS__ before C++20
+// ref: https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/Dispatch_v2.h
+#define WRAP(...) __VA_ARGS__
+#define CAT(a, b) CAT_(a, b)
+#define CAT_(a, b) a##b
+
 #define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
 #define LOG_LOC(LEVEL, MSG) LOG(LEVEL) << MSG << " at " << __FILE__ << ":" << __LINE__
-#define LOG_UNSUPPORTED_DTYPE(DTYPE, CONTEXT_IDENTIFIER)                                                               \
-    LOG_LOC(FATAL, WRAP(CONTEXT_IDENTIFIER << ": Unsupported data type: "                                              \
-                                                  + kDataTypeToDesc.at(static_cast<infini_train::DataType>(dtype))))
 
 inline std::vector<int64_t> ComputeStrides(const std::vector<int64_t> &dims) {
     std::vector<int64_t> strides(dims.size(), 1);
diff --git a/infini_train/include/common/cpu/common_cpu.h b/infini_train/include/common/cpu/common_cpu.h
index d4c73e84..b8a01538 100644
--- a/infini_train/include/common/cpu/common_cpu.h
+++ b/infini_train/include/common/cpu/common_cpu.h
@@ -3,20 +3,41 @@
 #include <type_traits>
 #include <utility>
 
+#include "infini_train/include/datatype.h"
+
 namespace infini_train::common::cpu {
+
+namespace detail {
+
+// FP16/BF16 don't support implicit conversion, so we route through float.
+template <typename DST, typename SRC> DST CastImpl(SRC &&x) {
+    using SrcBase = std::remove_cvref_t<SRC>;
+    if constexpr (std::is_same_v<DST, SrcBase>) {
+        return x;
+    } else if constexpr (std::is_same_v<DST, FP16> || std::is_same_v<DST, BF16>) {
+        // Destination is a framework 16-bit type: convert via float
+        return DST(static_cast<float>(std::forward<SRC>(x)));
+    } else if constexpr (std::is_same_v<SrcBase, FP16> || std::is_same_v<SrcBase, BF16>) {
+        // Source is a framework 16-bit type: widen to float first
+        return static_cast<DST>(static_cast<float>(x));
+    } else {
+        return static_cast<DST>(std::forward<SRC>(x));
+    }
+}
+
+} // namespace detail
+
 /**
- * Converts a value between arbitrary types. This offers perfect
- * forwarding which preserves value categories (lvalues/rvalues)
+ * Converts a value between arbitrary types, including framework FP16/BF16.
  *
- * @tparam DST Destination type (deduced)
+ * @tparam DST Destination type
  * @tparam SRC Source type (deduced)
- * @param x Input value (preserves const/volatile and value category)
+ * @param x Input value
  * @return Value converted to DST type
  */
 template <typename DST, typename SRC> DST Cast(SRC &&x) {
     static_assert(!std::is_reference_v<DST>, "Cast cannot return reference types");
-
-    // TODO(lzm): add cpu-version fp16 and bf16
-    return (DST)(std::forward<SRC>(x));
+    return detail::CastImpl<DST>(std::forward<SRC>(x));
 }
+
 } // namespace infini_train::common::cpu
diff --git a/infini_train/include/core/backend_type_map.h b/infini_train/include/core/backend_type_map.h
new file mode 100644
index 00000000..f67b8da7
--- /dev/null
+++ b/infini_train/include/core/backend_type_map.h
@@ -0,0 +1,81 @@
+#pragma once
+
+#include "infini_train/include/datatype.h"
+#include "infini_train/include/device.h"
+
+namespace infini_train::core {
+
+/**
+ * Backend type mapping: DataType -> backend-native dispatch type
+ *
+ * BackendTypeMap — maps DataType to the C++ type used by kernels/dispatch.
+ *                  Primary template intentionally undefined — there is NO
+ *                  default fallback to the framework TypeMap<DType>.
+ *
+ *                  Backends must register dtypes explicitly:
+ *                    - Standard types (int, float, double, ...):
+ *                        call INFINI_REGISTER_STANDARD_BACKEND_TYPES(Dev)
+ *                        once at file scope in the backend's dispatch header.
+ *                    - Low-precision types (FP16, BF16):
+ *                        directly specialize BackendTypeMap<Dev, kFLOAT16/kBFLOAT16>
+ *                        in the backend's dispatch header (the native scalar type
+ *                        differs per backend, e.g. __half on CUDA).
+ *
+ * If a backend does not register a dtype, HasMappedType_v returns false and
+ * DispatchByTypeMap fires a clear static_assert at compile time.
+ */
+
+// -----------------------------------------------------------------------------
+// BackendTypeMap: DataType -> backend dispatch type
+// Primary template intentionally undefined — no TypeMap<DType> fallback.
+// -----------------------------------------------------------------------------
+template <Device::DeviceType Dev, DataType DType> struct BackendTypeMap;
+
+} // namespace infini_train::core
+
+// -----------------------------------------------------------------------------
+// INFINI_REGISTER_STANDARD_BACKEND_TYPES(DEV)
+//
+// Explicitly registers the 10 standard (non-low-precision) dtypes for a backend
+// device.  Invoke once at file scope (outside any namespace) in the backend's
+// dispatch header, e.g.:
+//
+//   INFINI_REGISTER_STANDARD_BACKEND_TYPES(Device::DeviceType::kCUDA)
+//
+// FP16 and BF16 are NOT registered here — backends must specialize
+// BackendTypeMap<DEV, kFLOAT16/kBFLOAT16> directly with their native scalar
+// type (e.g. __half / __nv_bfloat16 on CUDA).
+// -----------------------------------------------------------------------------
+#define INFINI_REGISTER_STANDARD_BACKEND_TYPES(DEV)                                                                    \
+    namespace infini_train::core {                                                                                     \
+    template <> struct BackendTypeMap<DEV, DataType::kUINT8> {                                                         \
+        using type = uint8_t;                                                                                          \
+    };                                                                                                                 \
+    template <> struct BackendTypeMap<DEV, DataType::kINT8> {                                                          \
+        using type = int8_t;                                                                                           \
+    };                                                                                                                 \
+    template <> struct BackendTypeMap<DEV, DataType::kUINT16> {                                                        \
+        using type = uint16_t;                                                                                         \
+    };                                                                                                                 \
+    template <> struct BackendTypeMap<DEV, DataType::kINT16> {                                                         \
+        using type = int16_t;                                                                                          \
+    };                                                                                                                 \
+    template <> struct BackendTypeMap<DEV, DataType::kUINT32> {                                                        \
+        using type = uint32_t;                                                                                         \
+    };                                                                                                                 \
+    template <> struct BackendTypeMap<DEV, DataType::kINT32> {                                                         \
+        using type = int32_t;                                                                                          \
+    };                                                                                                                 \
+    template <> struct BackendTypeMap<DEV, DataType::kUINT64> {                                                        \
+        using type = uint64_t;                                                                                         \
+    };                                                                                                                 \
+    template <> struct BackendTypeMap<DEV, DataType::kINT64> {                                                         \
+        using type = int64_t;                                                                                          \
+    };                                                                                                                 \
+    template <> struct BackendTypeMap<DEV, DataType::kFLOAT32> {                                                       \
+        using type = float;                                                                                            \
+    };                                                                                                                 \
+    template <> struct BackendTypeMap<DEV, DataType::kFLOAT64> {                                                       \
+        using type = double;                                                                                           \
+    };                                                                                                                 \
+    } /* namespace infini_train::core */
diff --git a/infini_train/include/core/runtime/device_guard.h b/infini_train/include/core/runtime/device_guard.h
index c9eeeb25..dc56fc6f 100644
--- a/infini_train/include/core/runtime/device_guard.h
+++ b/infini_train/include/core/runtime/device_guard.h
@@ -66,6 +66,7 @@ class DeviceGuardImpl {
     // Device management
     // ----------------------------------------------------------------------
 
+    // FIXME(dcj): impl should only bind with device type
     virtual Device GetDevice() const = 0;
 
     virtual void SetDevice(Device device) const;
diff --git a/infini_train/include/datatype.h b/infini_train/include/datatype.h
index 79f325db..e2f3e2f6 100644
--- a/infini_train/include/datatype.h
+++ b/infini_train/include/datatype.h
@@ -1,14 +1,88 @@
 #pragma once
 
+#include <cstddef>
 #include <cstdint>
 #include <string>
 #include <unordered_map>
-#ifdef USE_CUDA
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#endif
 
 namespace infini_train {
+
+// -----------------------------------------------------------------------------
+// Framework scalar types (16-bit storage + fallback scalar semantics)
+// -----------------------------------------------------------------------------
+// FP16/BF16 are framework-level 16-bit scalar/storage types.
+// They are used for:
+//   - framework type identity
+//   - baseline dtype mapping
+//   - metadata / storage layout
+//   - CPU/reference/fallback conversion paths
+//
+// They are NOT intended to define backend-native arithmetic semantics.
+// Backend kernels should use backend-specific type maps, e.g.:
+//   - CUDA: __half / __nv_bfloat16
+//   - CPU : FP16 / BF16 / widened compute types (as needed)
+// -----------------------------------------------------------------------------
+
+namespace detail {
+
+// ---------------------------
+// BF16 helpers
+// ---------------------------
+uint16_t FloatToBf16Bits(float value);
+float Bf16BitsToFloat(uint16_t bits);
+
+// ---------------------------
+// FP16 helpers
+// Pure software IEEE-754 half <-> float conversion for framework fallback use.
+// ---------------------------
+uint16_t FloatToFp16Bits(float value);
+float Fp16BitsToFloat(uint16_t bits);
+
+} // namespace detail
+
+struct alignas(2) FP16 {
+    uint16_t x{0};
+
+    struct from_bits_t {};
+    static constexpr from_bits_t from_bits() { return {}; }
+
+    constexpr FP16() = default;
+    constexpr FP16(uint16_t bits, from_bits_t) : x(bits) {}
+
+    explicit FP16(float value);
+    explicit FP16(double value);
+    explicit FP16(int value);
+    explicit FP16(int64_t value);
+
+    explicit operator float() const;
+    explicit operator double() const;
+
+    FP16 &operator++();
+};
+
+struct alignas(2) BF16 {
+    uint16_t x{0};
+
+    struct from_bits_t {};
+    static constexpr from_bits_t from_bits() { return {}; }
+
+    constexpr BF16() = default;
+    constexpr BF16(uint16_t bits, from_bits_t) : x(bits) {}
+
+    explicit BF16(float value);
+    explicit BF16(double value);
+    explicit BF16(int value);
+    explicit BF16(int64_t value);
+
+    explicit operator float() const;
+    explicit operator double() const;
+
+    BF16 &operator++();
+};
+
+// -----------------------------------------------------------------------------
+// DataType enum and metadata tables
+// -----------------------------------------------------------------------------
 enum class DataType : int8_t {
     kUINT8,
     kINT8,
@@ -37,164 +111,19 @@ inline const std::unordered_map<DataType, std::string> kDataTypeToDesc = {
     {DataType::kFLOAT16, "fp16"},  {DataType::kFLOAT32, "fp32"},  {DataType::kFLOAT64, "fp64"},
 };
 
-/**
- * Compile-time type mapping from DataType enum to concrete C++ types.
- *
- * - Primary template: Declared but undefined to enforce specialization
- * - Specializations: Explicit mappings (DataType::kFLOAT32 → float, etc)
- * - TypeMap_t alias: Direct access to mapped type (TypeMap_t<DataType::kINT32> → int32_t)
- *
- * Enables type-safe generic code where operations dispatch based on DataType tokens,
- * with zero runtime overhead. Extend by adding new specializations.
- */
-template <DataType DType> struct TypeMap;
-template <DataType DType> using TypeMap_t = typename TypeMap<DType>::type;
-
-/**
- * Compile-time type mapping from C++ types to DataType enum.
- *
- * Example usage: DataTypeMap<int32_t>::value  // Returns DataType::kINT32
- * DataTypeMap_v for convenient access to the mapped value (e.g., DataTypeMap_v<int32_t>).
- */
-template <typename T> struct DataTypeMap;
-template <typename T> inline constexpr DataType DataTypeMap_v = DataTypeMap<T>::value;
-
-// Macro to define TypeMap specializations and reverse mappings
-#define DEFINE_DATA_TYPE_MAPPING(ENUM_VALUE, CPP_TYPE)                                                                 \
-    template <> struct TypeMap<DataType::ENUM_VALUE> {                                                                 \
-        using type = CPP_TYPE;                                                                                         \
-    };                                                                                                                 \
-    template <> struct DataTypeMap<CPP_TYPE> {                                                                         \
-        static constexpr DataType value = DataType::ENUM_VALUE;                                                        \
-    };
-
-DEFINE_DATA_TYPE_MAPPING(kUINT8, uint8_t)
-DEFINE_DATA_TYPE_MAPPING(kINT8, int8_t)
-DEFINE_DATA_TYPE_MAPPING(kUINT16, uint16_t)
-DEFINE_DATA_TYPE_MAPPING(kINT16, int16_t)
-DEFINE_DATA_TYPE_MAPPING(kUINT32, uint32_t)
-DEFINE_DATA_TYPE_MAPPING(kINT32, int32_t)
-DEFINE_DATA_TYPE_MAPPING(kUINT64, uint64_t)
-DEFINE_DATA_TYPE_MAPPING(kINT64, int64_t)
-DEFINE_DATA_TYPE_MAPPING(kFLOAT32, float)
-DEFINE_DATA_TYPE_MAPPING(kFLOAT64, double)
-
-#ifdef USE_CUDA
-DEFINE_DATA_TYPE_MAPPING(kBFLOAT16, nv_bfloat16)
-DEFINE_DATA_TYPE_MAPPING(kFLOAT16, half)
-#else
-// Non-CUDA fallbacks
-template <> struct TypeMap<DataType::kBFLOAT16> {
-    using type = uint16_t;
-};
-template <> struct TypeMap<DataType::kFLOAT16> {
-    using type = uint16_t;
-};
-
-// TODO(lzm): currently for non-CUDA/CPU, there's an ambiguity of uint16_t mapping to both kUINT16 and
-// kFLOAT16/kBFLOAT16. When CPU custom bfloat16/float16 types are defined, we should replace uint16_t with those types.
-#endif
-#undef DEFINE_DATA_TYPE_MAPPING
-
-// Extends std::is_floating_point to support CUDA floating-point types.
-template <typename T> struct is_floating_point_ext : std::is_floating_point<T> {};
-
-// Extends std::is_arithmetic to support CUDA floating-point types.
-template <typename T> struct is_arithmetic_ext : std::is_arithmetic<T> {};
-
-// Specializations for CUDA types
-#ifdef USE_CUDA
-template <> struct is_floating_point_ext<__nv_bfloat16> : std::true_type {};
-template <> struct is_arithmetic_ext<__nv_bfloat16> : std::true_type {};
-template <> struct is_floating_point_ext<__half> : std::true_type {};
-template <> struct is_arithmetic_ext<__half> : std::true_type {};
-#endif
-
-namespace {
-template <typename T1, typename T2> struct LargerType {
-    static constexpr size_t size1 = sizeof(T1);
-    static constexpr size_t size2 = sizeof(T2);
-    using type = std::conditional_t<(size1 >= size2), T1, T2>;
-};
-
-// Specializations of LargerType for the specific 16-bit FP combinations
-#ifdef USE_CUDA
-template <> struct LargerType<__nv_bfloat16, __half> {
-    using type = float;
-};
+// =============================================================================
+// DataType-level promotion  (pure enum → enum, no concrete/backend types)
+// =============================================================================
+// Rules (priority order):
+//   1. FP16 + BF16 → FLOAT32   (neither is a lossless superset of the other)
+//   2. Any float dominates any integer → keep the float type
+//   3. Same category (float-float or int-int) → wider byte size wins
+// =============================================================================
 
-template <> struct LargerType<__half, __nv_bfloat16> {
-    using type = float;
-};
-#endif
-
-/**
- * @brief Finds the first type in a parameter pack that satisfies the given predicate. If no type matches,
- * returns the last type in the pack (base case).
- *
- * @tparam Predicate Template template parameter that takes one type and provides a static `value` member
- * @tparam Ts Parameter pack of types to check
- */
-template <template <typename> class Predicate, typename... Ts> struct FirstMatchingType;
-
-template <template <typename> class Predicate, typename T> struct FirstMatchingType<Predicate, T> {
-    using type = T;
-};
-
-template <template <typename> class Predicate, typename T, typename... Ts>
-struct FirstMatchingType<Predicate, T, Ts...> {
-    using type = std::conditional_t<Predicate<T>::value, T, typename FirstMatchingType<Predicate, Ts...>::type>;
-};
+/// Returns true for floating-point DataTypes (FP16, BF16, FP32, FP64).
+bool IsFloatingPointDType(DataType dt);
 
-/**
- * @brief Recursively finds the widest type among those that satisfy a predicate. Types not satisfying the predicate are
- * ignored and don't affect the current maximum.
- *
- * @tparam Predicate Template template parameter that defines the type filter
- * @tparam CurrentMax The current widest type found so far
- * @tparam Ts Remaining types to process
- */
-template <template <typename> class Predicate, typename CurrentMax, typename... Ts> struct WidestTypeImpl;
-
-template <template <typename> class Predicate, typename CurrentMax> struct WidestTypeImpl<Predicate, CurrentMax> {
-    using type = CurrentMax;
-};
-
-template <template <typename> class Predicate, typename CurrentMax, typename T, typename... Ts>
-struct WidestTypeImpl<Predicate, CurrentMax, T, Ts...> {
-    using new_max = std::conditional_t<Predicate<T>::value, typename LargerType<CurrentMax, T>::type, CurrentMax>;
-    using type = typename WidestTypeImpl<Predicate, new_max, Ts...>::type;
-};
-
-template <template <typename> class Predicate, typename... Ts> struct MaxTypeBySizeWithPredicate {
-    using first = typename FirstMatchingType<Predicate, Ts...>::type;
-    using type = typename WidestTypeImpl<Predicate, first, Ts...>::type;
-};
-} // namespace
-
-/**
- * @brief Finds the widest/largest type according to the dtype promotion logic in PyTorch among a pack of arithmetic
- * types.
- *
- * Selects the type with the widest size from the provided type list. Includes support for CUDA floating-point types
- * (__half, __nv_bfloat16) when compiled with CUDA.
- * - If floating-point types are present, selects the largest floating-point type;
- * - Otherwise selects the largest integral type.
- * - If multiple integral types have the same size, the precedence follows the list order (i.e., the first type that has
- * the widest size will be selected).
- *
- * @tparam Ts Pack of types to evaluate. Must all be arithmetic or CUDA floating-point types.
- * @throws static_assert If no types are provided or if any type is non-arithmetic.
- * @note For mixed 16-bit floating-point types (__half and __nv_bfloat16), promotes to float (32-bit).
- */
-template <typename... Ts> struct WidestType {
-    static_assert(sizeof...(Ts) > 0, "At least one type is required");
-    static_assert((is_arithmetic_ext<Ts>::value && ...), "All types must be arithmetic or CUDA floating-point types");
-    static constexpr bool has_float = (is_floating_point_ext<Ts>::value || ...);
-    using type = typename std::conditional_t<has_float, MaxTypeBySizeWithPredicate<is_floating_point_ext, Ts...>,
-                                             MaxTypeBySizeWithPredicate<std::is_integral, Ts...>>::type;
-};
+/// Binary DataType promotion.
+DataType PromoteDataTypes(DataType a, DataType b);
 
-// Convenience alias for WidestType::type
-template <typename... Ts> using WidestType_t = typename WidestType<Ts...>::type;
 } // namespace infini_train
diff --git a/infini_train/include/dispatcher.h b/infini_train/include/dispatcher.h
index fc95d64e..29d11b73 100644
--- a/infini_train/include/dispatcher.h
+++ b/infini_train/include/dispatcher.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <format>
 #include <map>
 #include <type_traits>
 #include <utility>
@@ -8,370 +7,13 @@
 #include "glog/logging.h"
 
 #include "infini_train/include/autocast.h"
-#include "infini_train/include/common/common.h"
 #include "infini_train/include/device.h"
 #ifdef PROFILE_MODE
 #include "infini_train/include/profiler.h"
 #endif
 
-/**
- * General Utility Macros
- */
-#define EXPAND(X) X
-// This macro lets you pass an arbitrary expression that may contain internal
-// commas to another macro without having the commas causing the expression
-// to be interpreted as being multiple arguments
-// Basically an alternative for __VA_OPTS__ before C++20
-// ref: https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/Dispatch_v2.h
-#define WRAP(...) __VA_ARGS__
-#define CAT(a, b) CAT_(a, b)
-#define CAT_(a, b) a##b
-
-/**
- * Data Type Macros
- * Defines common categories of data types for dispatching
- */
-#define INFINI_FLOATING_TYPES DataType::kFLOAT32, DataType::kFLOAT64
-#define INFINI_REDUCED_FLOATING_TYPES DataType::kFLOAT16, DataType::kBFLOAT16
-#define INFINI_ALL_FLOATING_TYPES EXPAND(INFINI_FLOATING_TYPES), EXPAND(INFINI_REDUCED_FLOATING_TYPES)
-#define INFINI_SIGNED_INTEGRAL_TYPES DataType::kINT8, DataType::kINT16, DataType::kINT32, DataType::kINT64
-#define INFINI_UNSIGNED_INTEGRAL_TYPES DataType::kUINT8, DataType::kUINT16, DataType::kUINT32, DataType::kUINT64
-#define INFINI_ALL_INTEGRAL_TYPES EXPAND(INFINI_SIGNED_INTEGRAL_TYPES), EXPAND(INFINI_UNSIGNED_INTEGRAL_TYPES)
-#define INFINI_ALL_TYPES EXPAND(INFINI_ALL_FLOATING_TYPES), EXPAND(INFINI_ALL_INTEGRAL_TYPES)
-#define INFINI_8_BIT_TYPES DataType::kINT8, DataType::kUINT8
-#define INFINI_16_BIT_TYPES DataType::kINT16, DataType::kUINT16, DataType::kFLOAT16, DataType::kBFLOAT16
-#define INFINI_32_BIT_TYPES DataType::kINT32, DataType::kUINT32, DataType::kFLOAT32
-#define INFINI_64_BIT_TYPES DataType::kINT64, DataType::kUINT64, DataType::kFLOAT64
-
-/**
- * Dispatch Macros
- */
-#define DISPATCH_WITH_DEFAULT(DTYPE_EXPR, BODY, DEFAULT_BODY, ...)                                                     \
-    switch (DTYPE_EXPR) {                                                                                              \
-        CAT(DISPATCH_CASE_, PP_NARG(__VA_ARGS__))(__VA_ARGS__, WRAP(BODY)) default : { WRAP(DEFAULT_BODY); }           \
-    }
-
-// dispatch with switch and arbitrary number of cases
-#define DISPATCH(DTYPE_EXPR, BODY, ...)                                                                                \
-    DISPATCH_WITH_DEFAULT(                                                                                             \
-        DTYPE_EXPR, WRAP(BODY),                                                                                        \
-        EXPAND(LOG(FATAL) << "Unsupported data type at " << __FILE__ << ":" << __LINE__; return nullptr;),             \
-        __VA_ARGS__)
-
-// dispatch a single case
-#define DISPATCH_CASE(BODY, ...) CAT(DISPATCH_CASE_, PP_NARG(__VA_ARGS__))(__VA_ARGS__, WRAP(BODY))
-
-// Helper macros to count the number of arguments
-#define PP_NARG(...) PP_NARG_(__VA_ARGS__, PP_RSEQ_N())
-#define PP_NARG_(...) PP_ARG_N(__VA_ARGS__)
-#define PP_ARG_N(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22,  \
-                 _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42,   \
-                 _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, _61, _62,   \
-                 _63, N, ...)                                                                                          \
-    N
-#define PP_RSEQ_N()                                                                                                    \
-    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36,    \
-        35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8,  \
-        7, 6, 5, 4, 3, 2, 1, 0
-
-// Macros to generate case labels
-// Should have up to number of DataType cases (currently 12)
-#define DISPATCH_CASE_1(T1, BODY)                                                                                      \
-    case T1: {                                                                                                         \
-        BODY break;                                                                                                    \
-    }
-
-#define DISPATCH_CASE_2(T1, T2, BODY)                                                                                  \
-    case T1:                                                                                                           \
-    case T2: {                                                                                                         \
-        BODY break;                                                                                                    \
-    }
-
-#define DISPATCH_CASE_3(T1, T2, T3, BODY)                                                                              \
-    case T1:                                                                                                           \
-    case T2:                                                                                                           \
-    case T3: {                                                                                                         \
-        BODY break;                                                                                                    \
-    }
-
-#define DISPATCH_CASE_4(T1, T2, T3, T4, BODY)                                                                          \
-    case T1:                                                                                                           \
-    case T2:                                                                                                           \
-    case T3:                                                                                                           \
-    case T4: {                                                                                                         \
-        BODY break;                                                                                                    \
-    }
-
-#define DISPATCH_CASE_5(T1, T2, T3, T4, T5, BODY)                                                                      \
-    case T1:                                                                                                           \
-    case T2:                                                                                                           \
-    case T3:                                                                                                           \
-    case T4:                                                                                                           \
-    case T5: {                                                                                                         \
-        BODY break;                                                                                                    \
-    }
-
-#define DISPATCH_CASE_6(T1, T2, T3, T4, T5, T6, BODY)                                                                  \
-    case T1:                                                                                                           \
-    case T2:                                                                                                           \
-    case T3:                                                                                                           \
-    case T4:                                                                                                           \
-    case T5:                                                                                                           \
-    case T6: {                                                                                                         \
-        BODY break;                                                                                                    \
-    }
-
-#define DISPATCH_CASE_7(T1, T2, T3, T4, T5, T6, T7, BODY)                                                              \
-    case T1:                                                                                                           \
-    case T2:                                                                                                           \
-    case T3:                                                                                                           \
-    case T4:                                                                                                           \
-    case T5:                                                                                                           \
-    case T6:                                                                                                           \
-    case T7: {                                                                                                         \
-        BODY break;                                                                                                    \
-    }
-
-#define DISPATCH_CASE_8(T1, T2, T3, T4, T5, T6, T7, T8, BODY)                                                          \
-    case T1:                                                                                                           \
-    case T2:                                                                                                           \
-    case T3:                                                                                                           \
-    case T4:                                                                                                           \
-    case T5:                                                                                                           \
-    case T6:                                                                                                           \
-    case T7:                                                                                                           \
-    case T8: {                                                                                                         \
-        BODY break;                                                                                                    \
-    }
-
-#define DISPATCH_CASE_9(T1, T2, T3, T4, T5, T6, T7, T8, T9, BODY)                                                      \
-    case T1:                                                                                                           \
-    case T2:                                                                                                           \
-    case T3:                                                                                                           \
-    case T4:                                                                                                           \
-    case T5:                                                                                                           \
-    case T6:                                                                                                           \
-    case T7:                                                                                                           \
-    case T8:                                                                                                           \
-    case T9: {                                                                                                         \
-        BODY break;                                                                                                    \
-    }
-
-#define DISPATCH_CASE_10(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, BODY)                                                \
-    case T1:                                                                                                           \
-    case T2:                                                                                                           \
-    case T3:                                                                                                           \
-    case T4:                                                                                                           \
-    case T5:                                                                                                           \
-    case T6:                                                                                                           \
-    case T7:                                                                                                           \
-    case T8:                                                                                                           \
-    case T9:                                                                                                           \
-    case T10: {                                                                                                        \
-        BODY break;                                                                                                    \
-    }
-
-#define DISPATCH_CASE_11(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, BODY)                                           \
-    case T1:                                                                                                           \
-    case T2:                                                                                                           \
-    case T3:                                                                                                           \
-    case T4:                                                                                                           \
-    case T5:                                                                                                           \
-    case T6:                                                                                                           \
-    case T7:                                                                                                           \
-    case T8:                                                                                                           \
-    case T9:                                                                                                           \
-    case T10:                                                                                                          \
-    case T11: {                                                                                                        \
-        BODY break;                                                                                                    \
-    }
-
-#define DISPATCH_CASE_12(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, BODY)                                      \
-    case T1:                                                                                                           \
-    case T2:                                                                                                           \
-    case T3:                                                                                                           \
-    case T4:                                                                                                           \
-    case T5:                                                                                                           \
-    case T6:                                                                                                           \
-    case T7:                                                                                                           \
-    case T8:                                                                                                           \
-    case T9:                                                                                                           \
-    case T10:                                                                                                          \
-    case T11:                                                                                                          \
-    case T12: {                                                                                                        \
-        BODY break;                                                                                                    \
-    }
-
 namespace infini_train {
 
-template <DataType... DTypes> struct DataTypeList {};
-
-template <DataType Dtype, typename List> struct IsDataTypeInList;
-
-template <DataType Dtype, DataType... DTypes>
-struct IsDataTypeInList<Dtype, DataTypeList<DTypes...>> : std::disjunction<std::bool_constant<Dtype == DTypes>...> {};
-
-template <DataType Dtype, typename List>
-inline constexpr bool IsDataTypeInList_v = IsDataTypeInList<Dtype, List>::value;
-
-// function to check if a type is in a list of types
-template <typename T, typename... Ts> inline constexpr bool IsTypeInList = (std::is_same_v<T, Ts> || ...);
-
-/**
- * @brief Dispatches a functor call based on runtime DataType, restricted to specified allowed types.
- *
- * This function:
- * 1. Maps runtime DataType to compile-time C++ types using TypeMap_t
- * 2. Only processes types specified in AllowedDTypes template parameter
- * 3. Calls functor with resolved type and forwarded arguments
- *
- * @tparam AllowedDTypes List of DataType enums to support
- * @param dtype Runtime data type to dispatch
- * @param func Templated functor to call (must accept operator()<T>)
- * @param context_identifier Optional string for context in error messages
- * @param args Arguments to be forwarded to the functor
- *
- * Behavior:
- * - For allowed types: Instantiates functor with mapped C++ type
- * - For disallowed and unknown types: Logs error and returns
- *
- * @see TypeMap for DataType to C++ type mapping
- */
-template <DataType... AllowedDTypes, typename Functor, typename... Args>
-auto DispatchFunc(DataType dtype, Functor &&func, std::string_view context_identifier = "", Args &&...args) {
-    switch (dtype) {
-
-#define CASE_FOR_TYPE(DType)                                                                                           \
-    case DType: {                                                                                                      \
-        if constexpr (IsTypeInList<TypeMap_t<DType>, TypeMap_t<AllowedDTypes>...>) {                                   \
-            return std::forward<Functor>(func).template operator()<TypeMap_t<DType>>(std::forward<Args>(args)...);     \
-        } else {                                                                                                       \
-            break;                                                                                                     \
-        }                                                                                                              \
-    }
-
-        CASE_FOR_TYPE(DataType::kUINT8)
-        CASE_FOR_TYPE(DataType::kINT8)
-        CASE_FOR_TYPE(DataType::kUINT16)
-        CASE_FOR_TYPE(DataType::kINT16)
-        CASE_FOR_TYPE(DataType::kUINT32)
-        CASE_FOR_TYPE(DataType::kINT32)
-        CASE_FOR_TYPE(DataType::kUINT64)
-        CASE_FOR_TYPE(DataType::kINT64)
-        CASE_FOR_TYPE(DataType::kFLOAT32)
-        CASE_FOR_TYPE(DataType::kFLOAT64)
-        CASE_FOR_TYPE(DataType::kBFLOAT16)
-        CASE_FOR_TYPE(DataType::kFLOAT16)
-#undef CASE_FOR_TYPE
-    }
-    LOG_UNSUPPORTED_DTYPE(dtype, context_identifier);
-    // prevent the compiler warning about control reaching the end of non-void function
-    std::abort();
-}
-
-namespace {
-/**
- * @brief Responsible for resolving a list of data types and invoking a functor with the corresponding C++ types.
- *
- * @tparam index            Current index in the `dtypes` vector.
- * @tparam AllowedListTuple Tuple of allowed `DataType` sets per dispatch level.
- * @tparam ResolvedTypes    Accumulated resolved C++ types.
- */
-template <size_t index, typename AllowedListTuple, typename... ResolvedTypes> struct DtypeDispatcher {
-
-    /**
-     * @brief Dispatches based on runtime data types and invokes the functor with resolved C++ types.
-     *
-     * Recursively matches each `DataType` in `dtypes` against the corresponding allowed list in
-     * `AllowedListTuple`. For each match, maps the `DataType` to a C++ type using `TypeMap_t`.
-     * Once all types are resolved, invokes the functor.
-     *
-     * @param dtypes              Vector of runtime data types to dispatch on.
-     * @param func                Functor to invoke with resolved template types.
-     * @param context_identifier  String used for logging or error context.
-     * @param args                Additional arguments forwarded to the functor.
-     * @return Result of invoking the functor with resolved types and forwarded arguments.
-     */
-    template <typename Functor, typename... Args>
-    static auto call(const std::vector<DataType> &dtypes, Functor &&func, std::string_view context_identifier,
-                     Args &&...args) {
-        constexpr size_t num_lists = std::tuple_size_v<AllowedListTuple>;
-
-        if constexpr (index == num_lists) {
-            // Base case: All types resolved, invoke the functor
-            return std::forward<Functor>(func).template operator()<ResolvedTypes...>(std::forward<Args>(args)...);
-        } else {
-            // Recursive case: Resolve the next type
-            using CurrentList = std::tuple_element_t<index, AllowedListTuple>;
-            DataType dtype = dtypes[index];
-
-            switch (dtype) {
-#define CASE_FOR_TYPE(DType)                                                                                           \
-    case DType:                                                                                                        \
-        if constexpr (IsDataTypeInList_v<DType, CurrentList>) {                                                        \
-            using T = TypeMap_t<DType>;                                                                                \
-            return DtypeDispatcher<index + 1, AllowedListTuple, ResolvedTypes..., T>::call(                            \
-                dtypes, std::forward<Functor>(func), context_identifier, std::forward<Args>(args)...);                 \
-        } else {                                                                                                       \
-            break;                                                                                                     \
-        }
-
-                CASE_FOR_TYPE(DataType::kUINT8)
-                CASE_FOR_TYPE(DataType::kINT8)
-                CASE_FOR_TYPE(DataType::kUINT16)
-                CASE_FOR_TYPE(DataType::kINT16)
-                CASE_FOR_TYPE(DataType::kUINT32)
-                CASE_FOR_TYPE(DataType::kINT32)
-                CASE_FOR_TYPE(DataType::kUINT64)
-                CASE_FOR_TYPE(DataType::kINT64)
-                CASE_FOR_TYPE(DataType::kFLOAT32)
-                CASE_FOR_TYPE(DataType::kFLOAT64)
-                CASE_FOR_TYPE(DataType::kBFLOAT16)
-                CASE_FOR_TYPE(DataType::kFLOAT16)
-#undef CASE_FOR_TYPE
-            }
-            LOG_UNSUPPORTED_DTYPE(dtype, context_identifier);
-            // prevent the compiler warning about control reaching the end of non-void function
-            std::abort();
-        }
-    }
-};
-} // namespace
-
-/**
- * @brief Dispatches a functor based on a list of runtime data types.
- *
- * Given a vector of `DataType` values and corresponding allowed type lists, this function resolves
- * each data type to its mapped C++ type using `TypeMap_t`, then invokes the provided functor with
- * those types as template parameters.
- *
- * @tparam AllowedTypeLists   Variadic list of allowed data type sets per dispatch level.
- * @tparam Functor            Callable object with a templated call operator.
- * @tparam Args               Additional arguments to forward to the functor.
- *
- * @param dtypes              Vector of runtime data types to dispatch on.
- * @param func                Functor to invoke after resolving types.
- * @param context_identifier  Optional context string for error reporting/logging.
- * @param args                Additional arguments to pass to the functor.
- * @return Result of invoking the functor with resolved template types and arguments.
- *
- * Example functor using a templated lambda: [=]<typename T1, typename T2>() { ... }
- */
-template <typename... AllowedTypeLists, typename Functor, typename... Args>
-auto DispatchFunc(const std::vector<DataType> &dtypes, Functor &&func, std::string_view context_identifier = "",
-                  Args &&...args) {
-    constexpr size_t num_lists = sizeof...(AllowedTypeLists);
-    if (dtypes.size() != num_lists) {
-        LOG(FATAL) << std::format("DispatchFunc expects {} dtypes, but only got {} in {}", num_lists, dtypes.size(),
-                                  context_identifier);
-        std::abort();
-    }
-
-    using AllowedListTuple = std::tuple<AllowedTypeLists...>;
-    return DtypeDispatcher<0, AllowedListTuple>::call(dtypes, std::forward<Functor>(func), context_identifier,
-                                                      std::forward<Args>(args)...);
-}
-
 class KernelFunction {
 public:
     template <typename FuncT> explicit KernelFunction(FuncT &&func) : func_ptr_(reinterpret_cast<void *>(func)) {}
@@ -388,14 +30,12 @@ class KernelFunction {
 
         if constexpr (std::is_void_v<RetT>) {
             fn(std::forward<ArgsT>(args)...);
-
 #ifdef PROFILE_MODE
             Profiler::Instance().EndRecord(ctx.name, ctx.device);
 #endif
             return;
         } else {
             RetT ret = fn(std::forward<ArgsT>(args)...);
-
 #ifdef PROFILE_MODE
             Profiler::Instance().EndRecord(ctx.name, ctx.device);
 #endif
@@ -440,6 +80,7 @@ class Dispatcher {
 private:
     std::map<KeyT, KernelFunction> key_to_kernel_map_;
 };
+
 } // namespace infini_train
 
 #define REGISTER_KERNEL(device, kernel_name, kernel_func)                                                              \
diff --git a/infini_train/include/dtype_dispatch.h b/infini_train/include/dtype_dispatch.h
new file mode 100644
index 00000000..e3db38b8
--- /dev/null
+++ b/infini_train/include/dtype_dispatch.h
@@ -0,0 +1,335 @@
+#pragma once
+
+#include <format>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "glog/logging.h"
+
+#include "infini_train/include/common/common.h"
+#include "infini_train/include/datatype.h"
+
+// ---------------------------------------------------------------------------
+// Dispatch macros for dtype switch statements
+// ---------------------------------------------------------------------------
+
+#define LOG_UNSUPPORTED_DTYPE(DTYPE, CONTEXT_IDENTIFIER)                                                               \
+    LOG_LOC(FATAL, std::string(CONTEXT_IDENTIFIER)                                                                     \
+                       + ": Unsupported data type: " + kDataTypeToDesc.at(static_cast<infini_train::DataType>(DTYPE)))
+
+// Helper macros to count the number of arguments
+#define PP_NARG(...) PP_NARG_(__VA_ARGS__, PP_RSEQ_N())
+#define PP_NARG_(...) PP_ARG_N(__VA_ARGS__)
+#define PP_ARG_N(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22,  \
+                 _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42,   \
+                 _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, _61, _62,   \
+                 _63, N, ...)                                                                                          \
+    N
+#define PP_RSEQ_N()                                                                                                    \
+    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36,    \
+        35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8,  \
+        7, 6, 5, 4, 3, 2, 1, 0
+
+#define DISPATCH_CASE(BODY, ...) CAT(DISPATCH_CASE_, PP_NARG(__VA_ARGS__))(__VA_ARGS__, WRAP(BODY))
+
+#define DISPATCH_WITH_DEFAULT(DTYPE_EXPR, BODY, DEFAULT_BODY, ...)                                                     \
+    switch (DTYPE_EXPR) {                                                                                              \
+        CAT(DISPATCH_CASE_, PP_NARG(__VA_ARGS__))(__VA_ARGS__, WRAP(BODY)) default : { WRAP(DEFAULT_BODY); }           \
+    }
+
+#define DISPATCH(DTYPE_EXPR, BODY, ...)                                                                                \
+    DISPATCH_WITH_DEFAULT(                                                                                             \
+        DTYPE_EXPR, WRAP(BODY),                                                                                        \
+        EXPAND(LOG(FATAL) << "Unsupported data type at " << __FILE__ << ":" << __LINE__; return nullptr;),             \
+        __VA_ARGS__)
+
+#define DISPATCH_CASE_1(T1, BODY)                                                                                      \
+    case T1: {                                                                                                         \
+        BODY break;                                                                                                    \
+    }
+
+#define DISPATCH_CASE_2(T1, T2, BODY)                                                                                  \
+    case T1:                                                                                                           \
+    case T2: {                                                                                                         \
+        BODY break;                                                                                                    \
+    }
+
+#define DISPATCH_CASE_3(T1, T2, T3, BODY)                                                                              \
+    case T1:                                                                                                           \
+    case T2:                                                                                                           \
+    case T3: {                                                                                                         \
+        BODY break;                                                                                                    \
+    }
+
+#define DISPATCH_CASE_4(T1, T2, T3, T4, BODY)                                                                          \
+    case T1:                                                                                                           \
+    case T2:                                                                                                           \
+    case T3:                                                                                                           \
+    case T4: {                                                                                                         \
+        BODY break;                                                                                                    \
+    }
+
+#define DISPATCH_CASE_5(T1, T2, T3, T4, T5, BODY)                                                                      \
+    case T1:                                                                                                           \
+    case T2:                                                                                                           \
+    case T3:                                                                                                           \
+    case T4:                                                                                                           \
+    case T5: {                                                                                                         \
+        BODY break;                                                                                                    \
+    }
+
+#define DISPATCH_CASE_6(T1, T2, T3, T4, T5, T6, BODY)                                                                  \
+    case T1:                                                                                                           \
+    case T2:                                                                                                           \
+    case T3:                                                                                                           \
+    case T4:                                                                                                           \
+    case T5:                                                                                                           \
+    case T6: {                                                                                                         \
+        BODY break;                                                                                                    \
+    }
+
+#define DISPATCH_CASE_7(T1, T2, T3, T4, T5, T6, T7, BODY)                                                              \
+    case T1:                                                                                                           \
+    case T2:                                                                                                           \
+    case T3:                                                                                                           \
+    case T4:                                                                                                           \
+    case T5:                                                                                                           \
+    case T6:                                                                                                           \
+    case T7: {                                                                                                         \
+        BODY break;                                                                                                    \
+    }
+
+#define DISPATCH_CASE_8(T1, T2, T3, T4, T5, T6, T7, T8, BODY)                                                          \
+    case T1:                                                                                                           \
+    case T2:                                                                                                           \
+    case T3:                                                                                                           \
+    case T4:                                                                                                           \
+    case T5:                                                                                                           \
+    case T6:                                                                                                           \
+    case T7:                                                                                                           \
+    case T8: {                                                                                                         \
+        BODY break;                                                                                                    \
+    }
+
+#define DISPATCH_CASE_9(T1, T2, T3, T4, T5, T6, T7, T8, T9, BODY)                                                      \
+    case T1:                                                                                                           \
+    case T2:                                                                                                           \
+    case T3:                                                                                                           \
+    case T4:                                                                                                           \
+    case T5:                                                                                                           \
+    case T6:                                                                                                           \
+    case T7:                                                                                                           \
+    case T8:                                                                                                           \
+    case T9: {                                                                                                         \
+        BODY break;                                                                                                    \
+    }
+
+#define DISPATCH_CASE_10(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, BODY)                                                \
+    case T1:                                                                                                           \
+    case T2:                                                                                                           \
+    case T3:                                                                                                           \
+    case T4:                                                                                                           \
+    case T5:                                                                                                           \
+    case T6:                                                                                                           \
+    case T7:                                                                                                           \
+    case T8:                                                                                                           \
+    case T9:                                                                                                           \
+    case T10: {                                                                                                        \
+        BODY break;                                                                                                    \
+    }
+
+#define DISPATCH_CASE_11(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, BODY)                                           \
+    case T1:                                                                                                           \
+    case T2:                                                                                                           \
+    case T3:                                                                                                           \
+    case T4:                                                                                                           \
+    case T5:                                                                                                           \
+    case T6:                                                                                                           \
+    case T7:                                                                                                           \
+    case T8:                                                                                                           \
+    case T9:                                                                                                           \
+    case T10:                                                                                                          \
+    case T11: {                                                                                                        \
+        BODY break;                                                                                                    \
+    }
+
+#define DISPATCH_CASE_12(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, BODY)                                      \
+    case T1:                                                                                                           \
+    case T2:                                                                                                           \
+    case T3:                                                                                                           \
+    case T4:                                                                                                           \
+    case T5:                                                                                                           \
+    case T6:                                                                                                           \
+    case T7:                                                                                                           \
+    case T8:                                                                                                           \
+    case T9:                                                                                                           \
+    case T10:                                                                                                          \
+    case T11:                                                                                                          \
+    case T12: {                                                                                                        \
+        BODY break;                                                                                                    \
+    }
+
+namespace infini_train {
+
+/**
+ * Data Type Macros
+ * Defines common categories of data types for dispatching
+ */
+#define INFINI_FLOATING_TYPES DataType::kFLOAT32, DataType::kFLOAT64
+#define INFINI_REDUCED_FLOATING_TYPES DataType::kFLOAT16, DataType::kBFLOAT16
+#define INFINI_ALL_FLOATING_TYPES INFINI_FLOATING_TYPES, INFINI_REDUCED_FLOATING_TYPES
+#define INFINI_SIGNED_INTEGRAL_TYPES DataType::kINT8, DataType::kINT16, DataType::kINT32, DataType::kINT64
+#define INFINI_UNSIGNED_INTEGRAL_TYPES DataType::kUINT8, DataType::kUINT16, DataType::kUINT32, DataType::kUINT64
+#define INFINI_ALL_INTEGRAL_TYPES INFINI_SIGNED_INTEGRAL_TYPES, INFINI_UNSIGNED_INTEGRAL_TYPES
+#define INFINI_ALL_TYPES INFINI_ALL_FLOATING_TYPES, INFINI_ALL_INTEGRAL_TYPES
+#define INFINI_8_BIT_TYPES DataType::kINT8, DataType::kUINT8
+#define INFINI_16_BIT_TYPES DataType::kINT16, DataType::kUINT16, DataType::kFLOAT16, DataType::kBFLOAT16
+#define INFINI_32_BIT_TYPES DataType::kINT32, DataType::kUINT32, DataType::kFLOAT32
+#define INFINI_64_BIT_TYPES DataType::kINT64, DataType::kUINT64, DataType::kFLOAT64
+
+template <DataType... DTypes> struct DataTypeList {};
+
+template <DataType Dtype, typename List> struct IsDataTypeInList;
+
+template <DataType Dtype, DataType... DTypes>
+struct IsDataTypeInList<Dtype, DataTypeList<DTypes...>> : std::disjunction<std::bool_constant<Dtype == DTypes>...> {};
+
+template <DataType Dtype, typename List>
+inline constexpr bool IsDataTypeInList_v = IsDataTypeInList<Dtype, List>::value;
+
+template <typename T, typename... Ts> inline constexpr bool IsTypeInList = (std::is_same_v<T, Ts> || ...);
+
+template <template <DataType> class TypeMap, DataType DType> using MappedType_t = typename TypeMap<DType>::type;
+
+// -----------------------------------------------------------------------------
+// Detection trait: does TypeMap<DType> have a nested `type` alias?
+// Returns false (instead of a hard error) when the primary template is
+// undefined or the specialization intentionally omits `type`.
+// -----------------------------------------------------------------------------
+namespace detail {
+template <template <DataType> class TypeMap, DataType DType, typename = void> struct HasMappedType : std::false_type {};
+
+template <template <DataType> class TypeMap, DataType DType>
+struct HasMappedType<TypeMap, DType, std::void_t<typename TypeMap<DType>::type>> : std::true_type {};
+} // namespace detail
+
+template <template <DataType> class TypeMap, DataType DType>
+inline constexpr bool HasMappedType_v = detail::HasMappedType<TypeMap, DType>::value;
+
+// -----------------------------------------------------------------------------
+// Generic single-dtype dispatch by custom type map
+// -----------------------------------------------------------------------------
+// Membership is checked by DataType (not by mapped C++ type) to avoid
+// premature instantiation of TypeMap<DType> for every AllowedDType.
+// After confirming DType is in the allowed set, a static_assert verifies
+// that TypeMap actually provides a mapping; only then is MappedType_t used.
+// -----------------------------------------------------------------------------
+template <template <DataType> class TypeMap, DataType... AllowedDTypes, typename Functor, typename... Args>
+auto DispatchByTypeMap(DataType dtype, Functor &&func, std::string_view context_identifier = "", Args &&...args) {
+    switch (dtype) {
+#define CASE_FOR_TYPE(DType)                                                                                           \
+    case DType: {                                                                                                      \
+        if constexpr (IsDataTypeInList_v<DType, DataTypeList<AllowedDTypes...>>) {                                     \
+            static_assert(HasMappedType_v<TypeMap, DType>,                                                             \
+                          "TypeMap does not provide explicit mapping for this dtype. "                                 \
+                          "Register the dtype in the backend TypeMap (e.g., CpuTypeMap / CudaTypeMap).");              \
+            return std::forward<Functor>(func).template operator()<MappedType_t<TypeMap, DType>>(                      \
+                std::forward<Args>(args)...);                                                                          \
+        } else {                                                                                                       \
+            break;                                                                                                     \
+        }                                                                                                              \
+    }
+
+        CASE_FOR_TYPE(DataType::kUINT8)
+        CASE_FOR_TYPE(DataType::kINT8)
+        CASE_FOR_TYPE(DataType::kUINT16)
+        CASE_FOR_TYPE(DataType::kINT16)
+        CASE_FOR_TYPE(DataType::kUINT32)
+        CASE_FOR_TYPE(DataType::kINT32)
+        CASE_FOR_TYPE(DataType::kUINT64)
+        CASE_FOR_TYPE(DataType::kINT64)
+        CASE_FOR_TYPE(DataType::kFLOAT32)
+        CASE_FOR_TYPE(DataType::kFLOAT64)
+        CASE_FOR_TYPE(DataType::kBFLOAT16)
+        CASE_FOR_TYPE(DataType::kFLOAT16)
+#undef CASE_FOR_TYPE
+    }
+
+    LOG_UNSUPPORTED_DTYPE(dtype, context_identifier);
+    std::abort();
+}
+
+namespace detail {
+
+template <template <DataType> class TypeMap, size_t Index, typename AllowedListTuple, typename... ResolvedTypes>
+struct TypeMapDispatcher {
+    template <typename Functor, typename... Args>
+    static auto call(const std::vector<DataType> &dtypes, Functor &&func, std::string_view context_identifier,
+                     Args &&...args) {
+        constexpr size_t kNumLists = std::tuple_size_v<AllowedListTuple>;
+
+        if constexpr (Index == kNumLists) {
+            return std::forward<Functor>(func).template operator()<ResolvedTypes...>(std::forward<Args>(args)...);
+        } else {
+            using CurrentList = std::tuple_element_t<Index, AllowedListTuple>;
+            const DataType dtype = dtypes[Index];
+
+            switch (dtype) {
+#define CASE_FOR_TYPE(DType)                                                                                           \
+    case DType:                                                                                                        \
+        if constexpr (IsDataTypeInList_v<DType, CurrentList>) {                                                        \
+            static_assert(HasMappedType_v<TypeMap, DType>,                                                             \
+                          "TypeMap does not provide explicit mapping for this dtype. "                                 \
+                          "Register the dtype in the backend TypeMap (e.g., CpuTypeMap / CudaTypeMap).");              \
+            using T = MappedType_t<TypeMap, DType>;                                                                    \
+            return TypeMapDispatcher<TypeMap, Index + 1, AllowedListTuple, ResolvedTypes..., T>::call(                 \
+                dtypes, std::forward<Functor>(func), context_identifier, std::forward<Args>(args)...);                 \
+        } else {                                                                                                       \
+            break;                                                                                                     \
+        }
+
+                CASE_FOR_TYPE(DataType::kUINT8)
+                CASE_FOR_TYPE(DataType::kINT8)
+                CASE_FOR_TYPE(DataType::kUINT16)
+                CASE_FOR_TYPE(DataType::kINT16)
+                CASE_FOR_TYPE(DataType::kUINT32)
+                CASE_FOR_TYPE(DataType::kINT32)
+                CASE_FOR_TYPE(DataType::kUINT64)
+                CASE_FOR_TYPE(DataType::kINT64)
+                CASE_FOR_TYPE(DataType::kFLOAT32)
+                CASE_FOR_TYPE(DataType::kFLOAT64)
+                CASE_FOR_TYPE(DataType::kBFLOAT16)
+                CASE_FOR_TYPE(DataType::kFLOAT16)
+#undef CASE_FOR_TYPE
+            }
+
+            LOG_UNSUPPORTED_DTYPE(dtype, context_identifier);
+            std::abort();
+        }
+    }
+};
+
+} // namespace detail
+
+// -----------------------------------------------------------------------------
+// Generic multi-dtype dispatch by custom type map
+// -----------------------------------------------------------------------------
+template <template <DataType> class TypeMap, typename... AllowedTypeLists, typename Functor, typename... Args>
+auto DispatchByTypeMap(const std::vector<DataType> &dtypes, Functor &&func, std::string_view context_identifier = "",
+                       Args &&...args) {
+    constexpr size_t kNumLists = sizeof...(AllowedTypeLists);
+
+    if (dtypes.size() != kNumLists) {
+        LOG(FATAL) << std::format("DispatchByTypeMap expects {} dtypes, but only got {} in {}", kNumLists,
+                                  dtypes.size(), context_identifier);
+        std::abort();
+    }
+
+    using AllowedListTuple = std::tuple<AllowedTypeLists...>;
+    return detail::TypeMapDispatcher<TypeMap, 0, AllowedListTuple>::call(
+        dtypes, std::forward<Functor>(func), context_identifier, std::forward<Args>(args)...);
+}
+
+} // namespace infini_train
diff --git a/infini_train/include/scalar.h b/infini_train/include/scalar.h
new file mode 100644
index 00000000..4e6abba4
--- /dev/null
+++ b/infini_train/include/scalar.h
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+
+#include "glog/logging.h"
+
+#include "infini_train/include/common/cpu/common_cpu.h"
+
+namespace infini_train {
+
+struct Scalar {
+    enum class Kind : uint8_t { kBool, kDouble, kInt64, kUInt64 };
+
+    Scalar() : kind(Kind::kInt64), i(0) {}
+    Scalar(bool v) : kind(Kind::kBool), u(v ? 1 : 0) {}
+
+    template <typename T, typename std::enable_if_t<std::is_floating_point_v<T>, int> = 0>
+    Scalar(T v) : kind(Kind::kDouble), d(static_cast<double>(v)) {}
+
+    template <typename T,
+              typename std::enable_if_t<std::is_integral_v<T> && std::is_signed_v<T> && !std::is_same_v<T, bool>, int>
+              = 0>
+    Scalar(T v) : kind(Kind::kInt64), i(static_cast<int64_t>(v)) {}
+
+    template <typename T,
+              typename std::enable_if_t<std::is_integral_v<T> && std::is_unsigned_v<T> && !std::is_same_v<T, bool>, int>
+              = 0>
+    Scalar(T v) : kind(Kind::kUInt64), u(static_cast<uint64_t>(v)) {}
+
+    Scalar(FP16 v) : kind(Kind::kDouble), d(static_cast<float>(v)) {}
+    Scalar(BF16 v) : kind(Kind::kDouble), d(static_cast<float>(v)) {}
+
+    // TODO(dcj): Scalar::to<T>() should remain a framework-level conversion API
+    // and should not directly target backend-native types such as __nv_bfloat16
+    // or __half.
+    //
+    // Today to<T>() delegates to common::cpu::Cast, which only has explicit
+    // semantics for framework scalar types (e.g. FP16/BF16). When T is a
+    // backend-native half type, it falls back to raw static_cast, which happens
+    // to compile on CUDA (via implicit constructors) but is backend-dependent
+    // and may fail on other platforms (e.g. MACA).
+    //
+    // More importantly, this creates inconsistent rounding paths:
+    //   - to<BF16>():           double -> float -> bf16
+    //   - to<__nv_bfloat16>():  double -> bf16
+    // The two paths may yield different results due to double rounding.
+    // See `test/dtype/test_scalar.cc` (`TestToHalfPrecisionConversions`) for
+    // a similar example.
+    //
+    // Planned fix:
+    //   1) keep Scalar::to<T>() restricted to framework/common scalar types
+    //   2) introduce a standalone convert<To, From> utility for common
+    //      conversion semantics
+    //   3) let kernel/backend code use a backend-specific scalar_cast<T>
+    //      helper for native types, routing half-precision conversions
+    //      through float to guarantee consistent two-step rounding on all
+    //      backends.
+    template <typename T> T to() const {
+        switch (kind) {
+        case Kind::kBool:
+            return common::cpu::Cast<T>(u != 0);
+        case Kind::kDouble:
+            return common::cpu::Cast<T>(d);
+        case Kind::kInt64:
+            return common::cpu::Cast<T>(i);
+        case Kind::kUInt64:
+            return common::cpu::Cast<T>(u);
+        default:
+            LOG(FATAL) << "Unknown scalar kind";
+        }
+
+        std::abort();
+    }
+
+    Kind kind;
+    union {
+        double d;
+        int64_t i;
+        uint64_t u;
+    };
+};
+
+} // namespace infini_train
diff --git a/infini_train/include/tensor.h b/infini_train/include/tensor.h
index 39d6dd46..4f4ed94b 100644
--- a/infini_train/include/tensor.h
+++ b/infini_train/include/tensor.h
@@ -12,6 +12,7 @@
 
 #include "infini_train/include/datatype.h"
 #include "infini_train/include/device.h"
+#include "infini_train/include/scalar.h"
 
 namespace infini_train {
 namespace autograd {
@@ -78,8 +79,7 @@ class Tensor : public std::enable_shared_from_this<Tensor> {
     size_t NumElements() const;
     DataType Dtype() const;
 
-    // TODO(dcj): use scalar class later
-    template <typename T> void Fill(T value);
+    void Fill(Scalar value);
 
     Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> EigenMatrix();
     Eigen::Map<Eigen::Matrix<float, 1, Eigen::Dynamic, Eigen::RowMajor>> EigenVector();
diff --git a/infini_train/src/core/runtime/cpu/cpu_dispatch.h b/infini_train/src/core/runtime/cpu/cpu_dispatch.h
new file mode 100644
index 00000000..2433e139
--- /dev/null
+++ b/infini_train/src/core/runtime/cpu/cpu_dispatch.h
@@ -0,0 +1,74 @@
+#pragma once
+
+#include <utility>
+#include <vector>
+
+#include "infini_train/include/core/backend_type_map.h"
+#include "infini_train/include/dtype_dispatch.h"
+
+// -----------------------------------------------------------------------------
+// CPU low-precision BackendTypeMap specializations:
+//   FP16 -> infini_train::FP16, BF16 -> infini_train::BF16
+// CPU uses the framework wrapper types directly (host-side conversion).
+// -----------------------------------------------------------------------------
+namespace infini_train::core {
+template <> struct BackendTypeMap<Device::DeviceType::kCPU, DataType::kFLOAT16> {
+    using type = infini_train::FP16;
+};
+
+template <> struct BackendTypeMap<Device::DeviceType::kCPU, DataType::kBFLOAT16> {
+    using type = infini_train::BF16;
+};
+} // namespace infini_train::core
+
+// Register all standard (non-low-precision) dtypes for the CPU backend.
+// FP16/BF16 are registered explicitly above.
+INFINI_REGISTER_STANDARD_BACKEND_TYPES(infini_train::Device::DeviceType::kCPU)
+
+namespace infini_train::core::cpu {
+
+// -----------------------------------------------------------------------------
+// CpuTypeMap: DataType -> CPU native scalar type
+// Primary template intentionally undefined — no default fallback.
+// Each dtype is explicitly registered below.
+// -----------------------------------------------------------------------------
+template <DataType DType> struct CpuTypeMap;
+
+#define INFINI_REGISTER_CPU_TYPEMAP(DTYPE)                                                                             \
+    template <>                                                                                                        \
+    struct CpuTypeMap<DataType::DTYPE>                                                                                 \
+        : infini_train::core::BackendTypeMap<Device::DeviceType::kCPU, DataType::DTYPE> {};
+
+INFINI_REGISTER_CPU_TYPEMAP(kUINT8)
+INFINI_REGISTER_CPU_TYPEMAP(kINT8)
+INFINI_REGISTER_CPU_TYPEMAP(kUINT16)
+INFINI_REGISTER_CPU_TYPEMAP(kINT16)
+INFINI_REGISTER_CPU_TYPEMAP(kUINT32)
+INFINI_REGISTER_CPU_TYPEMAP(kINT32)
+INFINI_REGISTER_CPU_TYPEMAP(kUINT64)
+INFINI_REGISTER_CPU_TYPEMAP(kINT64)
+INFINI_REGISTER_CPU_TYPEMAP(kFLOAT32)
+INFINI_REGISTER_CPU_TYPEMAP(kFLOAT64)
+INFINI_REGISTER_CPU_TYPEMAP(kFLOAT16)
+INFINI_REGISTER_CPU_TYPEMAP(kBFLOAT16)
+
+#undef INFINI_REGISTER_CPU_TYPEMAP
+
+// -----------------------------------------------------------------------------
+// CPU dispatch helpers
+// -----------------------------------------------------------------------------
+
+template <DataType... AllowedDTypes, typename Functor, typename... Args>
+auto DispatchCpuFunc(DataType dtype, Functor &&func, std::string_view context_identifier = "", Args &&...args) {
+    return infini_train::DispatchByTypeMap<CpuTypeMap, AllowedDTypes...>(
+        dtype, std::forward<Functor>(func), context_identifier, std::forward<Args>(args)...);
+}
+
+template <typename... AllowedTypeLists, typename Functor, typename... Args>
+auto DispatchCpuFunc(const std::vector<DataType> &dtypes, Functor &&func, std::string_view context_identifier = "",
+                     Args &&...args) {
+    return infini_train::DispatchByTypeMap<CpuTypeMap, AllowedTypeLists...>(
+        dtypes, std::forward<Functor>(func), context_identifier, std::forward<Args>(args)...);
+}
+
+} // namespace infini_train::core::cpu
diff --git a/infini_train/src/core/runtime/cuda/cuda_dispatch.h b/infini_train/src/core/runtime/cuda/cuda_dispatch.h
new file mode 100644
index 00000000..46b7fe5d
--- /dev/null
+++ b/infini_train/src/core/runtime/cuda/cuda_dispatch.h
@@ -0,0 +1,79 @@
+#pragma once
+
+#include <utility>
+#include <vector>
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#include "infini_train/include/core/backend_type_map.h"
+#include "infini_train/include/dtype_dispatch.h"
+
+// -----------------------------------------------------------------------------
+// CUDA low-precision BackendTypeMap specializations:
+//   FP16 -> __half, BF16 -> __nv_bfloat16
+// -----------------------------------------------------------------------------
+namespace infini_train::core {
+template <> struct BackendTypeMap<Device::DeviceType::kCUDA, DataType::kFLOAT16> {
+    using type = __half;
+};
+
+template <> struct BackendTypeMap<Device::DeviceType::kCUDA, DataType::kBFLOAT16> {
+    using type = __nv_bfloat16;
+};
+} // namespace infini_train::core
+
+// Register all standard (non-low-precision) dtypes for the CUDA backend.
+// FP16/BF16 are registered explicitly above with their CUDA-native scalar types.
+INFINI_REGISTER_STANDARD_BACKEND_TYPES(infini_train::Device::DeviceType::kCUDA)
+
+namespace infini_train::core::cuda {
+
+// -----------------------------------------------------------------------------
+// CudaTypeMap: DataType -> CUDA native scalar type
+// Primary template intentionally undefined — no default fallback.
+// Each dtype is explicitly registered below.
+// -----------------------------------------------------------------------------
+template <DataType DType> struct CudaTypeMap;
+
+// Register all supported dtypes by delegating to BackendTypeMap<kCUDA, DType>.
+// Standard types come from INFINI_REGISTER_STANDARD_BACKEND_TYPES above;
+// FP16/BF16 come from the explicit BackendTypeMap specializations above.
+#define INFINI_REGISTER_CUDA_TYPEMAP(DTYPE)                                                                            \
+    template <>                                                                                                        \
+    struct CudaTypeMap<DataType::DTYPE>                                                                                \
+        : infini_train::core::BackendTypeMap<Device::DeviceType::kCUDA, DataType::DTYPE> {};
+
+INFINI_REGISTER_CUDA_TYPEMAP(kUINT8)
+INFINI_REGISTER_CUDA_TYPEMAP(kINT8)
+INFINI_REGISTER_CUDA_TYPEMAP(kUINT16)
+INFINI_REGISTER_CUDA_TYPEMAP(kINT16)
+INFINI_REGISTER_CUDA_TYPEMAP(kUINT32)
+INFINI_REGISTER_CUDA_TYPEMAP(kINT32)
+INFINI_REGISTER_CUDA_TYPEMAP(kUINT64)
+INFINI_REGISTER_CUDA_TYPEMAP(kINT64)
+INFINI_REGISTER_CUDA_TYPEMAP(kFLOAT32)
+INFINI_REGISTER_CUDA_TYPEMAP(kFLOAT64)
+INFINI_REGISTER_CUDA_TYPEMAP(kFLOAT16)
+INFINI_REGISTER_CUDA_TYPEMAP(kBFLOAT16)
+
+#undef INFINI_REGISTER_CUDA_TYPEMAP
+
+// -----------------------------------------------------------------------------
+// CUDA dispatch helpers
+// -----------------------------------------------------------------------------
+
+template <DataType... AllowedDTypes, typename Functor, typename... Args>
+auto DispatchCudaFunc(DataType dtype, Functor &&func, std::string_view context_identifier = "", Args &&...args) {
+    return infini_train::DispatchByTypeMap<CudaTypeMap, AllowedDTypes...>(
+        dtype, std::forward<Functor>(func), context_identifier, std::forward<Args>(args)...);
+}
+
+template <typename... AllowedTypeLists, typename Functor, typename... Args>
+auto DispatchCudaFunc(const std::vector<DataType> &dtypes, Functor &&func, std::string_view context_identifier = "",
+                      Args &&...args) {
+    return infini_train::DispatchByTypeMap<CudaTypeMap, AllowedTypeLists...>(
+        dtypes, std::forward<Functor>(func), context_identifier, std::forward<Args>(args)...);
+}
+
+} // namespace infini_train::core::cuda
diff --git a/infini_train/src/datatype.cc b/infini_train/src/datatype.cc
new file mode 100644
index 00000000..c2a4ccd9
--- /dev/null
+++ b/infini_train/src/datatype.cc
@@ -0,0 +1,200 @@
+#include "infini_train/include/datatype.h"
+
+#include <bit>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+
+namespace infini_train {
+
+namespace detail {
+
+// ---------------------------
+// BF16 helpers
+// ---------------------------
+uint16_t FloatToBf16Bits(float value) {
+    const uint32_t bits = std::bit_cast<uint32_t>(value);
+    const uint32_t lsb = (bits >> 16) & 1u;
+    const uint32_t rounding_bias = 0x7fffu + lsb;
+    return static_cast<uint16_t>((bits + rounding_bias) >> 16);
+}
+
+float Bf16BitsToFloat(uint16_t bits) {
+    const uint32_t u32 = static_cast<uint32_t>(bits) << 16;
+    return std::bit_cast<float>(u32);
+}
+
+// ---------------------------
+// FP16 helpers
+// Pure software IEEE-754 half <-> float conversion for framework fallback use.
+// ---------------------------
+uint16_t FloatToFp16Bits(float value) {
+    const uint32_t bits = std::bit_cast<uint32_t>(value);
+
+    const uint32_t sign = (bits >> 16) & 0x8000u;
+    uint32_t mantissa = bits & 0x007fffffu;
+    int32_t exp = static_cast<int32_t>((bits >> 23) & 0xffu);
+
+    // NaN / Inf
+    if (exp == 0xff) {
+        if (mantissa == 0) {
+            return static_cast<uint16_t>(sign | 0x7c00u); // inf
+        }
+        return static_cast<uint16_t>(sign | 0x7e00u); // quiet NaN
+    }
+
+    // Zero / subnormal in float32
+    if (exp == 0) {
+        return static_cast<uint16_t>(sign);
+    }
+
+    // Convert exponent bias: fp32 bias 127 -> fp16 bias 15
+    exp = exp - 127 + 15;
+
+    // Overflow -> inf
+    if (exp >= 0x1f) {
+        return static_cast<uint16_t>(sign | 0x7c00u);
+    }
+
+    // Underflow -> subnormal / zero
+    if (exp <= 0) {
+        if (exp < -10) {
+            return static_cast<uint16_t>(sign);
+        }
+
+        mantissa |= 0x00800000u;
+
+        const int shift = 14 - exp;
+        uint32_t half_mant = mantissa >> shift;
+
+        const uint32_t remainder = mantissa & ((1u << shift) - 1u);
+        const uint32_t halfway = 1u << (shift - 1);
+        if (remainder > halfway || (remainder == halfway && (half_mant & 1u))) {
+            ++half_mant;
+        }
+
+        return static_cast<uint16_t>(sign | half_mant);
+    }
+
+    // Normal fp16
+    uint32_t half_exp = static_cast<uint32_t>(exp) << 10;
+    uint32_t half_mant = mantissa >> 13;
+
+    const uint32_t round_bits = mantissa & 0x1fffu;
+    if (round_bits > 0x1000u || (round_bits == 0x1000u && (half_mant & 1u))) {
+        ++half_mant;
+        if (half_mant == 0x400u) {
+            half_mant = 0;
+            half_exp += 0x0400u;
+            if (half_exp >= 0x7c00u) {
+                return static_cast<uint16_t>(sign | 0x7c00u);
+            }
+        }
+    }
+
+    return static_cast<uint16_t>(sign | half_exp | half_mant);
+}
+
+float Fp16BitsToFloat(uint16_t bits) {
+    const uint32_t sign = (static_cast<uint32_t>(bits & 0x8000u)) << 16;
+    const uint32_t exp = (bits >> 10) & 0x1fu;
+    const uint32_t mant = bits & 0x03ffu;
+
+    uint32_t out = 0;
+
+    if (exp == 0) {
+        if (mant == 0) {
+            out = sign;
+        } else {
+            uint32_t mantissa = mant;
+            int32_t e = -14;
+            while ((mantissa & 0x0400u) == 0) {
+                mantissa <<= 1;
+                --e;
+            }
+            mantissa &= 0x03ffu;
+            const uint32_t exp32 = static_cast<uint32_t>(e + 127) << 23;
+            const uint32_t mant32 = mantissa << 13;
+            out = sign | exp32 | mant32;
+        }
+    } else if (exp == 0x1f) {
+        out = sign | 0x7f800000u | (mant << 13);
+    } else {
+        const uint32_t exp32 = static_cast<uint32_t>(static_cast<int32_t>(exp) - 15 + 127) << 23;
+        const uint32_t mant32 = mant << 13;
+        out = sign | exp32 | mant32;
+    }
+
+    return std::bit_cast<float>(out);
+}
+
+} // namespace detail
+
+// -----------------------------------------------------------------------------
+// FP16
+// -----------------------------------------------------------------------------
+FP16::FP16(float value) : x(detail::FloatToFp16Bits(value)) {}
+FP16::FP16(double value) : FP16(static_cast<float>(value)) {}
+FP16::FP16(int value) : FP16(static_cast<float>(value)) {}
+FP16::FP16(int64_t value) : FP16(static_cast<float>(value)) {}
+
+FP16::operator float() const { return detail::Fp16BitsToFloat(x); }
+FP16::operator double() const { return static_cast<double>(static_cast<float>(*this)); }
+
+FP16 &FP16::operator++() {
+    *this = FP16(static_cast<float>(*this) + 1.0f);
+    return *this;
+}
+
+// -----------------------------------------------------------------------------
+// BF16
+// -----------------------------------------------------------------------------
+BF16::BF16(float value) : x(detail::FloatToBf16Bits(value)) {}
+BF16::BF16(double value) : BF16(static_cast<float>(value)) {}
+BF16::BF16(int value) : BF16(static_cast<float>(value)) {}
+BF16::BF16(int64_t value) : BF16(static_cast<float>(value)) {}
+
+BF16::operator float() const { return detail::Bf16BitsToFloat(x); }
+BF16::operator double() const { return static_cast<double>(static_cast<float>(*this)); }
+
+BF16 &BF16::operator++() {
+    *this = BF16(static_cast<float>(*this) + 1.0f);
+    return *this;
+}
+
+// -----------------------------------------------------------------------------
+// DataType-level promotion
+// -----------------------------------------------------------------------------
+bool IsFloatingPointDType(DataType dt) {
+    return dt == DataType::kFLOAT16 || dt == DataType::kBFLOAT16 || dt == DataType::kFLOAT32
+        || dt == DataType::kFLOAT64;
+}
+
+DataType PromoteDataTypes(DataType a, DataType b) {
+    if (a == b) {
+        return a;
+    }
+
+    // Rule 1: FP16 ↔ BF16 — no lossless path, promote to FP32
+    if ((a == DataType::kFLOAT16 && b == DataType::kBFLOAT16)
+        || (a == DataType::kBFLOAT16 && b == DataType::kFLOAT16)) {
+        return DataType::kFLOAT32;
+    }
+
+    const bool a_fp = IsFloatingPointDType(a);
+    const bool b_fp = IsFloatingPointDType(b);
+
+    // Rule 2: float beats integer
+    if (a_fp && !b_fp) {
+        return a;
+    }
+    if (b_fp && !a_fp) {
+        return b;
+    }
+
+    // Rule 3: same category — wider wins
+    return kDataTypeToSize.at(a) >= kDataTypeToSize.at(b) ? a : b;
+}
+
+} // namespace infini_train
diff --git a/infini_train/src/kernels/cpu/cast.cc b/infini_train/src/kernels/cpu/cast.cc
index 35f31214..114a5597 100644
--- a/infini_train/src/kernels/cpu/cast.cc
+++ b/infini_train/src/kernels/cpu/cast.cc
@@ -2,14 +2,18 @@
 
 #include "infini_train/include/common/cpu/common_cpu.h"
 #include "infini_train/include/dispatcher.h"
+#include "infini_train/include/dtype_dispatch.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/cpu/cpu_dispatch.h"
+
 namespace infini_train::kernels::cpu {
+
 std::shared_ptr<Tensor> Cast(std::shared_ptr<Tensor> input, DataType dtype) {
     auto device = input->GetDevice();
     auto dst_tensor = std::make_shared<Tensor>(input->Dims(), dtype, device);
 
-    DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
+    core::cpu::DispatchCpuFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
         {dtype, input->Dtype()},
         [=]<typename Tdst, typename Tsrc>() {
             auto dst = static_cast<Tdst *>(dst_tensor->DataPtr());
diff --git a/infini_train/src/kernels/cpu/elementwise.cc b/infini_train/src/kernels/cpu/elementwise.cc
index 8d66acd2..71058b51 100644
--- a/infini_train/src/kernels/cpu/elementwise.cc
+++ b/infini_train/src/kernels/cpu/elementwise.cc
@@ -85,8 +85,8 @@ BinaryBackward(const std::shared_ptr<Tensor> &grad_output, const std::shared_ptr
 
     auto grad_a = std::make_shared<Tensor>(a_dims, DataType::kFLOAT32);
     auto grad_b = std::make_shared<Tensor>(b_dims, DataType::kFLOAT32);
-    grad_a->Fill<float>(0.0f);
-    grad_b->Fill<float>(0.0f);
+    grad_a->Fill(0.0);
+    grad_b->Fill(0.0);
 
     int ndim = a_dims.size();
     auto out_strides = ComputeStrides(a_dims);
diff --git a/infini_train/src/kernels/cpu/embedding.cc b/infini_train/src/kernels/cpu/embedding.cc
index 190c77c5..6b3a5aa6 100644
--- a/infini_train/src/kernels/cpu/embedding.cc
+++ b/infini_train/src/kernels/cpu/embedding.cc
@@ -41,7 +41,7 @@ std::shared_ptr<Tensor> EmbeddingBackward(const std::shared_ptr<Tensor> &input,
     CHECK_EQ(*grad_output->Dims().rbegin(), embedding_dim);
 
     auto grad_weight = std::make_shared<Tensor>(weight_dims, DataType::kFLOAT32);
-    grad_weight->Fill<float>(0.0f);
+    grad_weight->Fill(0.0);
 
     for (int i = 0; i < input->NumElements(); ++i) {
         int idx = static_cast<int>(static_cast<const int64_t *>(input->DataPtr())[i]);
diff --git a/infini_train/src/kernels/cpu/fill.cc b/infini_train/src/kernels/cpu/fill.cc
index 175a15a2..5f8b7cd3 100644
--- a/infini_train/src/kernels/cpu/fill.cc
+++ b/infini_train/src/kernels/cpu/fill.cc
@@ -1,13 +1,21 @@
 #include "glog/logging.h"
 
 #include "infini_train/include/dispatcher.h"
+#include "infini_train/include/dtype_dispatch.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/cpu/cpu_dispatch.h"
+
 namespace infini_train::kernels::cpu {
-void Fill(std::shared_ptr<Tensor> tensor, void *value_ptr) {
-    // FIXME(zbl): support other data types
-    auto data = reinterpret_cast<float *>(tensor->DataPtr());
-    std::fill(data, data + tensor->NumElements(), *(static_cast<float *>(value_ptr)));
+void Fill(std::shared_ptr<Tensor> tensor, Scalar scalar) {
+    core::cpu::DispatchCpuFunc<INFINI_ALL_TYPES>(
+        tensor->Dtype(),
+        [=]<typename T>() {
+            auto data = reinterpret_cast<T *>(tensor->DataPtr());
+            const T casted_value = scalar.to<T>();
+            std::fill(data, data + tensor->NumElements(), casted_value);
+        },
+        "CPU Fill");
 }
 } // namespace infini_train::kernels::cpu
 
diff --git a/infini_train/src/kernels/cpu/gather.cc b/infini_train/src/kernels/cpu/gather.cc
index 9717b795..b59fd45f 100644
--- a/infini_train/src/kernels/cpu/gather.cc
+++ b/infini_train/src/kernels/cpu/gather.cc
@@ -129,7 +129,7 @@ std::shared_ptr<Tensor> IndexGatherBackward(const std::shared_ptr<Tensor> &grad_
     }
 
     auto grad_input = std::make_shared<Tensor>(in_dims, grad_output->Dtype(), grad_output->GetDevice());
-    grad_input->Fill<float>(0.0f);
+    grad_input->Fill(0.0);
 
     std::vector<int64_t> in_strides(in_dims.size());
     int64_t s = 1;
diff --git a/infini_train/src/kernels/cpu/layernorm.cc b/infini_train/src/kernels/cpu/layernorm.cc
index c587f2c5..46a8f094 100644
--- a/infini_train/src/kernels/cpu/layernorm.cc
+++ b/infini_train/src/kernels/cpu/layernorm.cc
@@ -28,8 +28,8 @@ LayerNormForward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Ten
     auto output = std::make_shared<Tensor>(std::vector<int64_t>{batch_size, max_seqlen, embed_dim}, DataType::kFLOAT32);
     auto mean = std::make_shared<Tensor>(std::vector<int64_t>{batch_size, max_seqlen}, DataType::kFLOAT32);
     auto rstd = std::make_shared<Tensor>(std::vector<int64_t>{batch_size, max_seqlen}, DataType::kFLOAT32);
-    mean->Fill<float>(0.0f);
-    rstd->Fill<float>(0.0f);
+    mean->Fill(0.0);
+    rstd->Fill(0.0);
 
     for (int b = 0; b < batch_size; ++b) {
         for (int t = 0; t < max_seqlen; ++t) {
@@ -84,9 +84,9 @@ LayerNormBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Te
     auto grad_weight = std::make_shared<Tensor>(weight->Dims(), DataType::kFLOAT32);
     auto grad_bias = std::make_shared<Tensor>(bias->Dims(), DataType::kFLOAT32);
 
-    grad_input->Fill<float>(0.0f);
-    grad_weight->Fill<float>(0.0f);
-    grad_bias->Fill<float>(0.0f);
+    grad_input->Fill(0.0);
+    grad_weight->Fill(0.0);
+    grad_bias->Fill(0.0);
 
     for (int b = 0; b < batch_size; ++b) {
         for (int t = 0; t < max_seqlen; ++t) {
diff --git a/infini_train/src/kernels/cpu/linear.cc b/infini_train/src/kernels/cpu/linear.cc
index 2b209417..06bd20df 100644
--- a/infini_train/src/kernels/cpu/linear.cc
+++ b/infini_train/src/kernels/cpu/linear.cc
@@ -82,8 +82,8 @@ MatmulBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tenso
 
     auto grad_input = std::make_shared<Tensor>(input_dims, DataType::kFLOAT32);
     auto grad_other = std::make_shared<Tensor>(other_dims, DataType::kFLOAT32);
-    grad_input->Fill<float>(0.0f);
-    grad_other->Fill<float>(0.0f);
+    grad_input->Fill(0.0);
+    grad_other->Fill(0.0);
 
     for (int64_t b = 0; b < bs; ++b) {
         for (int64_t i = 0; i < m; ++i) {
diff --git a/infini_train/src/kernels/cpu/outer.cc b/infini_train/src/kernels/cpu/outer.cc
index b61a3ed0..b41c9551 100644
--- a/infini_train/src/kernels/cpu/outer.cc
+++ b/infini_train/src/kernels/cpu/outer.cc
@@ -47,8 +47,8 @@ std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>> OuterBackward(const
 
     auto grad_input = std::make_shared<Tensor>(std::vector<int64_t>{m}, DataType::kFLOAT32);
     auto grad_other = std::make_shared<Tensor>(std::vector<int64_t>{n}, DataType::kFLOAT32);
-    grad_input->Fill<float>(0.0f);
-    grad_other->Fill<float>(0.0f);
+    grad_input->Fill(0.0);
+    grad_other->Fill(0.0);
 
     grad_input->EigenVector() = grad_output->EigenMatrix() * other->EigenVector().transpose();
     grad_other->EigenVector() = grad_output->EigenMatrix().transpose() * input->EigenVector().transpose();
diff --git a/infini_train/src/kernels/cpu/slice.cc b/infini_train/src/kernels/cpu/slice.cc
index bef925a7..c3296ce7 100644
--- a/infini_train/src/kernels/cpu/slice.cc
+++ b/infini_train/src/kernels/cpu/slice.cc
@@ -83,7 +83,7 @@ std::shared_ptr<Tensor> SliceBackward(const std::shared_ptr<Tensor> &grad_output
     }
 
     auto new_tensor = std::make_shared<Tensor>(input->Dims(), input->Dtype(), input->GetDevice());
-    new_tensor->Fill<float>(0.0);
+    new_tensor->Fill(0.0);
 
     std::vector<int64_t> src_strides(dims.size());
     int64_t stride = 1;
diff --git a/infini_train/src/kernels/cpu/split.cc b/infini_train/src/kernels/cpu/split.cc
index 209857f0..d8e3ed9e 100644
--- a/infini_train/src/kernels/cpu/split.cc
+++ b/infini_train/src/kernels/cpu/split.cc
@@ -47,7 +47,7 @@ std::shared_ptr<Tensor> SplitBackward(const std::vector<int64_t> &input_dims, in
     CHECK_EQ(grad_outputs.size(), (input_dims[dim] + split_size - 1) / split_size);
 
     auto grad_input = std::make_shared<Tensor>(input_dims, DataType::kFLOAT32);
-    grad_input->Fill<float>(0.0f);
+    grad_input->Fill(0.0);
     for (int64_t start = 0, idx = 0; start < input_dims[dim]; start += split_size, ++idx) {
         auto output_dims = input_dims;
         output_dims[dim] = std::min(split_size, input_dims[dim] - start);
diff --git a/infini_train/src/kernels/cpu/transform.cc b/infini_train/src/kernels/cpu/transform.cc
index 00387917..1a810b44 100644
--- a/infini_train/src/kernels/cpu/transform.cc
+++ b/infini_train/src/kernels/cpu/transform.cc
@@ -196,7 +196,7 @@ std::shared_ptr<Tensor> RepeatInterleaveBackward(const std::shared_ptr<Tensor> &
     CHECK_EQ(grad_output->Dims()[dim], dim_size * repeat);
 
     auto grad_input = std::make_shared<Tensor>(input_dims, grad_output->Dtype(), grad_output->GetDevice());
-    grad_input->Fill<float>(0.0f);
+    grad_input->Fill(0.0);
 
     const float *grad_out_ptr = static_cast<const float *>(grad_output->DataPtr());
     float *grad_in_ptr = static_cast<float *>(grad_input->DataPtr());
diff --git a/infini_train/src/kernels/cuda/accumulate_grad.cu b/infini_train/src/kernels/cuda/accumulate_grad.cu
index f0f5a587..93409a7e 100644
--- a/infini_train/src/kernels/cuda/accumulate_grad.cu
+++ b/infini_train/src/kernels/cuda/accumulate_grad.cu
@@ -6,6 +6,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/cuda/cuda_dispatch.h"
 #include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
 
 namespace infini_train::kernels::cuda {
@@ -29,7 +30,7 @@ void AccumulateGrad(const std::shared_ptr<Tensor> &gradient, float rate, const s
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->cuda_stream();
 
-    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_FLOATING_TYPES>(
         gradient->Dtype(),
         [=]<typename T>() {
             AccumulateGradKernel<<<num_blocks, threads_per_block, 0, cuda_stream>>>(
@@ -73,7 +74,7 @@ void AdamAccumulateGrad(const std::shared_ptr<Tensor> &grad, const std::shared_p
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->cuda_stream();
 
-    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_FLOATING_TYPES>(
         grad->Dtype(),
         [=]<typename T>() {
             AdamAccumulateGradKernel<<<num_blocks, threads_per_block, 0, cuda_stream>>>(
diff --git a/infini_train/src/kernels/cuda/cast.cu b/infini_train/src/kernels/cuda/cast.cu
index 62a5b0d2..16190912 100644
--- a/infini_train/src/kernels/cuda/cast.cu
+++ b/infini_train/src/kernels/cuda/cast.cu
@@ -8,6 +8,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/cuda/cuda_dispatch.h"
 #include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
 
 namespace infini_train::kernels::cuda {
@@ -33,11 +34,12 @@ std::shared_ptr<Tensor> Cast(std::shared_ptr<Tensor> input, DataType dtype) {
     dim3 grid_dims(CEIL_DIV(num_elements, block_dims.x));
     const size_t step = grid_dims.x * block_dims.x;
 
-    DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
+    core::cuda::DispatchCudaFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
         {dtype, input->Dtype()},
         [=]<typename Tdst, typename Tsrc>() {
             auto dst = static_cast<Tdst *>(dst_tensor->DataPtr());
             auto src = static_cast<const Tsrc *>(input->DataPtr());
+
             for (size_t offset = 0; offset < num_elements; offset += step) {
                 CastKernel<<<grid_dims, block_dims, 0, cuda_stream>>>(dst, src, num_elements, offset);
             }
diff --git a/infini_train/src/kernels/cuda/comm.cu b/infini_train/src/kernels/cuda/comm.cu
index c3063e99..6ccad9e4 100644
--- a/infini_train/src/kernels/cuda/comm.cu
+++ b/infini_train/src/kernels/cuda/comm.cu
@@ -29,7 +29,7 @@ std::vector<std::shared_ptr<Tensor>> ReduceAddCoalesced(const std::vector<std::v
     std::vector<std::vector<std::shared_ptr<Tensor>>> to_destination_grads;
     for (int i = 0; i < grads[0].size(); ++i) {
         outputs.emplace_back(std::make_shared<Tensor>(grads[0][i]->Dims(), grads[0][i]->Dtype(), destination));
-        outputs[i]->Fill<float>(0.0);
+        outputs[i]->Fill(0.0);
     }
     for (int i = 0; i < grads.size(); ++i) {
         to_destination_grads.push_back(std::vector<std::shared_ptr<Tensor>>());
diff --git a/infini_train/src/kernels/cuda/concat.cu b/infini_train/src/kernels/cuda/concat.cu
index 43000470..c158a5c3 100644
--- a/infini_train/src/kernels/cuda/concat.cu
+++ b/infini_train/src/kernels/cuda/concat.cu
@@ -11,6 +11,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/cuda/cuda_dispatch.h"
 #include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
 
 namespace infini_train::kernels::cuda {
@@ -102,7 +103,7 @@ std::shared_ptr<Tensor> ConcatForward(const std::vector<std::shared_ptr<Tensor>>
     int threads_per_block = 256;
     int num_blocks = static_cast<int>((total + threads_per_block - 1) / threads_per_block);
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_TYPES>(
         dtype,
         [=, &inputs, &host_offsets]<typename T>() {
             std::vector<const T *> host_input_ptrs;
@@ -185,8 +186,7 @@ std::vector<std::shared_ptr<Tensor>> ConcatBackward(const std::shared_ptr<Tensor
     grads.reserve(input_dims_list.size());
     for (const auto &dvec : input_dims_list) {
         auto t = std::make_shared<Tensor>(dvec, dtype, device);
-        DispatchFunc<INFINI_ALL_TYPES>(
-            dtype, [=]<typename T>() { t->Fill<T>(0); }, "CUDA ConcatBackward");
+        t->Fill(0.0);
         grads.push_back(t);
     }
 
@@ -208,7 +208,7 @@ std::vector<std::shared_ptr<Tensor>> ConcatBackward(const std::shared_ptr<Tensor
     int threads_per_block = 256;
     int num_blocks = static_cast<int>((total + threads_per_block - 1) / threads_per_block);
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_TYPES>(
         dtype,
         [=, &grads, &host_offsets]<typename T>() {
             std::vector<T *> host_ptrs;
diff --git a/infini_train/src/kernels/cuda/cross_entropy.cu b/infini_train/src/kernels/cuda/cross_entropy.cu
index 9b95ca3e..8ba2785b 100644
--- a/infini_train/src/kernels/cuda/cross_entropy.cu
+++ b/infini_train/src/kernels/cuda/cross_entropy.cu
@@ -12,6 +12,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/cuda/cuda_dispatch.h"
 #include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
 
 namespace infini_train::kernels::cuda {
@@ -91,7 +92,8 @@ std::shared_ptr<Tensor> CrossEntropyForward(const std::shared_ptr<Tensor> &input
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->cuda_stream();
 
-    return DispatchFunc<DataTypeList<DataType::kUINT8, DataType::kINT64>, DataTypeList<INFINI_ALL_FLOATING_TYPES>>(
+    return core::cuda::DispatchCudaFunc<DataTypeList<DataType::kUINT8, DataType::kINT64>,
+                                        DataTypeList<INFINI_ALL_FLOATING_TYPES>>(
         {target->Dtype(), input->Dtype()},
         [=]<typename Ttarget, typename Tinput>() {
             const Ttarget *target_ptr = static_cast<const Ttarget *>(target->DataPtr());
@@ -198,14 +200,16 @@ std::shared_ptr<Tensor> CrossEntropyBackward(const std::shared_ptr<Tensor> &inpu
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->cuda_stream();
 
-    DispatchFunc<DataTypeList<DataType::kUINT8, DataType::kINT64>, DataTypeList<INFINI_ALL_FLOATING_TYPES>>(
+    core::cuda::DispatchCudaFunc<DataTypeList<DataType::kUINT8, DataType::kINT64>,
+                                 DataTypeList<INFINI_ALL_FLOATING_TYPES>>(
         {target->Dtype(), input_casted->Dtype()},
         [=]<typename Ttarget, typename Tinput>() {
-            grad_input->Fill<Tinput>(0);
+            grad_input->Fill(0.0);
             const Tinput *output_grad_ptr = static_cast<const Tinput *>(grad_output->DataPtr());
             const Ttarget *target_ptr = static_cast<const Ttarget *>(target->DataPtr());
             const Tinput *input_ptr = static_cast<const Tinput *>(input_casted->DataPtr());
             Tinput *input_grad_ptr = static_cast<Tinput *>(grad_input->DataPtr());
+
             CrossEntropyBackwardKernel<threads_per_block, Ttarget, Tinput>
                 <<<num_blocks, threads_per_block, 0, cuda_stream>>>(input_ptr, input_grad_ptr, target_ptr,
                                                                     output_grad_ptr, bs, num_classes);
diff --git a/infini_train/src/kernels/cuda/elementwise.cu b/infini_train/src/kernels/cuda/elementwise.cu
index 01e1048f..fe63e0b2 100644
--- a/infini_train/src/kernels/cuda/elementwise.cu
+++ b/infini_train/src/kernels/cuda/elementwise.cu
@@ -6,6 +6,7 @@
 #include "infini_train/include/common/cuda/kernel_helper.cuh"
 #include "infini_train/include/core/runtime/device_guard.h"
 #include "infini_train/include/dispatcher.h"
+#include "infini_train/include/dtype_dispatch.h"
 #include "infini_train/include/tensor.h"
 
 #include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
@@ -766,9 +767,7 @@ std::shared_ptr<Tensor> UnaryBackward(const std::shared_ptr<Tensor> &grad_output
                                       Func unary_fn) {
     auto dtype = grad_output->Dtype();
     auto a_dtype = a ? a->Dtype() : dtype;
-    DataType promoted_type = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
-        {dtype, a_dtype}, [=]<typename Tgrad, typename Ta>() { return DataTypeMap_v<WidestType_t<Tgrad, Ta>>; },
-        "CUDA UnaryBackward");
+    DataType promoted_type = PromoteDataTypes(dtype, a_dtype);
 
     auto grad_output_promoted
         = dtype == promoted_type ? grad_output : std::make_shared<Tensor>(grad_output->To(promoted_type));
@@ -795,9 +794,7 @@ std::shared_ptr<Tensor> BinaryForward(const std::shared_ptr<Tensor> &a, const st
     auto a_dtype = a->Dtype();
     auto b_dtype = b->Dtype();
 
-    DataType promoted_type = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
-        {a_dtype, b_dtype}, [=]<typename Ta, typename Tb>() { return DataTypeMap_v<WidestType_t<Ta, Tb>>; },
-        "CUDA BinaryForward");
+    DataType promoted_type = PromoteDataTypes(a_dtype, b_dtype);
 
     auto a_promoted = a_dtype == promoted_type ? a : std::make_shared<Tensor>(a->To(promoted_type));
     auto b_promoted = b_dtype == promoted_type ? b : std::make_shared<Tensor>(b->To(promoted_type));
@@ -837,9 +834,7 @@ BinaryBackward(const std::shared_ptr<Tensor> &grad_output, const std::shared_ptr
     auto a_dtype = a_promoted ? a_promoted->Dtype() : dtype;
     auto b_dtype = b_promoted ? b_promoted->Dtype() : dtype;
     // Compute dtype determined by saved tensors (forward compute dtype), not grad_output
-    DataType promoted_type = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
-        {a_dtype, b_dtype}, [=]<typename Ta, typename Tb>() { return DataTypeMap_v<WidestType_t<Ta, Tb>>; },
-        "CUDA BinaryBackward");
+    DataType promoted_type = PromoteDataTypes(a_dtype, b_dtype);
 
     CHECK(a_num_elements >= b_num_elements && a_num_elements % b_num_elements == 0);
 
@@ -867,8 +862,8 @@ BinaryBackward(const std::shared_ptr<Tensor> &grad_output, const std::shared_ptr
     switch (promoted_type) {
         DISPATCH_CASE(WRAP({
                           if (needs_broadcast) {
-                              grad_a->Fill<float>(0.0f);
-                              grad_b->Fill<float>(0.0f);
+                              grad_a->Fill(0.0f);
+                              grad_b->Fill(0.0f);
                           }
                           LaunchBackward<256, float>(fn_a, fn_b, grad_a, grad_b, a_dims, b_dims, grad_output_promoted,
                                                      a_promoted, b_promoted);
@@ -876,8 +871,8 @@ BinaryBackward(const std::shared_ptr<Tensor> &grad_output, const std::shared_ptr
                       DataType::kFLOAT32)
         DISPATCH_CASE(WRAP({
                           if (needs_broadcast) {
-                              grad_a->Fill<nv_bfloat16>(0);
-                              grad_b->Fill<nv_bfloat16>(0);
+                              grad_a->Fill(0.0f);
+                              grad_b->Fill(0.0f);
                           }
                           LaunchBackward<256, nv_bfloat16>(fn_a, fn_b, grad_a, grad_b, a_dims, b_dims,
                                                            grad_output_promoted, a_promoted, b_promoted);
@@ -885,8 +880,8 @@ BinaryBackward(const std::shared_ptr<Tensor> &grad_output, const std::shared_ptr
                       DataType::kBFLOAT16)
         // FIXME(zbl): AtomicAdd does not support int64_t
         // DISPATCH_CASE(WRAP({
-        //                   grad_a->Fill<int64_t>(0);
-        //                   grad_b->Fill<int64_t>(0);
+        //                   grad_a->Fill(0.0);
+        //                   grad_b->Fill(0.0);
         //                   LaunchBackward<256, int64_t>(fn_a, fn_b, grad_a, grad_b, a_dims, b_dims, grad_output, a,
         //                   b);
         //               }),
diff --git a/infini_train/src/kernels/cuda/embedding.cu b/infini_train/src/kernels/cuda/embedding.cu
index 2169e39b..89361e03 100644
--- a/infini_train/src/kernels/cuda/embedding.cu
+++ b/infini_train/src/kernels/cuda/embedding.cu
@@ -5,6 +5,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/cuda/cuda_dispatch.h"
 #include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
 
 namespace infini_train::kernels::cuda {
@@ -50,7 +51,7 @@ std::shared_ptr<Tensor> EmbeddingForward(const std::shared_ptr<Tensor> &input, c
     int threads_per_block = 256;
     int num_blocks = (batch_size * max_seqlen * embed_dim + threads_per_block - 1) / threads_per_block;
 
-    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_FLOATING_TYPES>(
         dtype,
         [=]<typename T>() {
             EmbeddingForwardKernel<<<num_blocks, threads_per_block, 0, cuda_stream>>>(
@@ -101,10 +102,10 @@ std::shared_ptr<Tensor> EmbeddingBackward(const std::shared_ptr<Tensor> &input,
     const int threads_per_block = 256;
     const int num_blocks = (num_tokens + threads_per_block - 1) / threads_per_block;
 
-    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_FLOATING_TYPES>(
         dtype,
         [=]<typename T>() {
-            grad_weight->Fill<T>(0);
+            grad_weight->Fill(0.0);
             EmbeddingBackwardKernel<<<num_blocks, threads_per_block, 0, cuda_stream>>>(
                 static_cast<const int64_t *>(input->DataPtr()), static_cast<const T *>(grad_output->DataPtr()),
                 static_cast<T *>(grad_weight->DataPtr()), num_tokens, embedding_dim, vocab_size);
diff --git a/infini_train/src/kernels/cuda/fill.cu b/infini_train/src/kernels/cuda/fill.cu
index d00bd0f2..f5532779 100644
--- a/infini_train/src/kernels/cuda/fill.cu
+++ b/infini_train/src/kernels/cuda/fill.cu
@@ -6,6 +6,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/cuda/cuda_dispatch.h"
 #include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
 
 namespace infini_train::kernels::cuda {
@@ -18,7 +19,7 @@ template <typename T> __global__ void FillKernel(T *data, T value, size_t size)
 }
 
 // TODO(dcj): refactor Fill kernel with elementwise template
-void Fill(std::shared_ptr<Tensor> tensor, void *value_ptr) {
+void Fill(std::shared_ptr<Tensor> tensor, Scalar scalar) {
     const int num_tokens = tensor->NumElements();
     const int threads_per_block = 256;
     const int num_blocks = (num_tokens + threads_per_block - 1) / threads_per_block;
@@ -27,11 +28,12 @@ void Fill(std::shared_ptr<Tensor> tensor, void *value_ptr) {
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->cuda_stream();
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_TYPES>(
         tensor->Dtype(),
         [=]<typename T>() {
-            FillKernel<T><<<num_blocks, threads_per_block, 0, cuda_stream>>>(
-                static_cast<T *>(tensor->DataPtr()), *(static_cast<T *>(value_ptr)), tensor->NumElements());
+            const T casted_value = scalar.to<T>();
+            FillKernel<T><<<num_blocks, threads_per_block, 0, cuda_stream>>>(static_cast<T *>(tensor->DataPtr()),
+                                                                             casted_value, tensor->NumElements());
         },
         "CUDA Fill");
 }
diff --git a/infini_train/src/kernels/cuda/gather.cu b/infini_train/src/kernels/cuda/gather.cu
index 1e12a77b..12d0567d 100644
--- a/infini_train/src/kernels/cuda/gather.cu
+++ b/infini_train/src/kernels/cuda/gather.cu
@@ -5,6 +5,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/cuda/cuda_dispatch.h"
 #include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
 
 namespace infini_train::kernels::cuda {
@@ -99,7 +100,7 @@ std::shared_ptr<Tensor> IndexGatherForward(const std::shared_ptr<Tensor> &input,
     const int threads = 256;
     const int blocks = (total_elements + threads - 1) / threads;
 
-    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_FLOATING_TYPES>(
         dtype,
         [=]<typename T>() {
             IndexGatherForwardKernel<T><<<blocks, threads, 0, stream>>>(
@@ -173,8 +174,7 @@ std::shared_ptr<Tensor> IndexGatherBackward(const std::shared_ptr<Tensor> &grad_
 
     auto dtype = grad_output->Dtype();
     auto grad_input = std::make_shared<Tensor>(in_dims, dtype, grad_output->GetDevice());
-    DispatchFunc<INFINI_ALL_TYPES>(
-        dtype, [=]<typename T>() { grad_input->Fill<T>(0); }, "CUDA IndexGatherBackwardZero");
+    grad_input->Fill(0.0);
 
     auto in_strides = ComputeStrides(in_dims);
     auto out_strides = ComputeStrides(idx_dims);
@@ -207,7 +207,7 @@ std::shared_ptr<Tensor> IndexGatherBackward(const std::shared_ptr<Tensor> &grad_
     const int threads = 256;
     const int blocks = (int)((total_elements + threads - 1) / threads);
 
-    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_FLOATING_TYPES>(
         dtype,
         [=]<typename T>() {
             IndexGatherBackwardKernel<T><<<blocks, threads, 0, stream>>>(
diff --git a/infini_train/src/kernels/cuda/layernorm.cu b/infini_train/src/kernels/cuda/layernorm.cu
index 95264d45..12f3c08d 100644
--- a/infini_train/src/kernels/cuda/layernorm.cu
+++ b/infini_train/src/kernels/cuda/layernorm.cu
@@ -7,6 +7,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/cuda/cuda_dispatch.h"
 #include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
 
 namespace infini_train::kernels::cuda {
@@ -85,11 +86,11 @@ LayerNormForward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Ten
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->cuda_stream();
 
-    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_FLOATING_TYPES>(
         dtype,
         [=]<typename T>() {
-            mean->Fill<float>(0);
-            rstd->Fill<float>(0);
+            mean->Fill(0.0);
+            rstd->Fill(0.0);
             LayerNormForwardKernel<BLOCK_SIZE><<<num_blocks, threads_per_block, 0, cuda_stream>>>(
                 static_cast<const T *>(input->DataPtr()), static_cast<const T *>(weight->DataPtr()),
                 static_cast<const T *>(bias->DataPtr()), static_cast<float *>(mean->DataPtr()),
@@ -179,12 +180,12 @@ LayerNormBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Te
     const auto &cuda_stream = dynamic_cast<infini_train::core::cuda::CudaStream *>(
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->cuda_stream();
-    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_FLOATING_TYPES>(
         dtype,
         [=]<typename T>() {
-            grad_input->Fill<T>(0);
-            grad_weight->Fill<T>(0);
-            grad_bias->Fill<T>(0);
+            grad_input->Fill(0.0);
+            grad_weight->Fill(0.0);
+            grad_bias->Fill(0.0);
             LayerNormBackwardKernel<BLOCK_SIZE><<<num_blocks, threads_per_block, 0, cuda_stream>>>(
                 static_cast<const T *>(input->DataPtr()), static_cast<const T *>(grad_output->DataPtr()),
                 static_cast<const float *>(mean->DataPtr()), static_cast<const float *>(rstd->DataPtr()),
diff --git a/infini_train/src/kernels/cuda/linear.cu b/infini_train/src/kernels/cuda/linear.cu
index 5b9b6781..7942d0ea 100644
--- a/infini_train/src/kernels/cuda/linear.cu
+++ b/infini_train/src/kernels/cuda/linear.cu
@@ -13,6 +13,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/cuda/cuda_dispatch.h"
 #include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
 
 namespace infini_train::kernels::cuda {
@@ -93,9 +94,7 @@ MatmulBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tenso
     auto other_dtype = other->Dtype();
     auto grad_output_dtype = grad_output->Dtype();
     // Compute dtype determined by saved tensors (forward compute dtype), not grad_output
-    DataType compute_dtype = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
-        {input_dtype, other_dtype}, [=]<typename Tin, typename To>() { return DataTypeMap_v<WidestType_t<Tin, To>>; },
-        "CUDA MatmulBackward");
+    DataType compute_dtype = PromoteDataTypes(input_dtype, other_dtype);
 
     auto input_promoted = input_dtype == compute_dtype ? input : std::make_shared<Tensor>(input->To(compute_dtype));
     auto other_promoted = other_dtype == compute_dtype ? other : std::make_shared<Tensor>(other->To(compute_dtype));
@@ -242,7 +241,7 @@ std::shared_ptr<Tensor> LinearForward(const std::shared_ptr<Tensor> &input, cons
         int threads_per_block = 256;
         int num_blocks = (bs * out_features + threads_per_block - 1) / threads_per_block;
 
-        DispatchFunc<DataType::kFLOAT32, DataType::kBFLOAT16>(
+        core::cuda::DispatchCudaFunc<DataType::kFLOAT32, DataType::kBFLOAT16>(
             dtype,
             [=]<typename T>() {
                 BiasCopyKernel<<<num_blocks, threads_per_block, 0, cuda_stream>>>(
@@ -250,8 +249,7 @@ std::shared_ptr<Tensor> LinearForward(const std::shared_ptr<Tensor> &input, cons
             },
             "CUDA LinearForward");
     } else {
-        DispatchFunc<DataType::kFLOAT32, DataType::kBFLOAT16>(
-            input->Dtype(), [=]<typename T>() { output->Fill<T>(0); }, "CUDA LinearForward");
+        output->Fill(0.0);
     }
 
     const float alpha = 1.0f;
@@ -338,9 +336,7 @@ LinearBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tenso
     DataType input_dtype = input ? input->Dtype() : (weight ? weight->Dtype() : dtype);
     DataType weight_dtype = weight ? weight->Dtype() : (input ? input->Dtype() : dtype);
     // Compute dtype determined by saved tensors (forward compute dtype), not grad_output
-    DataType compute_dtype = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
-        {input_dtype, weight_dtype}, [=]<typename Tin, typename Tw>() { return DataTypeMap_v<WidestType_t<Tin, Tw>>; },
-        "CUDA LinearBackward");
+    DataType compute_dtype = PromoteDataTypes(input_dtype, weight_dtype);
 
     auto grad_output_promoted
         = dtype == compute_dtype ? grad_output : std::make_shared<Tensor>(grad_output->To(compute_dtype));
diff --git a/infini_train/src/kernels/cuda/outer.cu b/infini_train/src/kernels/cuda/outer.cu
index ae7c9f7b..d0dcb485 100644
--- a/infini_train/src/kernels/cuda/outer.cu
+++ b/infini_train/src/kernels/cuda/outer.cu
@@ -11,6 +11,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/cuda/cuda_dispatch.h"
 #include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
 
 namespace infini_train::kernels::cuda {
@@ -81,9 +82,7 @@ std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>> OuterBackward(const
     auto grad_output_dtype = grad_output->Dtype();
 
     // Compute dtype determined by saved tensors (forward compute dtype), not grad_output
-    DataType promoted_type = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
-        {input_dtype, other_dtype}, [=]<typename Tin, typename To>() { return DataTypeMap_v<WidestType_t<Tin, To>>; },
-        "CUDA OuterBackward");
+    DataType promoted_type = PromoteDataTypes(input_dtype, other_dtype);
 
     auto input_promoted = input_dtype == promoted_type ? input : std::make_shared<Tensor>(input->To(promoted_type));
     auto other_promoted = other_dtype == promoted_type ? other : std::make_shared<Tensor>(other->To(promoted_type));
@@ -95,11 +94,11 @@ std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>> OuterBackward(const
     auto grad_input = std::make_shared<Tensor>(std::vector<int64_t>{M}, output_dtype, grad_output->GetDevice());
     auto grad_other = std::make_shared<Tensor>(std::vector<int64_t>{N}, output_dtype, grad_output->GetDevice());
 
-    DispatchFunc<DataType::kFLOAT32, DataType::kBFLOAT16>(
+    core::cuda::DispatchCudaFunc<DataType::kFLOAT32, DataType::kBFLOAT16>(
         promoted_type,
         [=]<typename T>() {
-            grad_input->Fill<T>(0);
-            grad_other->Fill<T>(0);
+            grad_input->Fill(0.0);
+            grad_other->Fill(0.0);
         },
         "CUDA OuterBackward");
 
diff --git a/infini_train/src/kernels/cuda/reduction.cu b/infini_train/src/kernels/cuda/reduction.cu
index 02942473..c56470e3 100644
--- a/infini_train/src/kernels/cuda/reduction.cu
+++ b/infini_train/src/kernels/cuda/reduction.cu
@@ -7,6 +7,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/cuda/cuda_dispatch.h"
 #include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
 
 namespace infini_train::kernels::cuda {
@@ -141,7 +142,7 @@ std::shared_ptr<Tensor> ReduceOpForward(const std::shared_ptr<Tensor> &input, co
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->cuda_stream();
 
-    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_FLOATING_TYPES>(
         dtype,
         [=]<typename T>() {
             GenericReduceKernel<T, ReduceFunc, FinalizeOp<T>, BLOCK_SIZE>
@@ -177,10 +178,10 @@ std::shared_ptr<Tensor> ReduceOpBackward(const std::shared_ptr<Tensor> &grad_out
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->cuda_stream();
 
-    DispatchFunc<INFINI_ALL_FLOATING_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_FLOATING_TYPES>(
         dtype,
         [=]<typename T>() {
-            grad_input->Fill<T>(0);
+            grad_input->Fill(0.0);
             GenericReduceBackwardKernel<<<num_blocks, threads_per_block, 0, cuda_stream>>>(
                 static_cast<T *>(grad_input->DataPtr()), static_cast<const T *>(grad_output->DataPtr()),
                 input ? static_cast<const T *>(input->DataPtr()) : nullptr,
diff --git a/infini_train/src/kernels/cuda/slice.cu b/infini_train/src/kernels/cuda/slice.cu
index c2265aaf..35bd2ac5 100644
--- a/infini_train/src/kernels/cuda/slice.cu
+++ b/infini_train/src/kernels/cuda/slice.cu
@@ -8,6 +8,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/cuda/cuda_dispatch.h"
 #include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
 
 namespace infini_train::kernels::cuda {
@@ -48,8 +49,7 @@ std::shared_ptr<Tensor> SliceForward(const std::shared_ptr<Tensor> &input, const
     auto dtype = input->Dtype();
     auto new_tensor = std::make_shared<Tensor>(new_dims, dtype, input->GetDevice());
     // NOTE(zbl): must initialize with 0
-    DispatchFunc<INFINI_ALL_TYPES>(
-        dtype, [=]<typename T>() { new_tensor->Fill<T>(0); }, "CUDA SliceForward");
+    new_tensor->Fill(0.0);
 
     std::vector<int64_t> src_strides(dims.size(), 0), dst_strides(new_dims.size(), 0);
     int64_t stride = 1;
@@ -92,7 +92,7 @@ std::shared_ptr<Tensor> SliceForward(const std::shared_ptr<Tensor> &input, const
     int threads_per_block = 256;
     int num_blocks = (total_elements + threads_per_block - 1) / threads_per_block;
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_TYPES>(
         dtype,
         [=]<typename T>() {
             SliceForwardKernel<<<num_blocks, threads_per_block, 0, stream>>>(
@@ -141,8 +141,7 @@ std::shared_ptr<Tensor> SliceBackward(const std::shared_ptr<Tensor> &grad_output
 
     auto grad_output_dtype = grad_output->Dtype();
     auto grad_input = std::make_shared<Tensor>(input->Dims(), grad_output_dtype, grad_output->GetDevice());
-    DispatchFunc<INFINI_ALL_TYPES>(
-        grad_output_dtype, [=]<typename T>() { grad_input->Fill<T>(0); }, "CUDA SliceBackward");
+    grad_input->Fill(0.0);
 
     std::vector<int64_t> src_strides(dims.size());
     int64_t stride = 1;
@@ -186,7 +185,7 @@ std::shared_ptr<Tensor> SliceBackward(const std::shared_ptr<Tensor> &grad_output
     int threads_per_block = 256;
     int num_blocks = (total_elements + threads_per_block - 1) / threads_per_block;
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_TYPES>(
         grad_output_dtype,
         [=]<typename T>() {
             SliceBackwardKernel<<<num_blocks, threads_per_block, 0, stream>>>(
diff --git a/infini_train/src/kernels/cuda/softmax.cu b/infini_train/src/kernels/cuda/softmax.cu
index 2bfec9e3..a7f3b612 100644
--- a/infini_train/src/kernels/cuda/softmax.cu
+++ b/infini_train/src/kernels/cuda/softmax.cu
@@ -12,6 +12,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/cuda/cuda_dispatch.h"
 #include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
 
 namespace infini_train::kernels::cuda {
@@ -186,10 +187,7 @@ std::shared_ptr<Tensor> SoftmaxBackward(const std::shared_ptr<Tensor> &grad_outp
                                         const std::shared_ptr<Tensor> &output, int64_t dim) {
     auto grad_output_dtype = grad_output->Dtype();
     auto output_dtype = output->Dtype();
-    DataType promoted_type = DispatchFunc<DataTypeList<INFINI_ALL_TYPES>, DataTypeList<INFINI_ALL_TYPES>>(
-        {grad_output_dtype, output_dtype},
-        [=]<typename Tgrad, typename Tout>() { return DataTypeMap_v<WidestType_t<Tgrad, Tout>>; },
-        "CUDA SoftmaxBackward");
+    DataType promoted_type = PromoteDataTypes(grad_output_dtype, output_dtype);
 
     auto grad_output_promoted
         = grad_output_dtype == promoted_type ? grad_output : std::make_shared<Tensor>(grad_output->To(promoted_type));
@@ -200,8 +198,7 @@ std::shared_ptr<Tensor> SoftmaxBackward(const std::shared_ptr<Tensor> &grad_outp
     CHECK(dim >= 0 && dim < output->Dims().size());
 
     auto grad_input = std::make_shared<Tensor>(output_dims, promoted_type, output->GetDevice());
-    DispatchFunc<INFINI_ALL_TYPES>(
-        promoted_type, [=]<typename T>() { grad_input->Fill<T>(0); }, "CUDA SoftmaxBackward");
+    grad_input->Fill(0.0);
 
     switch (promoted_type) {
         DISPATCH_CASE(WRAP(LaunchBackward<256, float>(grad_input, grad_output_promoted, output_promoted, dim);),
diff --git a/infini_train/src/kernels/cuda/split.cu b/infini_train/src/kernels/cuda/split.cu
index 8724f239..f208695f 100644
--- a/infini_train/src/kernels/cuda/split.cu
+++ b/infini_train/src/kernels/cuda/split.cu
@@ -7,6 +7,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/cuda/cuda_dispatch.h"
 #include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
 
 namespace infini_train::kernels::cuda {
@@ -58,7 +59,7 @@ std::vector<std::shared_ptr<Tensor>> SplitForward(const std::shared_ptr<Tensor>
                                       infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                       ->cuda_stream();
 
-        DispatchFunc<INFINI_ALL_TYPES>(
+        core::cuda::DispatchCudaFunc<INFINI_ALL_TYPES>(
             dtype,
             [=]<typename T>() {
                 SplitForwardKernel<<<num_blocks, threads_per_block, 0, cuda_stream>>>(
@@ -113,7 +114,7 @@ std::shared_ptr<Tensor> LaunchSplitBackward(const std::vector<int64_t> &input_di
     const auto &grad = grad_outputs[0];
     auto dtype = grad->Dtype();
     auto grad_input = std::make_shared<Tensor>(input_dims, dtype, grad->GetDevice());
-    grad_input->Fill<T>(0);
+    grad_input->Fill(0.0);
 
     int64_t N = std::accumulate(input_dims.begin(), input_dims.begin() + dim, 1, std::multiplies<int64_t>());
     int64_t W = std::accumulate(input_dims.begin() + dim + 1, input_dims.end(), 1, std::multiplies<int64_t>());
@@ -165,7 +166,7 @@ std::shared_ptr<Tensor> SplitBackward(const std::vector<int64_t> &input_dims, in
     CHECK_GE(dim, 0) << "Currently we do not support negative dimension";
     CHECK_LT(dim, input_dims.size());
 
-    return DispatchFunc<INFINI_ALL_TYPES>(
+    return core::cuda::DispatchCudaFunc<INFINI_ALL_TYPES>(
         grad_outputs[0]->Dtype(),
         [=]<typename T>() { return LaunchSplitBackward<T>(input_dims, split_size, dim, grad_outputs); },
         "CUDA SplitBackward");
diff --git a/infini_train/src/kernels/cuda/stack.cu b/infini_train/src/kernels/cuda/stack.cu
index 297f9e5e..562fa5ec 100644
--- a/infini_train/src/kernels/cuda/stack.cu
+++ b/infini_train/src/kernels/cuda/stack.cu
@@ -11,6 +11,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/cuda/cuda_dispatch.h"
 #include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
 
 namespace infini_train::kernels::cuda {
@@ -60,7 +61,7 @@ std::shared_ptr<Tensor> StackForward(const std::vector<std::shared_ptr<Tensor>>
     int threads_per_block = 256;
     int num_blocks = (total + threads_per_block - 1) / threads_per_block;
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_TYPES>(
         dtype,
         [=]<typename T>() {
             std::vector<const T *> host_input_ptrs;
@@ -112,8 +113,7 @@ std::vector<std::shared_ptr<Tensor>> StackBackward(const std::vector<int64_t> &i
     std::vector<std::shared_ptr<Tensor>> grads;
     for (int i = 0; i < num_inputs; ++i) {
         auto t = std::make_shared<Tensor>(base_dims, dtype, grad_output->GetDevice());
-        DispatchFunc<INFINI_ALL_TYPES>(
-            dtype, [=]<typename T>() { t->Fill<T>(0); }, "CUDA StackBackward");
+        t->Fill(0.0);
         grads.push_back(t);
     }
 
@@ -129,7 +129,7 @@ std::vector<std::shared_ptr<Tensor>> StackBackward(const std::vector<int64_t> &i
     int threads_per_block = 256;
     int num_blocks = (total + threads_per_block - 1) / threads_per_block;
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_TYPES>(
         dtype,
         [=]<typename T>() {
             std::vector<T *> host_ptrs;
diff --git a/infini_train/src/kernels/cuda/transform.cu b/infini_train/src/kernels/cuda/transform.cu
index a92da0a5..2bb35598 100644
--- a/infini_train/src/kernels/cuda/transform.cu
+++ b/infini_train/src/kernels/cuda/transform.cu
@@ -10,6 +10,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/cuda/cuda_dispatch.h"
 #include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
 
 namespace infini_train::kernels::cuda {
@@ -46,7 +47,7 @@ std::shared_ptr<Tensor> TrilForward(const std::shared_ptr<Tensor> &input, int64_
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->cuda_stream();
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_TYPES>(
         input->Dtype(),
         [=]<typename T>() {
             TrilForwardKernel<<<num_blocks, threads_per_block, 0, cuda_stream>>>(
@@ -89,10 +90,10 @@ std::shared_ptr<Tensor> TrilBackward(const std::shared_ptr<Tensor> &grad_output,
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->cuda_stream();
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_TYPES>(
         dtype,
         [=]<typename T>() {
-            grad_input->Fill<T>(0);
+            grad_input->Fill(0.0);
             TrilBackwardKernel<<<num_blocks, threads_per_block, 0, cuda_stream>>>(
                 static_cast<const T *>(grad_output->DataPtr()), static_cast<T *>(grad_input->DataPtr()), rows, cols,
                 diagonal);
@@ -134,7 +135,7 @@ std::shared_ptr<Tensor> TriuForward(const std::shared_ptr<Tensor> &input, int64_
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->cuda_stream();
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_TYPES>(
         input->Dtype(),
         [=]<typename T>() {
             TriuForwardKernel<<<num_blocks, threads_per_block, 0, cuda_stream>>>(
@@ -176,10 +177,10 @@ std::shared_ptr<Tensor> TriuBackward(const std::shared_ptr<Tensor> &grad_output,
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->cuda_stream();
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_TYPES>(
         dtype,
         [=]<typename T>() {
-            grad_input->Fill<T>(0);
+            grad_input->Fill(0.0);
             TriuBackwardKernel<<<num_blocks, threads_per_block, 0, cuda_stream>>>(
                 static_cast<const T *>(grad_output->DataPtr()), static_cast<T *>(grad_input->DataPtr()), rows, cols,
                 diagonal);
@@ -268,10 +269,10 @@ std::shared_ptr<Tensor> TransposeForward(const std::shared_ptr<Tensor> &input, i
     int threads_per_block = 256;
     int num_blocks = (num_elements + threads_per_block - 1) / threads_per_block;
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_TYPES>(
         dtype,
         [=]<typename T>() {
-            output->Fill<T>(0);
+            output->Fill(0.0);
             TransposeForwardKernel<<<num_blocks, threads_per_block, 0, stream>>>(
                 static_cast<const T *>(input->DataPtr()), static_cast<T *>(output->DataPtr()), in_dims_dev,
                 in_strides_dev, out_strides_dev, ndim, dim0, dim1, num_elements);
@@ -370,7 +371,7 @@ std::shared_ptr<Tensor> MaskForward(const std::shared_ptr<Tensor> &input, const
         int64_t inner = input->NumElements() / rows;
         int num_blocks = static_cast<int>((input->NumElements() + threads_per_block - 1) / threads_per_block);
 
-        DispatchFunc<INFINI_ALL_TYPES>(
+        core::cuda::DispatchCudaFunc<INFINI_ALL_TYPES>(
             dtype,
             [=]<typename T>() {
                 MaskLeadsForwardKernel<T><<<num_blocks, threads_per_block, 0, cuda_stream>>>(
@@ -383,7 +384,7 @@ std::shared_ptr<Tensor> MaskForward(const std::shared_ptr<Tensor> &input, const
         int64_t batch_size = input->NumElements() / mask_size;
         int num_blocks = static_cast<int>((input->NumElements() + threads_per_block - 1) / threads_per_block);
 
-        DispatchFunc<INFINI_ALL_TYPES>(
+        core::cuda::DispatchCudaFunc<INFINI_ALL_TYPES>(
             dtype,
             [=]<typename T>() {
                 MaskForwardKernel<T><<<num_blocks, threads_per_block, 0, cuda_stream>>>(
@@ -434,10 +435,10 @@ std::shared_ptr<Tensor> MaskBackward(const std::shared_ptr<Tensor> &grad_output,
         int64_t inner = grad_output->NumElements() / rows;
         int num_blocks = static_cast<int>((grad_output->NumElements() + threads_per_block - 1) / threads_per_block);
 
-        DispatchFunc<INFINI_ALL_TYPES>(
+        core::cuda::DispatchCudaFunc<INFINI_ALL_TYPES>(
             dtype,
             [=]<typename T>() {
-                grad_input->Fill<T>(0);
+                grad_input->Fill(0.0);
                 MaskLeadsBackwardKernel<T><<<num_blocks, threads_per_block, 0, cuda_stream>>>(
                     static_cast<const T *>(grad_output->DataPtr()), static_cast<const T *>(mask_casted->DataPtr()),
                     static_cast<T *>(grad_input->DataPtr()), rows, inner);
@@ -448,10 +449,10 @@ std::shared_ptr<Tensor> MaskBackward(const std::shared_ptr<Tensor> &grad_output,
         int64_t batch_size = grad_output->NumElements() / mask_size;
         int num_blocks = static_cast<int>((grad_output->NumElements() + threads_per_block - 1) / threads_per_block);
 
-        DispatchFunc<INFINI_ALL_TYPES>(
+        core::cuda::DispatchCudaFunc<INFINI_ALL_TYPES>(
             dtype,
             [=]<typename T>() {
-                grad_input->Fill<T>(0);
+                grad_input->Fill(0.0);
                 MaskBackwardKernel<T><<<num_blocks, threads_per_block, 0, cuda_stream>>>(
                     static_cast<const T *>(grad_output->DataPtr()), static_cast<const T *>(mask_casted->DataPtr()),
                     static_cast<T *>(grad_input->DataPtr()), static_cast<int>(batch_size), static_cast<int>(mask_size));
@@ -503,7 +504,7 @@ std::shared_ptr<Tensor> RepeatInterleaveForward(const std::shared_ptr<Tensor> &i
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->cuda_stream();
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_TYPES>(
         input->Dtype(),
         [=]<typename T>() {
             RepeatInterleaveForwardKernel<<<num_blocks, threads_per_block, 0, cuda_stream>>>(
@@ -561,10 +562,10 @@ std::shared_ptr<Tensor> RepeatInterleaveBackward(const std::shared_ptr<Tensor> &
                                   infini_train::core::GetDeviceGuardImpl(device.type())->GetStream(device))
                                   ->cuda_stream();
 
-    DispatchFunc<INFINI_ALL_TYPES>(
+    core::cuda::DispatchCudaFunc<INFINI_ALL_TYPES>(
         grad_output->Dtype(),
         [=]<typename T>() {
-            grad_input->Fill<T>(0);
+            grad_input->Fill(0.0);
             RepeatInterleaveBackwardKernel<<<num_blocks, threads_per_block, 0, cuda_stream>>>(
                 static_cast<const T *>(grad_output->DataPtr()), static_cast<T *>(grad_input->DataPtr()), outer,
                 dim_size, inner, repeat);
diff --git a/infini_train/src/kernels/cuda/vocab_parallel_cross_entropy.cu b/infini_train/src/kernels/cuda/vocab_parallel_cross_entropy.cu
index b03d071c..ebbf2f67 100644
--- a/infini_train/src/kernels/cuda/vocab_parallel_cross_entropy.cu
+++ b/infini_train/src/kernels/cuda/vocab_parallel_cross_entropy.cu
@@ -8,6 +8,7 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+#include "infini_train/src/core/runtime/cuda/cuda_dispatch.h"
 #include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
 
 namespace infini_train::kernels::cuda {
@@ -93,7 +94,8 @@ VocabParallelCrossEntropyBackward(const std::shared_ptr<Tensor> &grad_output,
     constexpr int threads_per_block = 256;
     const int num_blocks = static_cast<int>(rows);
 
-    DispatchFunc<DataTypeList<DataType::kUINT8, DataType::kINT64>, DataTypeList<INFINI_ALL_FLOATING_TYPES>>(
+    core::cuda::DispatchCudaFunc<DataTypeList<DataType::kUINT8, DataType::kINT64>,
+                                 DataTypeList<INFINI_ALL_FLOATING_TYPES>>(
         {masked_target->Dtype(), softmax_local->Dtype()},
         [=]<typename Tindex, typename Tinput>() {
             using Tmask = Tinput;
diff --git a/infini_train/src/nn/init.cc b/infini_train/src/nn/init.cc
index b1766bbd..79b4b48b 100644
--- a/infini_train/src/nn/init.cc
+++ b/infini_train/src/nn/init.cc
@@ -7,9 +7,6 @@
 #include <random>
 #include <unordered_set>
 
-#ifdef USE_CUDA
-#include <cuda_runtime_api.h>
-#endif
 #ifdef USE_OMP
 #include <omp.h>
 #endif
@@ -225,12 +222,8 @@ std::shared_ptr<Tensor> Arange(int64_t start, int64_t end, DataType dtype, Devic
         ARANGE_CASE(DataType::kINT32, int32_t)
         ARANGE_CASE(DataType::kUINT64, uint64_t)
         ARANGE_CASE(DataType::kINT64, int64_t)
-
-#ifdef USE_CUDA
-        ARANGE_CASE(DataType::kBFLOAT16, nv_bfloat16)
-        ARANGE_CASE(DataType::kFLOAT16, half)
-#endif
-
+        ARANGE_CASE(DataType::kBFLOAT16, BF16)
+        ARANGE_CASE(DataType::kFLOAT16, FP16)
         ARANGE_CASE(DataType::kFLOAT32, float)
         ARANGE_CASE(DataType::kFLOAT64, double)
 
diff --git a/infini_train/src/nn/parallel/process_group.cc b/infini_train/src/nn/parallel/process_group.cc
index 7b1f8202..3c4c4910 100644
--- a/infini_train/src/nn/parallel/process_group.cc
+++ b/infini_train/src/nn/parallel/process_group.cc
@@ -299,7 +299,7 @@ ProcessGroup::ReduceAddCoalesced(const std::vector<std::vector<std::shared_ptr<T
 
     for (size_t i = 0; i < grads[0].size(); ++i) {
         outputs.push_back(std::make_shared<Tensor>(grads[0][i]->Dims(), grads[0][i]->Dtype(), destination));
-        outputs[i]->Fill<float>(0.0f);
+        outputs[i]->Fill(0.0);
     }
     for (size_t i = 0; i < grads.size(); ++i) {
         devices.push_back(grads[i][0]->GetDevice());
diff --git a/infini_train/src/optimizer.cc b/infini_train/src/optimizer.cc
index 2c9b218a..d5589b01 100644
--- a/infini_train/src/optimizer.cc
+++ b/infini_train/src/optimizer.cc
@@ -38,13 +38,8 @@ Adam::Adam(const std::vector<std::shared_ptr<Tensor>> &params, float learning_ra
     for (const auto &param : params_) {
         m_.emplace_back(std::make_shared<Tensor>(param->Dims(), param->Dtype(), param->GetDevice()));
         v_.emplace_back(std::make_shared<Tensor>(param->Dims(), param->Dtype(), param->GetDevice()));
-        DispatchFunc<INFINI_ALL_TYPES>(
-            param->Dtype(),
-            [this]<typename T>() {
-                m_.back()->Fill<T>(0);
-                v_.back()->Fill<T>(0);
-            },
-            "CUDA Adam");
+        m_.back()->Fill(0.0);
+        v_.back()->Fill(0.0);
     }
 }
 
diff --git a/infini_train/src/tensor.cc b/infini_train/src/tensor.cc
index 490b9009..c41aa974 100644
--- a/infini_train/src/tensor.cc
+++ b/infini_train/src/tensor.cc
@@ -102,37 +102,12 @@ size_t Tensor::NumElements() const { return num_elements_; }
 
 DataType Tensor::Dtype() const { return dtype_; }
 
-template <typename T> void Tensor::Fill(T value) {
+void Tensor::Fill(Scalar value) {
     auto device = GetDevice();
     core::DeviceGuard guard(device);
-
-    DataType dtype = Dtype();
-
-    uint64_t storage = 0;
-
-    DispatchFunc<INFINI_ALL_TYPES>(Dtype(), [&storage, value]<typename TargetT>() {
-        TargetT casted_value = static_cast<TargetT>(value);
-        std::memcpy((void *)(&storage), &casted_value, sizeof(TargetT));
-    });
-
     auto kernel = Dispatcher::Instance().GetKernel({device.type(), "Fill"});
-    kernel.Call<void>(shared_from_this(), static_cast<void *>(&storage));
-}
-
-template void Tensor::Fill<uint8_t>(uint8_t);
-template void Tensor::Fill<int8_t>(int8_t);
-template void Tensor::Fill<uint16_t>(uint16_t);
-template void Tensor::Fill<int16_t>(int16_t);
-template void Tensor::Fill<uint32_t>(uint32_t);
-template void Tensor::Fill<int32_t>(int32_t);
-template void Tensor::Fill<uint64_t>(uint64_t);
-template void Tensor::Fill<int64_t>(int64_t);
-template void Tensor::Fill<float>(float);
-template void Tensor::Fill<double>(double);
-#ifdef USE_CUDA
-template void Tensor::Fill<nv_bfloat16>(nv_bfloat16);
-template void Tensor::Fill<half>(half);
-#endif
+    kernel.Call<void>(shared_from_this(), value);
+}
 
 Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> Tensor::EigenMatrix() {
     const int64_t bs = std::accumulate(dims_.rbegin() + 1, dims_.rend(), 1, std::multiplies<int64_t>());
@@ -531,7 +506,7 @@ void Tensor::ZeroGrad(bool set_to_none) {
         if (set_to_none) {
             grad_.reset();
         } else {
-            grad_->Fill<float>(0.0f);
+            grad_->Fill(0.0);
         }
     }
 }
@@ -550,7 +525,7 @@ void Tensor::Backward(std::shared_ptr<Tensor> gradient, bool retain_graph, bool
         if (!gradient) {
             CHECK_EQ(dims_.size(), 0);
             gradient = std::make_shared<Tensor>(std::vector<int64_t>{}, dtype_, GetDevice());
-            gradient->Fill<float>(1.0f);
+            gradient->Fill(1.0);
         } else {
             CHECK_EQ(static_cast<int>(GetDevice().type()), static_cast<int>(gradient->GetDevice().type()));
             CHECK_EQ(static_cast<int>(dtype_), static_cast<int>(gradient->Dtype()));
diff --git a/test/dtype/test_dtype_dispatch.cc b/test/dtype/test_dtype_dispatch.cc
new file mode 100644
index 00000000..a79a14fc
--- /dev/null
+++ b/test/dtype/test_dtype_dispatch.cc
@@ -0,0 +1,120 @@
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <type_traits>
+
+#include "glog/logging.h"
+
+#include "infini_train/include/datatype.h"
+#include "infini_train/include/dtype_dispatch.h"
+
+#include "infini_train/src/core/runtime/cpu/cpu_dispatch.h"
+
+using namespace infini_train;
+
+// ============================================================================
+// Test 1: HasMappedType_v intercepts backends missing FP16 / BF16
+// ============================================================================
+
+// A backend TypeMap that only registers kFLOAT32 — FP16 / BF16 are absent.
+template <DataType DType> struct LowPrecisionAbsentTypeMap;
+
+template <> struct LowPrecisionAbsentTypeMap<DataType::kFLOAT32> {
+    using type = float;
+};
+
+static_assert(HasMappedType_v<LowPrecisionAbsentTypeMap, DataType::kFLOAT32>,
+              "sanity: registered dtype must be detected as present");
+static_assert(!HasMappedType_v<LowPrecisionAbsentTypeMap, DataType::kFLOAT16>,
+              "unregistered kFLOAT16 must be intercepted by HasMappedType_v");
+static_assert(!HasMappedType_v<LowPrecisionAbsentTypeMap, DataType::kBFLOAT16>,
+              "unregistered kBFLOAT16 must be intercepted by HasMappedType_v");
+
+// ============================================================================
+// Test 2: CpuTypeMap resolves FP16 / BF16 to framework scalar types
+// ============================================================================
+
+static_assert(std::is_same_v<MappedType_t<core::cpu::CpuTypeMap, DataType::kFLOAT16>, FP16>,
+              "CpuTypeMap<kFLOAT16> must resolve to framework FP16");
+static_assert(std::is_same_v<MappedType_t<core::cpu::CpuTypeMap, DataType::kBFLOAT16>, BF16>,
+              "CpuTypeMap<kBFLOAT16> must resolve to framework BF16");
+
+// ============================================================================
+// Test 3: Runtime dispatch of kFLOAT16 / kBFLOAT16
+// ============================================================================
+
+void TestRuntimeDispatchLowPrecision() {
+    std::cout << "\n=== Test 3: Runtime dispatch of kFLOAT16 / kBFLOAT16 ===" << std::endl;
+
+    // kFLOAT16 must dispatch to framework FP16
+    bool called_fp16 = false;
+    core::cpu::DispatchCpuFunc<DataType::kFLOAT16, DataType::kBFLOAT16>(
+        DataType::kFLOAT16,
+        [&called_fp16]<typename T>() {
+            if constexpr (std::is_same_v<T, FP16>) {
+                called_fp16 = true;
+            }
+        },
+        "dispatch kFLOAT16");
+    CHECK(called_fp16) << "DispatchCpuFunc did not invoke functor for kFLOAT16";
+
+    // kBFLOAT16 must dispatch to framework BF16
+    bool called_bf16 = false;
+    core::cpu::DispatchCpuFunc<DataType::kFLOAT16, DataType::kBFLOAT16>(
+        DataType::kBFLOAT16,
+        [&called_bf16]<typename T>() {
+            if constexpr (std::is_same_v<T, BF16>) {
+                called_bf16 = true;
+            }
+        },
+        "dispatch kBFLOAT16");
+    CHECK(called_bf16) << "DispatchCpuFunc did not invoke functor for kBFLOAT16";
+
+    std::cout << "Low-precision dispatch OK." << std::endl;
+}
+
+// ============================================================================
+// Test 4: Runtime dispatch of a low-precision dtype outside AllowedDTypes
+//         must fatal
+// ============================================================================
+
+// Sub-process entry: tries to dispatch kFLOAT16 with only kFLOAT32 allowed.
+void TriggerRuntimeUnsupportedLowPrecisionFatal() {
+    core::cpu::DispatchCpuFunc<DataType::kFLOAT32>(
+        DataType::kFLOAT16,
+        []<typename T>() { (void)sizeof(T); },
+        "intercept kFLOAT16 when only kFLOAT32 is allowed");
+}
+
+void TestRuntimeInterceptLowPrecision(const char *argv0) {
+    std::cout << "\n=== Test 4: Runtime intercept of kFLOAT16 outside AllowedDTypes ===" << std::endl;
+    const std::string cmd = std::string(argv0) + " --expect-runtime-fatal > /dev/null 2>&1";
+    const int status = std::system(cmd.c_str());
+    CHECK_NE(status, 0) << "Expected non-zero exit when dispatching an unallowed low-precision dtype";
+    std::cout << "Low-precision runtime intercept OK." << std::endl;
+}
+
+// ============================================================================
+// Main
+// ============================================================================
+
+int main(int argc, char *argv[]) {
+    google::InitGoogleLogging(argv[0]);
+
+    if (argc > 1 && std::string(argv[1]) == "--expect-runtime-fatal") {
+        TriggerRuntimeUnsupportedLowPrecisionFatal();
+        return 0;
+    }
+
+    std::cout << "========================================" << std::endl;
+    std::cout << "  Low-precision Dtype Dispatch Test Suite" << std::endl;
+    std::cout << "========================================" << std::endl;
+
+    std::cout << "Compile-time checks: PASSED" << std::endl;
+
+    TestRuntimeDispatchLowPrecision();
+    TestRuntimeInterceptLowPrecision(argv[0]);
+
+    std::cout << "\nAll low-precision dtype dispatch tests passed." << std::endl;
+    return 0;
+}
diff --git a/test/dtype/test_dtype_dispatch_compile_fail.cc b/test/dtype/test_dtype_dispatch_compile_fail.cc
new file mode 100644
index 00000000..57c706a3
--- /dev/null
+++ b/test/dtype/test_dtype_dispatch_compile_fail.cc
@@ -0,0 +1,28 @@
+#include "infini_train/include/datatype.h"
+#include "infini_train/include/dtype_dispatch.h"
+
+using namespace infini_train;
+
+// ============================================================================
+// Compile-fail: dispatching an unregistered low-precision dtype must be
+//               intercepted at compile time
+// ============================================================================
+
+// Models a backend that has registered standard floating types but has NOT
+// yet provided a mapping for the low-precision dtypes FP16 / BF16.
+template <DataType DType> struct LowPrecisionMissingTypeMap;
+
+template <> struct LowPrecisionMissingTypeMap<DataType::kFLOAT32> {
+    using type = float;
+};
+
+int main() {
+    // Dispatching kFLOAT16 through LowPrecisionMissingTypeMap must trigger the
+    // static_assert inside DispatchByTypeMap, failing this translation unit
+    // before MappedType_t<TypeMap, kFLOAT16> is ever instantiated.
+    DispatchByTypeMap<LowPrecisionMissingTypeMap, DataType::kFLOAT16>(
+        DataType::kFLOAT16,
+        []<typename T>() { (void)sizeof(T); },
+        "compile-fail: unregistered low-precision dtype");
+    return 0;
+}
diff --git a/test/dtype/test_scalar.cc b/test/dtype/test_scalar.cc
new file mode 100644
index 00000000..bd97f473
--- /dev/null
+++ b/test/dtype/test_scalar.cc
@@ -0,0 +1,324 @@
+#include <cmath>
+#include <cstdint>
+#include <iostream>
+#include <limits>
+
+#include "glog/logging.h"
+
+#include "infini_train/include/scalar.h"
+
+using namespace infini_train;
+
+// ============================================================================
+// Test 1: Default Constructor
+// ============================================================================
+void TestDefaultConstructor() {
+    std::cout << "\n=== Test 1: Default Constructor ===" << std::endl;
+
+    Scalar default_scalar;
+    CHECK_EQ(static_cast<int>(default_scalar.kind), static_cast<int>(Scalar::Kind::kInt64));
+    CHECK_EQ(default_scalar.i, 0);
+
+    std::cout << "Default constructor test passed!" << std::endl;
+}
+
+// ============================================================================
+// Test 2: Bool Constructor
+// ============================================================================
+void TestBoolConstructor() {
+    std::cout << "\n=== Test 2: Bool Constructor ===" << std::endl;
+
+    Scalar scalar_true(true);
+    CHECK_EQ(static_cast<int>(scalar_true.kind), static_cast<int>(Scalar::Kind::kBool));
+    CHECK_EQ(scalar_true.u, static_cast<uint64_t>(1));
+
+    Scalar scalar_false(false);
+    CHECK_EQ(static_cast<int>(scalar_false.kind), static_cast<int>(Scalar::Kind::kBool));
+    CHECK_EQ(scalar_false.u, static_cast<uint64_t>(0));
+
+    std::cout << "Bool constructor test passed!" << std::endl;
+}
+
+// ============================================================================
+// Test 3: Signed Integer Constructor
+// ============================================================================
+void TestSignedIntConstructor() {
+    std::cout << "\n=== Test 3: Signed Integer Constructor ===" << std::endl;
+
+    Scalar scalar_positive(42);
+    CHECK_EQ(static_cast<int>(scalar_positive.kind), static_cast<int>(Scalar::Kind::kInt64));
+    CHECK_EQ(scalar_positive.i, static_cast<int64_t>(42));
+
+    Scalar scalar_negative(-7);
+    CHECK_EQ(scalar_negative.i, static_cast<int64_t>(-7));
+
+    int16_t short_val = -100;
+    Scalar scalar_short(short_val);
+    CHECK_EQ(static_cast<int>(scalar_short.kind), static_cast<int>(Scalar::Kind::kInt64));
+    CHECK_EQ(scalar_short.i, static_cast<int64_t>(-100));
+
+    std::cout << "Signed integer constructor test passed!" << std::endl;
+}
+
+// ============================================================================
+// Test 4: Unsigned Integer Constructor
+// ============================================================================
+void TestUnsignedIntConstructor() {
+    std::cout << "\n=== Test 4: Unsigned Integer Constructor ===" << std::endl;
+
+    unsigned int uint_val = 99u;
+    Scalar scalar_uint(uint_val);
+    CHECK_EQ(static_cast<int>(scalar_uint.kind), static_cast<int>(Scalar::Kind::kUInt64));
+    CHECK_EQ(scalar_uint.u, static_cast<uint64_t>(99));
+
+    uint64_t uint64_max = std::numeric_limits<uint64_t>::max();
+    Scalar scalar_max(uint64_max);
+    CHECK_EQ(scalar_max.u, uint64_max);
+
+    std::cout << "Unsigned integer constructor test passed!" << std::endl;
+}
+
+// ============================================================================
+// Test 5: Float Constructor
+// ============================================================================
+void TestFloatConstructor() {
+    std::cout << "\n=== Test 5: Float Constructor ===" << std::endl;
+
+    Scalar scalar_float(3.14f);
+    CHECK_EQ(static_cast<int>(scalar_float.kind), static_cast<int>(Scalar::Kind::kDouble));
+    CHECK(std::abs(scalar_float.d - 3.14) < 1e-5) << "Expected ~3.14, got " << scalar_float.d;
+
+    std::cout << "Float constructor test passed!" << std::endl;
+}
+
+// ============================================================================
+// Test 6: Double Constructor
+// ============================================================================
+void TestDoubleConstructor() {
+    std::cout << "\n=== Test 6: Double Constructor ===" << std::endl;
+
+    Scalar scalar_double(2.718281828);
+    CHECK_EQ(static_cast<int>(scalar_double.kind), static_cast<int>(Scalar::Kind::kDouble));
+    CHECK(std::abs(scalar_double.d - 2.718281828) < 1e-12) << "Expected ~2.718281828, got " << scalar_double.d;
+
+    std::cout << "Double constructor test passed!" << std::endl;
+}
+
+// ============================================================================
+// Test 7: FP16 / BF16 Constructor
+// ============================================================================
+void TestHalfPrecisionConstructor() {
+    std::cout << "\n=== Test 7: FP16 / BF16 Constructor ===" << std::endl;
+
+    FP16 fp16_val(1.5f);
+    Scalar scalar_from_fp16(fp16_val);
+    CHECK_EQ(static_cast<int>(scalar_from_fp16.kind), static_cast<int>(Scalar::Kind::kDouble));
+    CHECK(std::abs(scalar_from_fp16.d - 1.5) < 1e-3) << "FP16 constructor: expected ~1.5, got " << scalar_from_fp16.d;
+
+    BF16 bf16_val(2.0f);
+    Scalar scalar_from_bf16(bf16_val);
+    CHECK_EQ(static_cast<int>(scalar_from_bf16.kind), static_cast<int>(Scalar::Kind::kDouble));
+    CHECK(std::abs(scalar_from_bf16.d - 2.0) < 1e-2) << "BF16 constructor: expected ~2.0, got " << scalar_from_bf16.d;
+
+    std::cout << "FP16 / BF16 constructor test passed!" << std::endl;
+}
+
+// ============================================================================
+// Test 8: to<T>() Same-Type and Numeric Conversions
+// ============================================================================
+void TestToNumericConversions() {
+    std::cout << "\n=== Test 8: to<T>() Same-Type and Numeric Conversions ===" << std::endl;
+
+    // Same type
+    Scalar scalar_int(10);
+    CHECK_EQ(scalar_int.to<int64_t>(), static_cast<int64_t>(10));
+
+    Scalar scalar_double(3.14);
+    CHECK(std::abs(scalar_double.to<double>() - 3.14) < 1e-12) << "to<double> failed";
+
+    // Int -> float
+    Scalar scalar_positive(42);
+    CHECK(std::abs(scalar_positive.to<float>() - 42.0f) < 1e-6) << "to<float> failed";
+    CHECK(std::abs(scalar_positive.to<double>() - 42.0) < 1e-12) << "to<double> failed";
+
+    // Float -> int (truncation)
+    Scalar scalar_fractional(7.9);
+    CHECK_EQ(scalar_fractional.to<int64_t>(), static_cast<int64_t>(7));
+    CHECK_EQ(scalar_fractional.to<int32_t>(), static_cast<int32_t>(7));
+
+    // Negative int -> float
+    Scalar scalar_negative(-42);
+    CHECK(std::abs(scalar_negative.to<float>() - (-42.0f)) < 1e-6) << "negative to<float> failed";
+
+    std::cout << "Numeric conversion test passed!" << std::endl;
+}
+
+// ============================================================================
+// Test 9: to<T>() Bool Conversions
+// ============================================================================
+void TestToBoolConversions() {
+    std::cout << "\n=== Test 9: to<T>() Bool Conversions ===" << std::endl;
+
+    // Bool -> int
+    Scalar scalar_true(true);
+    CHECK_EQ(scalar_true.to<int64_t>(), static_cast<int64_t>(1));
+
+    Scalar scalar_false(false);
+    CHECK_EQ(scalar_false.to<int64_t>(), static_cast<int64_t>(0));
+
+    // Bool -> double
+    CHECK(std::abs(scalar_true.to<double>() - 1.0) < 1e-12) << "true to<double> failed";
+    CHECK(std::abs(scalar_false.to<double>() - 0.0) < 1e-12) << "false to<double> failed";
+
+    // Int -> bool
+    Scalar scalar_zero(0);
+    CHECK_EQ(scalar_zero.to<bool>(), false);
+
+    Scalar scalar_nonzero(5);
+    CHECK_EQ(scalar_nonzero.to<bool>(), true);
+
+    std::cout << "Bool conversion test passed!" << std::endl;
+}
+
+// ============================================================================
+// Test 10: to<T>() Unsigned Conversions
+// ============================================================================
+void TestToUnsignedConversions() {
+    std::cout << "\n=== Test 10: to<T>() Unsigned Conversions ===" << std::endl;
+
+    // uint -> double
+    uint64_t uint_val = 12345;
+    Scalar scalar_uint(uint_val);
+    CHECK(std::abs(scalar_uint.to<double>() - 12345.0) < 1e-6) << "uint to<double> failed";
+
+    // double -> uint
+    Scalar scalar_double(100.0);
+    CHECK_EQ(scalar_double.to<uint64_t>(), static_cast<uint64_t>(100));
+
+    std::cout << "Unsigned conversion test passed!" << std::endl;
+}
+
+// ============================================================================
+// Test 11: to<T>() FP16 / BF16 Conversions
+// ============================================================================
+void TestToHalfPrecisionConversions() {
+    std::cout << "\n=== Test 11: to<T>() FP16 / BF16 Conversions ===" << std::endl;
+
+    // NOTE(dcj): These tests exercise scalar.to<FP16/BF16>(), which goes through
+    // common::cpu::Cast and follows a double->float->bf16/fp16 two-step path.
+    // This differs from what happens in CUDA kernels where dispatch resolves T
+    // to __nv_bfloat16/__half and Cast falls through to a one-step static_cast
+    // (double->bf16 directly). The two paths may produce different rounding
+    // results. See the TODO in scalar.h for the planned fix.
+
+    // Double -> FP16
+    Scalar scalar_double(1.5);
+    auto to_fp16 = scalar_double.to<FP16>();
+    CHECK(std::abs(static_cast<float>(to_fp16) - 1.5f) < 1e-3) << "to<FP16> from double failed";
+
+    // Int -> FP16
+    Scalar scalar_int(3);
+    auto int_to_fp16 = scalar_int.to<FP16>();
+    CHECK(std::abs(static_cast<float>(int_to_fp16) - 3.0f) < 1e-3) << "to<FP16> from int failed";
+
+    // Double -> BF16
+    Scalar scalar_double2(2.0);
+    auto to_bf16 = scalar_double2.to<BF16>();
+    CHECK(std::abs(static_cast<float>(to_bf16) - 2.0f) < 1e-2) << "to<BF16> from double failed";
+
+    // Int -> BF16
+    Scalar scalar_int2(5);
+    auto int_to_bf16 = scalar_int2.to<BF16>();
+    CHECK(std::abs(static_cast<float>(int_to_bf16) - 5.0f) < 1e-1) << "to<BF16> from int failed";
+
+    std::cout << "FP16 / BF16 conversion test passed!" << std::endl;
+}
+
+// ============================================================================
+// Test 12: Edge Cases — Numeric Limits
+// ============================================================================
+void TestNumericLimits() {
+    std::cout << "\n=== Test 12: Edge Cases — Numeric Limits ===" << std::endl;
+
+    int64_t int64_max = std::numeric_limits<int64_t>::max();
+    Scalar scalar_int64_max(int64_max);
+    CHECK_EQ(scalar_int64_max.to<int64_t>(), int64_max);
+
+    int64_t int64_min = std::numeric_limits<int64_t>::min();
+    Scalar scalar_int64_min(int64_min);
+    CHECK_EQ(scalar_int64_min.to<int64_t>(), int64_min);
+
+    uint64_t uint64_max = std::numeric_limits<uint64_t>::max();
+    Scalar scalar_uint64_max(uint64_max);
+    CHECK_EQ(scalar_uint64_max.to<uint64_t>(), uint64_max);
+
+    std::cout << "Numeric limits test passed!" << std::endl;
+}
+
+// ============================================================================
+// Test 13: Edge Cases — Zero Values
+// ============================================================================
+void TestZeroValues() {
+    std::cout << "\n=== Test 13: Edge Cases — Zero Values ===" << std::endl;
+
+    Scalar scalar_int_zero(0);
+    CHECK_EQ(scalar_int_zero.to<int64_t>(), static_cast<int64_t>(0));
+    CHECK(std::abs(scalar_int_zero.to<double>()) < 1e-12) << "int zero to<double> failed";
+
+    Scalar scalar_double_zero(0.0);
+    CHECK(std::abs(scalar_double_zero.to<double>()) < 1e-12) << "double zero to<double> failed";
+    CHECK_EQ(scalar_double_zero.to<int64_t>(), static_cast<int64_t>(0));
+
+    std::cout << "Zero values test passed!" << std::endl;
+}
+
+// ============================================================================
+// Test 14: FP16 / BF16 Roundtrip
+// ============================================================================
+void TestHalfPrecisionRoundtrip() {
+    std::cout << "\n=== Test 14: FP16 / BF16 Roundtrip ===" << std::endl;
+
+    FP16 original_fp16(0.5f);
+    Scalar scalar_from_fp16(original_fp16);
+    auto roundtrip_fp16 = scalar_from_fp16.to<FP16>();
+    CHECK(std::abs(static_cast<float>(roundtrip_fp16) - 0.5f) < 1e-3) << "FP16 roundtrip failed";
+
+    BF16 original_bf16(4.0f);
+    Scalar scalar_from_bf16(original_bf16);
+    auto roundtrip_bf16 = scalar_from_bf16.to<BF16>();
+    CHECK(std::abs(static_cast<float>(roundtrip_bf16) - 4.0f) < 1e-2) << "BF16 roundtrip failed";
+
+    std::cout << "FP16 / BF16 roundtrip test passed!" << std::endl;
+}
+
+// ============================================================================
+// Main
+// ============================================================================
+int main(int argc, char *argv[]) {
+    google::InitGoogleLogging(argv[0]);
+
+    std::cout << "========================================" << std::endl;
+    std::cout << "       Scalar Test Suite" << std::endl;
+    std::cout << "========================================" << std::endl;
+
+    TestDefaultConstructor();
+    TestBoolConstructor();
+    TestSignedIntConstructor();
+    TestUnsignedIntConstructor();
+    TestFloatConstructor();
+    TestDoubleConstructor();
+    TestHalfPrecisionConstructor();
+    TestToNumericConversions();
+    TestToBoolConversions();
+    TestToUnsignedConversions();
+    TestToHalfPrecisionConversions();
+    TestNumericLimits();
+    TestZeroValues();
+    TestHalfPrecisionRoundtrip();
+
+    std::cout << "\n========================================" << std::endl;
+    std::cout << "    All Tests Completed Successfully" << std::endl;
+    std::cout << "========================================" << std::endl;
+
+    return 0;
+}
diff --git a/test/hook/test_precision_check.cc b/test/hook/test_precision_check.cc
index 65c8258c..eae3aaf3 100644
--- a/test/hook/test_precision_check.cc
+++ b/test/hook/test_precision_check.cc
@@ -40,7 +40,7 @@ class SimpleModel : public nn::Module {
 
 void RunModelForwardBackward(const std::shared_ptr<nn::Module> &model) {
     auto x = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32);
-    x->Fill<float>(2.0f);
+    x->Fill(2.0f);
     x->RequiresGrad();
 
     std::vector<std::shared_ptr<Tensor>> inputs = {x};
@@ -55,11 +55,11 @@ void TestFunctionLevel(const std::string &config_str) {
     std::cout << "========================================" << std::endl;
 
     auto x = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32);
-    x->Fill<float>(2.0f);
+    x->Fill(2.0f);
     x->RequiresGrad();
 
     auto y = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32);
-    y->Fill<float>(3.0f);
+    y->Fill(3.0f);
     y->RequiresGrad();
 
     auto z = x->Mul(y);
@@ -87,7 +87,7 @@ void TestSimpleFormat() {
     std::cout << "========================================" << std::endl;
 
     auto x = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32);
-    x->Fill<float>(2.0f);
+    x->Fill(2.0f);
     x->RequiresGrad();
 
     auto y = x->Mul(x);
@@ -104,7 +104,7 @@ void TestMd5Format() {
     std::cout << "========================================" << std::endl;
 
     auto x = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32);
-    x->Fill<float>(2.0f);
+    x->Fill(2.0f);
     x->RequiresGrad();
 
     auto y = x->Mul(x);