feat: extract the common module of Transformer by JYMiracle305 · Pull Request #115 · InfiniTensor/InfiniTrain

JYMiracle305 · 2026-03-13T08:53:00Z

核心变更

抽象出Transformer类模型的构建架构，将GPT2和LLaMA3构建过程统一为一个流程实现，后续transformer类的模型支持使用统一流程。TransformerConfig 中的枚举字段直接控制模型行为，同一套 TransformerModel 代码，传入不同的 config 就构建出 GPT2 或 LLaMA3。后续新增 Transformer 类模型只需扩展枚举值并填写对应的 config，保持整体架构改动最小化。

目录结构

infini_train/include/nn/modules/
├── transformer/
│ ├── transformer_config.h # TransformerConfig + ModelType/AttentionType/MLPType/NormType 枚举
│ ├── transformer.h # TransformerLayer / FirstStage / Chunk / LastStage / TransformerModel 类
│ ├── causal_self_attention.h # CausalSelfAttention（支持 Standard/RoPE，含 GQA）
│ ├── mlp.h # MLP 模块（支持 GELU/SwiGLU）
│ └── utils.h # RoPE 辅助方法（PrecomputeFreqsCis）
├── activations.h # 激活函数声明（NewGELU / SwiGLU）
└── normalization.h # 归一化类声明（LayerNorm / RMSNorm）

infini_train/src/nn/modules/
├── modules/transformer/
│ ├── transformer.cc # TransformerModel / FirstStage / Chunk / LastStage / Layer 实现
│ ├── causal_self_attention.cc # CausalSelfAttention 实现
│ ├── mlp.cc # MLP 实现（支持 GELU / SwiGLU）
│ └── utils.cc # PrecomputeFreqsCis 实现
├──activations.cc # NewGELU / SwiGLU 定义和Forward实现
└──normalization.cc # LayerNorm / RMSNorm 定义和Forward实现

example/
├── gpt2/
│ ├── config.h # GPT2Config 预设（返回 TransformerConfig）
│ └── checkpoint_loader.h/.cc # GPT2 权重加载（使用统一 TransformerModel）
└── llama3/
├── config.h # LLaMA3Config 预设（返回 TransformerConfig）
└── checkpoint_loader.h/.cc # LLaMA3 权重加载（使用统一 TransformerModel）

JYMiracle305 · 2026-03-16T02:43:53Z

单机多卡：
GPT2:

LLaMA3:

多机训练结果：
GPT2：

LLaMA3：

infini_train/src/core/models/decode_only_transformer/layer_specs.cc

+    first_stage.with_submodule(TransformerFirstStage::kWTELayerName, BuildVocabEmbeddingSpec(gpt2_config))
+        .with_submodule(TransformerFirstStage::kWPELayerName,
+                        BuildPositionEmbeddingSpec(gpt2_config.block_size, gpt2_config.n_embd));
+    spec.with_submodule("first_stage", first_stage);


infini_train/src/core/transformer/spec_utils.cc

+
+namespace infini_train::nn {
+
+void ModuleRegistry::Register(std::type_index type, ModuleCreator creator) { registry_[type] = std::move(creator); }


infini_train/src/core/transformer/transformer_layer.cc

+    auto tok_emb = (*modules_[kWTELayerName])({x1});
+
+    // Add position embedding only for models that use absolute position encoding
+    if (config_.attention_type == AttentionType::kStandard) {


example/llama3/main.cc

    // ManualSeed(42);

-    LLaMA3Config model_config = LLaMA3Config();
+    nn::TransformerConfig model_config;


infini_train/include/core/models/decode_only_transformer/model.h

+
+// ========== GPT2 Model Definition ==========
+// Uses LayerNorm, GELU activation, standard multi-head attention
+class GPT2 : public nn::TransformerLayer {


infini_train/include/core/transformer/transformer_block.h

+    Forward(const std::vector<std::shared_ptr<infini_train::Tensor>> &x) override;
+
+private:
+    AttentionType attention_type_;


infini_train/include/core/transformer/transformer_config.h

+
+    // Architecture choices
+    AttentionType attention_type = AttentionType::kStandard; // Attention mechanism type
+    MLPType mlp_type = MLPType::kGELU;                       // MLP activation type


infini_train/include/core/transformer/transformer_block.h

+
+namespace infini_train::nn {
+
+class RMSNorm : public infini_train::nn::CloneableModule<RMSNorm> {


infini_train/src/core/transformer/transformer_block.cc

+    modules_[kCFcLayerName] = build_module(config, spec.submodules_.at(kCFcLayerName));
+
+    // For SwiGLU, add second projection
+    if (spec.submodules_.count(kCFc2LayerName) > 0) {


infini_train/include/core/models/decode_only_transformer/model.h

+
+// ========== LLaMA3 Model Definition ==========
+// Uses RMSNorm, SwiGLU activation, GQA attention, RoPE positional encoding
+class LLaMA3 : public nn::TransformerLayer {


infini_train/include/core/models/decode_only_transformer/model.h

infini_train/include/core/transformer/activations/gelu.h

infini_train/include/nn/modules/transformer/causal_self_attention.h

+
+    static constexpr char kParamBiasName[] = "bias";
+
+    explicit CausalSelfAttention(const TransformerConfig &config, const ModuleSpec &spec = {});


infini_train/include/nn/modules/transformer/spec_utils.h

infini_train/include/core/transformer/spec_utils.h

+class ModuleRegistry {
+public:
+    static ModuleRegistry &Instance() {
+        static ModuleRegistry inst;


infini_train/src/core/transformer/norms/rms_norm.cc

+    auto norm = x[0] * nn::function::Rsqrt(nn::function::Mean(nn::function::Pow(x[0], 2), -1, true) + eps_);
+    return {norm * parameters_[kParamWeightName]};
+}
+} // namespace infini_train::nn


infini_train/src/core/transformer/transformer_builders.cc

+    return spec;
+}
+
+ModuleSpec BuildTransformerBlockSpec(const TransformerConfig &config) {


infini_train/src/core/transformer/norms/rms_norm.cc

infini_train/src/nn/modules/transformer/causal_self_attention.cc

kilinchange · 2026-03-29T16:12:49Z

infini_train/include/nn/modules/transformer/spec_utils.h

+struct ModuleSpec {
+    ModuleSpec() = default;
+
+    explicit ModuleSpec(std::type_index m) : module_(m) {}


感觉引入 type_index 有点奇怪，ModuleSpec 绑定 module 的意义是什么呢？

type_index 作为在注册表中查找具体构造函数的键。
Megatron中的ModuleSpec支持根据module或submodules进行构建。

example/gpt2/checkpoint_loader.cc

kilinchange · 2026-04-03T02:38:14Z

example/llama3/net.cc

-                                                        .use_scaled_rope = static_cast<bool>(use_scaled_rope),
-                                                        .norm_eps = norm_eps,
-                                                        .max_gen_batch_size = max_gen_bs});
+    nn::TransformerConfig llama3_config = nn::llama3::LLaMA3Config();


这里为什么要换一种写法？不是必要的话还原回去吧。

为了适配新的config结构，先调用各自的初始化函数，保证参数是属于本模型的，再去根据读入的数据修改

infini_train/include/core/models/decode_only_transformer/model.h

+
+    static std::shared_ptr<DecoderOnlyTransformer> FromLLMC_GPT2(const std::string &filepath);
+    static std::shared_ptr<DecoderOnlyTransformer> FromLLMC_LLaMA3(const std::string &filepath);
+    static void LoadWeightsFromLLMC(const std::string &filepath, DecoderOnlyTransformer *model,


infini_train/include/core/models/decode_only_transformer/model.h

+    static std::shared_ptr<DecoderOnlyTransformer> FromPretrained(ModelType model_type);
+
+    static std::shared_ptr<DecoderOnlyTransformer> FromLLMC_GPT2(const std::string &filepath);
+    static std::shared_ptr<DecoderOnlyTransformer> FromLLMC_LLaMA3(const std::string &filepath);


infini_train/include/nn/modules/transformer/causal_self_attention.h

infini_train/src/nn/modules/transformer/layer_specs.cc

kilinchange · 2026-04-03T09:47:31Z

infini_train/src/nn/modules/transformer/registry.cc

+INFINI_TRAIN_REGISTER_MODULE(CausalSelfAttention);
+INFINI_TRAIN_REGISTER_MODULE(MLP);
+
+// NewGELU


理论上下面这些基础 module 不需要注册了，需要明确的一点是，spec 只表示到 mlp/attention 这层；对于基础 module，在 mlp/attention 这类上层 module 的构造函数中，通过 spec 解析需要的参数，传参给基础 module 构造函数直接构造就行。

kilinchange · 2026-04-03T09:49:36Z

infini_train/include/nn/modules/transformer/spec_utils.h

+
+using ModuleCreator = std::function<std::shared_ptr<Module>(const TransformerConfig &, const ModuleSpec &)>;
+
+class ModuleRegistry {


由于所有需要 spec 构造的 module 的 creator 都是明确的了，所以似乎也不需要 Registry 了，从而 ModuleSpec 也不需要通过 type_index 绑定 module 信息了。

infini_train/include/nn/modules/transformer/spec_utils.h

+    return *value;
+}
+
+std::shared_ptr<Module> BuildModule(const TransformerConfig &config, const ModuleSpec &spec);


infini_train/src/core/models/decode_only_transformer/layer_specs.cc

example/gpt2/checkpoint_loader.h

test/transformer/test_transformer_architecture.cc

- Remove DecoderOnlyTransformer, use TransformerModel directly - Reorganize module directory structure - Move PrecomputeFreqsCis to utility file as standalone function - Extract LoadFromLLMC into separate method

- Remove ModuleRegistry and INFINI_TRAIN_REGISTER_MODULE macros - Replace BuildModule() with direct constructor calls - Simplify module instantiation in MLP, CausalSelfAttention, and TransformerLayer

JYMiracle305 force-pushed the feat/transformer branch from 2ab6ca5 to e0504d9 Compare March 13, 2026 14:25

JYMiracle305 requested review from Chamberlain0w0, chen2021673 and kilinchange and removed request for Chamberlain0w0 March 16, 2026 05:42

JYMiracle305 force-pushed the feat/transformer branch 3 times, most recently from dfdd913 to d833ec2 Compare March 16, 2026 08:10

chen2021673 reviewed Mar 18, 2026

View reviewed changes

Chamberlain0w0 requested changes Mar 18, 2026

View reviewed changes

This comment was marked as outdated.

Sign in to view

JYMiracle305 force-pushed the feat/transformer branch 4 times, most recently from 6ba15c3 to 2ac0526 Compare March 26, 2026 03:30

kilinchange requested changes Mar 29, 2026

View reviewed changes

This comment was marked as resolved.

Sign in to view

JYMiracle305 force-pushed the feat/transformer branch 3 times, most recently from d97d661 to 5cec43f Compare April 1, 2026 08:35

kilinchange self-requested a review April 3, 2026 02:10

JYMiracle305 requested review from Chamberlain0w0 and chen2021673 April 3, 2026 02:10

JYMiracle305 force-pushed the feat/transformer branch 3 times, most recently from fab42f1 to 2e4611c Compare April 3, 2026 07:11

kilinchange requested changes Apr 3, 2026

View reviewed changes

JYMiracle305 force-pushed the feat/transformer branch 6 times, most recently from 9d7cd4c to 9a8cce4 Compare April 9, 2026 07:06

JYMiracle305 requested a review from kilinchange April 9, 2026 07:12

kilinchange requested changes Apr 10, 2026

View reviewed changes

example/gpt2/checkpoint_loader.h Outdated Show resolved Hide resolved

test/transformer/test_transformer_architecture.cc Outdated Show resolved Hide resolved

JYMiracle305 force-pushed the feat/transformer branch from 9a8cce4 to bd26b52 Compare April 10, 2026 03:09

JYMiracle305 requested a review from kilinchange April 10, 2026 03:09

kilinchange approved these changes Apr 10, 2026

View reviewed changes

JYMiracle305 added 6 commits April 10, 2026 14:43

feat: extract the common module of Transformer

b6efa8e

fix: apply PR comment fixes

04d77dd

refactor: remove attention_type_ from TransformerLayer

0d39b4b

feat: unify GPT2 and LLaMA3 into DecoderOnlyTransformer

1ed7318

refactor: flatten transformer spec and extract model presets

91f544b

refactor: simplify architecture and reorganize modules

d283738

- Remove DecoderOnlyTransformer, use TransformerModel directly - Reorganize module directory structure - Move PrecomputeFreqsCis to utility file as standalone function - Extract LoadFromLLMC into separate method

JYMiracle305 force-pushed the feat/transformer branch from bd26b52 to 9541d39 Compare April 10, 2026 08:40

refactor: remove module registry and use direct construction

8174510

- Remove ModuleRegistry and INFINI_TRAIN_REGISTER_MODULE macros - Replace BuildModule() with direct constructor calls - Simplify module instantiation in MLP, CausalSelfAttention, and TransformerLayer

JYMiracle305 force-pushed the feat/transformer branch from 9541d39 to 8174510 Compare April 10, 2026 12:59


		namespace infini_train::nn {

		void ModuleRegistry::Register(std::type_index type, ModuleCreator creator) { registry_[type] = std::move(creator); }


		namespace infini_train::nn {

		class RMSNorm : public infini_train::nn::CloneableModule<RMSNorm> {


		static constexpr char kParamBiasName[] = "bias";

		explicit CausalSelfAttention(const TransformerConfig &config, const ModuleSpec &spec = {});


		using ModuleCreator = std::function<std::shared_ptr<Module>(const TransformerConfig &, const ModuleSpec &)>;

		class ModuleRegistry {

Conversation

JYMiracle305 commented Mar 13, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

核心变更

目录结构

Uh oh!

JYMiracle305 commented Mar 16, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

This comment was marked as resolved.

Uh oh!

This comment was marked as resolved.

Uh oh!

This comment was marked as resolved.

Uh oh!

This comment was marked as resolved.

Uh oh!

This comment was marked as outdated.

Uh oh!

This comment was marked as resolved.

Uh oh!

This comment was marked as resolved.

Uh oh!

This comment was marked as outdated.

Uh oh!

This comment was marked as outdated.

Uh oh!

This comment was marked as outdated.

Uh oh!

This comment was marked as resolved.

Uh oh!

This comment was marked as resolved.

Uh oh!

This comment was marked as resolved.

Uh oh!

This comment was marked as resolved.

Uh oh!

This comment was marked as resolved.

Uh oh!

This comment was marked as resolved.

Uh oh!

This comment was marked as outdated.

Uh oh!

Uh oh!

This comment was marked as outdated.

Uh oh!

This comment was marked as outdated.

Uh oh!

This comment was marked as outdated.

Uh oh!

This comment was marked as resolved.

Uh oh!

This comment was marked as resolved.

Uh oh!

This comment was marked as resolved.

Uh oh!

This comment was marked as resolved.

Uh oh!

This comment was marked as resolved.

Uh oh!

Uh oh!

kilinchange Mar 29, 2026

Choose a reason for hiding this comment

Uh oh!

JYMiracle305 Mar 30, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

This comment was marked as resolved.

This comment was marked as resolved.

Uh oh!

kilinchange Apr 3, 2026

Choose a reason for hiding this comment

Uh oh!

JYMiracle305 Apr 7, 2026

Choose a reason for hiding this comment

Uh oh!

This comment was marked as resolved.

Uh oh!

This comment was marked as resolved.

JYMiracle305 commented Mar 13, 2026 •

edited

Loading

JYMiracle305 commented Mar 16, 2026 •

edited

Loading

JYMiracle305 Mar 30, 2026 •

edited

Loading