-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathmodel-config-sample.yaml
More file actions
77 lines (70 loc) · 1.87 KB
/
model-config-sample.yaml
File metadata and controls
77 lines (70 loc) · 1.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
name: "Llama (2M)"
overwrite: true
data:
input_file: "train.jsonl"
# Optional validation file
validation_file: "val.jsonl"
# Optional external tokenizer path
tokenizer_path: "tokenizer" # Path to a directory containing tokenizer.json
preprocessing:
max_context_size: 1024
chunk_overlap: 0 # If > 0, chunks will overlap by this many tokens
tokenizer:
normal_vocab_size: 256
special_tokens:
pad: "<pad>"
bos: "<bos>"
eos: "<eos>"
# Add custom tokens if needed:
# ctrl1: "<ctrl1>"
# ctrl2: "<ctrl2>"
model:
architecture: "llama"
dimensions:
hidden_size: 128
intermediate_size: 256 # 384 * 4
num_layers: 4
attention:
num_heads: 8
num_kv_heads: null # If null, equals num_heads
head_dim: null # If null, computed from hidden_size/num_heads
max_position_embeddings: null
normalization:
rms_norm_eps: 1.0e-5
rope:
theta: 10000
traditional: false
scaling: null
misc:
attention_bias: false
mlp_bias: false
tie_word_embeddings: true
training:
# Number of epochs to train for (optional)
epochs: 1
hyperparameters:
batch_size: 16
learning_rate: 2.0e-2
weight_decay: 0.01
# iters: 10000 # If epochs is provided, this is ignored
scheduler:
type: "cosine" # Options: linear, cosine, cosine_with_warmup
min_lr_ratio: 0.01 # Minimum LR as a ratio of initial LR
optimization:
optimizer: "muon" # Options: adam, adamw, muon, sgd
logging:
log_dir: "logs"
checkpoint_dir: "checkpoints"
steps:
logging_interval: 1
checkpoint_interval: 10000
validation_interval: 1000 # Run validation every 1000 steps (0 to disable)
metrics:
log_loss: true
log_perplexity: true
log_tokens_per_second: true
log_learning_rate: true
log_tokens_processed: true
system:
seed: 42
device: "gpu" # Options: cpu, gpu