Trouter-Library commited on
Commit
fdac17c
·
verified ·
1 Parent(s): 3d5c120

Create trainer_config.yaml

Browse files
Files changed (1) hide show
  1. trainer_config.yaml +260 -0
trainer_config.yaml ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Helion-V1.5-XL Training Configuration
2
+
3
+ # Model Architecture
4
+ model:
5
+ model_type: helion
6
+ vocab_size: 100000
7
+ hidden_size: 6144
8
+ intermediate_size: 24576
9
+ num_hidden_layers: 48
10
+ num_attention_heads: 32
11
+ num_key_value_heads: 8
12
+ max_position_embeddings: 16384
13
+ rope_theta: 10000.0
14
+ rope_scaling:
15
+ type: linear
16
+ factor: 2.0
17
+ hidden_act: silu
18
+ initializer_range: 0.02
19
+ rms_norm_eps: 1.0e-6
20
+ use_cache: true
21
+ tie_word_embeddings: false
22
+ attention_bias: false
23
+ attention_dropout: 0.0
24
+
25
+ # Training Configuration
26
+ training:
27
+ # Optimization
28
+ optimizer: adamw
29
+ learning_rate: 3.0e-4
30
+ weight_decay: 0.1
31
+ adam_beta1: 0.9
32
+ adam_beta2: 0.95
33
+ adam_epsilon: 1.0e-8
34
+ max_grad_norm: 1.0
35
+
36
+ # Learning Rate Schedule
37
+ lr_scheduler_type: cosine
38
+ warmup_steps: 2000
39
+ min_learning_rate: 3.0e-5
40
+
41
+ # Batch Configuration
42
+ per_device_train_batch_size: 32
43
+ gradient_accumulation_steps: 8
44
+ global_batch_size: 4194304 # in tokens
45
+ max_sequence_length: 4096
46
+
47
+ # Training Steps
48
+ max_steps: 875000
49
+ save_steps: 5000
50
+ eval_steps: 1000
51
+ logging_steps: 100
52
+
53
+ # Mixed Precision
54
+ fp16: false
55
+ bf16: true
56
+ tf32: true
57
+
58
+ # Distributed Training
59
+ distributed_strategy: fsdp
60
+ fsdp_config:
61
+ fsdp_transformer_layer_cls_to_wrap: HelionDecoderLayer
62
+ fsdp_backward_prefetch: backward_pre
63
+ fsdp_state_dict_type: FULL_STATE_DICT
64
+ fsdp_cpu_offload: false
65
+
66
+ # Gradient Checkpointing
67
+ gradient_checkpointing: true
68
+ gradient_checkpointing_kwargs:
69
+ use_reentrant: false
70
+
71
+ # Compilation
72
+ torch_compile: true
73
+ torch_compile_backend: inductor
74
+ torch_compile_mode: max-autotune
75
+
76
+ # Data Configuration
77
+ data:
78
+ # Dataset Mixing Ratios
79
+ datasets:
80
+ - name: web_text
81
+ weight: 0.45
82
+ sources:
83
+ - common_crawl_filtered
84
+ - c4
85
+ - redpajama_web
86
+
87
+ - name: books
88
+ weight: 0.20
89
+ sources:
90
+ - books3
91
+ - gutenberg
92
+ - bookcorpus
93
+
94
+ - name: code
95
+ weight: 0.15
96
+ sources:
97
+ - github_code
98
+ - stack_overflow
99
+ - starcoder_data
100
+
101
+ - name: scientific
102
+ weight: 0.10
103
+ sources:
104
+ - arxiv
105
+ - pubmed
106
+ - semantic_scholar
107
+
108
+ - name: instruction
109
+ weight: 0.08
110
+ sources:
111
+ - openorca
112
+ - ultrachat
113
+ - wizardlm
114
+ - alpaca
115
+
116
+ - name: multilingual
117
+ weight: 0.02
118
+ sources:
119
+ - mc4_multilingual
120
+ - wikipedia_multilingual
121
+
122
+ # Data Processing
123
+ preprocessing:
124
+ tokenizer: helion_tokenizer
125
+ max_length: 4096
126
+ padding: false
127
+ truncation: true
128
+
129
+ # Data Quality
130
+ quality_filters:
131
+ - deduplication: true
132
+ dedup_threshold: 0.85
133
+ - min_token_length: 50
134
+ - max_token_length: 8192
135
+ - perplexity_filter: true
136
+ perplexity_threshold: 1500
137
+ - toxicity_filter: true
138
+ toxicity_threshold: 0.5
139
+ - pii_removal: true
140
+
141
+ # Infrastructure
142
+ infrastructure:
143
+ # Compute
144
+ num_gpus: 512
145
+ gpu_type: A100-80GB
146
+ num_nodes: 64
147
+ gpus_per_node: 8
148
+
149
+ # Networking
150
+ interconnect: infiniband
151
+ bandwidth_per_gpu: 400 # Gbps
152
+ communication_backend: nccl
153
+
154
+ # Storage
155
+ checkpoint_dir: /mnt/checkpoints/helion-v15-xl
156
+ data_dir: /mnt/data/training_corpus
157
+ tensorboard_dir: /mnt/logs/tensorboard
158
+
159
+ # Monitoring
160
+ wandb_project: helion-v15-xl
161
+ wandb_entity: deepxr-research
162
+ log_level: info
163
+
164
+ # Evaluation
165
+ evaluation:
166
+ eval_datasets:
167
+ - mmlu
168
+ - hellaswag
169
+ - arc_challenge
170
+ - arc_easy
171
+ - truthfulqa
172
+ - gsm8k
173
+ - humaneval
174
+ - mbpp
175
+
176
+ eval_batch_size: 16
177
+ eval_accumulation_steps: 4
178
+
179
+ # Few-shot Configuration
180
+ few_shot_examples:
181
+ mmlu: 5
182
+ hellaswag: 10
183
+ arc_challenge: 25
184
+ arc_easy: 25
185
+ gsm8k: 8
186
+ humaneval: 0
187
+ mbpp: 0
188
+
189
+ # Fine-tuning Stages
190
+ stages:
191
+ # Stage 1: Pre-training
192
+ - name: pretraining
193
+ steps: 750000
194
+ data_mix: [web_text, books, code, scientific]
195
+ learning_rate: 3.0e-4
196
+
197
+ # Stage 2: Domain Adaptation
198
+ - name: domain_adaptation
199
+ steps: 80000
200
+ data_mix: [code, scientific]
201
+ learning_rate: 1.0e-4
202
+
203
+ # Stage 3: Instruction Tuning
204
+ - name: instruction_tuning
205
+ steps: 45000
206
+ data_mix: [instruction]
207
+ learning_rate: 5.0e-5
208
+ lr_scheduler_type: linear
209
+
210
+ # Checkpointing
211
+ checkpointing:
212
+ save_total_limit: 10
213
+ save_strategy: steps
214
+ load_best_model_at_end: true
215
+ metric_for_best_model: eval_loss
216
+ greater_is_better: false
217
+
218
+ # Resume Training
219
+ resume_from_checkpoint: null
220
+ auto_resume: true
221
+
222
+ # Hardware Optimization
223
+ optimization:
224
+ # Memory Optimization
225
+ activation_checkpointing: true
226
+ cpu_offload: false
227
+ zero_stage: 2
228
+
229
+ # Flash Attention
230
+ use_flash_attention: true
231
+ flash_attention_version: 2
232
+
233
+ # Kernel Fusion
234
+ fused_adam: true
235
+ fused_lamb: false
236
+
237
+ # Communication
238
+ overlap_communication: true
239
+ bucket_size_mb: 25
240
+
241
+ # Safety and Alignment
242
+ safety:
243
+ # Content Filtering
244
+ content_filters:
245
+ - toxicity_classifier
246
+ - bias_detector
247
+ - pii_detector
248
+
249
+ # Constitutional AI
250
+ constitutional_principles:
251
+ - harmlessness
252
+ - helpfulness
253
+ - honesty
254
+
255
+ # RLHF Configuration
256
+ rlhf:
257
+ enabled: false
258
+ reward_model: null
259
+ ppo_epochs: 4
260
+ kl_coefficient: 0.1