Create trainer_config.yaml

Browse files

Files changed (1) hide show

trainer_config.yaml +260 -0

trainer_config.yaml ADDED Viewed

	@@ -0,0 +1,260 @@

+# Helion-V1.5-XL Training Configuration
+# Model Architecture
+model:
+  model_type: helion
+  vocab_size: 100000
+  hidden_size: 6144
+  intermediate_size: 24576
+  num_hidden_layers: 48
+  num_attention_heads: 32
+  num_key_value_heads: 8
+  max_position_embeddings: 16384
+  rope_theta: 10000.0
+  rope_scaling:
+    type: linear
+    factor: 2.0
+  hidden_act: silu
+  initializer_range: 0.02
+  rms_norm_eps: 1.0e-6
+  use_cache: true
+  tie_word_embeddings: false
+  attention_bias: false
+  attention_dropout: 0.0
+# Training Configuration
+training:
+  # Optimization
+  optimizer: adamw
+  learning_rate: 3.0e-4
+  weight_decay: 0.1
+  adam_beta1: 0.9
+  adam_beta2: 0.95
+  adam_epsilon: 1.0e-8
+  max_grad_norm: 1.0
+  # Learning Rate Schedule
+  lr_scheduler_type: cosine
+  warmup_steps: 2000
+  min_learning_rate: 3.0e-5
+  # Batch Configuration
+  per_device_train_batch_size: 32
+  gradient_accumulation_steps: 8
+  global_batch_size: 4194304  # in tokens
+  max_sequence_length: 4096
+  # Training Steps
+  max_steps: 875000
+  save_steps: 5000
+  eval_steps: 1000
+  logging_steps: 100
+  # Mixed Precision
+  fp16: false
+  bf16: true
+  tf32: true
+  # Distributed Training
+  distributed_strategy: fsdp
+  fsdp_config:
+    fsdp_transformer_layer_cls_to_wrap: HelionDecoderLayer
+    fsdp_backward_prefetch: backward_pre
+    fsdp_state_dict_type: FULL_STATE_DICT
+    fsdp_cpu_offload: false
+  # Gradient Checkpointing
+  gradient_checkpointing: true
+  gradient_checkpointing_kwargs:
+    use_reentrant: false
+  # Compilation
+  torch_compile: true
+  torch_compile_backend: inductor
+  torch_compile_mode: max-autotune
+# Data Configuration
+data:
+  # Dataset Mixing Ratios
+  datasets:
+    - name: web_text
+      weight: 0.45
+      sources:
+        - common_crawl_filtered
+        - c4
+        - redpajama_web
+    - name: books
+      weight: 0.20
+      sources:
+        - books3
+        - gutenberg
+        - bookcorpus
+    - name: code
+      weight: 0.15
+      sources:
+        - github_code
+        - stack_overflow
+        - starcoder_data
+    - name: scientific
+      weight: 0.10
+      sources:
+        - arxiv
+        - pubmed
+        - semantic_scholar
+    - name: instruction
+      weight: 0.08
+      sources:
+        - openorca
+        - ultrachat
+        - wizardlm
+        - alpaca
+    - name: multilingual
+      weight: 0.02
+      sources:
+        - mc4_multilingual
+        - wikipedia_multilingual
+  # Data Processing
+  preprocessing:
+    tokenizer: helion_tokenizer
+    max_length: 4096
+    padding: false
+    truncation: true
+  # Data Quality
+  quality_filters:
+    - deduplication: true
+      dedup_threshold: 0.85
+    - min_token_length: 50
+    - max_token_length: 8192
+    - perplexity_filter: true
+      perplexity_threshold: 1500
+    - toxicity_filter: true
+      toxicity_threshold: 0.5
+    - pii_removal: true
+# Infrastructure
+infrastructure:
+  # Compute
+  num_gpus: 512
+  gpu_type: A100-80GB
+  num_nodes: 64
+  gpus_per_node: 8
+  # Networking
+  interconnect: infiniband
+  bandwidth_per_gpu: 400  # Gbps
+  communication_backend: nccl
+  # Storage
+  checkpoint_dir: /mnt/checkpoints/helion-v15-xl
+  data_dir: /mnt/data/training_corpus
+  tensorboard_dir: /mnt/logs/tensorboard
+  # Monitoring
+  wandb_project: helion-v15-xl
+  wandb_entity: deepxr-research
+  log_level: info
+# Evaluation
+evaluation:
+  eval_datasets:
+    - mmlu
+    - hellaswag
+    - arc_challenge
+    - arc_easy
+    - truthfulqa
+    - gsm8k
+    - humaneval
+    - mbpp
+  eval_batch_size: 16
+  eval_accumulation_steps: 4
+  # Few-shot Configuration
+  few_shot_examples:
+    mmlu: 5
+    hellaswag: 10
+    arc_challenge: 25
+    arc_easy: 25
+    gsm8k: 8
+    humaneval: 0
+    mbpp: 0
+# Fine-tuning Stages
+stages:
+  # Stage 1: Pre-training
+  - name: pretraining
+    steps: 750000
+    data_mix: [web_text, books, code, scientific]
+    learning_rate: 3.0e-4
+  # Stage 2: Domain Adaptation
+  - name: domain_adaptation
+    steps: 80000
+    data_mix: [code, scientific]
+    learning_rate: 1.0e-4
+  # Stage 3: Instruction Tuning
+  - name: instruction_tuning
+    steps: 45000
+    data_mix: [instruction]
+    learning_rate: 5.0e-5
+    lr_scheduler_type: linear
+# Checkpointing
+checkpointing:
+  save_total_limit: 10
+  save_strategy: steps
+  load_best_model_at_end: true
+  metric_for_best_model: eval_loss
+  greater_is_better: false
+  # Resume Training
+  resume_from_checkpoint: null
+  auto_resume: true
+# Hardware Optimization
+optimization:
+  # Memory Optimization
+  activation_checkpointing: true
+  cpu_offload: false
+  zero_stage: 2
+  # Flash Attention
+  use_flash_attention: true
+  flash_attention_version: 2
+  # Kernel Fusion
+  fused_adam: true
+  fused_lamb: false
+  # Communication
+  overlap_communication: true
+  bucket_size_mb: 25
+# Safety and Alignment
+safety:
+  # Content Filtering
+  content_filters:
+    - toxicity_classifier
+    - bias_detector
+    - pii_detector
+  # Constitutional AI
+  constitutional_principles:
+    - harmlessness
+    - helpfulness
+    - honesty
+  # RLHF Configuration
+  rlhf:
+    enabled: false
+    reward_model: null
+    ppo_epochs: 4
+    kl_coefficient: 0.1