aditi184 commited on
Commit
4bd8611
·
verified ·
1 Parent(s): bbeb535

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ * filter=lfs diff=lfs merge=lfs -text
step200-unsharded/config.yaml ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: en_ru_hi_lr_5e5
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 2048
7
+ n_heads: 16
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 16
11
+ mlp_ratio: 1
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: moe
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: true
22
+ attention_dropout: 0.0
23
+ multi_query_attention: null
24
+ attention_layer_norm: true
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: rms
29
+ layer_norm_with_affine: true
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: true
32
+ max_sequence_length: 4096
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 50280
37
+ embedding_size: 50304
38
+ weight_tying: false
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: meta
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ moe_num_experts: 64
47
+ moe_top_k: 8
48
+ moe_mlp_impl: sparse
49
+ moe_log_expert_assignment: true
50
+ moe_shared_expert: false
51
+ moe_lbl_in_fp32: false
52
+ moe_loss_weight: 0.01
53
+ moe_zloss_weight: 0.001
54
+ moe_dropless: true
55
+ moe_capacity_factor: 1.25
56
+ scale_emb_init: false
57
+ emb_init_std: null
58
+ norm_after: false
59
+ optimizer:
60
+ name: adamw
61
+ learning_rate: 5.0e-05
62
+ weight_decay: 0.1
63
+ betas:
64
+ - 0.9
65
+ - 0.95
66
+ eps: 1.0e-08
67
+ no_decay_norm_and_bias: null
68
+ selective_updates: false
69
+ decay_norm_and_bias: true
70
+ decay_embeddings: true
71
+ metrics_log_interval: 25
72
+ record_update_metrics: false
73
+ scheduler:
74
+ name: cosine_with_warmup
75
+ units: tokens
76
+ t_warmup: 10485760000
77
+ t_max: 5000000000000.0
78
+ alpha_f: 0.1
79
+ grad_clip_warmup_steps: null
80
+ grad_clip_warmup_factor: null
81
+ warmup_min_lr: null
82
+ data:
83
+ paths:
84
+ - /scratch/k/khandela/text/hi_ru_en/train_tokenized/part-0-00000.npy
85
+ memmap_dtype: uint16
86
+ datasets: null
87
+ label_mask_paths: null
88
+ pad_direction: right
89
+ generate_attention_mask: false
90
+ generate_doc_lengths: false
91
+ num_workers: 32
92
+ drop_last: true
93
+ pin_memory: true
94
+ prefetch_factor: 8
95
+ persistent_workers: true
96
+ timeout: 0
97
+ seed: null
98
+ instance_filter:
99
+ repetition_max_period: 13
100
+ repetition_min_period: 1
101
+ repetition_max_count: 32
102
+ restore_dataloader: true
103
+ fast_forward_batches: null
104
+ evaluators:
105
+ - label: validation-test
106
+ type: lm
107
+ data:
108
+ paths: null
109
+ memmap_dtype: uint16
110
+ datasets:
111
+ english-validation:
112
+ - /scratch/k/khandela/text/hi_ru_en/test_tokenized/en/part-0-00000.npy
113
+ hindi-validation:
114
+ - /scratch/k/khandela/text/hi_ru_en/test_tokenized/hi/part-0-00000.npy
115
+ russian-validation:
116
+ - /scratch/k/khandela/text/hi_ru_en/test_tokenized/ru/part-0-00000.npy
117
+ label_mask_paths: null
118
+ pad_direction: right
119
+ generate_attention_mask: false
120
+ generate_doc_lengths: false
121
+ num_workers: 0
122
+ drop_last: true
123
+ pin_memory: false
124
+ prefetch_factor: null
125
+ persistent_workers: false
126
+ timeout: 0
127
+ seed: null
128
+ instance_filter: null
129
+ device_eval_batch_size: null
130
+ subset_num_batches: null
131
+ eval_interval: 100
132
+ tokenizer:
133
+ identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json
134
+ truncate_direction: right
135
+ save_folder: /scratch/k/khandela/runs/en_ru_hi_lr_5e5
136
+ remote_save_folder: null
137
+ canceled_check_interval: 50
138
+ save_interval: 100
139
+ save_interval_unsharded: 100
140
+ save_interval_ephemeral: null
141
+ save_num_checkpoints_to_keep: 2
142
+ save_num_unsharded_checkpoints_to_keep: 2
143
+ save_overwrite: true
144
+ force_save_unsharded: false
145
+ no_pre_train_checkpoint: true
146
+ load_path: /scratch/k/khandela/runs/test_real/step1
147
+ load_path_sharded_checkpointer: null
148
+ try_load_latest_save: false
149
+ reset_optimizer_state: false
150
+ reset_trainer_state: false
151
+ sharded_checkpointer: olmo_core
152
+ new_style_checkpoints: null
153
+ max_duration: 5ep
154
+ global_train_batch_size: 2048
155
+ device_train_batch_size: 128
156
+ device_train_microbatch_size: 4
157
+ device_eval_batch_size: 4
158
+ eval_subset_num_batches: -1
159
+ eval_on_load: true
160
+ device_train_grad_accum: 32
161
+ max_grad_norm: 1.0
162
+ max_grad_norm_ratio: null
163
+ precision: amp_bf16
164
+ wandb:
165
+ project: en_ru_hi_lr_5e5
166
+ entity: aditi-khandelwal-mcgill-university
167
+ group: aditi-khandelwal-mcgill-university
168
+ name: en_ru_hi_lr_5e5
169
+ tags:
170
+ - experiment1
171
+ log_artifacts: false
172
+ rank_zero_only: true
173
+ log_interval: 1
174
+ speed_monitor:
175
+ window_size: 1
176
+ gpu_flops_available: null
177
+ console_log_interval: 1
178
+ gen1_gc_interval: 1
179
+ compile: null
180
+ distributed_strategy: fsdp
181
+ fsdp:
182
+ use_orig_params: true
183
+ sharding_strategy: FULL_SHARD
184
+ wrapping_strategy: by_block
185
+ precision: mixed
186
+ hybrid_sharding_num_model_replicas: null
187
+ ddp: null
188
+ softmax_auxiliary_loss: false
189
+ auxiliary_loss_multiplier: 0.0001
190
+ time_limit: null
191
+ extra_steps_after_cancel: 10
192
+ early_stopping_factor: null
193
+ save_data_indices: true
194
+ python_profiling: false
195
+ torch_profiling: false
196
+ stop_at: 315
197
+ stop_after: null
198
+ activation_checkpointing: null
199
+ fused_loss: null
200
+ hf_datasets_cache_dir: null
201
+ module_outputs_save_steps: null
step200-unsharded/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d16eee4b2eca5170363ddb53a56cbc9f6ca809a89198f19fa4d772eb5f79f26e
3
+ size 27676726178
step200-unsharded/optim.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a66d47c21a76c27312f9fc39c2de9dfd3644bd28353975259abc9f6812a0cbfd
3
+ size 55353460914
step200-unsharded/train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ef7c3b25c9d1d5c16bee0056844040ccf11750f75dd5b06c61eeb7dd486397b
3
+ size 14668