Trouter-Library commited on
Commit
1e51c60
·
verified ·
1 Parent(s): fdac17c

Create model_memory.json

Browse files
Files changed (1) hide show
  1. model_memory.json +291 -0
model_memory.json ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Helion-V1.5-XL",
3
+ "total_parameters": 16247832576,
4
+ "trainable_parameters": 16247832576,
5
+ "non_trainable_parameters": 0,
6
+
7
+ "memory_footprint": {
8
+ "model_weights": {
9
+ "fp32": {
10
+ "size_gb": 64.991,
11
+ "size_bytes": 64991330304,
12
+ "bits_per_param": 32
13
+ },
14
+ "fp16": {
15
+ "size_gb": 32.496,
16
+ "size_bytes": 32495665152,
17
+ "bits_per_param": 16
18
+ },
19
+ "bf16": {
20
+ "size_gb": 32.496,
21
+ "size_bytes": 32495665152,
22
+ "bits_per_param": 16
23
+ },
24
+ "int8": {
25
+ "size_gb": 16.248,
26
+ "size_bytes": 16247832576,
27
+ "bits_per_param": 8
28
+ },
29
+ "int4": {
30
+ "size_gb": 9.124,
31
+ "size_bytes": 9124416288,
32
+ "bits_per_param": 4.5,
33
+ "note": "Includes quantization overhead"
34
+ }
35
+ },
36
+
37
+ "inference_memory": {
38
+ "fp32": {
39
+ "static_memory_gb": 64.991,
40
+ "kv_cache_per_token_mb": 0.393,
41
+ "activation_memory_gb": 2.048,
42
+ "total_memory_gb": 67.039,
43
+ "recommended_vram_gb": 80
44
+ },
45
+ "bf16": {
46
+ "static_memory_gb": 32.496,
47
+ "kv_cache_per_token_mb": 0.196,
48
+ "activation_memory_gb": 1.024,
49
+ "total_memory_gb": 33.520,
50
+ "recommended_vram_gb": 40
51
+ },
52
+ "int8": {
53
+ "static_memory_gb": 16.248,
54
+ "kv_cache_per_token_mb": 0.196,
55
+ "activation_memory_gb": 0.768,
56
+ "total_memory_gb": 17.016,
57
+ "recommended_vram_gb": 24
58
+ },
59
+ "int4": {
60
+ "static_memory_gb": 9.124,
61
+ "kv_cache_per_token_mb": 0.196,
62
+ "activation_memory_gb": 0.512,
63
+ "total_memory_gb": 9.636,
64
+ "recommended_vram_gb": 12
65
+ }
66
+ },
67
+
68
+ "training_memory": {
69
+ "model_states": {
70
+ "model_parameters_gb": 32.496,
71
+ "gradients_gb": 32.496,
72
+ "optimizer_states_gb": 129.983,
73
+ "total_gb": 194.975
74
+ },
75
+ "activation_memory": {
76
+ "per_layer_mb": 147.456,
77
+ "total_layers": 48,
78
+ "gradient_checkpointing_enabled": true,
79
+ "with_checkpointing_gb": 3.538,
80
+ "without_checkpointing_gb": 7.077
81
+ },
82
+ "total_per_gpu": {
83
+ "with_gradient_checkpointing_gb": 198.513,
84
+ "without_gradient_checkpointing_gb": 202.052,
85
+ "recommended_vram_gb": 80,
86
+ "batch_size_per_gpu": 1
87
+ }
88
+ }
89
+ },
90
+
91
+ "layer_breakdown": {
92
+ "embedding_layer": {
93
+ "parameters": 614400000,
94
+ "memory_bf16_mb": 1228.8
95
+ },
96
+ "decoder_layers": {
97
+ "total_layers": 48,
98
+ "parameters_per_layer": 325640192,
99
+ "memory_per_layer_bf16_mb": 651.28,
100
+ "total_parameters": 15630729216,
101
+ "total_memory_bf16_mb": 31261.44
102
+ },
103
+ "output_layer": {
104
+ "lm_head_parameters": 614400000,
105
+ "memory_bf16_mb": 1228.8,
106
+ "note": "Weights not tied with embeddings"
107
+ },
108
+ "normalization_layers": {
109
+ "parameters": 2703360,
110
+ "memory_bf16_mb": 5.41
111
+ }
112
+ },
113
+
114
+ "component_breakdown": {
115
+ "attention_layers": {
116
+ "q_proj": {
117
+ "shape": [6144, 6144],
118
+ "parameters_per_layer": 37748736,
119
+ "total_parameters": 1811939328
120
+ },
121
+ "k_proj": {
122
+ "shape": [6144, 1536],
123
+ "parameters_per_layer": 9437184,
124
+ "total_parameters": 452984832
125
+ },
126
+ "v_proj": {
127
+ "shape": [6144, 1536],
128
+ "parameters_per_layer": 9437184,
129
+ "total_parameters": 452984832
130
+ },
131
+ "o_proj": {
132
+ "shape": [6144, 6144],
133
+ "parameters_per_layer": 37748736,
134
+ "total_parameters": 1811939328
135
+ },
136
+ "total_attention_parameters": 4529848320
137
+ },
138
+ "mlp_layers": {
139
+ "gate_proj": {
140
+ "shape": [6144, 24576],
141
+ "parameters_per_layer": 150994944,
142
+ "total_parameters": 7247757312
143
+ },
144
+ "up_proj": {
145
+ "shape": [6144, 24576],
146
+ "parameters_per_layer": 150994944,
147
+ "total_parameters": 7247757312
148
+ },
149
+ "down_proj": {
150
+ "shape": [24576, 6144],
151
+ "parameters_per_layer": 150994944,
152
+ "total_parameters": 7247757312
153
+ },
154
+ "total_mlp_parameters": 21743271936
155
+ }
156
+ },
157
+
158
+ "kv_cache_specifications": {
159
+ "num_layers": 48,
160
+ "num_kv_heads": 8,
161
+ "head_dim": 192,
162
+ "hidden_size_kv": 1536,
163
+ "cache_size_per_token": {
164
+ "bf16_bytes": 196608,
165
+ "bf16_mb": 0.1875,
166
+ "fp32_bytes": 393216,
167
+ "fp32_mb": 0.375
168
+ },
169
+ "max_context_length": 16384,
170
+ "max_cache_size": {
171
+ "bf16_gb": 3.072,
172
+ "fp32_gb": 6.144
173
+ }
174
+ },
175
+
176
+ "inference_benchmarks": {
177
+ "hardware_profiles": [
178
+ {
179
+ "gpu": "NVIDIA A100 80GB",
180
+ "precision": "bf16",
181
+ "batch_size": 1,
182
+ "context_length": 2048,
183
+ "tokens_per_second": 47.3,
184
+ "latency_ms": 21.1,
185
+ "memory_used_gb": 34.2
186
+ },
187
+ {
188
+ "gpu": "NVIDIA A100 80GB",
189
+ "precision": "int8",
190
+ "batch_size": 1,
191
+ "context_length": 2048,
192
+ "tokens_per_second": 89.6,
193
+ "latency_ms": 11.2,
194
+ "memory_used_gb": 17.8
195
+ },
196
+ {
197
+ "gpu": "NVIDIA A100 80GB",
198
+ "precision": "int4",
199
+ "batch_size": 1,
200
+ "context_length": 2048,
201
+ "tokens_per_second": 134.2,
202
+ "latency_ms": 7.5,
203
+ "memory_used_gb": 10.4
204
+ },
205
+ {
206
+ "gpu": "NVIDIA H100 80GB",
207
+ "precision": "bf16",
208
+ "batch_size": 1,
209
+ "context_length": 2048,
210
+ "tokens_per_second": 78.1,
211
+ "latency_ms": 12.8,
212
+ "memory_used_gb": 34.2
213
+ },
214
+ {
215
+ "gpu": "NVIDIA H100 80GB",
216
+ "precision": "int4",
217
+ "batch_size": 1,
218
+ "context_length": 2048,
219
+ "tokens_per_second": 218.7,
220
+ "latency_ms": 4.6,
221
+ "memory_used_gb": 10.4
222
+ },
223
+ {
224
+ "gpu": "NVIDIA RTX 4090",
225
+ "precision": "int4",
226
+ "batch_size": 1,
227
+ "context_length": 2048,
228
+ "tokens_per_second": 87.3,
229
+ "latency_ms": 11.5,
230
+ "memory_used_gb": 10.4
231
+ }
232
+ ]
233
+ },
234
+
235
+ "optimization_recommendations": {
236
+ "for_inference": {
237
+ "under_12gb": "Use int4 quantization with context length <= 2048",
238
+ "12gb_to_24gb": "Use int8 quantization or int4 with longer context",
239
+ "24gb_to_40gb": "Use bf16 precision for best quality",
240
+ "over_40gb": "Use bf16 with large batch sizes or long contexts"
241
+ },
242
+ "for_fine_tuning": {
243
+ "lora": {
244
+ "minimum_vram_gb": 24,
245
+ "recommended_vram_gb": 40,
246
+ "trainable_parameters_percent": 0.1
247
+ },
248
+ "qlora": {
249
+ "minimum_vram_gb": 16,
250
+ "recommended_vram_gb": 24,
251
+ "base_precision": "int4",
252
+ "adapter_precision": "bf16"
253
+ },
254
+ "full_fine_tuning": {
255
+ "minimum_vram_gb": 80,
256
+ "recommended_setup": "Multi-GPU with FSDP",
257
+ "gpus_required": 8
258
+ }
259
+ }
260
+ },
261
+
262
+ "memory_efficient_techniques": {
263
+ "quantization": {
264
+ "int8": {
265
+ "memory_reduction": "50%",
266
+ "quality_impact": "minimal",
267
+ "speedup": "1.9x"
268
+ },
269
+ "int4": {
270
+ "memory_reduction": "72%",
271
+ "quality_impact": "slight degradation",
272
+ "speedup": "2.8x"
273
+ }
274
+ },
275
+ "flash_attention": {
276
+ "memory_reduction": "proportional to sequence length",
277
+ "speedup": "2-3x for long sequences",
278
+ "supported": true
279
+ },
280
+ "gradient_checkpointing": {
281
+ "memory_reduction": "50% activation memory",
282
+ "speed_penalty": "20-30% slower",
283
+ "recommended_for_training": true
284
+ },
285
+ "cpu_offloading": {
286
+ "memory_reduction": "up to 80%",
287
+ "speed_penalty": "10-50x slower",
288
+ "use_case": "inference on limited hardware"
289
+ }
290
+ }
291
+ }