{ "model_name": "Helion-V1.5-XL", "total_parameters": 16247832576, "trainable_parameters": 16247832576, "non_trainable_parameters": 0, "memory_footprint": { "model_weights": { "fp32": { "size_gb": 64.991, "size_bytes": 64991330304, "bits_per_param": 32 }, "fp16": { "size_gb": 32.496, "size_bytes": 32495665152, "bits_per_param": 16 }, "bf16": { "size_gb": 32.496, "size_bytes": 32495665152, "bits_per_param": 16 }, "int8": { "size_gb": 16.248, "size_bytes": 16247832576, "bits_per_param": 8 }, "int4": { "size_gb": 9.124, "size_bytes": 9124416288, "bits_per_param": 4.5, "note": "Includes quantization overhead" } }, "inference_memory": { "fp32": { "static_memory_gb": 64.991, "kv_cache_per_token_mb": 0.393, "activation_memory_gb": 2.048, "total_memory_gb": 67.039, "recommended_vram_gb": 80 }, "bf16": { "static_memory_gb": 32.496, "kv_cache_per_token_mb": 0.196, "activation_memory_gb": 1.024, "total_memory_gb": 33.520, "recommended_vram_gb": 40 }, "int8": { "static_memory_gb": 16.248, "kv_cache_per_token_mb": 0.196, "activation_memory_gb": 0.768, "total_memory_gb": 17.016, "recommended_vram_gb": 24 }, "int4": { "static_memory_gb": 9.124, "kv_cache_per_token_mb": 0.196, "activation_memory_gb": 0.512, "total_memory_gb": 9.636, "recommended_vram_gb": 12 } }, "training_memory": { "model_states": { "model_parameters_gb": 32.496, "gradients_gb": 32.496, "optimizer_states_gb": 129.983, "total_gb": 194.975 }, "activation_memory": { "per_layer_mb": 147.456, "total_layers": 48, "gradient_checkpointing_enabled": true, "with_checkpointing_gb": 3.538, "without_checkpointing_gb": 7.077 }, "total_per_gpu": { "with_gradient_checkpointing_gb": 198.513, "without_gradient_checkpointing_gb": 202.052, "recommended_vram_gb": 80, "batch_size_per_gpu": 1 } } }, "layer_breakdown": { "embedding_layer": { "parameters": 614400000, "memory_bf16_mb": 1228.8 }, "decoder_layers": { "total_layers": 48, "parameters_per_layer": 325640192, "memory_per_layer_bf16_mb": 651.28, "total_parameters": 15630729216, "total_memory_bf16_mb": 31261.44 }, "output_layer": { "lm_head_parameters": 614400000, "memory_bf16_mb": 1228.8, "note": "Weights not tied with embeddings" }, "normalization_layers": { "parameters": 2703360, "memory_bf16_mb": 5.41 } }, "component_breakdown": { "attention_layers": { "q_proj": { "shape": [6144, 6144], "parameters_per_layer": 37748736, "total_parameters": 1811939328 }, "k_proj": { "shape": [6144, 1536], "parameters_per_layer": 9437184, "total_parameters": 452984832 }, "v_proj": { "shape": [6144, 1536], "parameters_per_layer": 9437184, "total_parameters": 452984832 }, "o_proj": { "shape": [6144, 6144], "parameters_per_layer": 37748736, "total_parameters": 1811939328 }, "total_attention_parameters": 4529848320 }, "mlp_layers": { "gate_proj": { "shape": [6144, 24576], "parameters_per_layer": 150994944, "total_parameters": 7247757312 }, "up_proj": { "shape": [6144, 24576], "parameters_per_layer": 150994944, "total_parameters": 7247757312 }, "down_proj": { "shape": [24576, 6144], "parameters_per_layer": 150994944, "total_parameters": 7247757312 }, "total_mlp_parameters": 21743271936 } }, "kv_cache_specifications": { "num_layers": 48, "num_kv_heads": 8, "head_dim": 192, "hidden_size_kv": 1536, "cache_size_per_token": { "bf16_bytes": 196608, "bf16_mb": 0.1875, "fp32_bytes": 393216, "fp32_mb": 0.375 }, "max_context_length": 16384, "max_cache_size": { "bf16_gb": 3.072, "fp32_gb": 6.144 } }, "inference_benchmarks": { "hardware_profiles": [ { "gpu": "NVIDIA A100 80GB", "precision": "bf16", "batch_size": 1, "context_length": 2048, "tokens_per_second": 47.3, "latency_ms": 21.1, "memory_used_gb": 34.2 }, { "gpu": "NVIDIA A100 80GB", "precision": "int8", "batch_size": 1, "context_length": 2048, "tokens_per_second": 89.6, "latency_ms": 11.2, "memory_used_gb": 17.8 }, { "gpu": "NVIDIA A100 80GB", "precision": "int4", "batch_size": 1, "context_length": 2048, "tokens_per_second": 134.2, "latency_ms": 7.5, "memory_used_gb": 10.4 }, { "gpu": "NVIDIA H100 80GB", "precision": "bf16", "batch_size": 1, "context_length": 2048, "tokens_per_second": 78.1, "latency_ms": 12.8, "memory_used_gb": 34.2 }, { "gpu": "NVIDIA H100 80GB", "precision": "int4", "batch_size": 1, "context_length": 2048, "tokens_per_second": 218.7, "latency_ms": 4.6, "memory_used_gb": 10.4 }, { "gpu": "NVIDIA RTX 4090", "precision": "int4", "batch_size": 1, "context_length": 2048, "tokens_per_second": 87.3, "latency_ms": 11.5, "memory_used_gb": 10.4 } ] }, "optimization_recommendations": { "for_inference": { "under_12gb": "Use int4 quantization with context length <= 2048", "12gb_to_24gb": "Use int8 quantization or int4 with longer context", "24gb_to_40gb": "Use bf16 precision for best quality", "over_40gb": "Use bf16 with large batch sizes or long contexts" }, "for_fine_tuning": { "lora": { "minimum_vram_gb": 24, "recommended_vram_gb": 40, "trainable_parameters_percent": 0.1 }, "qlora": { "minimum_vram_gb": 16, "recommended_vram_gb": 24, "base_precision": "int4", "adapter_precision": "bf16" }, "full_fine_tuning": { "minimum_vram_gb": 80, "recommended_setup": "Multi-GPU with FSDP", "gpus_required": 8 } } }, "memory_efficient_techniques": { "quantization": { "int8": { "memory_reduction": "50%", "quality_impact": "minimal", "speedup": "1.9x" }, "int4": { "memory_reduction": "72%", "quality_impact": "slight degradation", "speedup": "2.8x" } }, "flash_attention": { "memory_reduction": "proportional to sequence length", "speedup": "2-3x for long sequences", "supported": true }, "gradient_checkpointing": { "memory_reduction": "50% activation memory", "speed_penalty": "20-30% slower", "recommended_for_training": true }, "cpu_offloading": { "memory_reduction": "up to 80%", "speed_penalty": "10-50x slower", "use_case": "inference on limited hardware" } } }