DeepXR
/

Helion-V2.0-Thinking

+{
+  "_metadata": {
+    "description": "Quantization configuration for Helion-V2.0-Thinking model",
+    "version": "1.0",
+    "supported_backends": ["bitsandbytes", "gptq", "awq", "gguf"]
+  },
+  "bitsandbytes": {
+    "4bit": {
+      "load_in_4bit": true,
+      "bnb_4bit_quant_type": "nf4",
+      "bnb_4bit_compute_dtype": "bfloat16",
+      "bnb_4bit_use_double_quant": true,
+      "bnb_4bit_quant_storage": "uint8",
+      "llm_int8_threshold": 6.0,
+      "llm_int8_skip_modules": ["lm_head", "vision_tower"],
+      "estimated_vram_usage_gb": 8,
+      "performance_impact": "minimal",
+      "quality_retention": 0.98
+    },
+    "8bit": {
+      "load_in_8bit": true,
+      "llm_int8_threshold": 6.0,
+      "llm_int8_has_fp16_weight": false,
+      "llm_int8_enable_fp32_cpu_offload": true,
+      "llm_int8_skip_modules": ["lm_head"],
+      "estimated_vram_usage_gb": 14,
+      "performance_impact": "low",
+      "quality_retention": 0.99
+    }
+  },
+  "gptq": {
+    "bits": 4,
+    "group_size": 128,
+    "damp_percent": 0.01,
+    "desc_act": true,
+    "sym": true,
+    "true_sequential": true,
+    "model_name_or_path": null,
+    "model_file_base_name": "model",
+    "quant_method": "gptq",
+    "disable_exllama": false,
+    "exllama_config": {
+      "version": 2,
+      "max_input_len": 200000,
+      "max_batch_size": 1
+    },
+    "estimated_vram_usage_gb": 7,
+    "performance_boost": "high",
+    "quality_retention": 0.97
+  },
+  "awq": {
+    "version": "gemm",
+    "bits": 4,
+    "group_size": 128,
+    "zero_point": true,
+    "q_group_size": 128,
+    "w_bit": 4,
+    "modules_to_not_convert": ["lm_head"],
+    "fuse_max_seq_len": 200000,
+    "do_fuse": true,
+    "estimated_vram_usage_gb": 6,
+    "performance_boost": "very_high",
+    "quality_retention": 0.975
+  },
+  "gguf": {
+    "quantization_schemes": {
+      "Q4_K_M": {
+        "description": "Medium quality 4-bit quantization",
+        "bits_per_weight": 4.5,
+        "estimated_size_gb": 6.2,
+        "quality_retention": 0.97,
+        "speed": "fast"
+      },
+      "Q5_K_M": {
+        "description": "Medium quality 5-bit quantization",
+        "bits_per_weight": 5.5,
+        "estimated_size_gb": 7.8,
+        "quality_retention": 0.98,
+        "speed": "medium"
+      },
+      "Q6_K": {
+        "description": "High quality 6-bit quantization",
+        "bits_per_weight": 6.5,
+        "estimated_size_gb": 9.1,
+        "quality_retention": 0.99,
+        "speed": "medium"
+      },
+      "Q8_0": {
+        "description": "Very high quality 8-bit quantization",
+        "bits_per_weight": 8.5,
+        "estimated_size_gb": 11.4,
+        "quality_retention": 0.995,
+        "speed": "slower"
+      }
+    },
+    "recommended": "Q4_K_M",
+    "context_length": 200000,
+    "rope_freq_base": 500000.0,
+    "rope_scaling_type": "linear",
+    "rope_scaling_factor": 8.0
+  },
+  "dynamic_quantization": {
+    "enabled": false,
+    "target_dtype": "int8",
+    "qconfig_spec": {
+      "": {
+        "dtype": "qint8",
+        "qscheme": "per_tensor_symmetric"
+      }
+    },
+    "modules_to_quantize": [
+      "q_proj",
+      "k_proj",
+      "v_proj",
+      "o_proj",
+      "gate_proj",
+      "up_proj",
+      "down_proj"
+    ]
+  },
+  "mixed_precision": {
+    "fp16": {
+      "enabled": true,
+      "opt_level": "O2",
+      "keep_batchnorm_fp32": true,
+      "loss_scale": "dynamic"
+    },
+    "bf16": {
+      "enabled": true,
+      "full_bf16": false
+    }
+  },
+  "optimization_targets": {
+    "latency": {
+      "recommended_quantization": "awq",
+      "recommended_bits": 4,
+      "enable_flash_attention": true,
+      "enable_torch_compile": true
+    },
+    "memory": {
+      "recommended_quantization": "gptq",
+      "recommended_bits": 4,
+      "enable_cpu_offload": true,
+      "enable_disk_offload": false
+    },
+    "quality": {
+      "recommended_quantization": "bitsandbytes_8bit",
+      "recommended_bits": 8,
+      "use_double_quant": true
+    },
+    "balanced": {
+      "recommended_quantization": "bitsandbytes_4bit",
+      "recommended_bits": 4,
+      "use_double_quant": true,
+      "compute_dtype": "bfloat16"
+    }
+  },
+  "calibration": {
+    "dataset": "c4",
+    "num_samples": 128,
+    "seq_len": 2048,
+    "use_vision_calibration": true,
+    "vision_calibration_images": 256
+  },
+  "hardware_recommendations": {
+    "rtx_4090": {
+      "recommended_config": "bitsandbytes_4bit",
+      "max_batch_size": 2,
+      "expected_tokens_per_sec": 89
+    },
+    "rtx_4080": {
+      "recommended_config": "bitsandbytes_8bit",
+      "max_batch_size": 1,
+      "expected_tokens_per_sec": 67
+    },
+    "rtx_4070": {
+      "recommended_config": "bitsandbytes_4bit",
+      "max_batch_size": 1,
+      "expected_tokens_per_sec": 52
+    },
+    "a100_40gb": {
+      "recommended_config": "fp16",
+      "max_batch_size": 4,
+      "expected_tokens_per_sec": 156
+    },
+    "a100_80gb": {
+      "recommended_config": "bf16",
+      "max_batch_size": 8,
+      "expected_tokens_per_sec": 289
+    }
+  },
+  "deployment_scenarios": {
+    "production_server": {
+      "quantization": "awq",
+      "bits": 4,
+      "batch_size": 4,
+      "priority": "throughput"
+    },
+    "edge_device": {
+      "quantization": "gguf_q4_k_m",
+      "bits": 4,
+      "batch_size": 1,
+      "priority": "memory"
+    },
+    "research": {
+      "quantization": "bitsandbytes_8bit",
+      "bits": 8,
+      "batch_size": 2,
+      "priority": "quality"
+    },
+    "interactive_demo": {
+      "quantization": "bitsandbytes_4bit",
+      "bits": 4,
+      "batch_size": 1,
+      "priority": "latency"
+    }
+  }
+}