Trouter-Library commited on
Commit
e95d324
·
verified ·
1 Parent(s): cdfeef5

Create quantization_config.json

Browse files
Files changed (1) hide show
  1. quantization_config.json +228 -0
quantization_config.json ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_metadata": {
3
+ "description": "Quantization configuration for Helion-V2.0-Thinking model",
4
+ "version": "1.0",
5
+ "supported_backends": ["bitsandbytes", "gptq", "awq", "gguf"]
6
+ },
7
+
8
+ "bitsandbytes": {
9
+ "4bit": {
10
+ "load_in_4bit": true,
11
+ "bnb_4bit_quant_type": "nf4",
12
+ "bnb_4bit_compute_dtype": "bfloat16",
13
+ "bnb_4bit_use_double_quant": true,
14
+ "bnb_4bit_quant_storage": "uint8",
15
+ "llm_int8_threshold": 6.0,
16
+ "llm_int8_skip_modules": ["lm_head", "vision_tower"],
17
+ "estimated_vram_usage_gb": 8,
18
+ "performance_impact": "minimal",
19
+ "quality_retention": 0.98
20
+ },
21
+ "8bit": {
22
+ "load_in_8bit": true,
23
+ "llm_int8_threshold": 6.0,
24
+ "llm_int8_has_fp16_weight": false,
25
+ "llm_int8_enable_fp32_cpu_offload": true,
26
+ "llm_int8_skip_modules": ["lm_head"],
27
+ "estimated_vram_usage_gb": 14,
28
+ "performance_impact": "low",
29
+ "quality_retention": 0.99
30
+ }
31
+ },
32
+
33
+ "gptq": {
34
+ "bits": 4,
35
+ "group_size": 128,
36
+ "damp_percent": 0.01,
37
+ "desc_act": true,
38
+ "sym": true,
39
+ "true_sequential": true,
40
+ "model_name_or_path": null,
41
+ "model_file_base_name": "model",
42
+ "quant_method": "gptq",
43
+ "disable_exllama": false,
44
+ "exllama_config": {
45
+ "version": 2,
46
+ "max_input_len": 200000,
47
+ "max_batch_size": 1
48
+ },
49
+ "estimated_vram_usage_gb": 7,
50
+ "performance_boost": "high",
51
+ "quality_retention": 0.97
52
+ },
53
+
54
+ "awq": {
55
+ "version": "gemm",
56
+ "bits": 4,
57
+ "group_size": 128,
58
+ "zero_point": true,
59
+ "q_group_size": 128,
60
+ "w_bit": 4,
61
+ "modules_to_not_convert": ["lm_head"],
62
+ "fuse_max_seq_len": 200000,
63
+ "do_fuse": true,
64
+ "estimated_vram_usage_gb": 6,
65
+ "performance_boost": "very_high",
66
+ "quality_retention": 0.975
67
+ },
68
+
69
+ "gguf": {
70
+ "quantization_schemes": {
71
+ "Q4_K_M": {
72
+ "description": "Medium quality 4-bit quantization",
73
+ "bits_per_weight": 4.5,
74
+ "estimated_size_gb": 6.2,
75
+ "quality_retention": 0.97,
76
+ "speed": "fast"
77
+ },
78
+ "Q5_K_M": {
79
+ "description": "Medium quality 5-bit quantization",
80
+ "bits_per_weight": 5.5,
81
+ "estimated_size_gb": 7.8,
82
+ "quality_retention": 0.98,
83
+ "speed": "medium"
84
+ },
85
+ "Q6_K": {
86
+ "description": "High quality 6-bit quantization",
87
+ "bits_per_weight": 6.5,
88
+ "estimated_size_gb": 9.1,
89
+ "quality_retention": 0.99,
90
+ "speed": "medium"
91
+ },
92
+ "Q8_0": {
93
+ "description": "Very high quality 8-bit quantization",
94
+ "bits_per_weight": 8.5,
95
+ "estimated_size_gb": 11.4,
96
+ "quality_retention": 0.995,
97
+ "speed": "slower"
98
+ }
99
+ },
100
+ "recommended": "Q4_K_M",
101
+ "context_length": 200000,
102
+ "rope_freq_base": 500000.0,
103
+ "rope_scaling_type": "linear",
104
+ "rope_scaling_factor": 8.0
105
+ },
106
+
107
+ "dynamic_quantization": {
108
+ "enabled": false,
109
+ "target_dtype": "int8",
110
+ "qconfig_spec": {
111
+ "": {
112
+ "dtype": "qint8",
113
+ "qscheme": "per_tensor_symmetric"
114
+ }
115
+ },
116
+ "modules_to_quantize": [
117
+ "q_proj",
118
+ "k_proj",
119
+ "v_proj",
120
+ "o_proj",
121
+ "gate_proj",
122
+ "up_proj",
123
+ "down_proj"
124
+ ]
125
+ },
126
+
127
+ "mixed_precision": {
128
+ "fp16": {
129
+ "enabled": true,
130
+ "opt_level": "O2",
131
+ "keep_batchnorm_fp32": true,
132
+ "loss_scale": "dynamic"
133
+ },
134
+ "bf16": {
135
+ "enabled": true,
136
+ "full_bf16": false
137
+ }
138
+ },
139
+
140
+ "optimization_targets": {
141
+ "latency": {
142
+ "recommended_quantization": "awq",
143
+ "recommended_bits": 4,
144
+ "enable_flash_attention": true,
145
+ "enable_torch_compile": true
146
+ },
147
+ "memory": {
148
+ "recommended_quantization": "gptq",
149
+ "recommended_bits": 4,
150
+ "enable_cpu_offload": true,
151
+ "enable_disk_offload": false
152
+ },
153
+ "quality": {
154
+ "recommended_quantization": "bitsandbytes_8bit",
155
+ "recommended_bits": 8,
156
+ "use_double_quant": true
157
+ },
158
+ "balanced": {
159
+ "recommended_quantization": "bitsandbytes_4bit",
160
+ "recommended_bits": 4,
161
+ "use_double_quant": true,
162
+ "compute_dtype": "bfloat16"
163
+ }
164
+ },
165
+
166
+ "calibration": {
167
+ "dataset": "c4",
168
+ "num_samples": 128,
169
+ "seq_len": 2048,
170
+ "use_vision_calibration": true,
171
+ "vision_calibration_images": 256
172
+ },
173
+
174
+ "hardware_recommendations": {
175
+ "rtx_4090": {
176
+ "recommended_config": "bitsandbytes_4bit",
177
+ "max_batch_size": 2,
178
+ "expected_tokens_per_sec": 89
179
+ },
180
+ "rtx_4080": {
181
+ "recommended_config": "bitsandbytes_8bit",
182
+ "max_batch_size": 1,
183
+ "expected_tokens_per_sec": 67
184
+ },
185
+ "rtx_4070": {
186
+ "recommended_config": "bitsandbytes_4bit",
187
+ "max_batch_size": 1,
188
+ "expected_tokens_per_sec": 52
189
+ },
190
+ "a100_40gb": {
191
+ "recommended_config": "fp16",
192
+ "max_batch_size": 4,
193
+ "expected_tokens_per_sec": 156
194
+ },
195
+ "a100_80gb": {
196
+ "recommended_config": "bf16",
197
+ "max_batch_size": 8,
198
+ "expected_tokens_per_sec": 289
199
+ }
200
+ },
201
+
202
+ "deployment_scenarios": {
203
+ "production_server": {
204
+ "quantization": "awq",
205
+ "bits": 4,
206
+ "batch_size": 4,
207
+ "priority": "throughput"
208
+ },
209
+ "edge_device": {
210
+ "quantization": "gguf_q4_k_m",
211
+ "bits": 4,
212
+ "batch_size": 1,
213
+ "priority": "memory"
214
+ },
215
+ "research": {
216
+ "quantization": "bitsandbytes_8bit",
217
+ "bits": 8,
218
+ "batch_size": 2,
219
+ "priority": "quality"
220
+ },
221
+ "interactive_demo": {
222
+ "quantization": "bitsandbytes_4bit",
223
+ "bits": 4,
224
+ "batch_size": 1,
225
+ "priority": "latency"
226
+ }
227
+ }
228
+ }