AlexGall commited on
Commit
c3aebf2
·
verified ·
1 Parent(s): e95d324

Create deployment_config.json

Browse files
Files changed (1) hide show
  1. deployment_config.json +327 -0
deployment_config.json ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "deployment_metadata": {
3
+ "model_name": "Helion-V2.0-Thinking",
4
+ "version": "2.0.0",
5
+ "deployment_date": "2024-11-27",
6
+ "supported_frameworks": ["transformers", "vllm", "text-generation-inference", "ollama"],
7
+ "minimum_transformers_version": "4.36.0"
8
+ },
9
+
10
+ "server_configurations": {
11
+ "development": {
12
+ "environment": "dev",
13
+ "host": "0.0.0.0",
14
+ "port": 8000,
15
+ "workers": 1,
16
+ "max_batch_size": 1,
17
+ "max_concurrent_requests": 4,
18
+ "timeout_seconds": 300,
19
+ "enable_cors": true,
20
+ "cors_origins": ["*"],
21
+ "log_level": "DEBUG",
22
+ "cache_enabled": true,
23
+ "metrics_enabled": true
24
+ },
25
+ "production": {
26
+ "environment": "prod",
27
+ "host": "0.0.0.0",
28
+ "port": 8000,
29
+ "workers": 4,
30
+ "max_batch_size": 8,
31
+ "max_concurrent_requests": 32,
32
+ "timeout_seconds": 180,
33
+ "enable_cors": true,
34
+ "cors_origins": ["https://yourdomain.com"],
35
+ "log_level": "INFO",
36
+ "cache_enabled": true,
37
+ "metrics_enabled": true,
38
+ "health_check_enabled": true,
39
+ "auto_scaling": true
40
+ }
41
+ },
42
+
43
+ "vllm_config": {
44
+ "gpu_memory_utilization": 0.9,
45
+ "max_num_seqs": 256,
46
+ "max_num_batched_tokens": 8192,
47
+ "max_model_len": 200000,
48
+ "trust_remote_code": true,
49
+ "tensor_parallel_size": 1,
50
+ "pipeline_parallel_size": 1,
51
+ "dtype": "bfloat16",
52
+ "quantization": null,
53
+ "enforce_eager": false,
54
+ "enable_chunked_prefill": true,
55
+ "max_num_on_the_fly": 8,
56
+ "enable_prefix_caching": true,
57
+ "disable_custom_all_reduce": false
58
+ },
59
+
60
+ "text_generation_inference": {
61
+ "max_concurrent_requests": 128,
62
+ "max_best_of": 4,
63
+ "max_stop_sequences": 4,
64
+ "max_input_length": 199000,
65
+ "max_total_tokens": 200000,
66
+ "waiting_served_ratio": 1.2,
67
+ "max_batch_prefill_tokens": 4096,
68
+ "max_batch_total_tokens": 200000,
69
+ "max_waiting_tokens": 20,
70
+ "hostname": "0.0.0.0",
71
+ "port": 8080,
72
+ "master_shard_uds_path": "/tmp/text-generation-server",
73
+ "tokenizer_name": "DeepXR/Helion-V2.0-Thinking",
74
+ "revision": "main",
75
+ "validation_workers": 2,
76
+ "json_output": false,
77
+ "otlp_endpoint": null,
78
+ "cors_allow_origin": "*",
79
+ "watermark_gamma": null,
80
+ "watermark_delta": null
81
+ },
82
+
83
+ "ollama_modelfile": {
84
+ "from": "DeepXR/Helion-V2.0-Thinking",
85
+ "template": "[INST] {{ .System }} {{ .Prompt }} [/INST]",
86
+ "parameter": {
87
+ "temperature": 0.7,
88
+ "top_p": 0.9,
89
+ "top_k": 50,
90
+ "num_ctx": 200000,
91
+ "num_predict": 2048,
92
+ "stop": ["</s>", "<|end|>"],
93
+ "repeat_penalty": 1.1,
94
+ "seed": -1
95
+ },
96
+ "system": "You are Helion, a helpful AI assistant with vision and tool use capabilities."
97
+ },
98
+
99
+ "api_endpoints": {
100
+ "generate": {
101
+ "path": "/v1/generate",
102
+ "method": "POST",
103
+ "rate_limit": "100/minute",
104
+ "request_schema": {
105
+ "prompt": "string (required)",
106
+ "max_tokens": "integer (optional, default: 1024)",
107
+ "temperature": "float (optional, default: 0.7)",
108
+ "top_p": "float (optional, default: 0.9)",
109
+ "stream": "boolean (optional, default: false)",
110
+ "images": "array<base64> (optional)"
111
+ }
112
+ },
113
+ "chat": {
114
+ "path": "/v1/chat/completions",
115
+ "method": "POST",
116
+ "rate_limit": "100/minute",
117
+ "openai_compatible": true,
118
+ "request_schema": {
119
+ "messages": "array (required)",
120
+ "model": "string (required)",
121
+ "temperature": "float (optional)",
122
+ "stream": "boolean (optional)"
123
+ }
124
+ },
125
+ "embeddings": {
126
+ "path": "/v1/embeddings",
127
+ "method": "POST",
128
+ "rate_limit": "200/minute",
129
+ "enabled": false
130
+ },
131
+ "health": {
132
+ "path": "/health",
133
+ "method": "GET",
134
+ "public": true
135
+ },
136
+ "metrics": {
137
+ "path": "/metrics",
138
+ "method": "GET",
139
+ "format": "prometheus",
140
+ "public": false
141
+ }
142
+ },
143
+
144
+ "load_balancing": {
145
+ "strategy": "round_robin",
146
+ "health_check_interval_seconds": 30,
147
+ "unhealthy_threshold": 3,
148
+ "healthy_threshold": 2,
149
+ "sticky_sessions": false,
150
+ "session_affinity_ttl_seconds": 3600
151
+ },
152
+
153
+ "caching": {
154
+ "enabled": true,
155
+ "backend": "redis",
156
+ "redis": {
157
+ "host": "localhost",
158
+ "port": 6379,
159
+ "db": 0,
160
+ "password": null,
161
+ "ssl": false,
162
+ "ttl_seconds": 3600,
163
+ "max_connections": 50
164
+ },
165
+ "cache_keys": {
166
+ "prompt_prefix": "helion:prompt:",
167
+ "result_prefix": "helion:result:",
168
+ "metrics_prefix": "helion:metrics:"
169
+ },
170
+ "cache_policies": {
171
+ "identical_prompts": true,
172
+ "similar_prompts": false,
173
+ "max_cache_size_mb": 1024
174
+ }
175
+ },
176
+
177
+ "monitoring": {
178
+ "prometheus": {
179
+ "enabled": true,
180
+ "port": 9090,
181
+ "metrics": [
182
+ "request_count",
183
+ "request_duration_seconds",
184
+ "token_generation_rate",
185
+ "gpu_memory_usage",
186
+ "active_requests",
187
+ "queue_size"
188
+ ]
189
+ },
190
+ "logging": {
191
+ "format": "json",
192
+ "output": "stdout",
193
+ "level": "INFO",
194
+ "include_request_body": false,
195
+ "include_response_body": false,
196
+ "log_rotation": {
197
+ "enabled": true,
198
+ "max_size_mb": 100,
199
+ "max_files": 10
200
+ }
201
+ },
202
+ "tracing": {
203
+ "enabled": false,
204
+ "backend": "jaeger",
205
+ "sampling_rate": 0.1
206
+ }
207
+ },
208
+
209
+ "security": {
210
+ "authentication": {
211
+ "enabled": true,
212
+ "type": "api_key",
213
+ "api_key_header": "X-API-Key",
214
+ "rate_limiting": true
215
+ },
216
+ "rate_limiting": {
217
+ "enabled": true,
218
+ "requests_per_minute": 100,
219
+ "requests_per_hour": 5000,
220
+ "burst_size": 10,
221
+ "strategy": "sliding_window"
222
+ },
223
+ "input_validation": {
224
+ "max_prompt_length": 199000,
225
+ "max_image_size_mb": 20,
226
+ "max_images_per_request": 10,
227
+ "allowed_image_formats": ["jpg", "jpeg", "png", "webp"],
228
+ "sanitize_inputs": true
229
+ },
230
+ "output_filtering": {
231
+ "enabled": true,
232
+ "pii_detection": true,
233
+ "toxicity_filtering": true,
234
+ "content_policy_enforcement": true
235
+ }
236
+ },
237
+
238
+ "resource_management": {
239
+ "gpu": {
240
+ "memory_fraction": 0.95,
241
+ "allow_growth": true,
242
+ "per_process_gpu_memory_fraction": 0.9,
243
+ "visible_devices": "0",
244
+ "multi_gpu_strategy": "model_parallel"
245
+ },
246
+ "cpu": {
247
+ "num_threads": 8,
248
+ "num_workers": 4,
249
+ "affinity_enabled": false
250
+ },
251
+ "memory": {
252
+ "max_memory_gb": 64,
253
+ "swap_enabled": false,
254
+ "oom_handling": "graceful_degradation"
255
+ }
256
+ },
257
+
258
+ "auto_scaling": {
259
+ "enabled": false,
260
+ "min_replicas": 1,
261
+ "max_replicas": 10,
262
+ "target_gpu_utilization": 0.7,
263
+ "target_request_rate": 50,
264
+ "scale_up_threshold": 0.8,
265
+ "scale_down_threshold": 0.3,
266
+ "cooldown_period_seconds": 300
267
+ },
268
+
269
+ "backup_and_recovery": {
270
+ "checkpoint_enabled": false,
271
+ "checkpoint_interval_hours": 24,
272
+ "checkpoint_path": "/data/checkpoints",
273
+ "max_checkpoints": 5,
274
+ "recovery_strategy": "latest_checkpoint"
275
+ },
276
+
277
+ "experimental_features": {
278
+ "speculative_decoding": false,
279
+ "continuous_batching": true,
280
+ "dynamic_batching": true,
281
+ "model_compilation": false,
282
+ "mixed_precision": true,
283
+ "gradient_checkpointing": false
284
+ },
285
+
286
+ "model_serving_options": {
287
+ "triton_inference_server": {
288
+ "enabled": false,
289
+ "model_repository": "/models",
290
+ "backend": "python",
291
+ "max_batch_size": 8,
292
+ "dynamic_batching": true
293
+ },
294
+ "torchserve": {
295
+ "enabled": false,
296
+ "model_store": "/model_store",
297
+ "batch_size": 4,
298
+ "workers": 2
299
+ },
300
+ "ray_serve": {
301
+ "enabled": false,
302
+ "num_replicas": 2,
303
+ "max_concurrent_queries": 16
304
+ }
305
+ },
306
+
307
+ "cloud_deployment": {
308
+ "aws": {
309
+ "instance_type": "p4d.24xlarge",
310
+ "region": "us-east-1",
311
+ "use_spot_instances": false,
312
+ "s3_model_path": "s3://your-bucket/models/helion-v2-thinking"
313
+ },
314
+ "gcp": {
315
+ "machine_type": "a2-highgpu-8g",
316
+ "region": "us-central1",
317
+ "preemptible": false,
318
+ "gcs_model_path": "gs://your-bucket/models/helion-v2-thinking"
319
+ },
320
+ "azure": {
321
+ "vm_size": "Standard_NC96ads_A100_v4",
322
+ "region": "eastus",
323
+ "spot_instance": false,
324
+ "blob_model_path": "https://your-storage.blob.core.windows.net/models/helion-v2-thinking"
325
+ }
326
+ }
327
+ }