HonestAI / src /models_config.py
JatsTheAIGen's picture
Update model IDs to use Cerebras deployment and add gated repository error handling
b3aba24
raw
history blame
2.3 kB
# models_config.py
# Optimized for NVIDIA T4 Medium (16GB VRAM) with 4-bit quantization
LLM_CONFIG = {
"primary_provider": "huggingface",
"models": {
"reasoning_primary": {
"model_id": "meta-llama/Llama-3.1-8B-Instruct:cerebras", # Cerebras deployment
"task": "general_reasoning",
"max_tokens": 10000,
"temperature": 0.7,
"cost_per_token": 0.000015,
"fallback": "Qwen/Qwen2.5-7B-Instruct", # Fallback to Qwen if Llama unavailable
"is_chat_model": True,
"use_4bit_quantization": True, # Enable 4-bit quantization for 16GB T4
"use_8bit_quantization": False
},
"embedding_specialist": {
"model_id": "intfloat/e5-large-v2", # Upgraded: 1024-dim embeddings (vs 384), much better semantic understanding
"task": "embeddings",
"vector_dimensions": 1024,
"purpose": "semantic_similarity",
"cost_advantage": "90%_cheaper_than_primary",
"is_chat_model": False
},
"classification_specialist": {
"model_id": "meta-llama/Llama-3.1-8B-Instruct:cerebras", # Cerebras deployment for classification
"task": "intent_classification",
"max_length": 512,
"specialization": "fast_inference",
"latency_target": "<100ms",
"is_chat_model": True,
"use_4bit_quantization": True
},
"safety_checker": {
"model_id": "meta-llama/Llama-3.1-8B-Instruct:cerebras", # Cerebras deployment for safety
"task": "content_moderation",
"confidence_threshold": 0.85,
"purpose": "bias_detection",
"is_chat_model": True,
"use_4bit_quantization": True
}
},
"routing_logic": {
"strategy": "task_based_routing",
"fallback_chain": ["primary", "fallback", "degraded_mode"],
"load_balancing": "round_robin_with_health_check"
},
"quantization_settings": {
"default_4bit": True, # Enable 4-bit quantization by default for T4 16GB
"default_8bit": False,
"bnb_4bit_compute_dtype": "float16",
"bnb_4bit_use_double_quant": True,
"bnb_4bit_quant_type": "nf4"
}
}