# models_config.py # Optimized for NVIDIA T4 Medium (16GB VRAM) with 4-bit quantization # UPDATED: Local models only - no API fallback LLM_CONFIG = { "primary_provider": "local", "models": { "reasoning_primary": { "model_id": "Qwen/Qwen2.5-7B-Instruct", # Single primary model for all text tasks "task": "general_reasoning", "max_tokens": 8000, # Reduced from 10000 "temperature": 0.7, "fallback": None, # Will handle fallback in code if needed "is_chat_model": True, "use_4bit_quantization": True, # Enable 4-bit quantization for 16GB T4 "use_8bit_quantization": False }, "embedding_specialist": { "model_id": "intfloat/e5-large-v2", # 1024-dim embeddings for semantic similarity "task": "embeddings", "vector_dimensions": 1024, "purpose": "semantic_similarity", "is_chat_model": False }, "classification_specialist": { "model_id": "Qwen/Qwen2.5-7B-Instruct", # Same model for all text tasks "task": "intent_classification", "max_length": 512, "specialization": "fast_inference", "latency_target": "<100ms", "is_chat_model": True, "use_4bit_quantization": True }, "safety_checker": { "model_id": "Qwen/Qwen2.5-7B-Instruct", # Same model for all text tasks "task": "content_moderation", "confidence_threshold": 0.85, "purpose": "bias_detection", "is_chat_model": True, "use_4bit_quantization": True } }, "routing_logic": { "strategy": "task_based_routing", "fallback_chain": ["primary"], # No API fallback "load_balancing": "single_model_reuse" }, "quantization_settings": { "default_4bit": True, # Enable 4-bit quantization by default for T4 16GB "default_8bit": False, "bnb_4bit_compute_dtype": "float16", "bnb_4bit_use_double_quant": True, "bnb_4bit_quant_type": "nf4" } }