# models_config.py
# Optimized for NVIDIA T4 Medium (16GB VRAM) with 4-bit quantization
# UPDATED: Local models only - no API fallback
LLM_CONFIG = {
    "primary_provider": "local",
    "models": {
        "reasoning_primary": {
            "model_id": "Qwen/Qwen2.5-7B-Instruct",  # Single primary model for all text tasks
            "task": "general_reasoning",
            "max_tokens": 8000,  # Reduced from 10000
            "temperature": 0.7,
            "fallback": None,  # Will handle fallback in code if needed
            "is_chat_model": True,
            "use_4bit_quantization": True,  # Enable 4-bit quantization for 16GB T4
            "use_8bit_quantization": False
        },
        "embedding_specialist": {
            "model_id": "intfloat/e5-large-v2",  # 1024-dim embeddings for semantic similarity
            "task": "embeddings",
            "vector_dimensions": 1024,
            "purpose": "semantic_similarity",
            "is_chat_model": False
        },
        "classification_specialist": {
            "model_id": "Qwen/Qwen2.5-7B-Instruct",  # Same model for all text tasks
            "task": "intent_classification",
            "max_length": 512,
            "specialization": "fast_inference",
            "latency_target": "<100ms",
            "is_chat_model": True,
            "use_4bit_quantization": True
        },
        "safety_checker": {
            "model_id": "Qwen/Qwen2.5-7B-Instruct",  # Same model for all text tasks
            "task": "content_moderation",
            "confidence_threshold": 0.85,
            "purpose": "bias_detection",
            "is_chat_model": True,
            "use_4bit_quantization": True
        }
    },
    "routing_logic": {
        "strategy": "task_based_routing",
        "fallback_chain": ["primary"],  # No API fallback
        "load_balancing": "single_model_reuse"
    },
    "quantization_settings": {
        "default_4bit": True,  # Enable 4-bit quantization by default for T4 16GB
        "default_8bit": False,
        "bnb_4bit_compute_dtype": "float16",
        "bnb_4bit_use_double_quant": True,
        "bnb_4bit_quant_type": "nf4"
    }
}