HonestAI

Paused

App Files Files Community

HonestAI / src /models_config.py

JatsTheAIGen

Update model IDs to use Cerebras deployment and add gated repository error handling

b3aba24 about 1 month ago

raw

history blame

2.3 kB

	# models_config.py
	# Optimized for NVIDIA T4 Medium (16GB VRAM) with 4-bit quantization
	LLM_CONFIG = {
	"primary_provider": "huggingface",
	"models": {
	"reasoning_primary": {
	"model_id": "meta-llama/Llama-3.1-8B-Instruct:cerebras", # Cerebras deployment
	"task": "general_reasoning",
	"max_tokens": 10000,
	"temperature": 0.7,
	"cost_per_token": 0.000015,
	"fallback": "Qwen/Qwen2.5-7B-Instruct", # Fallback to Qwen if Llama unavailable
	"is_chat_model": True,
	"use_4bit_quantization": True, # Enable 4-bit quantization for 16GB T4
	"use_8bit_quantization": False
	},
	"embedding_specialist": {
	"model_id": "intfloat/e5-large-v2", # Upgraded: 1024-dim embeddings (vs 384), much better semantic understanding
	"task": "embeddings",
	"vector_dimensions": 1024,
	"purpose": "semantic_similarity",
	"cost_advantage": "90%_cheaper_than_primary",
	"is_chat_model": False
	},
	"classification_specialist": {
	"model_id": "meta-llama/Llama-3.1-8B-Instruct:cerebras", # Cerebras deployment for classification
	"task": "intent_classification",
	"max_length": 512,
	"specialization": "fast_inference",
	"latency_target": "<100ms",
	"is_chat_model": True,
	"use_4bit_quantization": True
	},
	"safety_checker": {
	"model_id": "meta-llama/Llama-3.1-8B-Instruct:cerebras", # Cerebras deployment for safety
	"task": "content_moderation",
	"confidence_threshold": 0.85,
	"purpose": "bias_detection",
	"is_chat_model": True,
	"use_4bit_quantization": True
	}
	},
	"routing_logic": {
	"strategy": "task_based_routing",
	"fallback_chain": ["primary", "fallback", "degraded_mode"],
	"load_balancing": "round_robin_with_health_check"
	},
	"quantization_settings": {
	"default_4bit": True, # Enable 4-bit quantization by default for T4 16GB
	"default_8bit": False,
	"bnb_4bit_compute_dtype": "float16",
	"bnb_4bit_use_double_quant": True,
	"bnb_4bit_quant_type": "nf4"
	}
	}