from fastapi import FastAPI, HTTPException from pydantic import BaseModel import subprocess import signal import os import requests import time from typing import Optional app = FastAPI() # Predefined list of available models (TheBloke only - verified, fits 18GB Space) AVAILABLE_MODELS = { # === General Purpose (Default) === "deepseek-chat": "TheBloke/deepseek-llm-7B-chat-GGUF:deepseek-llm-7b-chat.Q4_K_M.gguf", # === Financial & Summarization Models === "mistral-7b": "TheBloke/Mistral-7B-Instruct-v0.2-GGUF:mistral-7b-instruct-v0.2.Q4_K_M.gguf", "openhermes-7b": "TheBloke/OpenHermes-2.5-Mistral-7B-GGUF:openhermes-2.5-mistral-7b.Q4_K_M.gguf", # === Coding Models === "deepseek-coder": "TheBloke/deepseek-coder-6.7B-instruct-GGUF:deepseek-coder-6.7b-instruct.Q4_K_M.gguf", # === Lightweight/Fast === "llama-7b": "TheBloke/Llama-2-7B-Chat-GGUF:llama-2-7b-chat.Q4_K_M.gguf", } # Global state current_model = "deepseek-chat" # Default model llama_process: Optional[subprocess.Popen] = None LLAMA_SERVER_PORT = 8080 LLAMA_SERVER_URL = f"http://localhost:{LLAMA_SERVER_PORT}" class ModelSwitchRequest(BaseModel): model_name: str class ChatCompletionRequest(BaseModel): messages: list[dict] max_tokens: int = 256 temperature: float = 0.7 def start_llama_server(model_id: str) -> subprocess.Popen: """Start llama-server with specified model (optimized for speed).""" cmd = [ "llama-server", "-hf", model_id, "--host", "0.0.0.0", "--port", str(LLAMA_SERVER_PORT), "-c", "2048", # Context size "-t", "4", # CPU threads (adjust based on cores) "-ngl", "0", # GPU layers (0 for CPU-only) "--cont-batching", # Enable continuous batching for speed "-b", "512", # Batch size ] print(f"Starting llama-server with model: {model_id}") print("This may take 2-3 minutes to download and load the model...") process = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=os.setsid if os.name != 'nt' else None, text=True, bufsize=1 ) # Wait for server to be ready (increased timeout for model download) max_retries = 300 # 5 minutes for i in range(max_retries): # Check if process died if process.poll() is not None: stdout, _ = process.communicate() print(f"llama-server exited with code {process.returncode}") print(f"Output: {stdout}") raise RuntimeError("llama-server process died") try: # Try root endpoint instead of /health response = requests.get(f"{LLAMA_SERVER_URL}/", timeout=2) if response.status_code in [200, 404]: # 404 is ok, means server is up print(f"llama-server ready after {i+1} seconds") return process except requests.exceptions.ConnectionError: # Server not ready yet pass except Exception as e: # Other errors, keep waiting pass time.sleep(1) raise RuntimeError("llama-server failed to start within 5 minutes") def stop_llama_server(): """Stop the running llama-server.""" global llama_process if llama_process: print("Stopping llama-server...") try: if os.name != 'nt': os.killpg(os.getpgid(llama_process.pid), signal.SIGTERM) else: llama_process.terminate() llama_process.wait(timeout=10) except: if os.name != 'nt': os.killpg(os.getpgid(llama_process.pid), signal.SIGKILL) else: llama_process.kill() llama_process = None time.sleep(2) # Give it time to fully shut down @app.on_event("startup") async def startup_event(): """Start with default model.""" global llama_process model_id = AVAILABLE_MODELS[current_model] llama_process = start_llama_server(model_id) @app.on_event("shutdown") async def shutdown_event(): """Clean shutdown.""" stop_llama_server() @app.get("/") async def root(): return { "status": "DeepSeek API with dynamic model switching", "current_model": current_model, "available_models": list(AVAILABLE_MODELS.keys()) } @app.get("/models") async def list_models(): """List all available models.""" return { "current_model": current_model, "available_models": list(AVAILABLE_MODELS.keys()) } @app.post("/switch-model") async def switch_model(request: ModelSwitchRequest): """Switch to a different model.""" global current_model, llama_process if request.model_name not in AVAILABLE_MODELS: raise HTTPException( status_code=400, detail=f"Model '{request.model_name}' not found. Available: {list(AVAILABLE_MODELS.keys())}" ) if request.model_name == current_model: return {"message": f"Already using model: {current_model}"} # Stop current server stop_llama_server() # Start with new model model_id = AVAILABLE_MODELS[request.model_name] llama_process = start_llama_server(model_id) current_model = request.model_name return { "message": f"Switched to model: {current_model}", "model": current_model } @app.post("/v1/chat/completions") async def chat_completions(request: ChatCompletionRequest): """OpenAI-compatible chat completions endpoint.""" try: # Forward to llama-server response = requests.post( f"{LLAMA_SERVER_URL}/v1/chat/completions", json={ "messages": request.messages, "max_tokens": request.max_tokens, "temperature": request.temperature, }, timeout=300 ) response.raise_for_status() return response.json() except requests.exceptions.RequestException as e: raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")