Spaces:
Running
Running
| import os | |
| import requests | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from fastapi.responses import HTMLResponse | |
| from llama_cpp import Llama | |
| from pydantic import BaseModel | |
| import uvicorn | |
| # Configuration | |
| MODEL_URL = "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf" | |
| MODEL_NAME = "DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf" | |
| MODEL_DIR = "model" | |
| MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME) | |
| # Create model directory if it doesn't exist | |
| os.makedirs(MODEL_DIR, exist_ok=True) | |
| # Download the model if it doesn't exist | |
| if not os.path.exists(MODEL_PATH): | |
| print(f"Downloading model from {MODEL_URL}...") | |
| response = requests.get(MODEL_URL, stream=True) | |
| if response.status_code == 200: | |
| with open(MODEL_PATH, "wb") as f: | |
| for chunk in response.iter_content(chunk_size=8192): | |
| f.write(chunk) | |
| print("Model downloaded successfully!") | |
| else: | |
| raise RuntimeError(f"Failed to download model: HTTP {response.status_code}") | |
| else: | |
| print("Model already exists. Skipping download.") | |
| # Initialize FastAPI | |
| app = FastAPI( | |
| title="DeepSeek-R1 OpenAI-Compatible API", | |
| description="OpenAI-compatible API for DeepSeek-R1-Distill-Qwen-1.5B", | |
| version="1.0.0" | |
| ) | |
| # CORS Configuration | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # Load the model | |
| print("Loading model...") | |
| try: | |
| llm = Llama( | |
| model_path=MODEL_PATH, | |
| n_ctx=2048, | |
| n_threads=4, | |
| n_gpu_layers=0, | |
| verbose=False | |
| ) | |
| print("Model loaded successfully!") | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to load model: {str(e)}") | |
| # Root endpoint with documentation | |
| async def root(): | |
| return f""" | |
| <html> | |
| <head> | |
| <title>DeepSeek-R1 OpenAI API</title> | |
| <style> | |
| body {{ font-family: Arial, sans-serif; max-width: 800px; margin: 20px auto; padding: 0 20px; }} | |
| .warning {{ color: #dc3545; background: #ffeef0; padding: 15px; border-radius: 5px; }} | |
| a {{ color: #007bff; text-decoration: none; }} | |
| code {{ background: #f8f9fa; padding: 2px 4px; border-radius: 4px; }} | |
| </style> | |
| </head> | |
| <body> | |
| <h1>DeepSeek-R1 OpenAI-Compatible API</h1> | |
| <div class="warning"> | |
| <h3>⚠️ Important Notice</h3> | |
| <p>For private use, please duplicate this space:<br> | |
| 1. Click your profile picture in the top-right<br> | |
| 2. Select "Duplicate Space"<br> | |
| 3. Set visibility to Private</p> | |
| </div> | |
| <h2>API Documentation</h2> | |
| <ul> | |
| <li><a href="/docs">Interactive Swagger Documentation</a></li> | |
| <li><a href="/redoc">ReDoc Documentation</a></li> | |
| </ul> | |
| <h2>API Endpoints</h2> | |
| <h3>Chat Completion</h3> | |
| <p><code>POST /v1/chat/completions</code></p> | |
| <p>Parameters:</p> | |
| <ul> | |
| <li><strong>messages</strong>: List of message objects</li> | |
| <li><strong>max_tokens</strong>: Maximum response length (default: 128)</li> | |
| <li><strong>temperature</strong>: Sampling temperature (default: 0.7)</li> | |
| <li><strong>top_p</strong>: Nucleus sampling threshold (default: 0.9)</li> | |
| </ul> | |
| <h2>Example Request</h2> | |
| <pre> | |
| curl -X POST "{os.environ.get('SPACE_HOST', 'http://localhost:7860')}/v1/chat/completions" \\ | |
| -H "Content-Type: application/json" \\ | |
| -d '{{ | |
| "messages": [{{"role": "user", "content": "Explain quantum computing"}}], | |
| "max_tokens": 150 | |
| }}' | |
| </pre> | |
| </body> | |
| </html> | |
| """ | |
| # OpenAI-Compatible Request Schema | |
| class ChatCompletionRequest(BaseModel): | |
| model: str = "DeepSeek-R1-Distill-Qwen-1.5B" | |
| messages: list[dict] | |
| max_tokens: int = 128 | |
| temperature: float = 0.7 | |
| top_p: float = 0.9 | |
| stream: bool = False | |
| # OpenAI-Compatible Response Schema | |
| class ChatCompletionResponse(BaseModel): | |
| id: str = "chatcmpl-12345" | |
| object: str = "chat.completion" | |
| created: int = 1693161600 | |
| model: str = "DeepSeek-R1-Distill-Qwen-1.5B" | |
| choices: list[dict] | |
| usage: dict | |
| async def chat_completion(request: ChatCompletionRequest): | |
| try: | |
| prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in request.messages]) | |
| prompt += "\nassistant:" | |
| response = llm( | |
| prompt=prompt, | |
| max_tokens=request.max_tokens, | |
| temperature=request.temperature, | |
| top_p=request.top_p, | |
| stop=["</s>"] | |
| ) | |
| return ChatCompletionResponse( | |
| choices=[{ | |
| "index": 0, | |
| "message": { | |
| "role": "assistant", | |
| "content": response['choices'][0]['text'].strip() | |
| }, | |
| "finish_reason": "stop" | |
| }], | |
| usage={ | |
| "prompt_tokens": len(prompt), | |
| "completion_tokens": len(response['choices'][0]['text']), | |
| "total_tokens": len(prompt) + len(response['choices'][0]['text']) | |
| } | |
| ) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| def health_check(): | |
| return {"status": "healthy"} | |
| if __name__ == "__main__": | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |