import torch import intel_extension_for_pytorch as ipex from transformers import AutoTokenizer, AutoModelForCausalLM from fastapi import FastAPI, HTTPException from pydantic import BaseModel from typing import Optional, List import uvicorn import os # Setup Intel optimizations os.environ["KMP_BLOCKTIME"] = "1" os.environ["KMP_AFFINITY"] = "granularity=fine,compact,1,0" os.environ["OMP_NUM_THREADS"] = "16" app = FastAPI(title="NSW-1 Inference API", version="1.0.0") # Global model and tokenizer model = None tokenizer = None class GenerateRequest(BaseModel): prompt: str max_length: Optional[int] = 200 temperature: Optional[float] = 0.7 top_p: Optional[float] = 0.9 top_k: Optional[int] = 50 num_return_sequences: Optional[int] = 1 class GenerateResponse(BaseModel): generated_text: List[str] model: str = "Opentrouter-ai/NSW-1" class HealthResponse(BaseModel): status: str model_loaded: bool @app.on_event("startup") async def load_model(): """Load model on startup""" global model, tokenizer model_path = os.getenv("MODEL_PATH", "Opentrouter-ai/NSW-1") print(f"Loading model from {model_path}...") tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForCausalLM.from_pretrained( model_path, torch_dtype=torch.float32, low_cpu_mem_usage=True ) # Apply Intel optimizations model = ipex.optimize(model, dtype=torch.float32) model.eval() print("Model loaded successfully!") @app.get("/health", response_model=HealthResponse) async def health_check(): """Health check endpoint""" return { "status": "healthy", "model_loaded": model is not None } @app.post("/generate", response_model=GenerateResponse) async def generate(request: GenerateRequest): """Generate text from prompt""" if model is None or tokenizer is None: raise HTTPException(status_code=503, detail="Model not loaded") try: # Tokenize input inputs = tokenizer( request.prompt, return_tensors="pt", padding=True, truncation=True ) # Generate with torch.no_grad(): outputs = model.generate( **inputs, max_length=request.max_length, temperature=request.temperature, top_p=request.top_p, top_k=request.top_k, num_return_sequences=request.num_return_sequences, do_sample=True, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id ) # Decode outputs generated_texts = [ tokenizer.decode(output, skip_special_tokens=True) for output in outputs ] return { "generated_text": generated_texts } except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.post("/finance-analysis") async def finance_analysis(request: GenerateRequest): """Specialized endpoint for finance queries""" finance_prompt = f"Financial Analysis:\n{request.prompt}\n\nAnalysis:" request.prompt = finance_prompt return await generate(request) @app.post("/safety-check") async def safety_check(request: GenerateRequest): """Check content safety""" safety_prompt = f"Content Safety Analysis:\nContent: {request.prompt}\n\nSafety Assessment:" request.prompt = safety_prompt request.max_length = 150 return await generate(request) if __name__ == "__main__": uvicorn.run( app, host="0.0.0.0", port=8000, workers=1 # Use single worker for CPU inference )