# Multi-stage build for DeepXR/Helion-2.5-Rnd
# Optimized for production inference with vLLM

# Stage 1: Base image with CUDA and Python
FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 AS base

# Set environment variables
ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 \
    CUDA_HOME=/usr/local/cuda \
    TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" \
    FORCE_CUDA=1 \
    MAX_JOBS=8

# Install system dependencies
RUN apt-get update && apt-get install -y \
    python3.10 \
    python3-pip \
    python3.10-dev \
    git \
    wget \
    curl \
    vim \
    build-essential \
    cmake \
    ninja-build \
    ccache \
    libssl-dev \
    libffi-dev \
    libjpeg-dev \
    libpng-dev \
    libgomp1 \
    && rm -rf /var/lib/apt/lists/*

# Update pip and install build tools
RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel

# Stage 2: Build dependencies
FROM base AS builder

WORKDIR /build

# Install PyTorch with CUDA support
RUN pip install --no-cache-dir \
    torch==2.2.0 \
    torchvision==0.17.0 \
    torchaudio==2.2.0 \
    --index-url https://download.pytorch.org/whl/cu121

# Install vLLM and core dependencies
RUN pip install --no-cache-dir \
    vllm==0.3.3 \
    transformers==4.40.0 \
    tokenizers==0.15.2 \
    sentencepiece==0.2.0 \
    accelerate==0.28.0 \
    bitsandbytes==0.43.0 \
    safetensors==0.4.2 \
    huggingface-hub==0.21.4

# Install additional ML libraries
RUN pip install --no-cache-dir \
    numpy==1.26.4 \
    scipy==1.12.0 \
    pandas==2.2.1 \
    scikit-learn==1.4.1 \
    pydantic==2.6.4 \
    fastapi==0.110.0 \
    uvicorn[standard]==0.29.0 \
    aiohttp==3.9.3 \
    ray[default]==2.10.0

# Install monitoring and optimization tools
RUN pip install --no-cache-dir \
    prometheus-client==0.20.0 \
    gputil==1.4.0 \
    psutil==5.9.8 \
    py-cpuinfo==9.0.0 \
    pynvml==11.5.0

# Stage 3: Final runtime image
FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04

# Copy environment variables
ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 \
    CUDA_HOME=/usr/local/cuda \
    MODEL_NAME=DeepXR/Helion-2.5-Rnd \
    MODEL_PATH=/models/helion \
    PORT=8000 \
    HOST=0.0.0.0 \
    TENSOR_PARALLEL_SIZE=2 \
    MAX_MODEL_LEN=131072 \
    GPU_MEMORY_UTILIZATION=0.95 \
    WORKERS=1

# Install runtime dependencies only
RUN apt-get update && apt-get install -y \
    python3.10 \
    python3-pip \
    curl \
    vim \
    libgomp1 \
    && rm -rf /var/lib/apt/lists/*

# Copy Python packages from builder
COPY --from=builder /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
COPY --from=builder /usr/local/bin /usr/local/bin

# Create application directory
WORKDIR /app

# Create necessary directories
RUN mkdir -p /models/helion /app/inference /app/logs /app/cache

# Copy inference code
COPY ./inference /app/inference
COPY ./model_config.yaml /app/
COPY ./config.json /app/

# Set permissions
RUN chmod +x /app/inference/*.py

# Create non-root user for security
RUN useradd -m -u 1000 helion && \
    chown -R helion:helion /app /models

USER helion

# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
    CMD curl -f http://localhost:${PORT}/health || exit 1

# Expose ports
EXPOSE 8000 8001 8002

# Set default command
CMD ["python3", "-m", "inference.server", \
     "--model", "${MODEL_PATH}", \
     "--host", "${HOST}", \
     "--port", "${PORT}", \
     "--tensor-parallel-size", "${TENSOR_PARALLEL_SIZE}", \
     "--max-model-len", "${MAX_MODEL_LEN}", \
     "--gpu-memory-utilization", "${GPU_MEMORY_UTILIZATION}"]

# Labels
LABEL maintainer="DeepXR Team" \
      version="2.5.0-rnd" \
      description="Helion-2.5 Research & Development Model - Advanced Language Model" \
      model="DeepXR/Helion-2.5-Rnd" \
      license="Apache-2.0"