# Multi-stage build for DeepXR/Helion-2.5-Rnd # Optimized for production inference with vLLM # Stage 1: Base image with CUDA and Python FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 AS base # Set environment variables ENV DEBIAN_FRONTEND=noninteractive \ PYTHONUNBUFFERED=1 \ CUDA_HOME=/usr/local/cuda \ TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" \ FORCE_CUDA=1 \ MAX_JOBS=8 # Install system dependencies RUN apt-get update && apt-get install -y \ python3.10 \ python3-pip \ python3.10-dev \ git \ wget \ curl \ vim \ build-essential \ cmake \ ninja-build \ ccache \ libssl-dev \ libffi-dev \ libjpeg-dev \ libpng-dev \ libgomp1 \ && rm -rf /var/lib/apt/lists/* # Update pip and install build tools RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel # Stage 2: Build dependencies FROM base AS builder WORKDIR /build # Install PyTorch with CUDA support RUN pip install --no-cache-dir \ torch==2.2.0 \ torchvision==0.17.0 \ torchaudio==2.2.0 \ --index-url https://download.pytorch.org/whl/cu121 # Install vLLM and core dependencies RUN pip install --no-cache-dir \ vllm==0.3.3 \ transformers==4.40.0 \ tokenizers==0.15.2 \ sentencepiece==0.2.0 \ accelerate==0.28.0 \ bitsandbytes==0.43.0 \ safetensors==0.4.2 \ huggingface-hub==0.21.4 # Install additional ML libraries RUN pip install --no-cache-dir \ numpy==1.26.4 \ scipy==1.12.0 \ pandas==2.2.1 \ scikit-learn==1.4.1 \ pydantic==2.6.4 \ fastapi==0.110.0 \ uvicorn[standard]==0.29.0 \ aiohttp==3.9.3 \ ray[default]==2.10.0 # Install monitoring and optimization tools RUN pip install --no-cache-dir \ prometheus-client==0.20.0 \ gputil==1.4.0 \ psutil==5.9.8 \ py-cpuinfo==9.0.0 \ pynvml==11.5.0 # Stage 3: Final runtime image FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 # Copy environment variables ENV DEBIAN_FRONTEND=noninteractive \ PYTHONUNBUFFERED=1 \ CUDA_HOME=/usr/local/cuda \ MODEL_NAME=DeepXR/Helion-2.5-Rnd \ MODEL_PATH=/models/helion \ PORT=8000 \ HOST=0.0.0.0 \ TENSOR_PARALLEL_SIZE=2 \ MAX_MODEL_LEN=131072 \ GPU_MEMORY_UTILIZATION=0.95 \ WORKERS=1 # Install runtime dependencies only RUN apt-get update && apt-get install -y \ python3.10 \ python3-pip \ curl \ vim \ libgomp1 \ && rm -rf /var/lib/apt/lists/* # Copy Python packages from builder COPY --from=builder /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages COPY --from=builder /usr/local/bin /usr/local/bin # Create application directory WORKDIR /app # Create necessary directories RUN mkdir -p /models/helion /app/inference /app/logs /app/cache # Copy inference code COPY ./inference /app/inference COPY ./model_config.yaml /app/ COPY ./config.json /app/ # Set permissions RUN chmod +x /app/inference/*.py # Create non-root user for security RUN useradd -m -u 1000 helion && \ chown -R helion:helion /app /models USER helion # Health check HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ CMD curl -f http://localhost:${PORT}/health || exit 1 # Expose ports EXPOSE 8000 8001 8002 # Set default command CMD ["python3", "-m", "inference.server", \ "--model", "${MODEL_PATH}", \ "--host", "${HOST}", \ "--port", "${PORT}", \ "--tensor-parallel-size", "${TENSOR_PARALLEL_SIZE}", \ "--max-model-len", "${MAX_MODEL_LEN}", \ "--gpu-memory-utilization", "${GPU_MEMORY_UTILIZATION}"] # Labels LABEL maintainer="DeepXR Team" \ version="2.5.0-rnd" \ description="Helion-2.5 Research & Development Model - Advanced Language Model" \ model="DeepXR/Helion-2.5-Rnd" \ license="Apache-2.0"