diff --git a/README.md b/README.md index 6ea5141..bca2e16 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,7 @@ These services require building custom Docker images from source. | [Clash](./src/clash) | 1.18.0 | | [ClickHouse](./src/clickhouse) | 24.11.1 | | [Conductor](./src/conductor) | latest | +| [Convex](./src/convex) | 33cef775 | | [DeepTutor](./apps/deeptutor) | latest | | [Dify](./apps/dify) | 0.18.2 | | [DNSMasq](./src/dnsmasq) | 2.91 | @@ -77,6 +78,7 @@ These services require building custom Docker images from source. | [LibreOffice](./src/libreoffice) | latest | | [libSQL Server](./src/libsql) | latest | | [LiteLLM](./src/litellm) | main-stable | +| [llama-swap](./src/llama-swap) | cpu | | [llama.cpp](./src/llama.cpp) | server | | [LMDeploy](./src/lmdeploy) | v0.11.1 | | [Logstash](./src/logstash) | 8.16.1 | diff --git a/README.zh.md b/README.zh.md index 75fbba1..82d1d94 100644 --- a/README.zh.md +++ b/README.zh.md @@ -37,6 +37,7 @@ Compose Anything 通过提供一组高质量的 Docker Compose 配置文件, | [Clash](./src/clash) | 1.18.0 | | [ClickHouse](./src/clickhouse) | 24.11.1 | | [Conductor](./src/conductor) | latest | +| [Convex](./src/convex) | 33cef775 | | [DeepTutor](./apps/deeptutor) | latest | | [Dify](./apps/dify) | 0.18.2 | | [DNSMasq](./src/dnsmasq) | 2.91 | @@ -77,6 +78,7 @@ Compose Anything 通过提供一组高质量的 Docker Compose 配置文件, | [LibreOffice](./src/libreoffice) | latest | | [libSQL Server](./src/libsql) | latest | | [LiteLLM](./src/litellm) | main-stable | +| [llama-swap](./src/llama-swap) | cpu | | [llama.cpp](./src/llama.cpp) | server | | [LMDeploy](./src/lmdeploy) | v0.11.1 | | [Logstash](./src/logstash) | 8.16.1 | diff --git a/apps/opik/docker-compose.yaml b/apps/opik/docker-compose.yaml index 5313df5..6c55b24 100644 --- a/apps/opik/docker-compose.yaml +++ b/apps/opik/docker-compose.yaml @@ -182,6 +182,7 @@ services: minio-init: <<: *defaults image: ${GLOBAL_REGISTRY:-}minio/mc:${MINIO_MC_VERSION:-RELEASE.2025-03-12T17-29-24Z} + restart: on-failure depends_on: minio: condition: service_healthy diff --git a/builds/mineru/Dockerfile b/builds/mineru/Dockerfile index 560aa73..5cbd953 100644 --- a/builds/mineru/Dockerfile +++ b/builds/mineru/Dockerfile @@ -1,7 +1,6 @@ # Use the official vllm image for gpu with Ampere、Ada Lovelace、Hopper architecture (8.0 <= Compute Capability <= 9.0) # Compute Capability version query (https://developer.nvidia.com/cuda-gpus) -# only support x86_64 architecture -FROM vllm/vllm-openai:v0.10.1.1 +FROM vllm/vllm-openai:v0.10.2 # Use the official vllm image for gpu with Volta、Turing、Blackwell architecture (7.0 < Compute Capability < 8.0 or Compute Capability >= 10.0) # support x86_64 architecture and ARM(AArch64) architecture diff --git a/builds/mineru/china.Dockerfile b/builds/mineru/china.Dockerfile new file mode 100644 index 0000000..c00db1c --- /dev/null +++ b/builds/mineru/china.Dockerfile @@ -0,0 +1,28 @@ +# Use DaoCloud mirrored vllm image for China region for gpu with Ampere、Ada Lovelace、Hopper architecture (8.0 <= Compute Capability <= 9.0) +# Compute Capability version query (https://developer.nvidia.com/cuda-gpus) +FROM docker.m.daocloud.io/vllm/vllm-openai:v0.10.2 + +# Use DaoCloud mirrored vllm image for China region for gpu with Volta、Turing、Blackwell architecture (7.0 < Compute Capability < 8.0 or Compute Capability >= 10.0) +# support x86_64 architecture and ARM(AArch64) architecture +# FROM docker.m.daocloud.io/vllm/vllm-openai:v0.11.0 + +# Install libgl for opencv support & Noto fonts for Chinese characters +RUN apt-get update && \ + apt-get install -y \ + fonts-noto-core \ + fonts-noto-cjk \ + fontconfig \ + libgl1 && \ + fc-cache -fv && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Install mineru latest +RUN python3 -m pip install -U 'mineru[core]>=2.7.0' -i https://mirrors.aliyun.com/pypi/simple --break-system-packages && \ + python3 -m pip cache purge + +# Download models and update the configuration file +RUN /bin/bash -c "mineru-models-download -s modelscope -m all" + +# Set the entry point to activate the virtual environment and run the command line tool +ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"] diff --git a/src/convex/.env.example b/src/convex/.env.example new file mode 100644 index 0000000..e9adf8d --- /dev/null +++ b/src/convex/.env.example @@ -0,0 +1,158 @@ +# Convex Configuration + +# ============================================================================= +# Versions +# ============================================================================= +CONVEX_BACKEND_VERSION=33cef775a8a6228cbacee4a09ac2c4073d62ed13 +CONVEX_DASHBOARD_VERSION=33cef775a8a6228cbacee4a09ac2c4073d62ed13 +POSTGRES_VERSION=17-alpine + +# ============================================================================= +# Port Configuration +# ============================================================================= +CONVEX_BACKEND_PORT_OVERRIDE=3210 +CONVEX_SITE_PROXY_PORT_OVERRIDE=3211 +CONVEX_DASHBOARD_PORT_OVERRIDE=6791 + +# ============================================================================= +# Instance Configuration +# ============================================================================= +# Name of your Convex instance +INSTANCE_NAME=convex-self-hosted + +# Secret key for instance authentication (generate a strong random string) +# Example: openssl rand -hex 32 +INSTANCE_SECRET= + +# ============================================================================= +# Origins +# ============================================================================= +# URL where the Convex backend is accessible +CONVEX_CLOUD_ORIGIN=http://127.0.0.1:3210 + +# URL where the Convex site proxy is accessible +CONVEX_SITE_ORIGIN=http://127.0.0.1:3211 + +# URL for the dashboard to connect to the backend +NEXT_PUBLIC_DEPLOYMENT_URL=http://127.0.0.1:3210 + +# ============================================================================= +# Database Configuration +# ============================================================================= +# PostgreSQL password (change in production) +POSTGRES_PASSWORD=convex + +# Full PostgreSQL connection URL (optional, constructed from above if not set) +# POSTGRES_URL=postgresql://postgres:convex@postgres:5432/convex + +# MySQL URL (alternative to PostgreSQL, leave empty to use PostgreSQL) +# MYSQL_URL= + +# ============================================================================= +# Application Limits +# ============================================================================= +# Maximum concurrent mutations +APPLICATION_MAX_CONCURRENT_MUTATIONS=16 + +# Maximum concurrent Node.js actions +APPLICATION_MAX_CONCURRENT_NODE_ACTIONS=16 + +# Maximum concurrent queries +APPLICATION_MAX_CONCURRENT_QUERIES=16 + +# Maximum concurrent V8 actions +APPLICATION_MAX_CONCURRENT_V8_ACTIONS=16 + +# User action timeout in seconds (empty for default) +ACTIONS_USER_TIMEOUT_SECS= + +# ============================================================================= +# SSL/TLS Settings +# ============================================================================= +# Set to false to require SSL (recommended for production) +DO_NOT_REQUIRE_SSL=true + +# ============================================================================= +# Data Retention +# ============================================================================= +# Document retention delay in seconds (default: 2 days) +DOCUMENT_RETENTION_DELAY=172800 + +# ============================================================================= +# Telemetry and Metrics +# ============================================================================= +# Disable telemetry beacon (set to true to disable) +DISABLE_BEACON=false + +# Enable Prometheus-compatible /metrics endpoint +DISABLE_METRICS_ENDPOINT=true + +# ============================================================================= +# Logging +# ============================================================================= +# Rust log level (error, warn, info, debug, trace) +RUST_LOG=info + +# Enable Rust backtrace (1, full, or empty) +RUST_BACKTRACE= + +# Redact logs sent to clients +REDACT_LOGS_TO_CLIENT= + +# HTTP server timeout in seconds +HTTP_SERVER_TIMEOUT_SECONDS= + +# ============================================================================= +# AWS S3 Configuration (Optional - for external storage) +# ============================================================================= +# AWS_ACCESS_KEY_ID= +# AWS_SECRET_ACCESS_KEY= +# AWS_REGION= +# AWS_SESSION_TOKEN= +# S3_ENDPOINT_URL= +# S3_STORAGE_EXPORTS_BUCKET= +# S3_STORAGE_FILES_BUCKET= +# S3_STORAGE_MODULES_BUCKET= +# S3_STORAGE_SEARCH_BUCKET= +# S3_STORAGE_SNAPSHOT_IMPORTS_BUCKET= +# AWS_S3_DISABLE_CHECKSUMS= +# AWS_S3_DISABLE_SSE= +# AWS_S3_FORCE_PATH_STYLE= + +# ============================================================================= +# Development Settings +# ============================================================================= +# Development version override +CONVEX_RELEASE_VERSION_DEV= + +# Load Monaco editor internally in dashboard +NEXT_PUBLIC_LOAD_MONACO_INTERNALLY= + +# ============================================================================= +# Timezone +# ============================================================================= +TZ=UTC + +# ============================================================================= +# Resource Limits - Convex Backend +# ============================================================================= +CONVEX_BACKEND_CPU_LIMIT=2.0 +CONVEX_BACKEND_CPU_RESERVATION=0.5 +CONVEX_BACKEND_MEMORY_LIMIT=2G +CONVEX_BACKEND_MEMORY_RESERVATION=512M + +# ============================================================================= +# Resource Limits - Convex Dashboard +# ============================================================================= +CONVEX_DASHBOARD_CPU_LIMIT=0.5 +CONVEX_DASHBOARD_CPU_RESERVATION=0.25 +CONVEX_DASHBOARD_MEMORY_LIMIT=256M +CONVEX_DASHBOARD_MEMORY_RESERVATION=128M + +# ============================================================================= +# Resource Limits - PostgreSQL +# ============================================================================= +POSTGRES_CPU_LIMIT=1.0 +POSTGRES_CPU_RESERVATION=0.25 +POSTGRES_MEMORY_LIMIT=1G +POSTGRES_MEMORY_RESERVATION=256M diff --git a/src/convex/README.md b/src/convex/README.md new file mode 100644 index 0000000..265a917 --- /dev/null +++ b/src/convex/README.md @@ -0,0 +1,123 @@ +# Convex + +Convex is an open-source reactive database designed to make life easy for web app developers, whether human or LLM. + +## Features + +- **Reactive Queries**: Queries automatically update when underlying data changes +- **Real-time Subscriptions**: Live UI updates without manual polling +- **Serverless Functions**: Write backend logic in TypeScript/JavaScript +- **Automatic Caching**: Built-in intelligent caching for optimal performance +- **Type Safety**: Full TypeScript support with generated types +- **Scalable Architecture**: Designed to handle high-throughput applications + +## Quick Start + +1. Copy `.env.example` to `.env`: + + ```bash + cp .env.example .env + ``` + +2. Generate an instance secret (required for production): + + ```bash + openssl rand -hex 32 + ``` + + Then set `INSTANCE_SECRET` in your `.env` file. + +3. Start Convex: + + ```bash + docker compose up -d + ``` + +4. Wait for services to be healthy (check with `docker compose ps`) + +5. Access the Dashboard at `http://localhost:6791` + +6. Backend API is available at `http://localhost:3210` + +## Default Configuration + +| Service | Port | Description | +| -------------- | ---- | ------------------------------- | +| Convex Backend | 3210 | Main API and WebSocket endpoint | +| Site Proxy | 3211 | Site hosting proxy | +| Dashboard | 6791 | Web UI for managing Convex | +| PostgreSQL | 5432 | Database (internal) | + +**Authentication**: Set `INSTANCE_SECRET` for production use. + +## Environment Variables + +Key environment variables (see `.env.example` for full list): + +| Variable | Description | Default | +| --------------------------------- | --------------------------------- | ----------------------- | +| `CONVEX_BACKEND_PORT_OVERRIDE` | Host port for backend API | `3210` | +| `CONVEX_SITE_PROXY_PORT_OVERRIDE` | Host port for site proxy | `3211` | +| `CONVEX_DASHBOARD_PORT_OVERRIDE` | Host port for dashboard | `6791` | +| `INSTANCE_NAME` | Name of the Convex instance | `convex-self-hosted` | +| `INSTANCE_SECRET` | Secret key for authentication | (required) | +| `CONVEX_CLOUD_ORIGIN` | URL for backend access | `http://127.0.0.1:3210` | +| `CONVEX_SITE_ORIGIN` | URL for site proxy access | `http://127.0.0.1:3211` | +| `POSTGRES_PASSWORD` | PostgreSQL password | `convex` | +| `RUST_LOG` | Log level (error/warn/info/debug) | `info` | +| `TZ` | Timezone | `UTC` | + +## Resource Requirements + +**Minimum**: + +- CPU: 1 core +- RAM: 1GB +- Disk: 5GB + +**Recommended**: + +- CPU: 2+ cores +- RAM: 2GB+ +- Disk: 20GB+ + +## Volumes + +- `convex_data`: Convex backend data storage +- `postgres_data`: PostgreSQL database data + +## Using with Your Application + +To use this self-hosted Convex backend with your application: + +1. Set the `CONVEX_SELF_HOSTED_URL` environment variable in your app: + + ```bash + CONVEX_SELF_HOSTED_URL=http://localhost:3210 + ``` + +2. Set the `CONVEX_SELF_HOSTED_ADMIN_KEY` environment variable: + + ```bash + CONVEX_SELF_HOSTED_ADMIN_KEY=your-instance-secret + ``` + +3. Deploy your Convex functions: + + ```bash + npx convex dev + ``` + +For more details, see the [Convex Self-Hosting Documentation](https://stack.convex.dev/self-hosted-develop-and-deploy). + +## Security Notes + +- **Always set a strong `INSTANCE_SECRET`** in production +- Enable SSL/TLS by setting `DO_NOT_REQUIRE_SSL=false` and using a reverse proxy +- Use strong database passwords +- Restrict network access to Convex services +- Consider using AWS S3 for external storage in production + +## License + +Apache-2.0 () diff --git a/src/convex/README.zh.md b/src/convex/README.zh.md new file mode 100644 index 0000000..9a2deb4 --- /dev/null +++ b/src/convex/README.zh.md @@ -0,0 +1,123 @@ +# Convex + +Convex 是一个开源的响应式数据库,旨在让 Web 应用开发者(无论是人类还是 LLM)的生活更加轻松。 + +## 功能特性 + +- **响应式查询**:当底层数据变化时,查询会自动更新 +- **实时订阅**:无需手动轮询即可实现实时 UI 更新 +- **无服务器函数**:使用 TypeScript/JavaScript 编写后端逻辑 +- **自动缓存**:内置智能缓存以获得最佳性能 +- **类型安全**:完整的 TypeScript 支持,并生成类型定义 +- **可扩展架构**:专为高吞吐量应用而设计 + +## 快速开始 + +1. 复制 `.env.example` 到 `.env`: + + ```bash + cp .env.example .env + ``` + +2. 生成实例密钥(生产环境必需): + + ```bash + openssl rand -hex 32 + ``` + + 然后在 `.env` 文件中设置 `INSTANCE_SECRET`。 + +3. 启动 Convex: + + ```bash + docker compose up -d + ``` + +4. 等待服务健康(使用 `docker compose ps` 检查) + +5. 访问 Dashboard:`http://localhost:6791` + +6. 后端 API 地址:`http://localhost:3210` + +## 默认配置 + +| 服务 | 端口 | 说明 | +| -------------- | ---- | ------------------------ | +| Convex Backend | 3210 | 主 API 和 WebSocket 端点 | +| Site Proxy | 3211 | 站点托管代理 | +| Dashboard | 6791 | 管理 Convex 的 Web UI | +| PostgreSQL | 5432 | 数据库(内部) | + +**认证**:生产环境请设置 `INSTANCE_SECRET`。 + +## 环境变量 + +关键环境变量(完整列表请参见 `.env.example`): + +| 变量 | 说明 | 默认值 | +| --------------------------------- | --------------------------------- | ----------------------- | +| `CONVEX_BACKEND_PORT_OVERRIDE` | 后端 API 的主机端口 | `3210` | +| `CONVEX_SITE_PROXY_PORT_OVERRIDE` | 站点代理的主机端口 | `3211` | +| `CONVEX_DASHBOARD_PORT_OVERRIDE` | Dashboard 的主机端口 | `6791` | +| `INSTANCE_NAME` | Convex 实例名称 | `convex-self-hosted` | +| `INSTANCE_SECRET` | 认证密钥 | (必需) | +| `CONVEX_CLOUD_ORIGIN` | 后端访问 URL | `http://127.0.0.1:3210` | +| `CONVEX_SITE_ORIGIN` | 站点代理访问 URL | `http://127.0.0.1:3211` | +| `POSTGRES_PASSWORD` | PostgreSQL 密码 | `convex` | +| `RUST_LOG` | 日志级别(error/warn/info/debug) | `info` | +| `TZ` | 时区 | `UTC` | + +## 资源需求 + +**最低配置**: + +- CPU:1 核 +- 内存:1GB +- 磁盘:5GB + +**推荐配置**: + +- CPU:2+ 核 +- 内存:2GB+ +- 磁盘:20GB+ + +## 数据卷 + +- `convex_data`:Convex 后端数据存储 +- `postgres_data`:PostgreSQL 数据库数据 + +## 在应用中使用 + +要将此自托管 Convex 后端与您的应用一起使用: + +1. 在应用中设置 `CONVEX_SELF_HOSTED_URL` 环境变量: + + ```bash + CONVEX_SELF_HOSTED_URL=http://localhost:3210 + ``` + +2. 设置 `CONVEX_SELF_HOSTED_ADMIN_KEY` 环境变量: + + ```bash + CONVEX_SELF_HOSTED_ADMIN_KEY=your-instance-secret + ``` + +3. 部署您的 Convex 函数: + + ```bash + npx convex dev + ``` + +更多详情,请参阅 [Convex 自托管文档](https://stack.convex.dev/self-hosted-develop-and-deploy)。 + +## 安全说明 + +- **生产环境务必设置强 `INSTANCE_SECRET`** +- 通过设置 `DO_NOT_REQUIRE_SSL=false` 并使用反向代理来启用 SSL/TLS +- 使用强数据库密码 +- 限制对 Convex 服务的网络访问 +- 生产环境考虑使用 AWS S3 进行外部存储 + +## 许可证 + +Apache-2.0() diff --git a/src/convex/docker-compose.yaml b/src/convex/docker-compose.yaml new file mode 100644 index 0000000..3ec93a1 --- /dev/null +++ b/src/convex/docker-compose.yaml @@ -0,0 +1,195 @@ +# Convex - Open-source Reactive Database +# https://github.com/get-convex/convex-backend +# +# Convex is an open-source reactive database designed to make life easy for +# web app developers. It provides real-time data synchronization, automatic +# caching, and a powerful query language. +# +# Key Features: +# - Reactive queries that automatically update when data changes +# - Real-time subscriptions for live UI updates +# - Built-in authentication and authorization +# - Serverless functions with TypeScript/JavaScript +# - Automatic scaling and caching +# +# Default Credentials: +# - Dashboard at http://localhost:6791 +# - Backend API at http://localhost:3210 +# - Site proxy at http://localhost:3211 +# +# Security Notes: +# - Set a strong INSTANCE_SECRET in production +# - Enable SSL/TLS in production +# - Use strong database passwords +# - Restrict network access to Convex services +# +# License: Apache-2.0 (https://github.com/get-convex/convex-backend/blob/main/LICENSE) + +x-defaults: &defaults + restart: unless-stopped + logging: + driver: json-file + options: + max-size: 100m + max-file: '3' + +services: + convex-backend: + <<: *defaults + image: ${GHCR_REGISTRY:-ghcr.io/}get-convex/convex-backend:${CONVEX_BACKEND_VERSION:-33cef775a8a6228cbacee4a09ac2c4073d62ed13} + stop_grace_period: 10s + stop_signal: SIGINT + ports: + - '${CONVEX_BACKEND_PORT_OVERRIDE:-3210}:3210' + - '${CONVEX_SITE_PROXY_PORT_OVERRIDE:-3211}:3211' + volumes: + - convex_data:/convex/data + environment: + # Instance configuration + - INSTANCE_NAME=${INSTANCE_NAME:-convex-self-hosted} + - INSTANCE_SECRET=${INSTANCE_SECRET} + + # Origins + - CONVEX_CLOUD_ORIGIN=${CONVEX_CLOUD_ORIGIN:-http://127.0.0.1:3210} + - CONVEX_SITE_ORIGIN=${CONVEX_SITE_ORIGIN:-http://127.0.0.1:3211} + + # Database configuration (PostgreSQL) + - POSTGRES_URL=${POSTGRES_URL:-postgresql://postgres:${POSTGRES_PASSWORD:-convex}@postgres:5432/convex} + + # Application limits + - APPLICATION_MAX_CONCURRENT_MUTATIONS=${APPLICATION_MAX_CONCURRENT_MUTATIONS:-16} + - APPLICATION_MAX_CONCURRENT_NODE_ACTIONS=${APPLICATION_MAX_CONCURRENT_NODE_ACTIONS:-16} + - APPLICATION_MAX_CONCURRENT_QUERIES=${APPLICATION_MAX_CONCURRENT_QUERIES:-16} + - APPLICATION_MAX_CONCURRENT_V8_ACTIONS=${APPLICATION_MAX_CONCURRENT_V8_ACTIONS:-16} + + # Actions timeout + - ACTIONS_USER_TIMEOUT_SECS=${ACTIONS_USER_TIMEOUT_SECS:-} + + # SSL/TLS settings + - DO_NOT_REQUIRE_SSL=${DO_NOT_REQUIRE_SSL:-true} + + # Document retention (default 2 days in seconds) + - DOCUMENT_RETENTION_DELAY=${DOCUMENT_RETENTION_DELAY:-172800} + + # Metrics and beacon + - DISABLE_BEACON=${DISABLE_BEACON:-false} + - DISABLE_METRICS_ENDPOINT=${DISABLE_METRICS_ENDPOINT:-true} + + # Logging + - RUST_LOG=${RUST_LOG:-info} + - RUST_BACKTRACE=${RUST_BACKTRACE:-} + - REDACT_LOGS_TO_CLIENT=${REDACT_LOGS_TO_CLIENT:-} + + # HTTP server timeout + - HTTP_SERVER_TIMEOUT_SECONDS=${HTTP_SERVER_TIMEOUT_SECONDS:-} + + # AWS S3 configuration (optional, for external storage) + - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-} + - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-} + - AWS_REGION=${AWS_REGION:-} + - AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN:-} + - S3_ENDPOINT_URL=${S3_ENDPOINT_URL:-} + - S3_STORAGE_EXPORTS_BUCKET=${S3_STORAGE_EXPORTS_BUCKET:-} + - S3_STORAGE_FILES_BUCKET=${S3_STORAGE_FILES_BUCKET:-} + - S3_STORAGE_MODULES_BUCKET=${S3_STORAGE_MODULES_BUCKET:-} + - S3_STORAGE_SEARCH_BUCKET=${S3_STORAGE_SEARCH_BUCKET:-} + - S3_STORAGE_SNAPSHOT_IMPORTS_BUCKET=${S3_STORAGE_SNAPSHOT_IMPORTS_BUCKET:-} + - AWS_S3_DISABLE_CHECKSUMS=${AWS_S3_DISABLE_CHECKSUMS:-} + - AWS_S3_DISABLE_SSE=${AWS_S3_DISABLE_SSE:-} + - AWS_S3_FORCE_PATH_STYLE=${AWS_S3_FORCE_PATH_STYLE:-} + + # MySQL URL (alternative to PostgreSQL) + - MYSQL_URL=${MYSQL_URL:-} + + # Development settings + - CONVEX_RELEASE_VERSION_DEV=${CONVEX_RELEASE_VERSION_DEV:-} + + # Timezone + - TZ=${TZ:-UTC} + depends_on: + postgres: + condition: service_healthy + healthcheck: + test: + - CMD + - curl + - -f + - http://localhost:3210/version + interval: 5s + timeout: 5s + retries: 5 + start_period: 30s + deploy: + resources: + limits: + cpus: '${CONVEX_BACKEND_CPU_LIMIT:-2.0}' + memory: '${CONVEX_BACKEND_MEMORY_LIMIT:-2G}' + reservations: + cpus: '${CONVEX_BACKEND_CPU_RESERVATION:-0.5}' + memory: '${CONVEX_BACKEND_MEMORY_RESERVATION:-512M}' + + convex-dashboard: + <<: *defaults + image: ${GHCR_REGISTRY:-ghcr.io/}get-convex/convex-dashboard:${CONVEX_DASHBOARD_VERSION:-33cef775a8a6228cbacee4a09ac2c4073d62ed13} + stop_grace_period: 10s + stop_signal: SIGINT + ports: + - '${CONVEX_DASHBOARD_PORT_OVERRIDE:-6791}:6791' + environment: + - NEXT_PUBLIC_DEPLOYMENT_URL=${NEXT_PUBLIC_DEPLOYMENT_URL:-http://127.0.0.1:3210} + - NEXT_PUBLIC_LOAD_MONACO_INTERNALLY=${NEXT_PUBLIC_LOAD_MONACO_INTERNALLY:-} + depends_on: + convex-backend: + condition: service_healthy + healthcheck: + test: + - CMD + - wget + - --quiet + - --tries=1 + - --spider + - http://localhost:6791 + interval: 10s + timeout: 5s + retries: 3 + start_period: 10s + deploy: + resources: + limits: + cpus: '${CONVEX_DASHBOARD_CPU_LIMIT:-0.5}' + memory: '${CONVEX_DASHBOARD_MEMORY_LIMIT:-256M}' + reservations: + cpus: '${CONVEX_DASHBOARD_CPU_RESERVATION:-0.25}' + memory: '${CONVEX_DASHBOARD_MEMORY_RESERVATION:-128M}' + + postgres: + <<: *defaults + image: ${GLOBAL_REGISTRY:-}postgres:${POSTGRES_VERSION:-17-alpine} + environment: + - POSTGRES_DB=convex + - POSTGRES_USER=postgres + - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-convex} + - POSTGRES_INITDB_ARGS=--encoding=UTF8 + - TZ=${TZ:-UTC} + volumes: + - postgres_data:/var/lib/postgresql/data + healthcheck: + test: + - CMD-SHELL + - pg_isready -U postgres + interval: 5s + timeout: 5s + retries: 5 + start_period: 10s + deploy: + resources: + limits: + cpus: '${POSTGRES_CPU_LIMIT:-1.0}' + memory: '${POSTGRES_MEMORY_LIMIT:-1G}' + reservations: + cpus: '${POSTGRES_CPU_RESERVATION:-0.25}' + memory: '${POSTGRES_MEMORY_RESERVATION:-256M}' + +volumes: + convex_data: + postgres_data: diff --git a/src/llama-swap/.env.example b/src/llama-swap/.env.example new file mode 100644 index 0000000..22897e3 --- /dev/null +++ b/src/llama-swap/.env.example @@ -0,0 +1,62 @@ +# ============================================================================= +# llama-swap Configuration +# https://github.com/mostlygeek/llama-swap +# Reliable model swapping for any local OpenAI/Anthropic compatible server +# ============================================================================= + +# ----------------------------------------------------------------------------- +# General Settings +# ----------------------------------------------------------------------------- + +# Timezone for the container (default: UTC) +TZ=UTC + +# GitHub Container Registry prefix (default: ghcr.io/) +GHCR_REGISTRY=ghcr.io/ + +# ----------------------------------------------------------------------------- +# Image Variants +# ----------------------------------------------------------------------------- + +# CPU-only image version tag (default: cpu) +# Available: cpu, cuda, vulkan, rocm, intel, musa +# Tagged releases example: v197-cuda-b8193 +LLAMA_SWAP_VERSION=cpu + +# NVIDIA CUDA image version tag (used with the `gpu` profile) +LLAMA_SWAP_CUDA_VERSION=cuda + +# AMD GPU image version tag (used with the `gpu-amd` profile) +# Options: vulkan (Vulkan/AMD), rocm (ROCm/AMD) +LLAMA_SWAP_AMD_VERSION=vulkan + +# ----------------------------------------------------------------------------- +# Network Settings +# ----------------------------------------------------------------------------- + +# Host port override for the llama-swap API (default: 9292) +# The Web UI and OpenAI-compatible API are both served on this port +LLAMA_SWAP_PORT_OVERRIDE=9292 + +# ----------------------------------------------------------------------------- +# GPU Settings (used with `gpu` profile) +# ----------------------------------------------------------------------------- + +# Number of NVIDIA GPUs to use (default: 1) +LLAMA_SWAP_GPU_COUNT=1 + +# ----------------------------------------------------------------------------- +# Resource Limits +# ----------------------------------------------------------------------------- + +# CPU limit (in cores) +LLAMA_SWAP_CPU_LIMIT=4.0 + +# CPU reservation (in cores) +LLAMA_SWAP_CPU_RESERVATION=2.0 + +# Memory limit (e.g., 8G, 16G) +LLAMA_SWAP_MEMORY_LIMIT=8G + +# Memory reservation (e.g., 4G, 8G) +LLAMA_SWAP_MEMORY_RESERVATION=4G diff --git a/src/llama-swap/README.md b/src/llama-swap/README.md new file mode 100644 index 0000000..777359a --- /dev/null +++ b/src/llama-swap/README.md @@ -0,0 +1,196 @@ +# llama-swap + +[llama-swap](https://github.com/mostlygeek/llama-swap) is a lightweight reverse proxy that provides reliable on-demand model swapping for any local OpenAI/Anthropic-compatible inference server (e.g., llama.cpp, vllm). Only one model is loaded at a time, and it is automatically swapped out when a different model is requested, making it easy to work with many models on a single machine. + +See also: [README.zh.md](./README.zh.md) + +## Features + +- **On-demand model swapping**: Automatically load/unload models based on API requests with zero manual intervention. +- **OpenAI/Anthropic compatible**: Drop-in replacement for any client that uses the OpenAI or Anthropic chat completion API. +- **Multi-backend support**: Works with llama.cpp (llama-server), vllm, and any OpenAI-compatible server. +- **Real-time Web UI**: Built-in interface for monitoring logs, inspecting requests, and manually managing models. +- **TTL-based unloading**: Models can be configured to unload automatically after a period of inactivity. +- **HuggingFace model downloads**: Reference HuggingFace models directly in `config.yaml` and they are downloaded on first use. +- **Multi-GPU support**: Works with NVIDIA CUDA, AMD ROCm/Vulkan, Intel, and CPU-only setups. + +## Quick Start + +1. Copy the example environment file: + + ```bash + cp .env.example .env + ``` + +2. Edit `config.yaml` to add your models. The provided `config.yaml` includes a commented example for a local GGUF model and a HuggingFace download. See [Configuration](#configuration) for details. + +3. Start the service (CPU-only by default): + + ```bash + docker compose up -d + ``` + +4. For NVIDIA GPU support: + + ```bash + docker compose --profile gpu up -d + ``` + +5. For AMD GPU support (Vulkan): + + ```bash + docker compose --profile gpu-amd up -d + ``` + +The API and Web UI are available at: `http://localhost:9292` + +## Services + +| Service | Profile | Description | +| ----------------- | ----------- | --------------------------------- | +| `llama-swap` | _(default)_ | CPU-only inference | +| `llama-swap-cuda` | `gpu` | NVIDIA CUDA GPU inference | +| `llama-swap-amd` | `gpu-amd` | AMD GPU inference (Vulkan / ROCm) | + +> **Note**: Only start one service at a time. All three services bind to the same host port (`LLAMA_SWAP_PORT_OVERRIDE`). + +## Configuration + +### `config.yaml` + +The `config.yaml` file defines the models llama-swap manages. It is mounted read-only at `/app/config.yaml` inside the container. Edit the provided `config.yaml` to add your models. + +Minimal example: + +```yaml +healthCheckTimeout: 300 + +models: + my-model: + cmd: /app/llama-server --port ${PORT} --model /root/.cache/llama.cpp/model.gguf --ctx-size 4096 + proxy: 'http://localhost:${PORT}' + ttl: 900 +``` + +- `${PORT}` is automatically assigned by llama-swap. +- `ttl` (seconds): unload the model after this many seconds of inactivity. +- `cmd`: the command to start the inference server. +- `proxy`: the address llama-swap forwards requests to. + +For downloading models from HuggingFace on first use: + +```yaml +models: + Qwen2.5-7B: + cmd: /app/llama-server --port ${PORT} -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M --ctx-size 8192 --n-gpu-layers 99 + proxy: 'http://localhost:${PORT}' +``` + +See the [official configuration documentation](https://github.com/mostlygeek/llama-swap/blob/main/docs/config.md) for all options including `groups`, `hooks`, `macros`, `aliases`, `filters`, and more. + +### Models Volume + +The named volume `llama_swap_models` is mounted to `/root/.cache/llama.cpp` inside the container. To place local GGUF model files inside the volume, you can use: + +```bash +# Copy a model into the named volume +docker run --rm -v llama_swap_models:/data -v /path/to/model.gguf:/src/model.gguf alpine cp /src/model.gguf /data/model.gguf +``` + +Alternatively, change the volume definition in `docker-compose.yaml` to use a host path: + +```yaml +volumes: + llama_swap_models: + driver: local + driver_opts: + type: none + o: bind + device: /path/to/your/models +``` + +## Environment Variables + +| Variable | Default | Description | +| ------------------------------- | ---------- | -------------------------------------------------- | +| `TZ` | `UTC` | Container timezone | +| `GHCR_REGISTRY` | `ghcr.io/` | GitHub Container Registry prefix | +| `LLAMA_SWAP_VERSION` | `cpu` | Image tag for the default CPU service | +| `LLAMA_SWAP_CUDA_VERSION` | `cuda` | Image tag for the CUDA service | +| `LLAMA_SWAP_AMD_VERSION` | `vulkan` | Image tag for the AMD service (`vulkan` or `rocm`) | +| `LLAMA_SWAP_PORT_OVERRIDE` | `9292` | Host port for the API and Web UI | +| `LLAMA_SWAP_GPU_COUNT` | `1` | Number of NVIDIA GPUs to use (CUDA profile) | +| `LLAMA_SWAP_CPU_LIMIT` | `4.0` | CPU limit in cores | +| `LLAMA_SWAP_CPU_RESERVATION` | `2.0` | CPU reservation in cores | +| `LLAMA_SWAP_MEMORY_LIMIT` | `8G` | Memory limit | +| `LLAMA_SWAP_MEMORY_RESERVATION` | `4G` | Memory reservation | + +## Default Ports + +| Port | Description | +| ------ | ------------------------------------------ | +| `9292` | OpenAI/Anthropic-compatible API and Web UI | + +## API Usage + +llama-swap exposes an OpenAI-compatible API. Use any OpenAI client by pointing it to `http://localhost:9292`: + +```bash +# List available models +curl http://localhost:9292/v1/models + +# Chat completion (automatically loads the model if not running) +curl http://localhost:9292/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "my-model", + "messages": [{"role": "user", "content": "Hello!"}] + }' +``` + +The Web UI is available at `http://localhost:9292` and provides real-time log streaming, request inspection, and manual model management. + +## NVIDIA GPU Setup + +Requires the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). + +```bash +docker compose --profile gpu up -d +``` + +For non-root security hardening, use the `cuda-non-root` image tag: + +```yaml +LLAMA_SWAP_CUDA_VERSION=cuda-non-root +``` + +## AMD GPU Setup + +Requires the `/dev/dri` and `/dev/dri` devices to be accessible on the host. + +```bash +docker compose --profile gpu-amd up -d +``` + +Use `rocm` instead of `vulkan` for full ROCm support: + +```bash +LLAMA_SWAP_AMD_VERSION=rocm docker compose --profile gpu-amd up -d +``` + +## Security Notes + +- By default, the container runs as root. Use the `cuda-non-root` or `rocm-non-root` image tags for improved security on GPU deployments. +- The `config.yaml` is mounted read-only (`ro`). +- Consider placing llama-swap behind a reverse proxy (e.g., Nginx, Caddy) when exposing it beyond localhost. + +## References + +- [llama-swap GitHub](https://github.com/mostlygeek/llama-swap) +- [Configuration Documentation](https://github.com/mostlygeek/llama-swap/blob/main/docs/config.md) +- [Container Security](https://github.com/mostlygeek/llama-swap/blob/main/docs/container-security.md) +- [Docker Compose Wiki](https://github.com/mostlygeek/llama-swap/wiki/Docker-Compose-Example) + +## License + +llama-swap is released under the MIT License. See the [LICENSE](https://github.com/mostlygeek/llama-swap/blob/main/LICENSE) file for details. diff --git a/src/llama-swap/README.zh.md b/src/llama-swap/README.zh.md new file mode 100644 index 0000000..0816e96 --- /dev/null +++ b/src/llama-swap/README.zh.md @@ -0,0 +1,196 @@ +# llama-swap + +[llama-swap](https://github.com/mostlygeek/llama-swap) 是一个轻量级反向代理,为任何本地 OpenAI/Anthropic 兼容的推理服务器(如 llama.cpp、vllm 等)提供可靠的按需模型切换功能。同一时间只加载一个模型,当收到对不同模型的请求时,llama-swap 会自动切换,让你可以在单台机器上轻松使用多个模型。 + +参见:[README.md](./README.md) + +## 功能特性 + +- **按需模型切换**:根据 API 请求自动加载/卸载模型,无需手动干预。 +- **兼容 OpenAI/Anthropic**:可直接替代任何使用 OpenAI 或 Anthropic 聊天补全 API 的客户端。 +- **多后端支持**:适用于 llama.cpp(llama-server)、vllm 及任何 OpenAI 兼容服务器。 +- **实时 Web UI**:内置界面,可监控日志、检查请求、手动管理模型。 +- **基于 TTL 的自动卸载**:可配置模型在闲置一段时间后自动卸载。 +- **HuggingFace 模型下载**:在 `config.yaml` 中直接引用 HuggingFace 模型,首次使用时自动下载。 +- **多 GPU 支持**:支持 NVIDIA CUDA、AMD ROCm/Vulkan、Intel 及纯 CPU 部署。 + +## 快速开始 + +1. 复制环境变量示例文件: + + ```bash + cp .env.example .env + ``` + +2. 编辑 `config.yaml`,添加你的模型配置。提供的 `config.yaml` 包含本地 GGUF 模型和 HuggingFace 下载的注释示例。详见[配置说明](#配置说明)。 + +3. 启动服务(默认仅使用 CPU): + + ```bash + docker compose up -d + ``` + +4. 启用 NVIDIA GPU 支持: + + ```bash + docker compose --profile gpu up -d + ``` + +5. 启用 AMD GPU 支持(Vulkan): + + ```bash + docker compose --profile gpu-amd up -d + ``` + +API 和 Web UI 地址:`http://localhost:9292` + +## 服务说明 + +| 服务名称 | Profile | 说明 | +| ----------------- | ---------- | ----------------------------- | +| `llama-swap` | _(默认)_ | 纯 CPU 推理 | +| `llama-swap-cuda` | `gpu` | NVIDIA CUDA GPU 推理 | +| `llama-swap-amd` | `gpu-amd` | AMD GPU 推理(Vulkan / ROCm) | + +> **注意**:每次只启动一个服务,三个服务均绑定到同一主机端口(`LLAMA_SWAP_PORT_OVERRIDE`)。 + +## 配置说明 + +### `config.yaml` + +`config.yaml` 文件定义了 llama-swap 管理的模型列表,以只读方式挂载到容器内的 `/app/config.yaml`。编辑提供的 `config.yaml` 即可添加你的模型。 + +最简示例: + +```yaml +healthCheckTimeout: 300 + +models: + my-model: + cmd: /app/llama-server --port ${PORT} --model /root/.cache/llama.cpp/model.gguf --ctx-size 4096 + proxy: 'http://localhost:${PORT}' + ttl: 900 +``` + +- `${PORT}` 由 llama-swap 自动分配。 +- `ttl`(秒):模型闲置超过该时长后自动卸载。 +- `cmd`:启动推理服务器的命令。 +- `proxy`:llama-swap 转发请求的地址。 + +直接使用 HuggingFace 模型(首次使用时自动下载): + +```yaml +models: + Qwen2.5-7B: + cmd: /app/llama-server --port ${PORT} -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M --ctx-size 8192 --n-gpu-layers 99 + proxy: 'http://localhost:${PORT}' +``` + +完整配置选项(包括 `groups`、`hooks`、`macros`、`aliases`、`filters` 等)请参阅[官方配置文档](https://github.com/mostlygeek/llama-swap/blob/main/docs/config.md)。 + +### 模型卷 + +命名卷 `llama_swap_models` 挂载到容器内的 `/root/.cache/llama.cpp`。可以通过以下方式将本地 GGUF 模型文件放入卷中: + +```bash +# 将模型文件复制到命名卷 +docker run --rm -v llama_swap_models:/data -v /path/to/model.gguf:/src/model.gguf alpine cp /src/model.gguf /data/model.gguf +``` + +或者将 `docker-compose.yaml` 中的卷定义改为主机路径绑定: + +```yaml +volumes: + llama_swap_models: + driver: local + driver_opts: + type: none + o: bind + device: /path/to/your/models +``` + +## 环境变量 + +| 变量名 | 默认值 | 说明 | +| ------------------------------- | ---------- | -------------------------------------- | +| `TZ` | `UTC` | 容器时区 | +| `GHCR_REGISTRY` | `ghcr.io/` | GitHub 容器镜像仓库前缀 | +| `LLAMA_SWAP_VERSION` | `cpu` | 默认 CPU 服务镜像标签 | +| `LLAMA_SWAP_CUDA_VERSION` | `cuda` | CUDA 服务镜像标签 | +| `LLAMA_SWAP_AMD_VERSION` | `vulkan` | AMD 服务镜像标签(`vulkan` 或 `rocm`) | +| `LLAMA_SWAP_PORT_OVERRIDE` | `9292` | API 和 Web UI 的主机端口 | +| `LLAMA_SWAP_GPU_COUNT` | `1` | 使用的 NVIDIA GPU 数量(gpu profile) | +| `LLAMA_SWAP_CPU_LIMIT` | `4.0` | CPU 上限(核心数) | +| `LLAMA_SWAP_CPU_RESERVATION` | `2.0` | CPU 预留(核心数) | +| `LLAMA_SWAP_MEMORY_LIMIT` | `8G` | 内存上限 | +| `LLAMA_SWAP_MEMORY_RESERVATION` | `4G` | 内存预留 | + +## 默认端口 + +| 端口 | 说明 | +| ------ | ----------------------------------- | +| `9292` | OpenAI/Anthropic 兼容 API 及 Web UI | + +## API 使用示例 + +llama-swap 暴露 OpenAI 兼容 API。将任何 OpenAI 客户端指向 `http://localhost:9292` 即可使用: + +```bash +# 列出可用模型 +curl http://localhost:9292/v1/models + +# 聊天补全(若模型未运行则自动加载) +curl http://localhost:9292/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "my-model", + "messages": [{"role": "user", "content": "你好!"}] + }' +``` + +Web UI 可通过 `http://localhost:9292` 访问,提供实时日志流、请求检查和手动模型管理功能。 + +## NVIDIA GPU 配置 + +需要安装 [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)。 + +```bash +docker compose --profile gpu up -d +``` + +如需非 root 安全加固,可使用 `cuda-non-root` 镜像标签: + +```bash +LLAMA_SWAP_CUDA_VERSION=cuda-non-root docker compose --profile gpu up -d +``` + +## AMD GPU 配置 + +需要主机上 `/dev/dri` 和 `/dev/kfd` 设备可访问。 + +```bash +docker compose --profile gpu-amd up -d +``` + +如需完整 ROCm 支持,可使用 `rocm` 替代 `vulkan`: + +```bash +LLAMA_SWAP_AMD_VERSION=rocm docker compose --profile gpu-amd up -d +``` + +## 安全说明 + +- 默认情况下容器以 root 用户运行。GPU 部署时建议使用 `cuda-non-root` 或 `rocm-non-root` 镜像标签提升安全性。 +- `config.yaml` 以只读方式(`ro`)挂载。 +- 若需在 localhost 之外暴露服务,建议在 llama-swap 前部署反向代理(如 Nginx、Caddy)。 + +## 参考链接 + +- [llama-swap GitHub](https://github.com/mostlygeek/llama-swap) +- [配置文档](https://github.com/mostlygeek/llama-swap/blob/main/docs/config.md) +- [容器安全文档](https://github.com/mostlygeek/llama-swap/blob/main/docs/container-security.md) +- [Docker Compose Wiki](https://github.com/mostlygeek/llama-swap/wiki/Docker-Compose-Example) + +## 许可证 + +llama-swap 使用 MIT 许可证发布。详情请参阅 [LICENSE](https://github.com/mostlygeek/llama-swap/blob/main/LICENSE) 文件。 diff --git a/src/llama-swap/config.yaml b/src/llama-swap/config.yaml new file mode 100644 index 0000000..eda7e43 --- /dev/null +++ b/src/llama-swap/config.yaml @@ -0,0 +1,47 @@ +# llama-swap configuration file +# https://github.com/mostlygeek/llama-swap/blob/main/docs/config.md +# +# This is the main configuration file for llama-swap. +# Mount this file to /app/config.yaml inside the container. +# +# llama-swap will automatically swap models on demand: +# - Only the requested model is loaded at a time. +# - Idle models are unloaded when a new one is requested. + +# Maximum time (in seconds) to wait for a model to become healthy. +# A high value is useful when downloading models from HuggingFace. +healthCheckTimeout: 300 + +# Macro definitions: reusable command snippets for model configuration. +# Reference with $${macro-name} inside cmd fields. +macros: + "llama-server": > + /app/llama-server + --port ${PORT} + +# Model definitions +models: + # Example: a local GGUF model stored in the models volume. + # The volume `llama_swap_models` is mounted to /root/.cache/llama.cpp inside + # the container. Place your .gguf files there and reference them with + # /root/.cache/llama.cpp/.gguf + "my-local-model": + # ${PORT} is automatically assigned by llama-swap + cmd: > + $${llama-server} + --model /root/.cache/llama.cpp/model.gguf + --ctx-size 4096 + --n-gpu-layers 0 + proxy: "http://localhost:${PORT}" + # Automatically unload the model after 15 minutes of inactivity + ttl: 900 + + # Example: download a model from HuggingFace on first use (requires internet access) + # "Qwen2.5-7B-Instruct": + # cmd: > + # $${llama-server} + # -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M + # --ctx-size 8192 + # --n-gpu-layers 99 + # proxy: "http://localhost:${PORT}" + # ttl: 900 diff --git a/src/llama-swap/docker-compose.yaml b/src/llama-swap/docker-compose.yaml new file mode 100644 index 0000000..4155a7a --- /dev/null +++ b/src/llama-swap/docker-compose.yaml @@ -0,0 +1,126 @@ +# Docker Compose configuration for llama-swap +# https://github.com/mostlygeek/llama-swap +# Reliable model swapping for any local OpenAI/Anthropic compatible server + +x-defaults: &defaults + restart: unless-stopped + logging: + driver: json-file + options: + max-size: 100m + max-file: '3' + +services: + # llama-swap - CPU variant (default) + llama-swap: + <<: *defaults + image: ${GHCR_REGISTRY:-ghcr.io/}mostlygeek/llama-swap:${LLAMA_SWAP_VERSION:-cpu} + ports: + - '${LLAMA_SWAP_PORT_OVERRIDE:-9292}:8080' + volumes: + - ./config.yaml:/app/config.yaml:ro + - llama_swap_models:/root/.cache/llama.cpp + environment: + - TZ=${TZ:-UTC} + healthcheck: + test: + - CMD + - wget + - --quiet + - --tries=1 + - --spider + - 'http://localhost:8080/v1/models' + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s + deploy: + resources: + limits: + cpus: ${LLAMA_SWAP_CPU_LIMIT:-4.0} + memory: ${LLAMA_SWAP_MEMORY_LIMIT:-8G} + reservations: + cpus: ${LLAMA_SWAP_CPU_RESERVATION:-2.0} + memory: ${LLAMA_SWAP_MEMORY_RESERVATION:-4G} + + # llama-swap - NVIDIA CUDA variant + # Requires NVIDIA Container Toolkit: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html + llama-swap-cuda: + <<: *defaults + image: ${GHCR_REGISTRY:-ghcr.io/}mostlygeek/llama-swap:${LLAMA_SWAP_CUDA_VERSION:-cuda} + ports: + - '${LLAMA_SWAP_PORT_OVERRIDE:-9292}:8080' + volumes: + - ./config.yaml:/app/config.yaml:ro + - llama_swap_models:/root/.cache/llama.cpp + environment: + - TZ=${TZ:-UTC} + healthcheck: + test: + - CMD + - wget + - --quiet + - --tries=1 + - --spider + - 'http://localhost:8080/v1/models' + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s + deploy: + resources: + limits: + cpus: ${LLAMA_SWAP_CPU_LIMIT:-4.0} + memory: ${LLAMA_SWAP_MEMORY_LIMIT:-8G} + reservations: + cpus: ${LLAMA_SWAP_CPU_RESERVATION:-2.0} + memory: ${LLAMA_SWAP_MEMORY_RESERVATION:-4G} + devices: + - driver: nvidia + count: ${LLAMA_SWAP_GPU_COUNT:-1} + capabilities: [gpu] + profiles: + - gpu + + # llama-swap - AMD ROCm / Vulkan variant (AMD GPU) + # For AMD GPUs, ensure /dev/dri and /dev/kfd are accessible + llama-swap-amd: + <<: *defaults + image: ${GHCR_REGISTRY:-ghcr.io/}mostlygeek/llama-swap:${LLAMA_SWAP_AMD_VERSION:-vulkan} + ports: + - '${LLAMA_SWAP_PORT_OVERRIDE:-9292}:8080' + volumes: + - ./config.yaml:/app/config.yaml:ro + - llama_swap_models:/root/.cache/llama.cpp + devices: + - /dev/dri:/dev/dri + - /dev/kfd:/dev/kfd + group_add: + - video + environment: + - TZ=${TZ:-UTC} + healthcheck: + test: + - CMD + - wget + - --quiet + - --tries=1 + - --spider + - 'http://localhost:8080/v1/models' + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s + deploy: + resources: + limits: + cpus: ${LLAMA_SWAP_CPU_LIMIT:-4.0} + memory: ${LLAMA_SWAP_MEMORY_LIMIT:-8G} + reservations: + cpus: ${LLAMA_SWAP_CPU_RESERVATION:-2.0} + memory: ${LLAMA_SWAP_MEMORY_RESERVATION:-4G} + profiles: + - gpu-amd + +volumes: + llama_swap_models: