feat: add services

- Introduced Convex, an open-source reactive database, with README and environment variable configurations. - Added Chinese translation for Convex documentation. - Created docker-compose configuration for Convex services. - Introduced llama-swap, a model swapping proxy for OpenAI/Anthropic compatible servers, with comprehensive README and example configuration. - Added Chinese translation for llama-swap documentation. - Included example environment file and docker-compose setup for llama-swap. - Configured health checks and resource limits for both Convex and llama-swap services.
2026-03-09 09:27:06 +08:00
parent 64c9b251c3
commit fbd0c9b7f4
14 changed files with 1260 additions and 2 deletions
@@ -37,6 +37,7 @@ These services require building custom Docker images from source.
 | [Clash](./src/clash)                                           | 1.18.0              |
 | [ClickHouse](./src/clickhouse)                                 | 24.11.1             |
 | [Conductor](./src/conductor)                                   | latest              |
+| [Convex](./src/convex)                                         | 33cef775            |
 | [DeepTutor](./apps/deeptutor)                                  | latest              |
 | [Dify](./apps/dify)                                            | 0.18.2              |
 | [DNSMasq](./src/dnsmasq)                                       | 2.91                |
@@ -77,6 +78,7 @@ These services require building custom Docker images from source.
 | [LibreOffice](./src/libreoffice)                               | latest              |
 | [libSQL Server](./src/libsql)                                  | latest              |
 | [LiteLLM](./src/litellm)                                       | main-stable         |
+| [llama-swap](./src/llama-swap)                                 | cpu                 |
 | [llama.cpp](./src/llama.cpp)                                   | server              |
 | [LMDeploy](./src/lmdeploy)                                     | v0.11.1             |
 | [Logstash](./src/logstash)                                     | 8.16.1              |
@@ -37,6 +37,7 @@ Compose Anything 通过提供一组高质量的 Docker Compose 配置文件，
 | [Clash](./src/clash)                                           | 1.18.0              |
 | [ClickHouse](./src/clickhouse)                                 | 24.11.1             |
 | [Conductor](./src/conductor)                                   | latest              |
+| [Convex](./src/convex)                                         | 33cef775            |
 | [DeepTutor](./apps/deeptutor)                                  | latest              |
 | [Dify](./apps/dify)                                            | 0.18.2              |
 | [DNSMasq](./src/dnsmasq)                                       | 2.91                |
@@ -77,6 +78,7 @@ Compose Anything 通过提供一组高质量的 Docker Compose 配置文件，
 | [LibreOffice](./src/libreoffice)                               | latest              |
 | [libSQL Server](./src/libsql)                                  | latest              |
 | [LiteLLM](./src/litellm)                                       | main-stable         |
+| [llama-swap](./src/llama-swap)                                 | cpu                 |
 | [llama.cpp](./src/llama.cpp)                                   | server              |
 | [LMDeploy](./src/lmdeploy)                                     | v0.11.1             |
 | [Logstash](./src/logstash)                                     | 8.16.1              |
@@ -182,6 +182,7 @@ services:
  minio-init:
    <<: *defaults
    image: ${GLOBAL_REGISTRY:-}minio/mc:${MINIO_MC_VERSION:-RELEASE.2025-03-12T17-29-24Z}
+    restart: on-failure
    depends_on:
      minio:
        condition: service_healthy
@@ -1,7 +1,6 @@
 # Use the official vllm image for gpu with Ampere、Ada Lovelace、Hopper architecture (8.0 <= Compute Capability <= 9.0)
 # Compute Capability version query (https://developer.nvidia.com/cuda-gpus)
-# only support x86_64 architecture
-FROM vllm/vllm-openai:v0.10.1.1
+FROM vllm/vllm-openai:v0.10.2

 # Use the official vllm image for gpu with Volta、Turing、Blackwell architecture (7.0 < Compute Capability < 8.0 or Compute Capability >= 10.0)
 # support x86_64 architecture and ARM(AArch64) architecture
@@ -0,0 +1,28 @@
+# Use DaoCloud mirrored vllm image for China region for gpu with Ampere、Ada Lovelace、Hopper architecture (8.0 <= Compute Capability <= 9.0)
+# Compute Capability version query (https://developer.nvidia.com/cuda-gpus)
+FROM docker.m.daocloud.io/vllm/vllm-openai:v0.10.2
+
+# Use DaoCloud mirrored vllm image for China region for gpu with Volta、Turing、Blackwell architecture (7.0 < Compute Capability < 8.0 or Compute Capability >= 10.0)
+# support x86_64 architecture and ARM(AArch64) architecture
+# FROM docker.m.daocloud.io/vllm/vllm-openai:v0.11.0
+
+# Install libgl for opencv support & Noto fonts for Chinese characters
+RUN apt-get update && \
+  apt-get install -y \
+  fonts-noto-core \
+  fonts-noto-cjk \
+  fontconfig \
+  libgl1 && \
+  fc-cache -fv && \
+  apt-get clean && \
+  rm -rf /var/lib/apt/lists/*
+
+# Install mineru latest
+RUN python3 -m pip install -U 'mineru[core]>=2.7.0' -i https://mirrors.aliyun.com/pypi/simple --break-system-packages && \
+  python3 -m pip cache purge
+
+# Download models and update the configuration file
+RUN /bin/bash -c "mineru-models-download -s modelscope -m all"
+
+# Set the entry point to activate the virtual environment and run the command line tool
+ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"]
@@ -0,0 +1,158 @@
+# Convex Configuration
+
+# =============================================================================
+# Versions
+# =============================================================================
+CONVEX_BACKEND_VERSION=33cef775a8a6228cbacee4a09ac2c4073d62ed13
+CONVEX_DASHBOARD_VERSION=33cef775a8a6228cbacee4a09ac2c4073d62ed13
+POSTGRES_VERSION=17-alpine
+
+# =============================================================================
+# Port Configuration
+# =============================================================================
+CONVEX_BACKEND_PORT_OVERRIDE=3210
+CONVEX_SITE_PROXY_PORT_OVERRIDE=3211
+CONVEX_DASHBOARD_PORT_OVERRIDE=6791
+
+# =============================================================================
+# Instance Configuration
+# =============================================================================
+# Name of your Convex instance
+INSTANCE_NAME=convex-self-hosted
+
+# Secret key for instance authentication (generate a strong random string)
+# Example: openssl rand -hex 32
+INSTANCE_SECRET=
+
+# =============================================================================
+# Origins
+# =============================================================================
+# URL where the Convex backend is accessible
+CONVEX_CLOUD_ORIGIN=http://127.0.0.1:3210
+
+# URL where the Convex site proxy is accessible
+CONVEX_SITE_ORIGIN=http://127.0.0.1:3211
+
+# URL for the dashboard to connect to the backend
+NEXT_PUBLIC_DEPLOYMENT_URL=http://127.0.0.1:3210
+
+# =============================================================================
+# Database Configuration
+# =============================================================================
+# PostgreSQL password (change in production)
+POSTGRES_PASSWORD=convex
+
+# Full PostgreSQL connection URL (optional, constructed from above if not set)
+# POSTGRES_URL=postgresql://postgres:convex@postgres:5432/convex
+
+# MySQL URL (alternative to PostgreSQL, leave empty to use PostgreSQL)
+# MYSQL_URL=
+
+# =============================================================================
+# Application Limits
+# =============================================================================
+# Maximum concurrent mutations
+APPLICATION_MAX_CONCURRENT_MUTATIONS=16
+
+# Maximum concurrent Node.js actions
+APPLICATION_MAX_CONCURRENT_NODE_ACTIONS=16
+
+# Maximum concurrent queries
+APPLICATION_MAX_CONCURRENT_QUERIES=16
+
+# Maximum concurrent V8 actions
+APPLICATION_MAX_CONCURRENT_V8_ACTIONS=16
+
+# User action timeout in seconds (empty for default)
+ACTIONS_USER_TIMEOUT_SECS=
+
+# =============================================================================
+# SSL/TLS Settings
+# =============================================================================
+# Set to false to require SSL (recommended for production)
+DO_NOT_REQUIRE_SSL=true
+
+# =============================================================================
+# Data Retention
+# =============================================================================
+# Document retention delay in seconds (default: 2 days)
+DOCUMENT_RETENTION_DELAY=172800
+
+# =============================================================================
+# Telemetry and Metrics
+# =============================================================================
+# Disable telemetry beacon (set to true to disable)
+DISABLE_BEACON=false
+
+# Enable Prometheus-compatible /metrics endpoint
+DISABLE_METRICS_ENDPOINT=true
+
+# =============================================================================
+# Logging
+# =============================================================================
+# Rust log level (error, warn, info, debug, trace)
+RUST_LOG=info
+
+# Enable Rust backtrace (1, full, or empty)
+RUST_BACKTRACE=
+
+# Redact logs sent to clients
+REDACT_LOGS_TO_CLIENT=
+
+# HTTP server timeout in seconds
+HTTP_SERVER_TIMEOUT_SECONDS=
+
+# =============================================================================
+# AWS S3 Configuration (Optional - for external storage)
+# =============================================================================
+# AWS_ACCESS_KEY_ID=
+# AWS_SECRET_ACCESS_KEY=
+# AWS_REGION=
+# AWS_SESSION_TOKEN=
+# S3_ENDPOINT_URL=
+# S3_STORAGE_EXPORTS_BUCKET=
+# S3_STORAGE_FILES_BUCKET=
+# S3_STORAGE_MODULES_BUCKET=
+# S3_STORAGE_SEARCH_BUCKET=
+# S3_STORAGE_SNAPSHOT_IMPORTS_BUCKET=
+# AWS_S3_DISABLE_CHECKSUMS=
+# AWS_S3_DISABLE_SSE=
+# AWS_S3_FORCE_PATH_STYLE=
+
+# =============================================================================
+# Development Settings
+# =============================================================================
+# Development version override
+CONVEX_RELEASE_VERSION_DEV=
+
+# Load Monaco editor internally in dashboard
+NEXT_PUBLIC_LOAD_MONACO_INTERNALLY=
+
+# =============================================================================
+# Timezone
+# =============================================================================
+TZ=UTC
+
+# =============================================================================
+# Resource Limits - Convex Backend
+# =============================================================================
+CONVEX_BACKEND_CPU_LIMIT=2.0
+CONVEX_BACKEND_CPU_RESERVATION=0.5
+CONVEX_BACKEND_MEMORY_LIMIT=2G
+CONVEX_BACKEND_MEMORY_RESERVATION=512M
+
+# =============================================================================
+# Resource Limits - Convex Dashboard
+# =============================================================================
+CONVEX_DASHBOARD_CPU_LIMIT=0.5
+CONVEX_DASHBOARD_CPU_RESERVATION=0.25
+CONVEX_DASHBOARD_MEMORY_LIMIT=256M
+CONVEX_DASHBOARD_MEMORY_RESERVATION=128M
+
+# =============================================================================
+# Resource Limits - PostgreSQL
+# =============================================================================
+POSTGRES_CPU_LIMIT=1.0
+POSTGRES_CPU_RESERVATION=0.25
+POSTGRES_MEMORY_LIMIT=1G
+POSTGRES_MEMORY_RESERVATION=256M
@@ -0,0 +1,123 @@
+# Convex
+
+Convex is an open-source reactive database designed to make life easy for web app developers, whether human or LLM.
+
+## Features
+
+- **Reactive Queries**: Queries automatically update when underlying data changes
+- **Real-time Subscriptions**: Live UI updates without manual polling
+- **Serverless Functions**: Write backend logic in TypeScript/JavaScript
+- **Automatic Caching**: Built-in intelligent caching for optimal performance
+- **Type Safety**: Full TypeScript support with generated types
+- **Scalable Architecture**: Designed to handle high-throughput applications
+
+## Quick Start
+
+1. Copy `.env.example` to `.env`:
+
+   ```bash
+   cp .env.example .env
+   ```
+
+2. Generate an instance secret (required for production):
+
+   ```bash
+   openssl rand -hex 32
+   ```
+
+   Then set `INSTANCE_SECRET` in your `.env` file.
+
+3. Start Convex:
+
+   ```bash
+   docker compose up -d
+   ```
+
+4. Wait for services to be healthy (check with `docker compose ps`)
+
+5. Access the Dashboard at `http://localhost:6791`
+
+6. Backend API is available at `http://localhost:3210`
+
+## Default Configuration
+
+| Service        | Port | Description                     |
+| -------------- | ---- | ------------------------------- |
+| Convex Backend | 3210 | Main API and WebSocket endpoint |
+| Site Proxy     | 3211 | Site hosting proxy              |
+| Dashboard      | 6791 | Web UI for managing Convex      |
+| PostgreSQL     | 5432 | Database (internal)             |
+
+**Authentication**: Set `INSTANCE_SECRET` for production use.
+
+## Environment Variables
+
+Key environment variables (see `.env.example` for full list):
+
+| Variable                          | Description                       | Default                 |
+| --------------------------------- | --------------------------------- | ----------------------- |
+| `CONVEX_BACKEND_PORT_OVERRIDE`    | Host port for backend API         | `3210`                  |
+| `CONVEX_SITE_PROXY_PORT_OVERRIDE` | Host port for site proxy          | `3211`                  |
+| `CONVEX_DASHBOARD_PORT_OVERRIDE`  | Host port for dashboard           | `6791`                  |
+| `INSTANCE_NAME`                   | Name of the Convex instance       | `convex-self-hosted`    |
+| `INSTANCE_SECRET`                 | Secret key for authentication     | (required)              |
+| `CONVEX_CLOUD_ORIGIN`             | URL for backend access            | `http://127.0.0.1:3210` |
+| `CONVEX_SITE_ORIGIN`              | URL for site proxy access         | `http://127.0.0.1:3211` |
+| `POSTGRES_PASSWORD`               | PostgreSQL password               | `convex`                |
+| `RUST_LOG`                        | Log level (error/warn/info/debug) | `info`                  |
+| `TZ`                              | Timezone                          | `UTC`                   |
+
+## Resource Requirements
+
+**Minimum**:
+
+- CPU: 1 core
+- RAM: 1GB
+- Disk: 5GB
+
+**Recommended**:
+
+- CPU: 2+ cores
+- RAM: 2GB+
+- Disk: 20GB+
+
+## Volumes
+
+- `convex_data`: Convex backend data storage
+- `postgres_data`: PostgreSQL database data
+
+## Using with Your Application
+
+To use this self-hosted Convex backend with your application:
+
+1. Set the `CONVEX_SELF_HOSTED_URL` environment variable in your app:
+
+   ```bash
+   CONVEX_SELF_HOSTED_URL=http://localhost:3210
+   ```
+
+2. Set the `CONVEX_SELF_HOSTED_ADMIN_KEY` environment variable:
+
+   ```bash
+   CONVEX_SELF_HOSTED_ADMIN_KEY=your-instance-secret
+   ```
+
+3. Deploy your Convex functions:
+
+   ```bash
+   npx convex dev
+   ```
+
+For more details, see the [Convex Self-Hosting Documentation](https://stack.convex.dev/self-hosted-develop-and-deploy).
+
+## Security Notes
+
+- **Always set a strong `INSTANCE_SECRET`** in production
+- Enable SSL/TLS by setting `DO_NOT_REQUIRE_SSL=false` and using a reverse proxy
+- Use strong database passwords
+- Restrict network access to Convex services
+- Consider using AWS S3 for external storage in production
+
+## License
+
+Apache-2.0 (<https://github.com/get-convex/convex-backend/blob/main/LICENSE>)
@@ -0,0 +1,123 @@
+# Convex
+
+Convex 是一个开源的响应式数据库，旨在让 Web 应用开发者（无论是人类还是 LLM）的生活更加轻松。
+
+## 功能特性
+
+- **响应式查询**：当底层数据变化时，查询会自动更新
+- **实时订阅**：无需手动轮询即可实现实时 UI 更新
+- **无服务器函数**：使用 TypeScript/JavaScript 编写后端逻辑
+- **自动缓存**：内置智能缓存以获得最佳性能
+- **类型安全**：完整的 TypeScript 支持，并生成类型定义
+- **可扩展架构**：专为高吞吐量应用而设计
+
+## 快速开始
+
+1. 复制 `.env.example` 到 `.env`：
+
+   ```bash
+   cp .env.example .env
+   ```
+
+2. 生成实例密钥（生产环境必需）：
+
+   ```bash
+   openssl rand -hex 32
+   ```
+
+   然后在 `.env` 文件中设置 `INSTANCE_SECRET`。
+
+3. 启动 Convex：
+
+   ```bash
+   docker compose up -d
+   ```
+
+4. 等待服务健康（使用 `docker compose ps` 检查）
+
+5. 访问 Dashboard：`http://localhost:6791`
+
+6. 后端 API 地址：`http://localhost:3210`
+
+## 默认配置
+
+| 服务           | 端口 | 说明                     |
+| -------------- | ---- | ------------------------ |
+| Convex Backend | 3210 | 主 API 和 WebSocket 端点 |
+| Site Proxy     | 3211 | 站点托管代理             |
+| Dashboard      | 6791 | 管理 Convex 的 Web UI    |
+| PostgreSQL     | 5432 | 数据库（内部）           |
+
+**认证**：生产环境请设置 `INSTANCE_SECRET`。
+
+## 环境变量
+
+关键环境变量（完整列表请参见 `.env.example`）：
+
+| 变量                              | 说明                              | 默认值                  |
+| --------------------------------- | --------------------------------- | ----------------------- |
+| `CONVEX_BACKEND_PORT_OVERRIDE`    | 后端 API 的主机端口               | `3210`                  |
+| `CONVEX_SITE_PROXY_PORT_OVERRIDE` | 站点代理的主机端口                | `3211`                  |
+| `CONVEX_DASHBOARD_PORT_OVERRIDE`  | Dashboard 的主机端口              | `6791`                  |
+| `INSTANCE_NAME`                   | Convex 实例名称                   | `convex-self-hosted`    |
+| `INSTANCE_SECRET`                 | 认证密钥                          | （必需）                |
+| `CONVEX_CLOUD_ORIGIN`             | 后端访问 URL                      | `http://127.0.0.1:3210` |
+| `CONVEX_SITE_ORIGIN`              | 站点代理访问 URL                  | `http://127.0.0.1:3211` |
+| `POSTGRES_PASSWORD`               | PostgreSQL 密码                   | `convex`                |
+| `RUST_LOG`                        | 日志级别（error/warn/info/debug） | `info`                  |
+| `TZ`                              | 时区                              | `UTC`                   |
+
+## 资源需求
+
+**最低配置**：
+
+- CPU：1 核
+- 内存：1GB
+- 磁盘：5GB
+
+**推荐配置**：
+
+- CPU：2+ 核
+- 内存：2GB+
+- 磁盘：20GB+
+
+## 数据卷
+
+- `convex_data`：Convex 后端数据存储
+- `postgres_data`：PostgreSQL 数据库数据
+
+## 在应用中使用
+
+要将此自托管 Convex 后端与您的应用一起使用：
+
+1. 在应用中设置 `CONVEX_SELF_HOSTED_URL` 环境变量：
+
+   ```bash
+   CONVEX_SELF_HOSTED_URL=http://localhost:3210
+   ```
+
+2. 设置 `CONVEX_SELF_HOSTED_ADMIN_KEY` 环境变量：
+
+   ```bash
+   CONVEX_SELF_HOSTED_ADMIN_KEY=your-instance-secret
+   ```
+
+3. 部署您的 Convex 函数：
+
+   ```bash
+   npx convex dev
+   ```
+
+更多详情，请参阅 [Convex 自托管文档](https://stack.convex.dev/self-hosted-develop-and-deploy)。
+
+## 安全说明
+
+- **生产环境务必设置强 `INSTANCE_SECRET`**
+- 通过设置 `DO_NOT_REQUIRE_SSL=false` 并使用反向代理来启用 SSL/TLS
+- 使用强数据库密码
+- 限制对 Convex 服务的网络访问
+- 生产环境考虑使用 AWS S3 进行外部存储
+
+## 许可证
+
+Apache-2.0（<https://github.com/get-convex/convex-backend/blob/main/LICENSE>）
@@ -0,0 +1,195 @@
+# Convex - Open-source Reactive Database
+# https://github.com/get-convex/convex-backend
+#
+# Convex is an open-source reactive database designed to make life easy for
+# web app developers. It provides real-time data synchronization, automatic
+# caching, and a powerful query language.
+#
+# Key Features:
+# - Reactive queries that automatically update when data changes
+# - Real-time subscriptions for live UI updates
+# - Built-in authentication and authorization
+# - Serverless functions with TypeScript/JavaScript
+# - Automatic scaling and caching
+#
+# Default Credentials:
+# - Dashboard at http://localhost:6791
+# - Backend API at http://localhost:3210
+# - Site proxy at http://localhost:3211
+#
+# Security Notes:
+# - Set a strong INSTANCE_SECRET in production
+# - Enable SSL/TLS in production
+# - Use strong database passwords
+# - Restrict network access to Convex services
+#
+# License: Apache-2.0 (https://github.com/get-convex/convex-backend/blob/main/LICENSE)
+
+x-defaults: &defaults
+  restart: unless-stopped
+  logging:
+    driver: json-file
+    options:
+      max-size: 100m
+      max-file: '3'
+
+services:
+  convex-backend:
+    <<: *defaults
+    image: ${GHCR_REGISTRY:-ghcr.io/}get-convex/convex-backend:${CONVEX_BACKEND_VERSION:-33cef775a8a6228cbacee4a09ac2c4073d62ed13}
+    stop_grace_period: 10s
+    stop_signal: SIGINT
+    ports:
+      - '${CONVEX_BACKEND_PORT_OVERRIDE:-3210}:3210'
+      - '${CONVEX_SITE_PROXY_PORT_OVERRIDE:-3211}:3211'
+    volumes:
+      - convex_data:/convex/data
+    environment:
+      # Instance configuration
+      - INSTANCE_NAME=${INSTANCE_NAME:-convex-self-hosted}
+      - INSTANCE_SECRET=${INSTANCE_SECRET}
+
+      # Origins
+      - CONVEX_CLOUD_ORIGIN=${CONVEX_CLOUD_ORIGIN:-http://127.0.0.1:3210}
+      - CONVEX_SITE_ORIGIN=${CONVEX_SITE_ORIGIN:-http://127.0.0.1:3211}
+
+      # Database configuration (PostgreSQL)
+      - POSTGRES_URL=${POSTGRES_URL:-postgresql://postgres:${POSTGRES_PASSWORD:-convex}@postgres:5432/convex}
+
+      # Application limits
+      - APPLICATION_MAX_CONCURRENT_MUTATIONS=${APPLICATION_MAX_CONCURRENT_MUTATIONS:-16}
+      - APPLICATION_MAX_CONCURRENT_NODE_ACTIONS=${APPLICATION_MAX_CONCURRENT_NODE_ACTIONS:-16}
+      - APPLICATION_MAX_CONCURRENT_QUERIES=${APPLICATION_MAX_CONCURRENT_QUERIES:-16}
+      - APPLICATION_MAX_CONCURRENT_V8_ACTIONS=${APPLICATION_MAX_CONCURRENT_V8_ACTIONS:-16}
+
+      # Actions timeout
+      - ACTIONS_USER_TIMEOUT_SECS=${ACTIONS_USER_TIMEOUT_SECS:-}
+
+      # SSL/TLS settings
+      - DO_NOT_REQUIRE_SSL=${DO_NOT_REQUIRE_SSL:-true}
+
+      # Document retention (default 2 days in seconds)
+      - DOCUMENT_RETENTION_DELAY=${DOCUMENT_RETENTION_DELAY:-172800}
+
+      # Metrics and beacon
+      - DISABLE_BEACON=${DISABLE_BEACON:-false}
+      - DISABLE_METRICS_ENDPOINT=${DISABLE_METRICS_ENDPOINT:-true}
+
+      # Logging
+      - RUST_LOG=${RUST_LOG:-info}
+      - RUST_BACKTRACE=${RUST_BACKTRACE:-}
+      - REDACT_LOGS_TO_CLIENT=${REDACT_LOGS_TO_CLIENT:-}
+
+      # HTTP server timeout
+      - HTTP_SERVER_TIMEOUT_SECONDS=${HTTP_SERVER_TIMEOUT_SECONDS:-}
+
+      # AWS S3 configuration (optional, for external storage)
+      - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-}
+      - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-}
+      - AWS_REGION=${AWS_REGION:-}
+      - AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN:-}
+      - S3_ENDPOINT_URL=${S3_ENDPOINT_URL:-}
+      - S3_STORAGE_EXPORTS_BUCKET=${S3_STORAGE_EXPORTS_BUCKET:-}
+      - S3_STORAGE_FILES_BUCKET=${S3_STORAGE_FILES_BUCKET:-}
+      - S3_STORAGE_MODULES_BUCKET=${S3_STORAGE_MODULES_BUCKET:-}
+      - S3_STORAGE_SEARCH_BUCKET=${S3_STORAGE_SEARCH_BUCKET:-}
+      - S3_STORAGE_SNAPSHOT_IMPORTS_BUCKET=${S3_STORAGE_SNAPSHOT_IMPORTS_BUCKET:-}
+      - AWS_S3_DISABLE_CHECKSUMS=${AWS_S3_DISABLE_CHECKSUMS:-}
+      - AWS_S3_DISABLE_SSE=${AWS_S3_DISABLE_SSE:-}
+      - AWS_S3_FORCE_PATH_STYLE=${AWS_S3_FORCE_PATH_STYLE:-}
+
+      # MySQL URL (alternative to PostgreSQL)
+      - MYSQL_URL=${MYSQL_URL:-}
+
+      # Development settings
+      - CONVEX_RELEASE_VERSION_DEV=${CONVEX_RELEASE_VERSION_DEV:-}
+
+      # Timezone
+      - TZ=${TZ:-UTC}
+    depends_on:
+      postgres:
+        condition: service_healthy
+    healthcheck:
+      test:
+        - CMD
+        - curl
+        - -f
+        - http://localhost:3210/version
+      interval: 5s
+      timeout: 5s
+      retries: 5
+      start_period: 30s
+    deploy:
+      resources:
+        limits:
+          cpus: '${CONVEX_BACKEND_CPU_LIMIT:-2.0}'
+          memory: '${CONVEX_BACKEND_MEMORY_LIMIT:-2G}'
+        reservations:
+          cpus: '${CONVEX_BACKEND_CPU_RESERVATION:-0.5}'
+          memory: '${CONVEX_BACKEND_MEMORY_RESERVATION:-512M}'
+
+  convex-dashboard:
+    <<: *defaults
+    image: ${GHCR_REGISTRY:-ghcr.io/}get-convex/convex-dashboard:${CONVEX_DASHBOARD_VERSION:-33cef775a8a6228cbacee4a09ac2c4073d62ed13}
+    stop_grace_period: 10s
+    stop_signal: SIGINT
+    ports:
+      - '${CONVEX_DASHBOARD_PORT_OVERRIDE:-6791}:6791'
+    environment:
+      - NEXT_PUBLIC_DEPLOYMENT_URL=${NEXT_PUBLIC_DEPLOYMENT_URL:-http://127.0.0.1:3210}
+      - NEXT_PUBLIC_LOAD_MONACO_INTERNALLY=${NEXT_PUBLIC_LOAD_MONACO_INTERNALLY:-}
+    depends_on:
+      convex-backend:
+        condition: service_healthy
+    healthcheck:
+      test:
+        - CMD
+        - wget
+        - --quiet
+        - --tries=1
+        - --spider
+        - http://localhost:6791
+      interval: 10s
+      timeout: 5s
+      retries: 3
+      start_period: 10s
+    deploy:
+      resources:
+        limits:
+          cpus: '${CONVEX_DASHBOARD_CPU_LIMIT:-0.5}'
+          memory: '${CONVEX_DASHBOARD_MEMORY_LIMIT:-256M}'
+        reservations:
+          cpus: '${CONVEX_DASHBOARD_CPU_RESERVATION:-0.25}'
+          memory: '${CONVEX_DASHBOARD_MEMORY_RESERVATION:-128M}'
+
+  postgres:
+    <<: *defaults
+    image: ${GLOBAL_REGISTRY:-}postgres:${POSTGRES_VERSION:-17-alpine}
+    environment:
+      - POSTGRES_DB=convex
+      - POSTGRES_USER=postgres
+      - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-convex}
+      - POSTGRES_INITDB_ARGS=--encoding=UTF8
+      - TZ=${TZ:-UTC}
+    volumes:
+      - postgres_data:/var/lib/postgresql/data
+    healthcheck:
+      test:
+        - CMD-SHELL
+        - pg_isready -U postgres
+      interval: 5s
+      timeout: 5s
+      retries: 5
+      start_period: 10s
+    deploy:
+      resources:
+        limits:
+          cpus: '${POSTGRES_CPU_LIMIT:-1.0}'
+          memory: '${POSTGRES_MEMORY_LIMIT:-1G}'
+        reservations:
+          cpus: '${POSTGRES_CPU_RESERVATION:-0.25}'
+          memory: '${POSTGRES_MEMORY_RESERVATION:-256M}'
+
+volumes:
+  convex_data:
+  postgres_data:
@@ -0,0 +1,62 @@
+# =============================================================================
+# llama-swap Configuration
+# https://github.com/mostlygeek/llama-swap
+# Reliable model swapping for any local OpenAI/Anthropic compatible server
+# =============================================================================
+
+# -----------------------------------------------------------------------------
+# General Settings
+# -----------------------------------------------------------------------------
+
+# Timezone for the container (default: UTC)
+TZ=UTC
+
+# GitHub Container Registry prefix (default: ghcr.io/)
+GHCR_REGISTRY=ghcr.io/
+
+# -----------------------------------------------------------------------------
+# Image Variants
+# -----------------------------------------------------------------------------
+
+# CPU-only image version tag (default: cpu)
+# Available: cpu, cuda, vulkan, rocm, intel, musa
+# Tagged releases example: v197-cuda-b8193
+LLAMA_SWAP_VERSION=cpu
+
+# NVIDIA CUDA image version tag (used with the `gpu` profile)
+LLAMA_SWAP_CUDA_VERSION=cuda
+
+# AMD GPU image version tag (used with the `gpu-amd` profile)
+# Options: vulkan (Vulkan/AMD), rocm (ROCm/AMD)
+LLAMA_SWAP_AMD_VERSION=vulkan
+
+# -----------------------------------------------------------------------------
+# Network Settings
+# -----------------------------------------------------------------------------
+
+# Host port override for the llama-swap API (default: 9292)
+# The Web UI and OpenAI-compatible API are both served on this port
+LLAMA_SWAP_PORT_OVERRIDE=9292
+
+# -----------------------------------------------------------------------------
+# GPU Settings (used with `gpu` profile)
+# -----------------------------------------------------------------------------
+
+# Number of NVIDIA GPUs to use (default: 1)
+LLAMA_SWAP_GPU_COUNT=1
+
+# -----------------------------------------------------------------------------
+# Resource Limits
+# -----------------------------------------------------------------------------
+
+# CPU limit (in cores)
+LLAMA_SWAP_CPU_LIMIT=4.0
+
+# CPU reservation (in cores)
+LLAMA_SWAP_CPU_RESERVATION=2.0
+
+# Memory limit (e.g., 8G, 16G)
+LLAMA_SWAP_MEMORY_LIMIT=8G
+
+# Memory reservation (e.g., 4G, 8G)
+LLAMA_SWAP_MEMORY_RESERVATION=4G
@@ -0,0 +1,196 @@
+# llama-swap
+
+[llama-swap](https://github.com/mostlygeek/llama-swap) is a lightweight reverse proxy that provides reliable on-demand model swapping for any local OpenAI/Anthropic-compatible inference server (e.g., llama.cpp, vllm). Only one model is loaded at a time, and it is automatically swapped out when a different model is requested, making it easy to work with many models on a single machine.
+
+See also: [README.zh.md](./README.zh.md)
+
+## Features
+
+- **On-demand model swapping**: Automatically load/unload models based on API requests with zero manual intervention.
+- **OpenAI/Anthropic compatible**: Drop-in replacement for any client that uses the OpenAI or Anthropic chat completion API.
+- **Multi-backend support**: Works with llama.cpp (llama-server), vllm, and any OpenAI-compatible server.
+- **Real-time Web UI**: Built-in interface for monitoring logs, inspecting requests, and manually managing models.
+- **TTL-based unloading**: Models can be configured to unload automatically after a period of inactivity.
+- **HuggingFace model downloads**: Reference HuggingFace models directly in `config.yaml` and they are downloaded on first use.
+- **Multi-GPU support**: Works with NVIDIA CUDA, AMD ROCm/Vulkan, Intel, and CPU-only setups.
+
+## Quick Start
+
+1. Copy the example environment file:
+
+   ```bash
+   cp .env.example .env
+   ```
+
+2. Edit `config.yaml` to add your models. The provided `config.yaml` includes a commented example for a local GGUF model and a HuggingFace download. See [Configuration](#configuration) for details.
+
+3. Start the service (CPU-only by default):
+
+   ```bash
+   docker compose up -d
+   ```
+
+4. For NVIDIA GPU support:
+
+   ```bash
+   docker compose --profile gpu up -d
+   ```
+
+5. For AMD GPU support (Vulkan):
+
+   ```bash
+   docker compose --profile gpu-amd up -d
+   ```
+
+The API and Web UI are available at: `http://localhost:9292`
+
+## Services
+
+| Service           | Profile     | Description                       |
+| ----------------- | ----------- | --------------------------------- |
+| `llama-swap`      | _(default)_ | CPU-only inference                |
+| `llama-swap-cuda` | `gpu`       | NVIDIA CUDA GPU inference         |
+| `llama-swap-amd`  | `gpu-amd`   | AMD GPU inference (Vulkan / ROCm) |
+
+> **Note**: Only start one service at a time. All three services bind to the same host port (`LLAMA_SWAP_PORT_OVERRIDE`).
+
+## Configuration
+
+### `config.yaml`
+
+The `config.yaml` file defines the models llama-swap manages. It is mounted read-only at `/app/config.yaml` inside the container. Edit the provided `config.yaml` to add your models.
+
+Minimal example:
+
+```yaml
+healthCheckTimeout: 300
+
+models:
+  my-model:
+    cmd: /app/llama-server --port ${PORT} --model /root/.cache/llama.cpp/model.gguf --ctx-size 4096
+    proxy: 'http://localhost:${PORT}'
+    ttl: 900
+```
+
+- `${PORT}` is automatically assigned by llama-swap.
+- `ttl` (seconds): unload the model after this many seconds of inactivity.
+- `cmd`: the command to start the inference server.
+- `proxy`: the address llama-swap forwards requests to.
+
+For downloading models from HuggingFace on first use:
+
+```yaml
+models:
+  Qwen2.5-7B:
+    cmd: /app/llama-server --port ${PORT} -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M --ctx-size 8192 --n-gpu-layers 99
+    proxy: 'http://localhost:${PORT}'
+```
+
+See the [official configuration documentation](https://github.com/mostlygeek/llama-swap/blob/main/docs/config.md) for all options including `groups`, `hooks`, `macros`, `aliases`, `filters`, and more.
+
+### Models Volume
+
+The named volume `llama_swap_models` is mounted to `/root/.cache/llama.cpp` inside the container. To place local GGUF model files inside the volume, you can use:
+
+```bash
+# Copy a model into the named volume
+docker run --rm -v llama_swap_models:/data -v /path/to/model.gguf:/src/model.gguf alpine cp /src/model.gguf /data/model.gguf
+```
+
+Alternatively, change the volume definition in `docker-compose.yaml` to use a host path:
+
+```yaml
+volumes:
+  llama_swap_models:
+    driver: local
+    driver_opts:
+      type: none
+      o: bind
+      device: /path/to/your/models
+```
+
+## Environment Variables
+
+| Variable                        | Default    | Description                                        |
+| ------------------------------- | ---------- | -------------------------------------------------- |
+| `TZ`                            | `UTC`      | Container timezone                                 |
+| `GHCR_REGISTRY`                 | `ghcr.io/` | GitHub Container Registry prefix                   |
+| `LLAMA_SWAP_VERSION`            | `cpu`      | Image tag for the default CPU service              |
+| `LLAMA_SWAP_CUDA_VERSION`       | `cuda`     | Image tag for the CUDA service                     |
+| `LLAMA_SWAP_AMD_VERSION`        | `vulkan`   | Image tag for the AMD service (`vulkan` or `rocm`) |
+| `LLAMA_SWAP_PORT_OVERRIDE`      | `9292`     | Host port for the API and Web UI                   |
+| `LLAMA_SWAP_GPU_COUNT`          | `1`        | Number of NVIDIA GPUs to use (CUDA profile)        |
+| `LLAMA_SWAP_CPU_LIMIT`          | `4.0`      | CPU limit in cores                                 |
+| `LLAMA_SWAP_CPU_RESERVATION`    | `2.0`      | CPU reservation in cores                           |
+| `LLAMA_SWAP_MEMORY_LIMIT`       | `8G`       | Memory limit                                       |
+| `LLAMA_SWAP_MEMORY_RESERVATION` | `4G`       | Memory reservation                                 |
+
+## Default Ports
+
+| Port   | Description                                |
+| ------ | ------------------------------------------ |
+| `9292` | OpenAI/Anthropic-compatible API and Web UI |
+
+## API Usage
+
+llama-swap exposes an OpenAI-compatible API. Use any OpenAI client by pointing it to `http://localhost:9292`:
+
+```bash
+# List available models
+curl http://localhost:9292/v1/models
+
+# Chat completion (automatically loads the model if not running)
+curl http://localhost:9292/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "my-model",
+    "messages": [{"role": "user", "content": "Hello!"}]
+  }'
+```
+
+The Web UI is available at `http://localhost:9292` and provides real-time log streaming, request inspection, and manual model management.
+
+## NVIDIA GPU Setup
+
+Requires the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html).
+
+```bash
+docker compose --profile gpu up -d
+```
+
+For non-root security hardening, use the `cuda-non-root` image tag:
+
+```yaml
+LLAMA_SWAP_CUDA_VERSION=cuda-non-root
+```
+
+## AMD GPU Setup
+
+Requires the `/dev/dri` and `/dev/dri` devices to be accessible on the host.
+
+```bash
+docker compose --profile gpu-amd up -d
+```
+
+Use `rocm` instead of `vulkan` for full ROCm support:
+
+```bash
+LLAMA_SWAP_AMD_VERSION=rocm docker compose --profile gpu-amd up -d
+```
+
+## Security Notes
+
+- By default, the container runs as root. Use the `cuda-non-root` or `rocm-non-root` image tags for improved security on GPU deployments.
+- The `config.yaml` is mounted read-only (`ro`).
+- Consider placing llama-swap behind a reverse proxy (e.g., Nginx, Caddy) when exposing it beyond localhost.
+
+## References
+
+- [llama-swap GitHub](https://github.com/mostlygeek/llama-swap)
+- [Configuration Documentation](https://github.com/mostlygeek/llama-swap/blob/main/docs/config.md)
+- [Container Security](https://github.com/mostlygeek/llama-swap/blob/main/docs/container-security.md)
+- [Docker Compose Wiki](https://github.com/mostlygeek/llama-swap/wiki/Docker-Compose-Example)
+
+## License
+
+llama-swap is released under the MIT License. See the [LICENSE](https://github.com/mostlygeek/llama-swap/blob/main/LICENSE) file for details.
@@ -0,0 +1,196 @@
+# llama-swap
+
+[llama-swap](https://github.com/mostlygeek/llama-swap) 是一个轻量级反向代理，为任何本地 OpenAI/Anthropic 兼容的推理服务器（如 llama.cpp、vllm 等）提供可靠的按需模型切换功能。同一时间只加载一个模型，当收到对不同模型的请求时，llama-swap 会自动切换，让你可以在单台机器上轻松使用多个模型。
+
+参见：[README.md](./README.md)
+
+## 功能特性
+
+- **按需模型切换**：根据 API 请求自动加载/卸载模型，无需手动干预。
+- **兼容 OpenAI/Anthropic**：可直接替代任何使用 OpenAI 或 Anthropic 聊天补全 API 的客户端。
+- **多后端支持**：适用于 llama.cpp（llama-server）、vllm 及任何 OpenAI 兼容服务器。
+- **实时 Web UI**：内置界面，可监控日志、检查请求、手动管理模型。
+- **基于 TTL 的自动卸载**：可配置模型在闲置一段时间后自动卸载。
+- **HuggingFace 模型下载**：在 `config.yaml` 中直接引用 HuggingFace 模型，首次使用时自动下载。
+- **多 GPU 支持**：支持 NVIDIA CUDA、AMD ROCm/Vulkan、Intel 及纯 CPU 部署。
+
+## 快速开始
+
+1. 复制环境变量示例文件：
+
+   ```bash
+   cp .env.example .env
+   ```
+
+2. 编辑 `config.yaml`，添加你的模型配置。提供的 `config.yaml` 包含本地 GGUF 模型和 HuggingFace 下载的注释示例。详见[配置说明](#配置说明)。
+
+3. 启动服务（默认仅使用 CPU）：
+
+   ```bash
+   docker compose up -d
+   ```
+
+4. 启用 NVIDIA GPU 支持：
+
+   ```bash
+   docker compose --profile gpu up -d
+   ```
+
+5. 启用 AMD GPU 支持（Vulkan）：
+
+   ```bash
+   docker compose --profile gpu-amd up -d
+   ```
+
+API 和 Web UI 地址：`http://localhost:9292`
+
+## 服务说明
+
+| 服务名称          | Profile    | 说明                          |
+| ----------------- | ---------- | ----------------------------- |
+| `llama-swap`      | _（默认）_ | 纯 CPU 推理                   |
+| `llama-swap-cuda` | `gpu`      | NVIDIA CUDA GPU 推理          |
+| `llama-swap-amd`  | `gpu-amd`  | AMD GPU 推理（Vulkan / ROCm） |
+
+> **注意**：每次只启动一个服务，三个服务均绑定到同一主机端口（`LLAMA_SWAP_PORT_OVERRIDE`）。
+
+## 配置说明
+
+### `config.yaml`
+
+`config.yaml` 文件定义了 llama-swap 管理的模型列表，以只读方式挂载到容器内的 `/app/config.yaml`。编辑提供的 `config.yaml` 即可添加你的模型。
+
+最简示例：
+
+```yaml
+healthCheckTimeout: 300
+
+models:
+  my-model:
+    cmd: /app/llama-server --port ${PORT} --model /root/.cache/llama.cpp/model.gguf --ctx-size 4096
+    proxy: 'http://localhost:${PORT}'
+    ttl: 900
+```
+
+- `${PORT}` 由 llama-swap 自动分配。
+- `ttl`（秒）：模型闲置超过该时长后自动卸载。
+- `cmd`：启动推理服务器的命令。
+- `proxy`：llama-swap 转发请求的地址。
+
+直接使用 HuggingFace 模型（首次使用时自动下载）：
+
+```yaml
+models:
+  Qwen2.5-7B:
+    cmd: /app/llama-server --port ${PORT} -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M --ctx-size 8192 --n-gpu-layers 99
+    proxy: 'http://localhost:${PORT}'
+```
+
+完整配置选项（包括 `groups`、`hooks`、`macros`、`aliases`、`filters` 等）请参阅[官方配置文档](https://github.com/mostlygeek/llama-swap/blob/main/docs/config.md)。
+
+### 模型卷
+
+命名卷 `llama_swap_models` 挂载到容器内的 `/root/.cache/llama.cpp`。可以通过以下方式将本地 GGUF 模型文件放入卷中：
+
+```bash
+# 将模型文件复制到命名卷
+docker run --rm -v llama_swap_models:/data -v /path/to/model.gguf:/src/model.gguf alpine cp /src/model.gguf /data/model.gguf
+```
+
+或者将 `docker-compose.yaml` 中的卷定义改为主机路径绑定：
+
+```yaml
+volumes:
+  llama_swap_models:
+    driver: local
+    driver_opts:
+      type: none
+      o: bind
+      device: /path/to/your/models
+```
+
+## 环境变量
+
+| 变量名                          | 默认值     | 说明                                   |
+| ------------------------------- | ---------- | -------------------------------------- |
+| `TZ`                            | `UTC`      | 容器时区                               |
+| `GHCR_REGISTRY`                 | `ghcr.io/` | GitHub 容器镜像仓库前缀                |
+| `LLAMA_SWAP_VERSION`            | `cpu`      | 默认 CPU 服务镜像标签                  |
+| `LLAMA_SWAP_CUDA_VERSION`       | `cuda`     | CUDA 服务镜像标签                      |
+| `LLAMA_SWAP_AMD_VERSION`        | `vulkan`   | AMD 服务镜像标签（`vulkan` 或 `rocm`） |
+| `LLAMA_SWAP_PORT_OVERRIDE`      | `9292`     | API 和 Web UI 的主机端口               |
+| `LLAMA_SWAP_GPU_COUNT`          | `1`        | 使用的 NVIDIA GPU 数量（gpu profile）  |
+| `LLAMA_SWAP_CPU_LIMIT`          | `4.0`      | CPU 上限（核心数）                     |
+| `LLAMA_SWAP_CPU_RESERVATION`    | `2.0`      | CPU 预留（核心数）                     |
+| `LLAMA_SWAP_MEMORY_LIMIT`       | `8G`       | 内存上限                               |
+| `LLAMA_SWAP_MEMORY_RESERVATION` | `4G`       | 内存预留                               |
+
+## 默认端口
+
+| 端口   | 说明                                |
+| ------ | ----------------------------------- |
+| `9292` | OpenAI/Anthropic 兼容 API 及 Web UI |
+
+## API 使用示例
+
+llama-swap 暴露 OpenAI 兼容 API。将任何 OpenAI 客户端指向 `http://localhost:9292` 即可使用：
+
+```bash
+# 列出可用模型
+curl http://localhost:9292/v1/models
+
+# 聊天补全（若模型未运行则自动加载）
+curl http://localhost:9292/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "my-model",
+    "messages": [{"role": "user", "content": "你好！"}]
+  }'
+```
+
+Web UI 可通过 `http://localhost:9292` 访问，提供实时日志流、请求检查和手动模型管理功能。
+
+## NVIDIA GPU 配置
+
+需要安装 [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)。
+
+```bash
+docker compose --profile gpu up -d
+```
+
+如需非 root 安全加固，可使用 `cuda-non-root` 镜像标签：
+
+```bash
+LLAMA_SWAP_CUDA_VERSION=cuda-non-root docker compose --profile gpu up -d
+```
+
+## AMD GPU 配置
+
+需要主机上 `/dev/dri` 和 `/dev/kfd` 设备可访问。
+
+```bash
+docker compose --profile gpu-amd up -d
+```
+
+如需完整 ROCm 支持，可使用 `rocm` 替代 `vulkan`：
+
+```bash
+LLAMA_SWAP_AMD_VERSION=rocm docker compose --profile gpu-amd up -d
+```
+
+## 安全说明
+
+- 默认情况下容器以 root 用户运行。GPU 部署时建议使用 `cuda-non-root` 或 `rocm-non-root` 镜像标签提升安全性。
+- `config.yaml` 以只读方式（`ro`）挂载。
+- 若需在 localhost 之外暴露服务，建议在 llama-swap 前部署反向代理（如 Nginx、Caddy）。
+
+## 参考链接
+
+- [llama-swap GitHub](https://github.com/mostlygeek/llama-swap)
+- [配置文档](https://github.com/mostlygeek/llama-swap/blob/main/docs/config.md)
+- [容器安全文档](https://github.com/mostlygeek/llama-swap/blob/main/docs/container-security.md)
+- [Docker Compose Wiki](https://github.com/mostlygeek/llama-swap/wiki/Docker-Compose-Example)
+
+## 许可证
+
+llama-swap 使用 MIT 许可证发布。详情请参阅 [LICENSE](https://github.com/mostlygeek/llama-swap/blob/main/LICENSE) 文件。
@@ -0,0 +1,47 @@
+# llama-swap configuration file
+# https://github.com/mostlygeek/llama-swap/blob/main/docs/config.md
+#
+# This is the main configuration file for llama-swap.
+# Mount this file to /app/config.yaml inside the container.
+#
+# llama-swap will automatically swap models on demand:
+# - Only the requested model is loaded at a time.
+# - Idle models are unloaded when a new one is requested.
+
+# Maximum time (in seconds) to wait for a model to become healthy.
+# A high value is useful when downloading models from HuggingFace.
+healthCheckTimeout: 300
+
+# Macro definitions: reusable command snippets for model configuration.
+# Reference with $${macro-name} inside cmd fields.
+macros:
+  "llama-server": >
+    /app/llama-server
+    --port ${PORT}
+
+# Model definitions
+models:
+  # Example: a local GGUF model stored in the models volume.
+  # The volume `llama_swap_models` is mounted to /root/.cache/llama.cpp inside
+  # the container. Place your .gguf files there and reference them with
+  # /root/.cache/llama.cpp/<filename>.gguf
+  "my-local-model":
+    # ${PORT} is automatically assigned by llama-swap
+    cmd: >
+      $${llama-server}
+      --model /root/.cache/llama.cpp/model.gguf
+      --ctx-size 4096
+      --n-gpu-layers 0
+    proxy: "http://localhost:${PORT}"
+    # Automatically unload the model after 15 minutes of inactivity
+    ttl: 900
+
+  # Example: download a model from HuggingFace on first use (requires internet access)
+  # "Qwen2.5-7B-Instruct":
+  #   cmd: >
+  #     $${llama-server}
+  #     -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M
+  #     --ctx-size 8192
+  #     --n-gpu-layers 99
+  #   proxy: "http://localhost:${PORT}"
+  #   ttl: 900
@@ -0,0 +1,126 @@
+# Docker Compose configuration for llama-swap
+# https://github.com/mostlygeek/llama-swap
+# Reliable model swapping for any local OpenAI/Anthropic compatible server
+
+x-defaults: &defaults
+  restart: unless-stopped
+  logging:
+    driver: json-file
+    options:
+      max-size: 100m
+      max-file: '3'
+
+services:
+  # llama-swap - CPU variant (default)
+  llama-swap:
+    <<: *defaults
+    image: ${GHCR_REGISTRY:-ghcr.io/}mostlygeek/llama-swap:${LLAMA_SWAP_VERSION:-cpu}
+    ports:
+      - '${LLAMA_SWAP_PORT_OVERRIDE:-9292}:8080'
+    volumes:
+      - ./config.yaml:/app/config.yaml:ro
+      - llama_swap_models:/root/.cache/llama.cpp
+    environment:
+      - TZ=${TZ:-UTC}
+    healthcheck:
+      test:
+        - CMD
+        - wget
+        - --quiet
+        - --tries=1
+        - --spider
+        - 'http://localhost:8080/v1/models'
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 30s
+    deploy:
+      resources:
+        limits:
+          cpus: ${LLAMA_SWAP_CPU_LIMIT:-4.0}
+          memory: ${LLAMA_SWAP_MEMORY_LIMIT:-8G}
+        reservations:
+          cpus: ${LLAMA_SWAP_CPU_RESERVATION:-2.0}
+          memory: ${LLAMA_SWAP_MEMORY_RESERVATION:-4G}
+
+  # llama-swap - NVIDIA CUDA variant
+  # Requires NVIDIA Container Toolkit: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html
+  llama-swap-cuda:
+    <<: *defaults
+    image: ${GHCR_REGISTRY:-ghcr.io/}mostlygeek/llama-swap:${LLAMA_SWAP_CUDA_VERSION:-cuda}
+    ports:
+      - '${LLAMA_SWAP_PORT_OVERRIDE:-9292}:8080'
+    volumes:
+      - ./config.yaml:/app/config.yaml:ro
+      - llama_swap_models:/root/.cache/llama.cpp
+    environment:
+      - TZ=${TZ:-UTC}
+    healthcheck:
+      test:
+        - CMD
+        - wget
+        - --quiet
+        - --tries=1
+        - --spider
+        - 'http://localhost:8080/v1/models'
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 30s
+    deploy:
+      resources:
+        limits:
+          cpus: ${LLAMA_SWAP_CPU_LIMIT:-4.0}
+          memory: ${LLAMA_SWAP_MEMORY_LIMIT:-8G}
+        reservations:
+          cpus: ${LLAMA_SWAP_CPU_RESERVATION:-2.0}
+          memory: ${LLAMA_SWAP_MEMORY_RESERVATION:-4G}
+          devices:
+            - driver: nvidia
+              count: ${LLAMA_SWAP_GPU_COUNT:-1}
+              capabilities: [gpu]
+    profiles:
+      - gpu
+
+  # llama-swap - AMD ROCm / Vulkan variant (AMD GPU)
+  # For AMD GPUs, ensure /dev/dri and /dev/kfd are accessible
+  llama-swap-amd:
+    <<: *defaults
+    image: ${GHCR_REGISTRY:-ghcr.io/}mostlygeek/llama-swap:${LLAMA_SWAP_AMD_VERSION:-vulkan}
+    ports:
+      - '${LLAMA_SWAP_PORT_OVERRIDE:-9292}:8080'
+    volumes:
+      - ./config.yaml:/app/config.yaml:ro
+      - llama_swap_models:/root/.cache/llama.cpp
+    devices:
+      - /dev/dri:/dev/dri
+      - /dev/kfd:/dev/kfd
+    group_add:
+      - video
+    environment:
+      - TZ=${TZ:-UTC}
+    healthcheck:
+      test:
+        - CMD
+        - wget
+        - --quiet
+        - --tries=1
+        - --spider
+        - 'http://localhost:8080/v1/models'
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 30s
+    deploy:
+      resources:
+        limits:
+          cpus: ${LLAMA_SWAP_CPU_LIMIT:-4.0}
+          memory: ${LLAMA_SWAP_MEMORY_LIMIT:-8G}
+        reservations:
+          cpus: ${LLAMA_SWAP_CPU_RESERVATION:-2.0}
+          memory: ${LLAMA_SWAP_MEMORY_RESERVATION:-4G}
+    profiles:
+      - gpu-amd
+
+volumes:
+  llama_swap_models: