feat: add services
- Introduced Convex, an open-source reactive database, with README and environment variable configurations. - Added Chinese translation for Convex documentation. - Created docker-compose configuration for Convex services. - Introduced llama-swap, a model swapping proxy for OpenAI/Anthropic compatible servers, with comprehensive README and example configuration. - Added Chinese translation for llama-swap documentation. - Included example environment file and docker-compose setup for llama-swap. - Configured health checks and resource limits for both Convex and llama-swap services.
This commit is contained in:
@@ -37,6 +37,7 @@ These services require building custom Docker images from source.
|
||||
| [Clash](./src/clash) | 1.18.0 |
|
||||
| [ClickHouse](./src/clickhouse) | 24.11.1 |
|
||||
| [Conductor](./src/conductor) | latest |
|
||||
| [Convex](./src/convex) | 33cef775 |
|
||||
| [DeepTutor](./apps/deeptutor) | latest |
|
||||
| [Dify](./apps/dify) | 0.18.2 |
|
||||
| [DNSMasq](./src/dnsmasq) | 2.91 |
|
||||
@@ -77,6 +78,7 @@ These services require building custom Docker images from source.
|
||||
| [LibreOffice](./src/libreoffice) | latest |
|
||||
| [libSQL Server](./src/libsql) | latest |
|
||||
| [LiteLLM](./src/litellm) | main-stable |
|
||||
| [llama-swap](./src/llama-swap) | cpu |
|
||||
| [llama.cpp](./src/llama.cpp) | server |
|
||||
| [LMDeploy](./src/lmdeploy) | v0.11.1 |
|
||||
| [Logstash](./src/logstash) | 8.16.1 |
|
||||
|
||||
@@ -37,6 +37,7 @@ Compose Anything 通过提供一组高质量的 Docker Compose 配置文件,
|
||||
| [Clash](./src/clash) | 1.18.0 |
|
||||
| [ClickHouse](./src/clickhouse) | 24.11.1 |
|
||||
| [Conductor](./src/conductor) | latest |
|
||||
| [Convex](./src/convex) | 33cef775 |
|
||||
| [DeepTutor](./apps/deeptutor) | latest |
|
||||
| [Dify](./apps/dify) | 0.18.2 |
|
||||
| [DNSMasq](./src/dnsmasq) | 2.91 |
|
||||
@@ -77,6 +78,7 @@ Compose Anything 通过提供一组高质量的 Docker Compose 配置文件,
|
||||
| [LibreOffice](./src/libreoffice) | latest |
|
||||
| [libSQL Server](./src/libsql) | latest |
|
||||
| [LiteLLM](./src/litellm) | main-stable |
|
||||
| [llama-swap](./src/llama-swap) | cpu |
|
||||
| [llama.cpp](./src/llama.cpp) | server |
|
||||
| [LMDeploy](./src/lmdeploy) | v0.11.1 |
|
||||
| [Logstash](./src/logstash) | 8.16.1 |
|
||||
|
||||
@@ -182,6 +182,7 @@ services:
|
||||
minio-init:
|
||||
<<: *defaults
|
||||
image: ${GLOBAL_REGISTRY:-}minio/mc:${MINIO_MC_VERSION:-RELEASE.2025-03-12T17-29-24Z}
|
||||
restart: on-failure
|
||||
depends_on:
|
||||
minio:
|
||||
condition: service_healthy
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
# Use the official vllm image for gpu with Ampere、Ada Lovelace、Hopper architecture (8.0 <= Compute Capability <= 9.0)
|
||||
# Compute Capability version query (https://developer.nvidia.com/cuda-gpus)
|
||||
# only support x86_64 architecture
|
||||
FROM vllm/vllm-openai:v0.10.1.1
|
||||
FROM vllm/vllm-openai:v0.10.2
|
||||
|
||||
# Use the official vllm image for gpu with Volta、Turing、Blackwell architecture (7.0 < Compute Capability < 8.0 or Compute Capability >= 10.0)
|
||||
# support x86_64 architecture and ARM(AArch64) architecture
|
||||
|
||||
28
builds/mineru/china.Dockerfile
Normal file
28
builds/mineru/china.Dockerfile
Normal file
@@ -0,0 +1,28 @@
|
||||
# Use DaoCloud mirrored vllm image for China region for gpu with Ampere、Ada Lovelace、Hopper architecture (8.0 <= Compute Capability <= 9.0)
|
||||
# Compute Capability version query (https://developer.nvidia.com/cuda-gpus)
|
||||
FROM docker.m.daocloud.io/vllm/vllm-openai:v0.10.2
|
||||
|
||||
# Use DaoCloud mirrored vllm image for China region for gpu with Volta、Turing、Blackwell architecture (7.0 < Compute Capability < 8.0 or Compute Capability >= 10.0)
|
||||
# support x86_64 architecture and ARM(AArch64) architecture
|
||||
# FROM docker.m.daocloud.io/vllm/vllm-openai:v0.11.0
|
||||
|
||||
# Install libgl for opencv support & Noto fonts for Chinese characters
|
||||
RUN apt-get update && \
|
||||
apt-get install -y \
|
||||
fonts-noto-core \
|
||||
fonts-noto-cjk \
|
||||
fontconfig \
|
||||
libgl1 && \
|
||||
fc-cache -fv && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install mineru latest
|
||||
RUN python3 -m pip install -U 'mineru[core]>=2.7.0' -i https://mirrors.aliyun.com/pypi/simple --break-system-packages && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
# Download models and update the configuration file
|
||||
RUN /bin/bash -c "mineru-models-download -s modelscope -m all"
|
||||
|
||||
# Set the entry point to activate the virtual environment and run the command line tool
|
||||
ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"]
|
||||
158
src/convex/.env.example
Normal file
158
src/convex/.env.example
Normal file
@@ -0,0 +1,158 @@
|
||||
# Convex Configuration
|
||||
|
||||
# =============================================================================
|
||||
# Versions
|
||||
# =============================================================================
|
||||
CONVEX_BACKEND_VERSION=33cef775a8a6228cbacee4a09ac2c4073d62ed13
|
||||
CONVEX_DASHBOARD_VERSION=33cef775a8a6228cbacee4a09ac2c4073d62ed13
|
||||
POSTGRES_VERSION=17-alpine
|
||||
|
||||
# =============================================================================
|
||||
# Port Configuration
|
||||
# =============================================================================
|
||||
CONVEX_BACKEND_PORT_OVERRIDE=3210
|
||||
CONVEX_SITE_PROXY_PORT_OVERRIDE=3211
|
||||
CONVEX_DASHBOARD_PORT_OVERRIDE=6791
|
||||
|
||||
# =============================================================================
|
||||
# Instance Configuration
|
||||
# =============================================================================
|
||||
# Name of your Convex instance
|
||||
INSTANCE_NAME=convex-self-hosted
|
||||
|
||||
# Secret key for instance authentication (generate a strong random string)
|
||||
# Example: openssl rand -hex 32
|
||||
INSTANCE_SECRET=
|
||||
|
||||
# =============================================================================
|
||||
# Origins
|
||||
# =============================================================================
|
||||
# URL where the Convex backend is accessible
|
||||
CONVEX_CLOUD_ORIGIN=http://127.0.0.1:3210
|
||||
|
||||
# URL where the Convex site proxy is accessible
|
||||
CONVEX_SITE_ORIGIN=http://127.0.0.1:3211
|
||||
|
||||
# URL for the dashboard to connect to the backend
|
||||
NEXT_PUBLIC_DEPLOYMENT_URL=http://127.0.0.1:3210
|
||||
|
||||
# =============================================================================
|
||||
# Database Configuration
|
||||
# =============================================================================
|
||||
# PostgreSQL password (change in production)
|
||||
POSTGRES_PASSWORD=convex
|
||||
|
||||
# Full PostgreSQL connection URL (optional, constructed from above if not set)
|
||||
# POSTGRES_URL=postgresql://postgres:convex@postgres:5432/convex
|
||||
|
||||
# MySQL URL (alternative to PostgreSQL, leave empty to use PostgreSQL)
|
||||
# MYSQL_URL=
|
||||
|
||||
# =============================================================================
|
||||
# Application Limits
|
||||
# =============================================================================
|
||||
# Maximum concurrent mutations
|
||||
APPLICATION_MAX_CONCURRENT_MUTATIONS=16
|
||||
|
||||
# Maximum concurrent Node.js actions
|
||||
APPLICATION_MAX_CONCURRENT_NODE_ACTIONS=16
|
||||
|
||||
# Maximum concurrent queries
|
||||
APPLICATION_MAX_CONCURRENT_QUERIES=16
|
||||
|
||||
# Maximum concurrent V8 actions
|
||||
APPLICATION_MAX_CONCURRENT_V8_ACTIONS=16
|
||||
|
||||
# User action timeout in seconds (empty for default)
|
||||
ACTIONS_USER_TIMEOUT_SECS=
|
||||
|
||||
# =============================================================================
|
||||
# SSL/TLS Settings
|
||||
# =============================================================================
|
||||
# Set to false to require SSL (recommended for production)
|
||||
DO_NOT_REQUIRE_SSL=true
|
||||
|
||||
# =============================================================================
|
||||
# Data Retention
|
||||
# =============================================================================
|
||||
# Document retention delay in seconds (default: 2 days)
|
||||
DOCUMENT_RETENTION_DELAY=172800
|
||||
|
||||
# =============================================================================
|
||||
# Telemetry and Metrics
|
||||
# =============================================================================
|
||||
# Disable telemetry beacon (set to true to disable)
|
||||
DISABLE_BEACON=false
|
||||
|
||||
# Enable Prometheus-compatible /metrics endpoint
|
||||
DISABLE_METRICS_ENDPOINT=true
|
||||
|
||||
# =============================================================================
|
||||
# Logging
|
||||
# =============================================================================
|
||||
# Rust log level (error, warn, info, debug, trace)
|
||||
RUST_LOG=info
|
||||
|
||||
# Enable Rust backtrace (1, full, or empty)
|
||||
RUST_BACKTRACE=
|
||||
|
||||
# Redact logs sent to clients
|
||||
REDACT_LOGS_TO_CLIENT=
|
||||
|
||||
# HTTP server timeout in seconds
|
||||
HTTP_SERVER_TIMEOUT_SECONDS=
|
||||
|
||||
# =============================================================================
|
||||
# AWS S3 Configuration (Optional - for external storage)
|
||||
# =============================================================================
|
||||
# AWS_ACCESS_KEY_ID=
|
||||
# AWS_SECRET_ACCESS_KEY=
|
||||
# AWS_REGION=
|
||||
# AWS_SESSION_TOKEN=
|
||||
# S3_ENDPOINT_URL=
|
||||
# S3_STORAGE_EXPORTS_BUCKET=
|
||||
# S3_STORAGE_FILES_BUCKET=
|
||||
# S3_STORAGE_MODULES_BUCKET=
|
||||
# S3_STORAGE_SEARCH_BUCKET=
|
||||
# S3_STORAGE_SNAPSHOT_IMPORTS_BUCKET=
|
||||
# AWS_S3_DISABLE_CHECKSUMS=
|
||||
# AWS_S3_DISABLE_SSE=
|
||||
# AWS_S3_FORCE_PATH_STYLE=
|
||||
|
||||
# =============================================================================
|
||||
# Development Settings
|
||||
# =============================================================================
|
||||
# Development version override
|
||||
CONVEX_RELEASE_VERSION_DEV=
|
||||
|
||||
# Load Monaco editor internally in dashboard
|
||||
NEXT_PUBLIC_LOAD_MONACO_INTERNALLY=
|
||||
|
||||
# =============================================================================
|
||||
# Timezone
|
||||
# =============================================================================
|
||||
TZ=UTC
|
||||
|
||||
# =============================================================================
|
||||
# Resource Limits - Convex Backend
|
||||
# =============================================================================
|
||||
CONVEX_BACKEND_CPU_LIMIT=2.0
|
||||
CONVEX_BACKEND_CPU_RESERVATION=0.5
|
||||
CONVEX_BACKEND_MEMORY_LIMIT=2G
|
||||
CONVEX_BACKEND_MEMORY_RESERVATION=512M
|
||||
|
||||
# =============================================================================
|
||||
# Resource Limits - Convex Dashboard
|
||||
# =============================================================================
|
||||
CONVEX_DASHBOARD_CPU_LIMIT=0.5
|
||||
CONVEX_DASHBOARD_CPU_RESERVATION=0.25
|
||||
CONVEX_DASHBOARD_MEMORY_LIMIT=256M
|
||||
CONVEX_DASHBOARD_MEMORY_RESERVATION=128M
|
||||
|
||||
# =============================================================================
|
||||
# Resource Limits - PostgreSQL
|
||||
# =============================================================================
|
||||
POSTGRES_CPU_LIMIT=1.0
|
||||
POSTGRES_CPU_RESERVATION=0.25
|
||||
POSTGRES_MEMORY_LIMIT=1G
|
||||
POSTGRES_MEMORY_RESERVATION=256M
|
||||
123
src/convex/README.md
Normal file
123
src/convex/README.md
Normal file
@@ -0,0 +1,123 @@
|
||||
# Convex
|
||||
|
||||
Convex is an open-source reactive database designed to make life easy for web app developers, whether human or LLM.
|
||||
|
||||
## Features
|
||||
|
||||
- **Reactive Queries**: Queries automatically update when underlying data changes
|
||||
- **Real-time Subscriptions**: Live UI updates without manual polling
|
||||
- **Serverless Functions**: Write backend logic in TypeScript/JavaScript
|
||||
- **Automatic Caching**: Built-in intelligent caching for optimal performance
|
||||
- **Type Safety**: Full TypeScript support with generated types
|
||||
- **Scalable Architecture**: Designed to handle high-throughput applications
|
||||
|
||||
## Quick Start
|
||||
|
||||
1. Copy `.env.example` to `.env`:
|
||||
|
||||
```bash
|
||||
cp .env.example .env
|
||||
```
|
||||
|
||||
2. Generate an instance secret (required for production):
|
||||
|
||||
```bash
|
||||
openssl rand -hex 32
|
||||
```
|
||||
|
||||
Then set `INSTANCE_SECRET` in your `.env` file.
|
||||
|
||||
3. Start Convex:
|
||||
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
4. Wait for services to be healthy (check with `docker compose ps`)
|
||||
|
||||
5. Access the Dashboard at `http://localhost:6791`
|
||||
|
||||
6. Backend API is available at `http://localhost:3210`
|
||||
|
||||
## Default Configuration
|
||||
|
||||
| Service | Port | Description |
|
||||
| -------------- | ---- | ------------------------------- |
|
||||
| Convex Backend | 3210 | Main API and WebSocket endpoint |
|
||||
| Site Proxy | 3211 | Site hosting proxy |
|
||||
| Dashboard | 6791 | Web UI for managing Convex |
|
||||
| PostgreSQL | 5432 | Database (internal) |
|
||||
|
||||
**Authentication**: Set `INSTANCE_SECRET` for production use.
|
||||
|
||||
## Environment Variables
|
||||
|
||||
Key environment variables (see `.env.example` for full list):
|
||||
|
||||
| Variable | Description | Default |
|
||||
| --------------------------------- | --------------------------------- | ----------------------- |
|
||||
| `CONVEX_BACKEND_PORT_OVERRIDE` | Host port for backend API | `3210` |
|
||||
| `CONVEX_SITE_PROXY_PORT_OVERRIDE` | Host port for site proxy | `3211` |
|
||||
| `CONVEX_DASHBOARD_PORT_OVERRIDE` | Host port for dashboard | `6791` |
|
||||
| `INSTANCE_NAME` | Name of the Convex instance | `convex-self-hosted` |
|
||||
| `INSTANCE_SECRET` | Secret key for authentication | (required) |
|
||||
| `CONVEX_CLOUD_ORIGIN` | URL for backend access | `http://127.0.0.1:3210` |
|
||||
| `CONVEX_SITE_ORIGIN` | URL for site proxy access | `http://127.0.0.1:3211` |
|
||||
| `POSTGRES_PASSWORD` | PostgreSQL password | `convex` |
|
||||
| `RUST_LOG` | Log level (error/warn/info/debug) | `info` |
|
||||
| `TZ` | Timezone | `UTC` |
|
||||
|
||||
## Resource Requirements
|
||||
|
||||
**Minimum**:
|
||||
|
||||
- CPU: 1 core
|
||||
- RAM: 1GB
|
||||
- Disk: 5GB
|
||||
|
||||
**Recommended**:
|
||||
|
||||
- CPU: 2+ cores
|
||||
- RAM: 2GB+
|
||||
- Disk: 20GB+
|
||||
|
||||
## Volumes
|
||||
|
||||
- `convex_data`: Convex backend data storage
|
||||
- `postgres_data`: PostgreSQL database data
|
||||
|
||||
## Using with Your Application
|
||||
|
||||
To use this self-hosted Convex backend with your application:
|
||||
|
||||
1. Set the `CONVEX_SELF_HOSTED_URL` environment variable in your app:
|
||||
|
||||
```bash
|
||||
CONVEX_SELF_HOSTED_URL=http://localhost:3210
|
||||
```
|
||||
|
||||
2. Set the `CONVEX_SELF_HOSTED_ADMIN_KEY` environment variable:
|
||||
|
||||
```bash
|
||||
CONVEX_SELF_HOSTED_ADMIN_KEY=your-instance-secret
|
||||
```
|
||||
|
||||
3. Deploy your Convex functions:
|
||||
|
||||
```bash
|
||||
npx convex dev
|
||||
```
|
||||
|
||||
For more details, see the [Convex Self-Hosting Documentation](https://stack.convex.dev/self-hosted-develop-and-deploy).
|
||||
|
||||
## Security Notes
|
||||
|
||||
- **Always set a strong `INSTANCE_SECRET`** in production
|
||||
- Enable SSL/TLS by setting `DO_NOT_REQUIRE_SSL=false` and using a reverse proxy
|
||||
- Use strong database passwords
|
||||
- Restrict network access to Convex services
|
||||
- Consider using AWS S3 for external storage in production
|
||||
|
||||
## License
|
||||
|
||||
Apache-2.0 (<https://github.com/get-convex/convex-backend/blob/main/LICENSE>)
|
||||
123
src/convex/README.zh.md
Normal file
123
src/convex/README.zh.md
Normal file
@@ -0,0 +1,123 @@
|
||||
# Convex
|
||||
|
||||
Convex 是一个开源的响应式数据库,旨在让 Web 应用开发者(无论是人类还是 LLM)的生活更加轻松。
|
||||
|
||||
## 功能特性
|
||||
|
||||
- **响应式查询**:当底层数据变化时,查询会自动更新
|
||||
- **实时订阅**:无需手动轮询即可实现实时 UI 更新
|
||||
- **无服务器函数**:使用 TypeScript/JavaScript 编写后端逻辑
|
||||
- **自动缓存**:内置智能缓存以获得最佳性能
|
||||
- **类型安全**:完整的 TypeScript 支持,并生成类型定义
|
||||
- **可扩展架构**:专为高吞吐量应用而设计
|
||||
|
||||
## 快速开始
|
||||
|
||||
1. 复制 `.env.example` 到 `.env`:
|
||||
|
||||
```bash
|
||||
cp .env.example .env
|
||||
```
|
||||
|
||||
2. 生成实例密钥(生产环境必需):
|
||||
|
||||
```bash
|
||||
openssl rand -hex 32
|
||||
```
|
||||
|
||||
然后在 `.env` 文件中设置 `INSTANCE_SECRET`。
|
||||
|
||||
3. 启动 Convex:
|
||||
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
4. 等待服务健康(使用 `docker compose ps` 检查)
|
||||
|
||||
5. 访问 Dashboard:`http://localhost:6791`
|
||||
|
||||
6. 后端 API 地址:`http://localhost:3210`
|
||||
|
||||
## 默认配置
|
||||
|
||||
| 服务 | 端口 | 说明 |
|
||||
| -------------- | ---- | ------------------------ |
|
||||
| Convex Backend | 3210 | 主 API 和 WebSocket 端点 |
|
||||
| Site Proxy | 3211 | 站点托管代理 |
|
||||
| Dashboard | 6791 | 管理 Convex 的 Web UI |
|
||||
| PostgreSQL | 5432 | 数据库(内部) |
|
||||
|
||||
**认证**:生产环境请设置 `INSTANCE_SECRET`。
|
||||
|
||||
## 环境变量
|
||||
|
||||
关键环境变量(完整列表请参见 `.env.example`):
|
||||
|
||||
| 变量 | 说明 | 默认值 |
|
||||
| --------------------------------- | --------------------------------- | ----------------------- |
|
||||
| `CONVEX_BACKEND_PORT_OVERRIDE` | 后端 API 的主机端口 | `3210` |
|
||||
| `CONVEX_SITE_PROXY_PORT_OVERRIDE` | 站点代理的主机端口 | `3211` |
|
||||
| `CONVEX_DASHBOARD_PORT_OVERRIDE` | Dashboard 的主机端口 | `6791` |
|
||||
| `INSTANCE_NAME` | Convex 实例名称 | `convex-self-hosted` |
|
||||
| `INSTANCE_SECRET` | 认证密钥 | (必需) |
|
||||
| `CONVEX_CLOUD_ORIGIN` | 后端访问 URL | `http://127.0.0.1:3210` |
|
||||
| `CONVEX_SITE_ORIGIN` | 站点代理访问 URL | `http://127.0.0.1:3211` |
|
||||
| `POSTGRES_PASSWORD` | PostgreSQL 密码 | `convex` |
|
||||
| `RUST_LOG` | 日志级别(error/warn/info/debug) | `info` |
|
||||
| `TZ` | 时区 | `UTC` |
|
||||
|
||||
## 资源需求
|
||||
|
||||
**最低配置**:
|
||||
|
||||
- CPU:1 核
|
||||
- 内存:1GB
|
||||
- 磁盘:5GB
|
||||
|
||||
**推荐配置**:
|
||||
|
||||
- CPU:2+ 核
|
||||
- 内存:2GB+
|
||||
- 磁盘:20GB+
|
||||
|
||||
## 数据卷
|
||||
|
||||
- `convex_data`:Convex 后端数据存储
|
||||
- `postgres_data`:PostgreSQL 数据库数据
|
||||
|
||||
## 在应用中使用
|
||||
|
||||
要将此自托管 Convex 后端与您的应用一起使用:
|
||||
|
||||
1. 在应用中设置 `CONVEX_SELF_HOSTED_URL` 环境变量:
|
||||
|
||||
```bash
|
||||
CONVEX_SELF_HOSTED_URL=http://localhost:3210
|
||||
```
|
||||
|
||||
2. 设置 `CONVEX_SELF_HOSTED_ADMIN_KEY` 环境变量:
|
||||
|
||||
```bash
|
||||
CONVEX_SELF_HOSTED_ADMIN_KEY=your-instance-secret
|
||||
```
|
||||
|
||||
3. 部署您的 Convex 函数:
|
||||
|
||||
```bash
|
||||
npx convex dev
|
||||
```
|
||||
|
||||
更多详情,请参阅 [Convex 自托管文档](https://stack.convex.dev/self-hosted-develop-and-deploy)。
|
||||
|
||||
## 安全说明
|
||||
|
||||
- **生产环境务必设置强 `INSTANCE_SECRET`**
|
||||
- 通过设置 `DO_NOT_REQUIRE_SSL=false` 并使用反向代理来启用 SSL/TLS
|
||||
- 使用强数据库密码
|
||||
- 限制对 Convex 服务的网络访问
|
||||
- 生产环境考虑使用 AWS S3 进行外部存储
|
||||
|
||||
## 许可证
|
||||
|
||||
Apache-2.0(<https://github.com/get-convex/convex-backend/blob/main/LICENSE>)
|
||||
195
src/convex/docker-compose.yaml
Normal file
195
src/convex/docker-compose.yaml
Normal file
@@ -0,0 +1,195 @@
|
||||
# Convex - Open-source Reactive Database
|
||||
# https://github.com/get-convex/convex-backend
|
||||
#
|
||||
# Convex is an open-source reactive database designed to make life easy for
|
||||
# web app developers. It provides real-time data synchronization, automatic
|
||||
# caching, and a powerful query language.
|
||||
#
|
||||
# Key Features:
|
||||
# - Reactive queries that automatically update when data changes
|
||||
# - Real-time subscriptions for live UI updates
|
||||
# - Built-in authentication and authorization
|
||||
# - Serverless functions with TypeScript/JavaScript
|
||||
# - Automatic scaling and caching
|
||||
#
|
||||
# Default Credentials:
|
||||
# - Dashboard at http://localhost:6791
|
||||
# - Backend API at http://localhost:3210
|
||||
# - Site proxy at http://localhost:3211
|
||||
#
|
||||
# Security Notes:
|
||||
# - Set a strong INSTANCE_SECRET in production
|
||||
# - Enable SSL/TLS in production
|
||||
# - Use strong database passwords
|
||||
# - Restrict network access to Convex services
|
||||
#
|
||||
# License: Apache-2.0 (https://github.com/get-convex/convex-backend/blob/main/LICENSE)
|
||||
|
||||
x-defaults: &defaults
|
||||
restart: unless-stopped
|
||||
logging:
|
||||
driver: json-file
|
||||
options:
|
||||
max-size: 100m
|
||||
max-file: '3'
|
||||
|
||||
services:
|
||||
convex-backend:
|
||||
<<: *defaults
|
||||
image: ${GHCR_REGISTRY:-ghcr.io/}get-convex/convex-backend:${CONVEX_BACKEND_VERSION:-33cef775a8a6228cbacee4a09ac2c4073d62ed13}
|
||||
stop_grace_period: 10s
|
||||
stop_signal: SIGINT
|
||||
ports:
|
||||
- '${CONVEX_BACKEND_PORT_OVERRIDE:-3210}:3210'
|
||||
- '${CONVEX_SITE_PROXY_PORT_OVERRIDE:-3211}:3211'
|
||||
volumes:
|
||||
- convex_data:/convex/data
|
||||
environment:
|
||||
# Instance configuration
|
||||
- INSTANCE_NAME=${INSTANCE_NAME:-convex-self-hosted}
|
||||
- INSTANCE_SECRET=${INSTANCE_SECRET}
|
||||
|
||||
# Origins
|
||||
- CONVEX_CLOUD_ORIGIN=${CONVEX_CLOUD_ORIGIN:-http://127.0.0.1:3210}
|
||||
- CONVEX_SITE_ORIGIN=${CONVEX_SITE_ORIGIN:-http://127.0.0.1:3211}
|
||||
|
||||
# Database configuration (PostgreSQL)
|
||||
- POSTGRES_URL=${POSTGRES_URL:-postgresql://postgres:${POSTGRES_PASSWORD:-convex}@postgres:5432/convex}
|
||||
|
||||
# Application limits
|
||||
- APPLICATION_MAX_CONCURRENT_MUTATIONS=${APPLICATION_MAX_CONCURRENT_MUTATIONS:-16}
|
||||
- APPLICATION_MAX_CONCURRENT_NODE_ACTIONS=${APPLICATION_MAX_CONCURRENT_NODE_ACTIONS:-16}
|
||||
- APPLICATION_MAX_CONCURRENT_QUERIES=${APPLICATION_MAX_CONCURRENT_QUERIES:-16}
|
||||
- APPLICATION_MAX_CONCURRENT_V8_ACTIONS=${APPLICATION_MAX_CONCURRENT_V8_ACTIONS:-16}
|
||||
|
||||
# Actions timeout
|
||||
- ACTIONS_USER_TIMEOUT_SECS=${ACTIONS_USER_TIMEOUT_SECS:-}
|
||||
|
||||
# SSL/TLS settings
|
||||
- DO_NOT_REQUIRE_SSL=${DO_NOT_REQUIRE_SSL:-true}
|
||||
|
||||
# Document retention (default 2 days in seconds)
|
||||
- DOCUMENT_RETENTION_DELAY=${DOCUMENT_RETENTION_DELAY:-172800}
|
||||
|
||||
# Metrics and beacon
|
||||
- DISABLE_BEACON=${DISABLE_BEACON:-false}
|
||||
- DISABLE_METRICS_ENDPOINT=${DISABLE_METRICS_ENDPOINT:-true}
|
||||
|
||||
# Logging
|
||||
- RUST_LOG=${RUST_LOG:-info}
|
||||
- RUST_BACKTRACE=${RUST_BACKTRACE:-}
|
||||
- REDACT_LOGS_TO_CLIENT=${REDACT_LOGS_TO_CLIENT:-}
|
||||
|
||||
# HTTP server timeout
|
||||
- HTTP_SERVER_TIMEOUT_SECONDS=${HTTP_SERVER_TIMEOUT_SECONDS:-}
|
||||
|
||||
# AWS S3 configuration (optional, for external storage)
|
||||
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-}
|
||||
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-}
|
||||
- AWS_REGION=${AWS_REGION:-}
|
||||
- AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN:-}
|
||||
- S3_ENDPOINT_URL=${S3_ENDPOINT_URL:-}
|
||||
- S3_STORAGE_EXPORTS_BUCKET=${S3_STORAGE_EXPORTS_BUCKET:-}
|
||||
- S3_STORAGE_FILES_BUCKET=${S3_STORAGE_FILES_BUCKET:-}
|
||||
- S3_STORAGE_MODULES_BUCKET=${S3_STORAGE_MODULES_BUCKET:-}
|
||||
- S3_STORAGE_SEARCH_BUCKET=${S3_STORAGE_SEARCH_BUCKET:-}
|
||||
- S3_STORAGE_SNAPSHOT_IMPORTS_BUCKET=${S3_STORAGE_SNAPSHOT_IMPORTS_BUCKET:-}
|
||||
- AWS_S3_DISABLE_CHECKSUMS=${AWS_S3_DISABLE_CHECKSUMS:-}
|
||||
- AWS_S3_DISABLE_SSE=${AWS_S3_DISABLE_SSE:-}
|
||||
- AWS_S3_FORCE_PATH_STYLE=${AWS_S3_FORCE_PATH_STYLE:-}
|
||||
|
||||
# MySQL URL (alternative to PostgreSQL)
|
||||
- MYSQL_URL=${MYSQL_URL:-}
|
||||
|
||||
# Development settings
|
||||
- CONVEX_RELEASE_VERSION_DEV=${CONVEX_RELEASE_VERSION_DEV:-}
|
||||
|
||||
# Timezone
|
||||
- TZ=${TZ:-UTC}
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
healthcheck:
|
||||
test:
|
||||
- CMD
|
||||
- curl
|
||||
- -f
|
||||
- http://localhost:3210/version
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
start_period: 30s
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: '${CONVEX_BACKEND_CPU_LIMIT:-2.0}'
|
||||
memory: '${CONVEX_BACKEND_MEMORY_LIMIT:-2G}'
|
||||
reservations:
|
||||
cpus: '${CONVEX_BACKEND_CPU_RESERVATION:-0.5}'
|
||||
memory: '${CONVEX_BACKEND_MEMORY_RESERVATION:-512M}'
|
||||
|
||||
convex-dashboard:
|
||||
<<: *defaults
|
||||
image: ${GHCR_REGISTRY:-ghcr.io/}get-convex/convex-dashboard:${CONVEX_DASHBOARD_VERSION:-33cef775a8a6228cbacee4a09ac2c4073d62ed13}
|
||||
stop_grace_period: 10s
|
||||
stop_signal: SIGINT
|
||||
ports:
|
||||
- '${CONVEX_DASHBOARD_PORT_OVERRIDE:-6791}:6791'
|
||||
environment:
|
||||
- NEXT_PUBLIC_DEPLOYMENT_URL=${NEXT_PUBLIC_DEPLOYMENT_URL:-http://127.0.0.1:3210}
|
||||
- NEXT_PUBLIC_LOAD_MONACO_INTERNALLY=${NEXT_PUBLIC_LOAD_MONACO_INTERNALLY:-}
|
||||
depends_on:
|
||||
convex-backend:
|
||||
condition: service_healthy
|
||||
healthcheck:
|
||||
test:
|
||||
- CMD
|
||||
- wget
|
||||
- --quiet
|
||||
- --tries=1
|
||||
- --spider
|
||||
- http://localhost:6791
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 10s
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: '${CONVEX_DASHBOARD_CPU_LIMIT:-0.5}'
|
||||
memory: '${CONVEX_DASHBOARD_MEMORY_LIMIT:-256M}'
|
||||
reservations:
|
||||
cpus: '${CONVEX_DASHBOARD_CPU_RESERVATION:-0.25}'
|
||||
memory: '${CONVEX_DASHBOARD_MEMORY_RESERVATION:-128M}'
|
||||
|
||||
postgres:
|
||||
<<: *defaults
|
||||
image: ${GLOBAL_REGISTRY:-}postgres:${POSTGRES_VERSION:-17-alpine}
|
||||
environment:
|
||||
- POSTGRES_DB=convex
|
||||
- POSTGRES_USER=postgres
|
||||
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-convex}
|
||||
- POSTGRES_INITDB_ARGS=--encoding=UTF8
|
||||
- TZ=${TZ:-UTC}
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
healthcheck:
|
||||
test:
|
||||
- CMD-SHELL
|
||||
- pg_isready -U postgres
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
start_period: 10s
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: '${POSTGRES_CPU_LIMIT:-1.0}'
|
||||
memory: '${POSTGRES_MEMORY_LIMIT:-1G}'
|
||||
reservations:
|
||||
cpus: '${POSTGRES_CPU_RESERVATION:-0.25}'
|
||||
memory: '${POSTGRES_MEMORY_RESERVATION:-256M}'
|
||||
|
||||
volumes:
|
||||
convex_data:
|
||||
postgres_data:
|
||||
62
src/llama-swap/.env.example
Normal file
62
src/llama-swap/.env.example
Normal file
@@ -0,0 +1,62 @@
|
||||
# =============================================================================
|
||||
# llama-swap Configuration
|
||||
# https://github.com/mostlygeek/llama-swap
|
||||
# Reliable model swapping for any local OpenAI/Anthropic compatible server
|
||||
# =============================================================================
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# General Settings
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
# Timezone for the container (default: UTC)
|
||||
TZ=UTC
|
||||
|
||||
# GitHub Container Registry prefix (default: ghcr.io/)
|
||||
GHCR_REGISTRY=ghcr.io/
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Image Variants
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
# CPU-only image version tag (default: cpu)
|
||||
# Available: cpu, cuda, vulkan, rocm, intel, musa
|
||||
# Tagged releases example: v197-cuda-b8193
|
||||
LLAMA_SWAP_VERSION=cpu
|
||||
|
||||
# NVIDIA CUDA image version tag (used with the `gpu` profile)
|
||||
LLAMA_SWAP_CUDA_VERSION=cuda
|
||||
|
||||
# AMD GPU image version tag (used with the `gpu-amd` profile)
|
||||
# Options: vulkan (Vulkan/AMD), rocm (ROCm/AMD)
|
||||
LLAMA_SWAP_AMD_VERSION=vulkan
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Network Settings
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
# Host port override for the llama-swap API (default: 9292)
|
||||
# The Web UI and OpenAI-compatible API are both served on this port
|
||||
LLAMA_SWAP_PORT_OVERRIDE=9292
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# GPU Settings (used with `gpu` profile)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
# Number of NVIDIA GPUs to use (default: 1)
|
||||
LLAMA_SWAP_GPU_COUNT=1
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Resource Limits
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
# CPU limit (in cores)
|
||||
LLAMA_SWAP_CPU_LIMIT=4.0
|
||||
|
||||
# CPU reservation (in cores)
|
||||
LLAMA_SWAP_CPU_RESERVATION=2.0
|
||||
|
||||
# Memory limit (e.g., 8G, 16G)
|
||||
LLAMA_SWAP_MEMORY_LIMIT=8G
|
||||
|
||||
# Memory reservation (e.g., 4G, 8G)
|
||||
LLAMA_SWAP_MEMORY_RESERVATION=4G
|
||||
196
src/llama-swap/README.md
Normal file
196
src/llama-swap/README.md
Normal file
@@ -0,0 +1,196 @@
|
||||
# llama-swap
|
||||
|
||||
[llama-swap](https://github.com/mostlygeek/llama-swap) is a lightweight reverse proxy that provides reliable on-demand model swapping for any local OpenAI/Anthropic-compatible inference server (e.g., llama.cpp, vllm). Only one model is loaded at a time, and it is automatically swapped out when a different model is requested, making it easy to work with many models on a single machine.
|
||||
|
||||
See also: [README.zh.md](./README.zh.md)
|
||||
|
||||
## Features
|
||||
|
||||
- **On-demand model swapping**: Automatically load/unload models based on API requests with zero manual intervention.
|
||||
- **OpenAI/Anthropic compatible**: Drop-in replacement for any client that uses the OpenAI or Anthropic chat completion API.
|
||||
- **Multi-backend support**: Works with llama.cpp (llama-server), vllm, and any OpenAI-compatible server.
|
||||
- **Real-time Web UI**: Built-in interface for monitoring logs, inspecting requests, and manually managing models.
|
||||
- **TTL-based unloading**: Models can be configured to unload automatically after a period of inactivity.
|
||||
- **HuggingFace model downloads**: Reference HuggingFace models directly in `config.yaml` and they are downloaded on first use.
|
||||
- **Multi-GPU support**: Works with NVIDIA CUDA, AMD ROCm/Vulkan, Intel, and CPU-only setups.
|
||||
|
||||
## Quick Start
|
||||
|
||||
1. Copy the example environment file:
|
||||
|
||||
```bash
|
||||
cp .env.example .env
|
||||
```
|
||||
|
||||
2. Edit `config.yaml` to add your models. The provided `config.yaml` includes a commented example for a local GGUF model and a HuggingFace download. See [Configuration](#configuration) for details.
|
||||
|
||||
3. Start the service (CPU-only by default):
|
||||
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
4. For NVIDIA GPU support:
|
||||
|
||||
```bash
|
||||
docker compose --profile gpu up -d
|
||||
```
|
||||
|
||||
5. For AMD GPU support (Vulkan):
|
||||
|
||||
```bash
|
||||
docker compose --profile gpu-amd up -d
|
||||
```
|
||||
|
||||
The API and Web UI are available at: `http://localhost:9292`
|
||||
|
||||
## Services
|
||||
|
||||
| Service | Profile | Description |
|
||||
| ----------------- | ----------- | --------------------------------- |
|
||||
| `llama-swap` | _(default)_ | CPU-only inference |
|
||||
| `llama-swap-cuda` | `gpu` | NVIDIA CUDA GPU inference |
|
||||
| `llama-swap-amd` | `gpu-amd` | AMD GPU inference (Vulkan / ROCm) |
|
||||
|
||||
> **Note**: Only start one service at a time. All three services bind to the same host port (`LLAMA_SWAP_PORT_OVERRIDE`).
|
||||
|
||||
## Configuration
|
||||
|
||||
### `config.yaml`
|
||||
|
||||
The `config.yaml` file defines the models llama-swap manages. It is mounted read-only at `/app/config.yaml` inside the container. Edit the provided `config.yaml` to add your models.
|
||||
|
||||
Minimal example:
|
||||
|
||||
```yaml
|
||||
healthCheckTimeout: 300
|
||||
|
||||
models:
|
||||
my-model:
|
||||
cmd: /app/llama-server --port ${PORT} --model /root/.cache/llama.cpp/model.gguf --ctx-size 4096
|
||||
proxy: 'http://localhost:${PORT}'
|
||||
ttl: 900
|
||||
```
|
||||
|
||||
- `${PORT}` is automatically assigned by llama-swap.
|
||||
- `ttl` (seconds): unload the model after this many seconds of inactivity.
|
||||
- `cmd`: the command to start the inference server.
|
||||
- `proxy`: the address llama-swap forwards requests to.
|
||||
|
||||
For downloading models from HuggingFace on first use:
|
||||
|
||||
```yaml
|
||||
models:
|
||||
Qwen2.5-7B:
|
||||
cmd: /app/llama-server --port ${PORT} -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M --ctx-size 8192 --n-gpu-layers 99
|
||||
proxy: 'http://localhost:${PORT}'
|
||||
```
|
||||
|
||||
See the [official configuration documentation](https://github.com/mostlygeek/llama-swap/blob/main/docs/config.md) for all options including `groups`, `hooks`, `macros`, `aliases`, `filters`, and more.
|
||||
|
||||
### Models Volume
|
||||
|
||||
The named volume `llama_swap_models` is mounted to `/root/.cache/llama.cpp` inside the container. To place local GGUF model files inside the volume, you can use:
|
||||
|
||||
```bash
|
||||
# Copy a model into the named volume
|
||||
docker run --rm -v llama_swap_models:/data -v /path/to/model.gguf:/src/model.gguf alpine cp /src/model.gguf /data/model.gguf
|
||||
```
|
||||
|
||||
Alternatively, change the volume definition in `docker-compose.yaml` to use a host path:
|
||||
|
||||
```yaml
|
||||
volumes:
|
||||
llama_swap_models:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: /path/to/your/models
|
||||
```
|
||||
|
||||
## Environment Variables
|
||||
|
||||
| Variable | Default | Description |
|
||||
| ------------------------------- | ---------- | -------------------------------------------------- |
|
||||
| `TZ` | `UTC` | Container timezone |
|
||||
| `GHCR_REGISTRY` | `ghcr.io/` | GitHub Container Registry prefix |
|
||||
| `LLAMA_SWAP_VERSION` | `cpu` | Image tag for the default CPU service |
|
||||
| `LLAMA_SWAP_CUDA_VERSION` | `cuda` | Image tag for the CUDA service |
|
||||
| `LLAMA_SWAP_AMD_VERSION` | `vulkan` | Image tag for the AMD service (`vulkan` or `rocm`) |
|
||||
| `LLAMA_SWAP_PORT_OVERRIDE` | `9292` | Host port for the API and Web UI |
|
||||
| `LLAMA_SWAP_GPU_COUNT` | `1` | Number of NVIDIA GPUs to use (CUDA profile) |
|
||||
| `LLAMA_SWAP_CPU_LIMIT` | `4.0` | CPU limit in cores |
|
||||
| `LLAMA_SWAP_CPU_RESERVATION` | `2.0` | CPU reservation in cores |
|
||||
| `LLAMA_SWAP_MEMORY_LIMIT` | `8G` | Memory limit |
|
||||
| `LLAMA_SWAP_MEMORY_RESERVATION` | `4G` | Memory reservation |
|
||||
|
||||
## Default Ports
|
||||
|
||||
| Port | Description |
|
||||
| ------ | ------------------------------------------ |
|
||||
| `9292` | OpenAI/Anthropic-compatible API and Web UI |
|
||||
|
||||
## API Usage
|
||||
|
||||
llama-swap exposes an OpenAI-compatible API. Use any OpenAI client by pointing it to `http://localhost:9292`:
|
||||
|
||||
```bash
|
||||
# List available models
|
||||
curl http://localhost:9292/v1/models
|
||||
|
||||
# Chat completion (automatically loads the model if not running)
|
||||
curl http://localhost:9292/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "my-model",
|
||||
"messages": [{"role": "user", "content": "Hello!"}]
|
||||
}'
|
||||
```
|
||||
|
||||
The Web UI is available at `http://localhost:9292` and provides real-time log streaming, request inspection, and manual model management.
|
||||
|
||||
## NVIDIA GPU Setup
|
||||
|
||||
Requires the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html).
|
||||
|
||||
```bash
|
||||
docker compose --profile gpu up -d
|
||||
```
|
||||
|
||||
For non-root security hardening, use the `cuda-non-root` image tag:
|
||||
|
||||
```yaml
|
||||
LLAMA_SWAP_CUDA_VERSION=cuda-non-root
|
||||
```
|
||||
|
||||
## AMD GPU Setup
|
||||
|
||||
Requires the `/dev/dri` and `/dev/dri` devices to be accessible on the host.
|
||||
|
||||
```bash
|
||||
docker compose --profile gpu-amd up -d
|
||||
```
|
||||
|
||||
Use `rocm` instead of `vulkan` for full ROCm support:
|
||||
|
||||
```bash
|
||||
LLAMA_SWAP_AMD_VERSION=rocm docker compose --profile gpu-amd up -d
|
||||
```
|
||||
|
||||
## Security Notes
|
||||
|
||||
- By default, the container runs as root. Use the `cuda-non-root` or `rocm-non-root` image tags for improved security on GPU deployments.
|
||||
- The `config.yaml` is mounted read-only (`ro`).
|
||||
- Consider placing llama-swap behind a reverse proxy (e.g., Nginx, Caddy) when exposing it beyond localhost.
|
||||
|
||||
## References
|
||||
|
||||
- [llama-swap GitHub](https://github.com/mostlygeek/llama-swap)
|
||||
- [Configuration Documentation](https://github.com/mostlygeek/llama-swap/blob/main/docs/config.md)
|
||||
- [Container Security](https://github.com/mostlygeek/llama-swap/blob/main/docs/container-security.md)
|
||||
- [Docker Compose Wiki](https://github.com/mostlygeek/llama-swap/wiki/Docker-Compose-Example)
|
||||
|
||||
## License
|
||||
|
||||
llama-swap is released under the MIT License. See the [LICENSE](https://github.com/mostlygeek/llama-swap/blob/main/LICENSE) file for details.
|
||||
196
src/llama-swap/README.zh.md
Normal file
196
src/llama-swap/README.zh.md
Normal file
@@ -0,0 +1,196 @@
|
||||
# llama-swap
|
||||
|
||||
[llama-swap](https://github.com/mostlygeek/llama-swap) 是一个轻量级反向代理,为任何本地 OpenAI/Anthropic 兼容的推理服务器(如 llama.cpp、vllm 等)提供可靠的按需模型切换功能。同一时间只加载一个模型,当收到对不同模型的请求时,llama-swap 会自动切换,让你可以在单台机器上轻松使用多个模型。
|
||||
|
||||
参见:[README.md](./README.md)
|
||||
|
||||
## 功能特性
|
||||
|
||||
- **按需模型切换**:根据 API 请求自动加载/卸载模型,无需手动干预。
|
||||
- **兼容 OpenAI/Anthropic**:可直接替代任何使用 OpenAI 或 Anthropic 聊天补全 API 的客户端。
|
||||
- **多后端支持**:适用于 llama.cpp(llama-server)、vllm 及任何 OpenAI 兼容服务器。
|
||||
- **实时 Web UI**:内置界面,可监控日志、检查请求、手动管理模型。
|
||||
- **基于 TTL 的自动卸载**:可配置模型在闲置一段时间后自动卸载。
|
||||
- **HuggingFace 模型下载**:在 `config.yaml` 中直接引用 HuggingFace 模型,首次使用时自动下载。
|
||||
- **多 GPU 支持**:支持 NVIDIA CUDA、AMD ROCm/Vulkan、Intel 及纯 CPU 部署。
|
||||
|
||||
## 快速开始
|
||||
|
||||
1. 复制环境变量示例文件:
|
||||
|
||||
```bash
|
||||
cp .env.example .env
|
||||
```
|
||||
|
||||
2. 编辑 `config.yaml`,添加你的模型配置。提供的 `config.yaml` 包含本地 GGUF 模型和 HuggingFace 下载的注释示例。详见[配置说明](#配置说明)。
|
||||
|
||||
3. 启动服务(默认仅使用 CPU):
|
||||
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
4. 启用 NVIDIA GPU 支持:
|
||||
|
||||
```bash
|
||||
docker compose --profile gpu up -d
|
||||
```
|
||||
|
||||
5. 启用 AMD GPU 支持(Vulkan):
|
||||
|
||||
```bash
|
||||
docker compose --profile gpu-amd up -d
|
||||
```
|
||||
|
||||
API 和 Web UI 地址:`http://localhost:9292`
|
||||
|
||||
## 服务说明
|
||||
|
||||
| 服务名称 | Profile | 说明 |
|
||||
| ----------------- | ---------- | ----------------------------- |
|
||||
| `llama-swap` | _(默认)_ | 纯 CPU 推理 |
|
||||
| `llama-swap-cuda` | `gpu` | NVIDIA CUDA GPU 推理 |
|
||||
| `llama-swap-amd` | `gpu-amd` | AMD GPU 推理(Vulkan / ROCm) |
|
||||
|
||||
> **注意**:每次只启动一个服务,三个服务均绑定到同一主机端口(`LLAMA_SWAP_PORT_OVERRIDE`)。
|
||||
|
||||
## 配置说明
|
||||
|
||||
### `config.yaml`
|
||||
|
||||
`config.yaml` 文件定义了 llama-swap 管理的模型列表,以只读方式挂载到容器内的 `/app/config.yaml`。编辑提供的 `config.yaml` 即可添加你的模型。
|
||||
|
||||
最简示例:
|
||||
|
||||
```yaml
|
||||
healthCheckTimeout: 300
|
||||
|
||||
models:
|
||||
my-model:
|
||||
cmd: /app/llama-server --port ${PORT} --model /root/.cache/llama.cpp/model.gguf --ctx-size 4096
|
||||
proxy: 'http://localhost:${PORT}'
|
||||
ttl: 900
|
||||
```
|
||||
|
||||
- `${PORT}` 由 llama-swap 自动分配。
|
||||
- `ttl`(秒):模型闲置超过该时长后自动卸载。
|
||||
- `cmd`:启动推理服务器的命令。
|
||||
- `proxy`:llama-swap 转发请求的地址。
|
||||
|
||||
直接使用 HuggingFace 模型(首次使用时自动下载):
|
||||
|
||||
```yaml
|
||||
models:
|
||||
Qwen2.5-7B:
|
||||
cmd: /app/llama-server --port ${PORT} -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M --ctx-size 8192 --n-gpu-layers 99
|
||||
proxy: 'http://localhost:${PORT}'
|
||||
```
|
||||
|
||||
完整配置选项(包括 `groups`、`hooks`、`macros`、`aliases`、`filters` 等)请参阅[官方配置文档](https://github.com/mostlygeek/llama-swap/blob/main/docs/config.md)。
|
||||
|
||||
### 模型卷
|
||||
|
||||
命名卷 `llama_swap_models` 挂载到容器内的 `/root/.cache/llama.cpp`。可以通过以下方式将本地 GGUF 模型文件放入卷中:
|
||||
|
||||
```bash
|
||||
# 将模型文件复制到命名卷
|
||||
docker run --rm -v llama_swap_models:/data -v /path/to/model.gguf:/src/model.gguf alpine cp /src/model.gguf /data/model.gguf
|
||||
```
|
||||
|
||||
或者将 `docker-compose.yaml` 中的卷定义改为主机路径绑定:
|
||||
|
||||
```yaml
|
||||
volumes:
|
||||
llama_swap_models:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: /path/to/your/models
|
||||
```
|
||||
|
||||
## 环境变量
|
||||
|
||||
| 变量名 | 默认值 | 说明 |
|
||||
| ------------------------------- | ---------- | -------------------------------------- |
|
||||
| `TZ` | `UTC` | 容器时区 |
|
||||
| `GHCR_REGISTRY` | `ghcr.io/` | GitHub 容器镜像仓库前缀 |
|
||||
| `LLAMA_SWAP_VERSION` | `cpu` | 默认 CPU 服务镜像标签 |
|
||||
| `LLAMA_SWAP_CUDA_VERSION` | `cuda` | CUDA 服务镜像标签 |
|
||||
| `LLAMA_SWAP_AMD_VERSION` | `vulkan` | AMD 服务镜像标签(`vulkan` 或 `rocm`) |
|
||||
| `LLAMA_SWAP_PORT_OVERRIDE` | `9292` | API 和 Web UI 的主机端口 |
|
||||
| `LLAMA_SWAP_GPU_COUNT` | `1` | 使用的 NVIDIA GPU 数量(gpu profile) |
|
||||
| `LLAMA_SWAP_CPU_LIMIT` | `4.0` | CPU 上限(核心数) |
|
||||
| `LLAMA_SWAP_CPU_RESERVATION` | `2.0` | CPU 预留(核心数) |
|
||||
| `LLAMA_SWAP_MEMORY_LIMIT` | `8G` | 内存上限 |
|
||||
| `LLAMA_SWAP_MEMORY_RESERVATION` | `4G` | 内存预留 |
|
||||
|
||||
## 默认端口
|
||||
|
||||
| 端口 | 说明 |
|
||||
| ------ | ----------------------------------- |
|
||||
| `9292` | OpenAI/Anthropic 兼容 API 及 Web UI |
|
||||
|
||||
## API 使用示例
|
||||
|
||||
llama-swap 暴露 OpenAI 兼容 API。将任何 OpenAI 客户端指向 `http://localhost:9292` 即可使用:
|
||||
|
||||
```bash
|
||||
# 列出可用模型
|
||||
curl http://localhost:9292/v1/models
|
||||
|
||||
# 聊天补全(若模型未运行则自动加载)
|
||||
curl http://localhost:9292/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "my-model",
|
||||
"messages": [{"role": "user", "content": "你好!"}]
|
||||
}'
|
||||
```
|
||||
|
||||
Web UI 可通过 `http://localhost:9292` 访问,提供实时日志流、请求检查和手动模型管理功能。
|
||||
|
||||
## NVIDIA GPU 配置
|
||||
|
||||
需要安装 [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)。
|
||||
|
||||
```bash
|
||||
docker compose --profile gpu up -d
|
||||
```
|
||||
|
||||
如需非 root 安全加固,可使用 `cuda-non-root` 镜像标签:
|
||||
|
||||
```bash
|
||||
LLAMA_SWAP_CUDA_VERSION=cuda-non-root docker compose --profile gpu up -d
|
||||
```
|
||||
|
||||
## AMD GPU 配置
|
||||
|
||||
需要主机上 `/dev/dri` 和 `/dev/kfd` 设备可访问。
|
||||
|
||||
```bash
|
||||
docker compose --profile gpu-amd up -d
|
||||
```
|
||||
|
||||
如需完整 ROCm 支持,可使用 `rocm` 替代 `vulkan`:
|
||||
|
||||
```bash
|
||||
LLAMA_SWAP_AMD_VERSION=rocm docker compose --profile gpu-amd up -d
|
||||
```
|
||||
|
||||
## 安全说明
|
||||
|
||||
- 默认情况下容器以 root 用户运行。GPU 部署时建议使用 `cuda-non-root` 或 `rocm-non-root` 镜像标签提升安全性。
|
||||
- `config.yaml` 以只读方式(`ro`)挂载。
|
||||
- 若需在 localhost 之外暴露服务,建议在 llama-swap 前部署反向代理(如 Nginx、Caddy)。
|
||||
|
||||
## 参考链接
|
||||
|
||||
- [llama-swap GitHub](https://github.com/mostlygeek/llama-swap)
|
||||
- [配置文档](https://github.com/mostlygeek/llama-swap/blob/main/docs/config.md)
|
||||
- [容器安全文档](https://github.com/mostlygeek/llama-swap/blob/main/docs/container-security.md)
|
||||
- [Docker Compose Wiki](https://github.com/mostlygeek/llama-swap/wiki/Docker-Compose-Example)
|
||||
|
||||
## 许可证
|
||||
|
||||
llama-swap 使用 MIT 许可证发布。详情请参阅 [LICENSE](https://github.com/mostlygeek/llama-swap/blob/main/LICENSE) 文件。
|
||||
47
src/llama-swap/config.yaml
Normal file
47
src/llama-swap/config.yaml
Normal file
@@ -0,0 +1,47 @@
|
||||
# llama-swap configuration file
|
||||
# https://github.com/mostlygeek/llama-swap/blob/main/docs/config.md
|
||||
#
|
||||
# This is the main configuration file for llama-swap.
|
||||
# Mount this file to /app/config.yaml inside the container.
|
||||
#
|
||||
# llama-swap will automatically swap models on demand:
|
||||
# - Only the requested model is loaded at a time.
|
||||
# - Idle models are unloaded when a new one is requested.
|
||||
|
||||
# Maximum time (in seconds) to wait for a model to become healthy.
|
||||
# A high value is useful when downloading models from HuggingFace.
|
||||
healthCheckTimeout: 300
|
||||
|
||||
# Macro definitions: reusable command snippets for model configuration.
|
||||
# Reference with $${macro-name} inside cmd fields.
|
||||
macros:
|
||||
"llama-server": >
|
||||
/app/llama-server
|
||||
--port ${PORT}
|
||||
|
||||
# Model definitions
|
||||
models:
|
||||
# Example: a local GGUF model stored in the models volume.
|
||||
# The volume `llama_swap_models` is mounted to /root/.cache/llama.cpp inside
|
||||
# the container. Place your .gguf files there and reference them with
|
||||
# /root/.cache/llama.cpp/<filename>.gguf
|
||||
"my-local-model":
|
||||
# ${PORT} is automatically assigned by llama-swap
|
||||
cmd: >
|
||||
$${llama-server}
|
||||
--model /root/.cache/llama.cpp/model.gguf
|
||||
--ctx-size 4096
|
||||
--n-gpu-layers 0
|
||||
proxy: "http://localhost:${PORT}"
|
||||
# Automatically unload the model after 15 minutes of inactivity
|
||||
ttl: 900
|
||||
|
||||
# Example: download a model from HuggingFace on first use (requires internet access)
|
||||
# "Qwen2.5-7B-Instruct":
|
||||
# cmd: >
|
||||
# $${llama-server}
|
||||
# -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M
|
||||
# --ctx-size 8192
|
||||
# --n-gpu-layers 99
|
||||
# proxy: "http://localhost:${PORT}"
|
||||
# ttl: 900
|
||||
126
src/llama-swap/docker-compose.yaml
Normal file
126
src/llama-swap/docker-compose.yaml
Normal file
@@ -0,0 +1,126 @@
|
||||
# Docker Compose configuration for llama-swap
|
||||
# https://github.com/mostlygeek/llama-swap
|
||||
# Reliable model swapping for any local OpenAI/Anthropic compatible server
|
||||
|
||||
x-defaults: &defaults
|
||||
restart: unless-stopped
|
||||
logging:
|
||||
driver: json-file
|
||||
options:
|
||||
max-size: 100m
|
||||
max-file: '3'
|
||||
|
||||
services:
|
||||
# llama-swap - CPU variant (default)
|
||||
llama-swap:
|
||||
<<: *defaults
|
||||
image: ${GHCR_REGISTRY:-ghcr.io/}mostlygeek/llama-swap:${LLAMA_SWAP_VERSION:-cpu}
|
||||
ports:
|
||||
- '${LLAMA_SWAP_PORT_OVERRIDE:-9292}:8080'
|
||||
volumes:
|
||||
- ./config.yaml:/app/config.yaml:ro
|
||||
- llama_swap_models:/root/.cache/llama.cpp
|
||||
environment:
|
||||
- TZ=${TZ:-UTC}
|
||||
healthcheck:
|
||||
test:
|
||||
- CMD
|
||||
- wget
|
||||
- --quiet
|
||||
- --tries=1
|
||||
- --spider
|
||||
- 'http://localhost:8080/v1/models'
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: ${LLAMA_SWAP_CPU_LIMIT:-4.0}
|
||||
memory: ${LLAMA_SWAP_MEMORY_LIMIT:-8G}
|
||||
reservations:
|
||||
cpus: ${LLAMA_SWAP_CPU_RESERVATION:-2.0}
|
||||
memory: ${LLAMA_SWAP_MEMORY_RESERVATION:-4G}
|
||||
|
||||
# llama-swap - NVIDIA CUDA variant
|
||||
# Requires NVIDIA Container Toolkit: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html
|
||||
llama-swap-cuda:
|
||||
<<: *defaults
|
||||
image: ${GHCR_REGISTRY:-ghcr.io/}mostlygeek/llama-swap:${LLAMA_SWAP_CUDA_VERSION:-cuda}
|
||||
ports:
|
||||
- '${LLAMA_SWAP_PORT_OVERRIDE:-9292}:8080'
|
||||
volumes:
|
||||
- ./config.yaml:/app/config.yaml:ro
|
||||
- llama_swap_models:/root/.cache/llama.cpp
|
||||
environment:
|
||||
- TZ=${TZ:-UTC}
|
||||
healthcheck:
|
||||
test:
|
||||
- CMD
|
||||
- wget
|
||||
- --quiet
|
||||
- --tries=1
|
||||
- --spider
|
||||
- 'http://localhost:8080/v1/models'
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: ${LLAMA_SWAP_CPU_LIMIT:-4.0}
|
||||
memory: ${LLAMA_SWAP_MEMORY_LIMIT:-8G}
|
||||
reservations:
|
||||
cpus: ${LLAMA_SWAP_CPU_RESERVATION:-2.0}
|
||||
memory: ${LLAMA_SWAP_MEMORY_RESERVATION:-4G}
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: ${LLAMA_SWAP_GPU_COUNT:-1}
|
||||
capabilities: [gpu]
|
||||
profiles:
|
||||
- gpu
|
||||
|
||||
# llama-swap - AMD ROCm / Vulkan variant (AMD GPU)
|
||||
# For AMD GPUs, ensure /dev/dri and /dev/kfd are accessible
|
||||
llama-swap-amd:
|
||||
<<: *defaults
|
||||
image: ${GHCR_REGISTRY:-ghcr.io/}mostlygeek/llama-swap:${LLAMA_SWAP_AMD_VERSION:-vulkan}
|
||||
ports:
|
||||
- '${LLAMA_SWAP_PORT_OVERRIDE:-9292}:8080'
|
||||
volumes:
|
||||
- ./config.yaml:/app/config.yaml:ro
|
||||
- llama_swap_models:/root/.cache/llama.cpp
|
||||
devices:
|
||||
- /dev/dri:/dev/dri
|
||||
- /dev/kfd:/dev/kfd
|
||||
group_add:
|
||||
- video
|
||||
environment:
|
||||
- TZ=${TZ:-UTC}
|
||||
healthcheck:
|
||||
test:
|
||||
- CMD
|
||||
- wget
|
||||
- --quiet
|
||||
- --tries=1
|
||||
- --spider
|
||||
- 'http://localhost:8080/v1/models'
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: ${LLAMA_SWAP_CPU_LIMIT:-4.0}
|
||||
memory: ${LLAMA_SWAP_MEMORY_LIMIT:-8G}
|
||||
reservations:
|
||||
cpus: ${LLAMA_SWAP_CPU_RESERVATION:-2.0}
|
||||
memory: ${LLAMA_SWAP_MEMORY_RESERVATION:-4G}
|
||||
profiles:
|
||||
- gpu-amd
|
||||
|
||||
volumes:
|
||||
llama_swap_models:
|
||||
Reference in New Issue
Block a user