feat: add nexa-sdk

2025-11-16 00:12:14 +08:00
parent 5f9820e7db
commit 1c42cb2800
9 changed files with 616 additions and 5 deletions
--- a/builds/mineru-vllm/README.md
+++ b/builds/mineru-vllm/README.md
@@ -39,7 +39,7 @@ mineru -p demo.pdf -o ./output -b vlm-http-client -u http://localhost:30000

 ## Configuration

- `MINERU_DOCKER_IMAGE`: The Docker image for MinerU, default is `alexsuntop/mineru:2.6.4`.
+- `MINERU_VERSION`: The version for MinerU, default is `2.6.4`.
 - `MINERU_PORT_OVERRIDE_VLLM`: The host port for the VLLM server, default is `30000`.
 - `MINERU_PORT_OVERRIDE_API`: The host port for the API service, default is `8000`.
 - `MINERU_PORT_OVERRIDE_GRADIO`: The host port for the Gradio WebUI, default is `7860`.
--- a/builds/mineru-vllm/README.zh.md
+++ b/builds/mineru-vllm/README.zh.md
@@ -39,7 +39,7 @@ mineru -p demo.pdf -o ./output -b vlm-http-client -u http://localhost:30000

 ## 配置

- `MINERU_DOCKER_IMAGE`: MinerU 的 Docker 镜像，默认为 `alexsuntop/mineru:2.6.4`。
+- `MINERU_VERSION`: MinerU 的 Docker 镜像版本，默认为 `2.6.4`。
 - `MINERU_PORT_OVERRIDE_VLLM`: VLLM 服务器的主机端口，默认为 `30000`。
 - `MINERU_PORT_OVERRIDE_API`: API 服务的主机端口，默认为 `8000`。
 - `MINERU_PORT_OVERRIDE_GRADIO`: Gradio WebUI 的主机端口，默认为 `7860`。
--- a/builds/mineru-vllm/docker-compose.yaml
+++ b/builds/mineru-vllm/docker-compose.yaml
@@ -8,7 +8,7 @@ x-defaults: &defaults

 x-mineru-vllm: &mineru-vllm
  <<: *defaults
-  image: ${MINERU_DOCKER_IMAGE:-alexsuntop/mineru:2.6.4}
+  image: ${GLOBAL_REGISTRY:-}alexsuntop/mineru:${MINERU_VERSION:-2.6.4}
  build:
    context: .
    dockerfile: Dockerfile
--- a/builds/nexa-sdk/.env.example
+++ b/builds/nexa-sdk/.env.example
@@ -0,0 +1,41 @@
+# Global registry for container images (optional)
+# GLOBAL_REGISTRY=
+
+# Nexa SDK version
+NEXA_SDK_VERSION=latest
+
+# Timezone configuration
+TZ=UTC
+
+# Port override for host binding
+NEXA_SDK_PORT_OVERRIDE=8080
+
+# Server configuration
+NEXA_HOST=0.0.0.0:8080
+NEXA_KEEPALIVE=300
+NEXA_ORIGINS=*
+
+# HuggingFace token for accessing private models (optional)
+NEXA_HFTOKEN=
+
+# Logging level (none, debug, info, warn, error)
+NEXA_LOG=none
+
+# Model to run (can be any Nexa-compatible model)
+# Examples: gemma-2-2b-instruct, qwen3-4b, llama-3-8b, mistral-7b
+NEXA_MODEL=gemma-2-2b-instruct
+
+# GPU configuration (for gpu profile only)
+# Number of GPU layers to offload (-1 for all layers)
+NEXA_GPU_LAYERS=-1
+
+# Shared memory size
+NEXA_SHM_SIZE=2g
+
+# Resource limits
+NEXA_SDK_CPU_LIMIT=4.0
+NEXA_SDK_MEMORY_LIMIT=8G
+
+# Resource reservations
+NEXA_SDK_CPU_RESERVATION=2.0
+NEXA_SDK_MEMORY_RESERVATION=4G
--- a/builds/nexa-sdk/Dockerfile
+++ b/builds/nexa-sdk/Dockerfile
@@ -1,6 +1,8 @@
 # https://github.com/NexaAI/nexa-sdk/issues/684
-FROM ubuntu:24.04
-#FROM nvidia/cuda:12.4.1-base-ubuntu22.04
+FROM ubuntu:22.04

 RUN apt update && apt install -y libgomp1 curl ffmpeg sox
 RUN curl -fsSL https://github.com/NexaAI/nexa-sdk/releases/latest/download/nexa-cli_linux_x86_64.sh | sh
+
+EXPOSE 8080
+CMD [ "nexa", "serve", "--host", "0.0.0.0:8080" ]
--- a/builds/nexa-sdk/Dockerfile.cuda
+++ b/builds/nexa-sdk/Dockerfile.cuda
@@ -0,0 +1,8 @@
+# https://github.com/NexaAI/nexa-sdk/issues/684
+FROM nvidia/cuda:12.8.1-cudnn-runtime-ubuntu22.04
+
+RUN apt update && apt install -y libgomp1 curl ffmpeg sox
+RUN curl -fsSL https://github.com/NexaAI/nexa-sdk/releases/latest/download/nexa-cli_linux_x86_64.sh | sh
+
+EXPOSE 8080
+CMD [ "nexa", "serve", "--host", "0.0.0.0:8080" ]
--- a/builds/nexa-sdk/README.md
+++ b/builds/nexa-sdk/README.md
@@ -0,0 +1,233 @@
+# Nexa SDK
+
+Nexa SDK is a comprehensive toolkit for running AI models locally. It provides inference for various model types including LLM, VLM (Vision Language Models), TTS (Text-to-Speech), ASR (Automatic Speech Recognition), and more. Built with performance in mind, it supports both CPU and GPU acceleration.
+
+## Features
+
+- **Multi-Model Support**: Run LLM, VLM, TTS, ASR, embedding, reranking, and image generation models
+- **OpenAI-Compatible API**: Provides standard OpenAI API endpoints for easy integration
+- **GPU Acceleration**: Optional GPU support via NVIDIA CUDA for faster inference
+- **Resource Management**: Configurable CPU/memory limits and GPU layer offloading
+- **Model Caching**: Persistent model storage for faster startup
+- **Profile Support**: Easy switching between CPU-only and GPU-accelerated modes
+
+## Quick Start
+
+### Prerequisites
+
+- Docker and Docker Compose
+- For GPU support: NVIDIA Docker runtime and compatible GPU
+
+### Basic Usage (CPU)
+
+```bash
+# Copy environment file
+cp .env.example .env
+
+# Edit .env to configure your model and settings
+# NEXA_MODEL=gemma-2-2b-instruct
+
+# Start the service with CPU profile
+docker compose --profile cpu up -d
+```
+
+### GPU-Accelerated Usage
+
+```bash
+# Copy environment file
+cp .env.example .env
+
+# Configure for GPU usage
+# NEXA_MODEL=gemma-2-2b-instruct
+# NEXA_GPU_LAYERS=-1  # -1 means all layers on GPU
+
+# Start the service with GPU profile
+docker compose --profile gpu up -d
+```
+
+## Configuration
+
+### Environment Variables
+
+| Variable                 | Default               | Description                                            |
+| ------------------------ | --------------------- | ------------------------------------------------------ |
+| `NEXA_SDK_VERSION`       | `latest`              | Nexa SDK Docker image version                          |
+| `NEXA_SDK_PORT_OVERRIDE` | `8080`                | Host port for API access                               |
+| `NEXA_MODEL`             | `gemma-2-2b-instruct` | Model to load (e.g., qwen3-4b, llama-3-8b, mistral-7b) |
+| `NEXA_HOST`              | `0.0.0.0:8080`        | Server bind address                                    |
+| `NEXA_KEEPALIVE`         | `300`                 | Model keepalive timeout in seconds                     |
+| `NEXA_ORIGINS`           | `*`                   | CORS allowed origins                                   |
+| `NEXA_HFTOKEN`           | -                     | HuggingFace token for private models                   |
+| `NEXA_LOG`               | `none`                | Logging level (none, debug, info, warn, error)         |
+| `NEXA_GPU_LAYERS`        | `-1`                  | GPU layers to offload (-1 = all, 0 = CPU only)         |
+| `NEXA_SHM_SIZE`          | `2g`                  | Shared memory size                                     |
+| `TZ`                     | `UTC`                 | Container timezone                                     |
+
+### Resource Limits
+
+| Variable                      | Default | Description        |
+| ----------------------------- | ------- | ------------------ |
+| `NEXA_SDK_CPU_LIMIT`          | `4.0`   | Maximum CPU cores  |
+| `NEXA_SDK_MEMORY_LIMIT`       | `8G`    | Maximum memory     |
+| `NEXA_SDK_CPU_RESERVATION`    | `2.0`   | Reserved CPU cores |
+| `NEXA_SDK_MEMORY_RESERVATION` | `4G`    | Reserved memory    |
+
+### Profiles
+
+- `cpu`: Run with CPU-only inference (default profile needed)
+- `gpu`: Run with GPU acceleration (requires NVIDIA GPU)
+
+## Usage Examples
+
+### Test the API
+
+```bash
+# Check available models
+curl http://localhost:8080/v1/models
+
+# Chat completion
+curl http://localhost:8080/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gemma-2-2b-instruct",
+    "messages": [
+      {"role": "user", "content": "Hello!"}
+    ]
+  }'
+```
+
+### Using Different Models
+
+Edit `.env` to change the model:
+
+```bash
+# Small models for limited resources
+NEXA_MODEL=gemma-2-2b-instruct
+# or
+NEXA_MODEL=qwen3-4b
+
+# Larger models for better quality
+NEXA_MODEL=llama-3-8b
+# or
+NEXA_MODEL=mistral-7b
+```
+
+### GPU Configuration
+
+For GPU acceleration, adjust the number of layers:
+
+```bash
+# Offload all layers to GPU (fastest)
+NEXA_GPU_LAYERS=-1
+
+# Offload 30 layers (hybrid mode)
+NEXA_GPU_LAYERS=30
+
+# CPU only
+NEXA_GPU_LAYERS=0
+```
+
+## Model Management
+
+Models are automatically downloaded on first run and cached in the `nexa_models` volume. The default cache location inside the container is `/root/.cache/nexa`.
+
+To use a different model:
+
+1. Update `NEXA_MODEL` in `.env`
+2. Restart the service: `docker compose --profile <cpu|gpu> restart`
+
+## API Endpoints
+
+Nexa SDK provides OpenAI-compatible API endpoints:
+
+- `GET /v1/models` - List available models
+- `POST /v1/chat/completions` - Chat completions
+- `POST /v1/completions` - Text completions
+- `POST /v1/embeddings` - Text embeddings
+- `GET /health` - Health check
+- `GET /docs` - API documentation (Swagger UI)
+
+## Troubleshooting
+
+### Out of Memory
+
+Increase memory limits or use a smaller model:
+
+```bash
+NEXA_SDK_MEMORY_LIMIT=16G
+NEXA_SDK_MEMORY_RESERVATION=8G
+# Or switch to a smaller model
+NEXA_MODEL=gemma-2-2b-instruct
+```
+
+### GPU Not Detected
+
+Ensure NVIDIA Docker runtime is installed:
+
+```bash
+# Check GPU availability
+docker run --rm --gpus all nvidia/cuda:12.8.1-base-ubuntu22.04 nvidia-smi
+```
+
+### Model Download Issues
+
+Set HuggingFace token if accessing private models:
+
+```bash
+NEXA_HFTOKEN=your_hf_token_here
+```
+
+### Slow Performance
+
+- Use GPU profile for better performance
+- Increase `NEXA_GPU_LAYERS` to offload more computation to GPU
+- Allocate more resources or use a smaller model
+
+## Advanced Configuration
+
+### Custom Model Path
+
+If you want to use local model files, mount them as a volume:
+
+```yaml
+volumes:
+  - ./models:/models
+  - nexa_models:/root/.cache/nexa
+```
+
+Then reference the model by its path in the command.
+
+### HTTPS Configuration
+
+Set environment variables for HTTPS:
+
+```bash
+NEXA_ENABLEHTTPS=true
+```
+
+Mount certificate files:
+
+```yaml
+volumes:
+  - ./certs/cert.pem:/app/cert.pem:ro
+  - ./certs/key.pem:/app/key.pem:ro
+```
+
+## Health Check
+
+The service includes a health check that verifies the API is responding:
+
+```bash
+curl http://localhost:8080/v1/models
+```
+
+## License
+
+Nexa SDK is developed by Nexa AI. Please refer to the [official repository](https://github.com/NexaAI/nexa-sdk) for license information.
+
+## Links
+
+- [Official Repository](https://github.com/NexaAI/nexa-sdk)
+- [Nexa AI Website](https://nexa.ai)
+- [Documentation](https://docs.nexa.ai)
+- [Model Hub](https://sdk.nexa.ai)
--- a/builds/nexa-sdk/README.zh.md
+++ b/builds/nexa-sdk/README.zh.md
@@ -0,0 +1,233 @@
+# Nexa SDK
+
+Nexa SDK 是一个功能全面的本地 AI 模型运行工具包。它支持多种模型类型的推理，包括 LLM、VLM（视觉语言模型）、TTS（文本转语音）、ASR（自动语音识别）等。该工具专注于性能优化，支持 CPU 和 GPU 加速。
+
+## 特性
+
+- **多模型支持**：运行 LLM、VLM、TTS、ASR、嵌入、重排序和图像生成模型
+- **OpenAI 兼容 API**：提供标准的 OpenAI API 端点，便于集成
+- **GPU 加速**：通过 NVIDIA CUDA 提供可选的 GPU 支持，实现更快的推理速度
+- **资源管理**：可配置的 CPU/内存限制和 GPU 层卸载
+- **模型缓存**：持久化模型存储，加快启动速度
+- **配置文件支持**：轻松在 CPU 模式和 GPU 加速模式之间切换
+
+## 快速开始
+
+### 前置要求
+
+- Docker 和 Docker Compose
+- GPU 支持需要：NVIDIA Docker runtime 和兼容的 GPU
+
+### 基本使用（CPU）
+
+```bash
+# 复制环境配置文件
+cp .env.example .env
+
+# 编辑 .env 配置模型和设置
+# NEXA_MODEL=gemma-2-2b-instruct
+
+# 使用 CPU 配置文件启动服务
+docker compose --profile cpu up -d
+```
+
+### GPU 加速使用
+
+```bash
+# 复制环境配置文件
+cp .env.example .env
+
+# 配置 GPU 使用
+# NEXA_MODEL=gemma-2-2b-instruct
+# NEXA_GPU_LAYERS=-1  # -1 表示所有层都在 GPU 上
+
+# 使用 GPU 配置文件启动服务
+docker compose --profile gpu up -d
+```
+
+## 配置
+
+### 环境变量
+
+| 变量                     | 默认值                | 说明                                                |
+| ------------------------ | --------------------- | --------------------------------------------------- |
+| `NEXA_SDK_VERSION`       | `latest`              | Nexa SDK Docker 镜像版本                            |
+| `NEXA_SDK_PORT_OVERRIDE` | `8080`                | API 访问的主机端口                                  |
+| `NEXA_MODEL`             | `gemma-2-2b-instruct` | 要加载的模型（如 qwen3-4b、llama-3-8b、mistral-7b） |
+| `NEXA_HOST`              | `0.0.0.0:8080`        | 服务器绑定地址                                      |
+| `NEXA_KEEPALIVE`         | `300`                 | 模型保活超时时间（秒）                              |
+| `NEXA_ORIGINS`           | `*`                   | CORS 允许的源                                       |
+| `NEXA_HFTOKEN`           | -                     | 用于私有模型的 HuggingFace 令牌                     |
+| `NEXA_LOG`               | `none`                | 日志级别（none、debug、info、warn、error）          |
+| `NEXA_GPU_LAYERS`        | `-1`                  | 卸载到 GPU 的层数（-1 = 全部，0 = 仅 CPU）          |
+| `NEXA_SHM_SIZE`          | `2g`                  | 共享内存大小                                        |
+| `TZ`                     | `UTC`                 | 容器时区                                            |
+
+### 资源限制
+
+| 变量                          | 默认值 | 说明            |
+| ----------------------------- | ------ | --------------- |
+| `NEXA_SDK_CPU_LIMIT`          | `4.0`  | 最大 CPU 核心数 |
+| `NEXA_SDK_MEMORY_LIMIT`       | `8G`   | 最大内存        |
+| `NEXA_SDK_CPU_RESERVATION`    | `2.0`  | 预留 CPU 核心数 |
+| `NEXA_SDK_MEMORY_RESERVATION` | `4G`   | 预留内存        |
+
+### 配置文件
+
+- `cpu`：使用 CPU 推理运行（需要指定默认配置文件）
+- `gpu`：使用 GPU 加速运行（需要 NVIDIA GPU）
+
+## 使用示例
+
+### 测试 API
+
+```bash
+# 检查可用模型
+curl http://localhost:8080/v1/models
+
+# 聊天完成
+curl http://localhost:8080/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gemma-2-2b-instruct",
+    "messages": [
+      {"role": "user", "content": "你好！"}
+    ]
+  }'
+```
+
+### 使用不同的模型
+
+编辑 `.env` 更改模型：
+
+```bash
+# 资源受限时使用小模型
+NEXA_MODEL=gemma-2-2b-instruct
+# 或
+NEXA_MODEL=qwen3-4b
+
+# 追求更好质量时使用大模型
+NEXA_MODEL=llama-3-8b
+# 或
+NEXA_MODEL=mistral-7b
+```
+
+### GPU 配置
+
+对于 GPU 加速，调整层数：
+
+```bash
+# 将所有层卸载到 GPU（最快）
+NEXA_GPU_LAYERS=-1
+
+# 卸载 30 层（混合模式）
+NEXA_GPU_LAYERS=30
+
+# 仅 CPU
+NEXA_GPU_LAYERS=0
+```
+
+## 模型管理
+
+模型会在首次运行时自动下载，并缓存在 `nexa_models` 卷中。容器内的默认缓存位置是 `/root/.cache/nexa`。
+
+要使用不同的模型：
+
+1. 在 `.env` 中更新 `NEXA_MODEL`
+2. 重启服务：`docker compose --profile <cpu|gpu> restart`
+
+## API 端点
+
+Nexa SDK 提供 OpenAI 兼容的 API 端点：
+
+- `GET /v1/models` - 列出可用模型
+- `POST /v1/chat/completions` - 聊天完成
+- `POST /v1/completions` - 文本完成
+- `POST /v1/embeddings` - 文本嵌入
+- `GET /health` - 健康检查
+- `GET /docs` - API 文档（Swagger UI）
+
+## 故障排除
+
+### 内存不足
+
+增加内存限制或使用更小的模型：
+
+```bash
+NEXA_SDK_MEMORY_LIMIT=16G
+NEXA_SDK_MEMORY_RESERVATION=8G
+# 或切换到更小的模型
+NEXA_MODEL=gemma-2-2b-instruct
+```
+
+### GPU 未检测到
+
+确保已安装 NVIDIA Docker runtime：
+
+```bash
+# 检查 GPU 可用性
+docker run --rm --gpus all nvidia/cuda:12.8.1-base-ubuntu22.04 nvidia-smi
+```
+
+### 模型下载问题
+
+如果访问私有模型，设置 HuggingFace 令牌：
+
+```bash
+NEXA_HFTOKEN=your_hf_token_here
+```
+
+### 性能缓慢
+
+- 使用 GPU 配置文件以获得更好的性能
+- 增加 `NEXA_GPU_LAYERS` 以将更多计算卸载到 GPU
+- 分配更多资源或使用更小的模型
+
+## 高级配置
+
+### 自定义模型路径
+
+如果要使用本地模型文件，将它们挂载为卷：
+
+```yaml
+volumes:
+  - ./models:/models
+  - nexa_models:/root/.cache/nexa
+```
+
+然后在命令中通过路径引用模型。
+
+### HTTPS 配置
+
+设置 HTTPS 的环境变量：
+
+```bash
+NEXA_ENABLEHTTPS=true
+```
+
+挂载证书文件：
+
+```yaml
+volumes:
+  - ./certs/cert.pem:/app/cert.pem:ro
+  - ./certs/key.pem:/app/key.pem:ro
+```
+
+## 健康检查
+
+服务包含验证 API 是否响应的健康检查：
+
+```bash
+curl http://localhost:8080/v1/models
+```
+
+## 许可证
+
+Nexa SDK 由 Nexa AI 开发。许可证信息请参考[官方仓库](https://github.com/NexaAI/nexa-sdk)。
+
+## 链接
+
+- [官方仓库](https://github.com/NexaAI/nexa-sdk)
+- [Nexa AI 网站](https://nexa.ai)
+- [文档](https://docs.nexa.ai)
+- [模型中心](https://sdk.nexa.ai)
--- a/builds/nexa-sdk/docker-compose.yaml
+++ b/builds/nexa-sdk/docker-compose.yaml
@@ -0,0 +1,94 @@
+x-defaults: &defaults
+  restart: unless-stopped
+  logging:
+    driver: json-file
+    options:
+      max-size: 100m
+      max-file: "3"
+
+services:
+  nexa-sdk:
+    <<: *defaults
+    build:
+      context: .
+      dockerfile: Dockerfile
+    image: ${GLOBAL_REGISTRY:-}alexsuntop/nexa-sdk:${NEXA_SDK_CPU_VERSION:-0.2.57}
+    ports:
+      - "${NEXA_SDK_PORT_OVERRIDE:-8080}:8080"
+    volumes:
+      - nexa_models:/root/.cache/nexa
+    environment:
+      - TZ=${TZ:-UTC}
+      - NEXA_HOST=${NEXA_HOST:-0.0.0.0:8080}
+      - NEXA_KEEPALIVE=${NEXA_KEEPALIVE:-300}
+      - NEXA_ORIGINS=${NEXA_ORIGINS:-*}
+      - NEXA_HFTOKEN=${NEXA_HFTOKEN:-}
+      - NEXA_LOG=${NEXA_LOG:-none}
+    command: >
+      nexa server
+      ${NEXA_MODEL:-gemma-2-2b-instruct}
+    ipc: host
+    shm_size: ${NEXA_SHM_SIZE:-2g}
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8080/v1/models"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+    deploy:
+      resources:
+        limits:
+          cpus: ${NEXA_SDK_CPU_LIMIT:-4.0}
+          memory: ${NEXA_SDK_MEMORY_LIMIT:-8G}
+        reservations:
+          cpus: ${NEXA_SDK_CPU_RESERVATION:-2.0}
+          memory: ${NEXA_SDK_MEMORY_RESERVATION:-4G}
+    profiles:
+      - cpu
+
+  nexa-sdk-cuda:
+    <<: *defaults
+    build:
+      context: .
+      dockerfile: Dockerfile.cuda
+    image: ${GLOBAL_REGISTRY:-}alexsuntop/nexa-sdk:${NEXA_SDK_CUDA_VERSION:-0.2.57-cuda}
+    ports:
+      - "${NEXA_SDK_PORT_OVERRIDE:-8080}:8080"
+    volumes:
+      - nexa_models:/root/.cache/nexa
+    environment:
+      - TZ=${TZ:-UTC}
+      - NEXA_HOST=${NEXA_HOST:-0.0.0.0:8080}
+      - NEXA_KEEPALIVE=${NEXA_KEEPALIVE:-300}
+      - NEXA_ORIGINS=${NEXA_ORIGINS:-*}
+      - NEXA_HFTOKEN=${NEXA_HFTOKEN:-}
+      - NEXA_LOG=${NEXA_LOG:-none}
+    command: >
+      nexa server
+      ${NEXA_MODEL:-gemma-2-2b-instruct}
+      -ngl ${NEXA_GPU_LAYERS:--1}
+    ipc: host
+    shm_size: ${NEXA_SHM_SIZE:-2g}
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8080/v1/models"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+    deploy:
+      resources:
+        limits:
+          cpus: ${NEXA_SDK_CPU_LIMIT:-4.0}
+          memory: ${NEXA_SDK_MEMORY_LIMIT:-8G}
+        reservations:
+          cpus: ${NEXA_SDK_CPU_RESERVATION:-2.0}
+          memory: ${NEXA_SDK_MEMORY_RESERVATION:-4G}
+          devices:
+            - driver: nvidia
+              device_ids: ['0']
+              capabilities: [gpu]
+    profiles:
+      - cuda
+
+volumes:
+  nexa_models: