feat: add nexa-sdk
This commit is contained in:
@@ -39,7 +39,7 @@ mineru -p demo.pdf -o ./output -b vlm-http-client -u http://localhost:30000
|
||||
|
||||
## Configuration
|
||||
|
||||
- `MINERU_DOCKER_IMAGE`: The Docker image for MinerU, default is `alexsuntop/mineru:2.6.4`.
|
||||
- `MINERU_VERSION`: The version for MinerU, default is `2.6.4`.
|
||||
- `MINERU_PORT_OVERRIDE_VLLM`: The host port for the VLLM server, default is `30000`.
|
||||
- `MINERU_PORT_OVERRIDE_API`: The host port for the API service, default is `8000`.
|
||||
- `MINERU_PORT_OVERRIDE_GRADIO`: The host port for the Gradio WebUI, default is `7860`.
|
||||
|
||||
@@ -39,7 +39,7 @@ mineru -p demo.pdf -o ./output -b vlm-http-client -u http://localhost:30000
|
||||
|
||||
## 配置
|
||||
|
||||
- `MINERU_DOCKER_IMAGE`: MinerU 的 Docker 镜像,默认为 `alexsuntop/mineru:2.6.4`。
|
||||
- `MINERU_VERSION`: MinerU 的 Docker 镜像版本,默认为 `2.6.4`。
|
||||
- `MINERU_PORT_OVERRIDE_VLLM`: VLLM 服务器的主机端口,默认为 `30000`。
|
||||
- `MINERU_PORT_OVERRIDE_API`: API 服务的主机端口,默认为 `8000`。
|
||||
- `MINERU_PORT_OVERRIDE_GRADIO`: Gradio WebUI 的主机端口,默认为 `7860`。
|
||||
|
||||
@@ -8,7 +8,7 @@ x-defaults: &defaults
|
||||
|
||||
x-mineru-vllm: &mineru-vllm
|
||||
<<: *defaults
|
||||
image: ${MINERU_DOCKER_IMAGE:-alexsuntop/mineru:2.6.4}
|
||||
image: ${GLOBAL_REGISTRY:-}alexsuntop/mineru:${MINERU_VERSION:-2.6.4}
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
|
||||
41
builds/nexa-sdk/.env.example
Normal file
41
builds/nexa-sdk/.env.example
Normal file
@@ -0,0 +1,41 @@
|
||||
# Global registry for container images (optional)
|
||||
# GLOBAL_REGISTRY=
|
||||
|
||||
# Nexa SDK version
|
||||
NEXA_SDK_VERSION=latest
|
||||
|
||||
# Timezone configuration
|
||||
TZ=UTC
|
||||
|
||||
# Port override for host binding
|
||||
NEXA_SDK_PORT_OVERRIDE=8080
|
||||
|
||||
# Server configuration
|
||||
NEXA_HOST=0.0.0.0:8080
|
||||
NEXA_KEEPALIVE=300
|
||||
NEXA_ORIGINS=*
|
||||
|
||||
# HuggingFace token for accessing private models (optional)
|
||||
NEXA_HFTOKEN=
|
||||
|
||||
# Logging level (none, debug, info, warn, error)
|
||||
NEXA_LOG=none
|
||||
|
||||
# Model to run (can be any Nexa-compatible model)
|
||||
# Examples: gemma-2-2b-instruct, qwen3-4b, llama-3-8b, mistral-7b
|
||||
NEXA_MODEL=gemma-2-2b-instruct
|
||||
|
||||
# GPU configuration (for gpu profile only)
|
||||
# Number of GPU layers to offload (-1 for all layers)
|
||||
NEXA_GPU_LAYERS=-1
|
||||
|
||||
# Shared memory size
|
||||
NEXA_SHM_SIZE=2g
|
||||
|
||||
# Resource limits
|
||||
NEXA_SDK_CPU_LIMIT=4.0
|
||||
NEXA_SDK_MEMORY_LIMIT=8G
|
||||
|
||||
# Resource reservations
|
||||
NEXA_SDK_CPU_RESERVATION=2.0
|
||||
NEXA_SDK_MEMORY_RESERVATION=4G
|
||||
@@ -1,6 +1,8 @@
|
||||
# https://github.com/NexaAI/nexa-sdk/issues/684
|
||||
FROM ubuntu:24.04
|
||||
#FROM nvidia/cuda:12.4.1-base-ubuntu22.04
|
||||
FROM ubuntu:22.04
|
||||
|
||||
RUN apt update && apt install -y libgomp1 curl ffmpeg sox
|
||||
RUN curl -fsSL https://github.com/NexaAI/nexa-sdk/releases/latest/download/nexa-cli_linux_x86_64.sh | sh
|
||||
|
||||
EXPOSE 8080
|
||||
CMD [ "nexa", "serve", "--host", "0.0.0.0:8080" ]
|
||||
|
||||
8
builds/nexa-sdk/Dockerfile.cuda
Normal file
8
builds/nexa-sdk/Dockerfile.cuda
Normal file
@@ -0,0 +1,8 @@
|
||||
# https://github.com/NexaAI/nexa-sdk/issues/684
|
||||
FROM nvidia/cuda:12.8.1-cudnn-runtime-ubuntu22.04
|
||||
|
||||
RUN apt update && apt install -y libgomp1 curl ffmpeg sox
|
||||
RUN curl -fsSL https://github.com/NexaAI/nexa-sdk/releases/latest/download/nexa-cli_linux_x86_64.sh | sh
|
||||
|
||||
EXPOSE 8080
|
||||
CMD [ "nexa", "serve", "--host", "0.0.0.0:8080" ]
|
||||
233
builds/nexa-sdk/README.md
Normal file
233
builds/nexa-sdk/README.md
Normal file
@@ -0,0 +1,233 @@
|
||||
# Nexa SDK
|
||||
|
||||
Nexa SDK is a comprehensive toolkit for running AI models locally. It provides inference for various model types including LLM, VLM (Vision Language Models), TTS (Text-to-Speech), ASR (Automatic Speech Recognition), and more. Built with performance in mind, it supports both CPU and GPU acceleration.
|
||||
|
||||
## Features
|
||||
|
||||
- **Multi-Model Support**: Run LLM, VLM, TTS, ASR, embedding, reranking, and image generation models
|
||||
- **OpenAI-Compatible API**: Provides standard OpenAI API endpoints for easy integration
|
||||
- **GPU Acceleration**: Optional GPU support via NVIDIA CUDA for faster inference
|
||||
- **Resource Management**: Configurable CPU/memory limits and GPU layer offloading
|
||||
- **Model Caching**: Persistent model storage for faster startup
|
||||
- **Profile Support**: Easy switching between CPU-only and GPU-accelerated modes
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- Docker and Docker Compose
|
||||
- For GPU support: NVIDIA Docker runtime and compatible GPU
|
||||
|
||||
### Basic Usage (CPU)
|
||||
|
||||
```bash
|
||||
# Copy environment file
|
||||
cp .env.example .env
|
||||
|
||||
# Edit .env to configure your model and settings
|
||||
# NEXA_MODEL=gemma-2-2b-instruct
|
||||
|
||||
# Start the service with CPU profile
|
||||
docker compose --profile cpu up -d
|
||||
```
|
||||
|
||||
### GPU-Accelerated Usage
|
||||
|
||||
```bash
|
||||
# Copy environment file
|
||||
cp .env.example .env
|
||||
|
||||
# Configure for GPU usage
|
||||
# NEXA_MODEL=gemma-2-2b-instruct
|
||||
# NEXA_GPU_LAYERS=-1 # -1 means all layers on GPU
|
||||
|
||||
# Start the service with GPU profile
|
||||
docker compose --profile gpu up -d
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### Environment Variables
|
||||
|
||||
| Variable | Default | Description |
|
||||
| ------------------------ | --------------------- | ------------------------------------------------------ |
|
||||
| `NEXA_SDK_VERSION` | `latest` | Nexa SDK Docker image version |
|
||||
| `NEXA_SDK_PORT_OVERRIDE` | `8080` | Host port for API access |
|
||||
| `NEXA_MODEL` | `gemma-2-2b-instruct` | Model to load (e.g., qwen3-4b, llama-3-8b, mistral-7b) |
|
||||
| `NEXA_HOST` | `0.0.0.0:8080` | Server bind address |
|
||||
| `NEXA_KEEPALIVE` | `300` | Model keepalive timeout in seconds |
|
||||
| `NEXA_ORIGINS` | `*` | CORS allowed origins |
|
||||
| `NEXA_HFTOKEN` | - | HuggingFace token for private models |
|
||||
| `NEXA_LOG` | `none` | Logging level (none, debug, info, warn, error) |
|
||||
| `NEXA_GPU_LAYERS` | `-1` | GPU layers to offload (-1 = all, 0 = CPU only) |
|
||||
| `NEXA_SHM_SIZE` | `2g` | Shared memory size |
|
||||
| `TZ` | `UTC` | Container timezone |
|
||||
|
||||
### Resource Limits
|
||||
|
||||
| Variable | Default | Description |
|
||||
| ----------------------------- | ------- | ------------------ |
|
||||
| `NEXA_SDK_CPU_LIMIT` | `4.0` | Maximum CPU cores |
|
||||
| `NEXA_SDK_MEMORY_LIMIT` | `8G` | Maximum memory |
|
||||
| `NEXA_SDK_CPU_RESERVATION` | `2.0` | Reserved CPU cores |
|
||||
| `NEXA_SDK_MEMORY_RESERVATION` | `4G` | Reserved memory |
|
||||
|
||||
### Profiles
|
||||
|
||||
- `cpu`: Run with CPU-only inference (default profile needed)
|
||||
- `gpu`: Run with GPU acceleration (requires NVIDIA GPU)
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### Test the API
|
||||
|
||||
```bash
|
||||
# Check available models
|
||||
curl http://localhost:8080/v1/models
|
||||
|
||||
# Chat completion
|
||||
curl http://localhost:8080/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "gemma-2-2b-instruct",
|
||||
"messages": [
|
||||
{"role": "user", "content": "Hello!"}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
### Using Different Models
|
||||
|
||||
Edit `.env` to change the model:
|
||||
|
||||
```bash
|
||||
# Small models for limited resources
|
||||
NEXA_MODEL=gemma-2-2b-instruct
|
||||
# or
|
||||
NEXA_MODEL=qwen3-4b
|
||||
|
||||
# Larger models for better quality
|
||||
NEXA_MODEL=llama-3-8b
|
||||
# or
|
||||
NEXA_MODEL=mistral-7b
|
||||
```
|
||||
|
||||
### GPU Configuration
|
||||
|
||||
For GPU acceleration, adjust the number of layers:
|
||||
|
||||
```bash
|
||||
# Offload all layers to GPU (fastest)
|
||||
NEXA_GPU_LAYERS=-1
|
||||
|
||||
# Offload 30 layers (hybrid mode)
|
||||
NEXA_GPU_LAYERS=30
|
||||
|
||||
# CPU only
|
||||
NEXA_GPU_LAYERS=0
|
||||
```
|
||||
|
||||
## Model Management
|
||||
|
||||
Models are automatically downloaded on first run and cached in the `nexa_models` volume. The default cache location inside the container is `/root/.cache/nexa`.
|
||||
|
||||
To use a different model:
|
||||
|
||||
1. Update `NEXA_MODEL` in `.env`
|
||||
2. Restart the service: `docker compose --profile <cpu|gpu> restart`
|
||||
|
||||
## API Endpoints
|
||||
|
||||
Nexa SDK provides OpenAI-compatible API endpoints:
|
||||
|
||||
- `GET /v1/models` - List available models
|
||||
- `POST /v1/chat/completions` - Chat completions
|
||||
- `POST /v1/completions` - Text completions
|
||||
- `POST /v1/embeddings` - Text embeddings
|
||||
- `GET /health` - Health check
|
||||
- `GET /docs` - API documentation (Swagger UI)
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Out of Memory
|
||||
|
||||
Increase memory limits or use a smaller model:
|
||||
|
||||
```bash
|
||||
NEXA_SDK_MEMORY_LIMIT=16G
|
||||
NEXA_SDK_MEMORY_RESERVATION=8G
|
||||
# Or switch to a smaller model
|
||||
NEXA_MODEL=gemma-2-2b-instruct
|
||||
```
|
||||
|
||||
### GPU Not Detected
|
||||
|
||||
Ensure NVIDIA Docker runtime is installed:
|
||||
|
||||
```bash
|
||||
# Check GPU availability
|
||||
docker run --rm --gpus all nvidia/cuda:12.8.1-base-ubuntu22.04 nvidia-smi
|
||||
```
|
||||
|
||||
### Model Download Issues
|
||||
|
||||
Set HuggingFace token if accessing private models:
|
||||
|
||||
```bash
|
||||
NEXA_HFTOKEN=your_hf_token_here
|
||||
```
|
||||
|
||||
### Slow Performance
|
||||
|
||||
- Use GPU profile for better performance
|
||||
- Increase `NEXA_GPU_LAYERS` to offload more computation to GPU
|
||||
- Allocate more resources or use a smaller model
|
||||
|
||||
## Advanced Configuration
|
||||
|
||||
### Custom Model Path
|
||||
|
||||
If you want to use local model files, mount them as a volume:
|
||||
|
||||
```yaml
|
||||
volumes:
|
||||
- ./models:/models
|
||||
- nexa_models:/root/.cache/nexa
|
||||
```
|
||||
|
||||
Then reference the model by its path in the command.
|
||||
|
||||
### HTTPS Configuration
|
||||
|
||||
Set environment variables for HTTPS:
|
||||
|
||||
```bash
|
||||
NEXA_ENABLEHTTPS=true
|
||||
```
|
||||
|
||||
Mount certificate files:
|
||||
|
||||
```yaml
|
||||
volumes:
|
||||
- ./certs/cert.pem:/app/cert.pem:ro
|
||||
- ./certs/key.pem:/app/key.pem:ro
|
||||
```
|
||||
|
||||
## Health Check
|
||||
|
||||
The service includes a health check that verifies the API is responding:
|
||||
|
||||
```bash
|
||||
curl http://localhost:8080/v1/models
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
Nexa SDK is developed by Nexa AI. Please refer to the [official repository](https://github.com/NexaAI/nexa-sdk) for license information.
|
||||
|
||||
## Links
|
||||
|
||||
- [Official Repository](https://github.com/NexaAI/nexa-sdk)
|
||||
- [Nexa AI Website](https://nexa.ai)
|
||||
- [Documentation](https://docs.nexa.ai)
|
||||
- [Model Hub](https://sdk.nexa.ai)
|
||||
233
builds/nexa-sdk/README.zh.md
Normal file
233
builds/nexa-sdk/README.zh.md
Normal file
@@ -0,0 +1,233 @@
|
||||
# Nexa SDK
|
||||
|
||||
Nexa SDK 是一个功能全面的本地 AI 模型运行工具包。它支持多种模型类型的推理,包括 LLM、VLM(视觉语言模型)、TTS(文本转语音)、ASR(自动语音识别)等。该工具专注于性能优化,支持 CPU 和 GPU 加速。
|
||||
|
||||
## 特性
|
||||
|
||||
- **多模型支持**:运行 LLM、VLM、TTS、ASR、嵌入、重排序和图像生成模型
|
||||
- **OpenAI 兼容 API**:提供标准的 OpenAI API 端点,便于集成
|
||||
- **GPU 加速**:通过 NVIDIA CUDA 提供可选的 GPU 支持,实现更快的推理速度
|
||||
- **资源管理**:可配置的 CPU/内存限制和 GPU 层卸载
|
||||
- **模型缓存**:持久化模型存储,加快启动速度
|
||||
- **配置文件支持**:轻松在 CPU 模式和 GPU 加速模式之间切换
|
||||
|
||||
## 快速开始
|
||||
|
||||
### 前置要求
|
||||
|
||||
- Docker 和 Docker Compose
|
||||
- GPU 支持需要:NVIDIA Docker runtime 和兼容的 GPU
|
||||
|
||||
### 基本使用(CPU)
|
||||
|
||||
```bash
|
||||
# 复制环境配置文件
|
||||
cp .env.example .env
|
||||
|
||||
# 编辑 .env 配置模型和设置
|
||||
# NEXA_MODEL=gemma-2-2b-instruct
|
||||
|
||||
# 使用 CPU 配置文件启动服务
|
||||
docker compose --profile cpu up -d
|
||||
```
|
||||
|
||||
### GPU 加速使用
|
||||
|
||||
```bash
|
||||
# 复制环境配置文件
|
||||
cp .env.example .env
|
||||
|
||||
# 配置 GPU 使用
|
||||
# NEXA_MODEL=gemma-2-2b-instruct
|
||||
# NEXA_GPU_LAYERS=-1 # -1 表示所有层都在 GPU 上
|
||||
|
||||
# 使用 GPU 配置文件启动服务
|
||||
docker compose --profile gpu up -d
|
||||
```
|
||||
|
||||
## 配置
|
||||
|
||||
### 环境变量
|
||||
|
||||
| 变量 | 默认值 | 说明 |
|
||||
| ------------------------ | --------------------- | --------------------------------------------------- |
|
||||
| `NEXA_SDK_VERSION` | `latest` | Nexa SDK Docker 镜像版本 |
|
||||
| `NEXA_SDK_PORT_OVERRIDE` | `8080` | API 访问的主机端口 |
|
||||
| `NEXA_MODEL` | `gemma-2-2b-instruct` | 要加载的模型(如 qwen3-4b、llama-3-8b、mistral-7b) |
|
||||
| `NEXA_HOST` | `0.0.0.0:8080` | 服务器绑定地址 |
|
||||
| `NEXA_KEEPALIVE` | `300` | 模型保活超时时间(秒) |
|
||||
| `NEXA_ORIGINS` | `*` | CORS 允许的源 |
|
||||
| `NEXA_HFTOKEN` | - | 用于私有模型的 HuggingFace 令牌 |
|
||||
| `NEXA_LOG` | `none` | 日志级别(none、debug、info、warn、error) |
|
||||
| `NEXA_GPU_LAYERS` | `-1` | 卸载到 GPU 的层数(-1 = 全部,0 = 仅 CPU) |
|
||||
| `NEXA_SHM_SIZE` | `2g` | 共享内存大小 |
|
||||
| `TZ` | `UTC` | 容器时区 |
|
||||
|
||||
### 资源限制
|
||||
|
||||
| 变量 | 默认值 | 说明 |
|
||||
| ----------------------------- | ------ | --------------- |
|
||||
| `NEXA_SDK_CPU_LIMIT` | `4.0` | 最大 CPU 核心数 |
|
||||
| `NEXA_SDK_MEMORY_LIMIT` | `8G` | 最大内存 |
|
||||
| `NEXA_SDK_CPU_RESERVATION` | `2.0` | 预留 CPU 核心数 |
|
||||
| `NEXA_SDK_MEMORY_RESERVATION` | `4G` | 预留内存 |
|
||||
|
||||
### 配置文件
|
||||
|
||||
- `cpu`:使用 CPU 推理运行(需要指定默认配置文件)
|
||||
- `gpu`:使用 GPU 加速运行(需要 NVIDIA GPU)
|
||||
|
||||
## 使用示例
|
||||
|
||||
### 测试 API
|
||||
|
||||
```bash
|
||||
# 检查可用模型
|
||||
curl http://localhost:8080/v1/models
|
||||
|
||||
# 聊天完成
|
||||
curl http://localhost:8080/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "gemma-2-2b-instruct",
|
||||
"messages": [
|
||||
{"role": "user", "content": "你好!"}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
### 使用不同的模型
|
||||
|
||||
编辑 `.env` 更改模型:
|
||||
|
||||
```bash
|
||||
# 资源受限时使用小模型
|
||||
NEXA_MODEL=gemma-2-2b-instruct
|
||||
# 或
|
||||
NEXA_MODEL=qwen3-4b
|
||||
|
||||
# 追求更好质量时使用大模型
|
||||
NEXA_MODEL=llama-3-8b
|
||||
# 或
|
||||
NEXA_MODEL=mistral-7b
|
||||
```
|
||||
|
||||
### GPU 配置
|
||||
|
||||
对于 GPU 加速,调整层数:
|
||||
|
||||
```bash
|
||||
# 将所有层卸载到 GPU(最快)
|
||||
NEXA_GPU_LAYERS=-1
|
||||
|
||||
# 卸载 30 层(混合模式)
|
||||
NEXA_GPU_LAYERS=30
|
||||
|
||||
# 仅 CPU
|
||||
NEXA_GPU_LAYERS=0
|
||||
```
|
||||
|
||||
## 模型管理
|
||||
|
||||
模型会在首次运行时自动下载,并缓存在 `nexa_models` 卷中。容器内的默认缓存位置是 `/root/.cache/nexa`。
|
||||
|
||||
要使用不同的模型:
|
||||
|
||||
1. 在 `.env` 中更新 `NEXA_MODEL`
|
||||
2. 重启服务:`docker compose --profile <cpu|gpu> restart`
|
||||
|
||||
## API 端点
|
||||
|
||||
Nexa SDK 提供 OpenAI 兼容的 API 端点:
|
||||
|
||||
- `GET /v1/models` - 列出可用模型
|
||||
- `POST /v1/chat/completions` - 聊天完成
|
||||
- `POST /v1/completions` - 文本完成
|
||||
- `POST /v1/embeddings` - 文本嵌入
|
||||
- `GET /health` - 健康检查
|
||||
- `GET /docs` - API 文档(Swagger UI)
|
||||
|
||||
## 故障排除
|
||||
|
||||
### 内存不足
|
||||
|
||||
增加内存限制或使用更小的模型:
|
||||
|
||||
```bash
|
||||
NEXA_SDK_MEMORY_LIMIT=16G
|
||||
NEXA_SDK_MEMORY_RESERVATION=8G
|
||||
# 或切换到更小的模型
|
||||
NEXA_MODEL=gemma-2-2b-instruct
|
||||
```
|
||||
|
||||
### GPU 未检测到
|
||||
|
||||
确保已安装 NVIDIA Docker runtime:
|
||||
|
||||
```bash
|
||||
# 检查 GPU 可用性
|
||||
docker run --rm --gpus all nvidia/cuda:12.8.1-base-ubuntu22.04 nvidia-smi
|
||||
```
|
||||
|
||||
### 模型下载问题
|
||||
|
||||
如果访问私有模型,设置 HuggingFace 令牌:
|
||||
|
||||
```bash
|
||||
NEXA_HFTOKEN=your_hf_token_here
|
||||
```
|
||||
|
||||
### 性能缓慢
|
||||
|
||||
- 使用 GPU 配置文件以获得更好的性能
|
||||
- 增加 `NEXA_GPU_LAYERS` 以将更多计算卸载到 GPU
|
||||
- 分配更多资源或使用更小的模型
|
||||
|
||||
## 高级配置
|
||||
|
||||
### 自定义模型路径
|
||||
|
||||
如果要使用本地模型文件,将它们挂载为卷:
|
||||
|
||||
```yaml
|
||||
volumes:
|
||||
- ./models:/models
|
||||
- nexa_models:/root/.cache/nexa
|
||||
```
|
||||
|
||||
然后在命令中通过路径引用模型。
|
||||
|
||||
### HTTPS 配置
|
||||
|
||||
设置 HTTPS 的环境变量:
|
||||
|
||||
```bash
|
||||
NEXA_ENABLEHTTPS=true
|
||||
```
|
||||
|
||||
挂载证书文件:
|
||||
|
||||
```yaml
|
||||
volumes:
|
||||
- ./certs/cert.pem:/app/cert.pem:ro
|
||||
- ./certs/key.pem:/app/key.pem:ro
|
||||
```
|
||||
|
||||
## 健康检查
|
||||
|
||||
服务包含验证 API 是否响应的健康检查:
|
||||
|
||||
```bash
|
||||
curl http://localhost:8080/v1/models
|
||||
```
|
||||
|
||||
## 许可证
|
||||
|
||||
Nexa SDK 由 Nexa AI 开发。许可证信息请参考[官方仓库](https://github.com/NexaAI/nexa-sdk)。
|
||||
|
||||
## 链接
|
||||
|
||||
- [官方仓库](https://github.com/NexaAI/nexa-sdk)
|
||||
- [Nexa AI 网站](https://nexa.ai)
|
||||
- [文档](https://docs.nexa.ai)
|
||||
- [模型中心](https://sdk.nexa.ai)
|
||||
94
builds/nexa-sdk/docker-compose.yaml
Normal file
94
builds/nexa-sdk/docker-compose.yaml
Normal file
@@ -0,0 +1,94 @@
|
||||
x-defaults: &defaults
|
||||
restart: unless-stopped
|
||||
logging:
|
||||
driver: json-file
|
||||
options:
|
||||
max-size: 100m
|
||||
max-file: "3"
|
||||
|
||||
services:
|
||||
nexa-sdk:
|
||||
<<: *defaults
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
image: ${GLOBAL_REGISTRY:-}alexsuntop/nexa-sdk:${NEXA_SDK_CPU_VERSION:-0.2.57}
|
||||
ports:
|
||||
- "${NEXA_SDK_PORT_OVERRIDE:-8080}:8080"
|
||||
volumes:
|
||||
- nexa_models:/root/.cache/nexa
|
||||
environment:
|
||||
- TZ=${TZ:-UTC}
|
||||
- NEXA_HOST=${NEXA_HOST:-0.0.0.0:8080}
|
||||
- NEXA_KEEPALIVE=${NEXA_KEEPALIVE:-300}
|
||||
- NEXA_ORIGINS=${NEXA_ORIGINS:-*}
|
||||
- NEXA_HFTOKEN=${NEXA_HFTOKEN:-}
|
||||
- NEXA_LOG=${NEXA_LOG:-none}
|
||||
command: >
|
||||
nexa server
|
||||
${NEXA_MODEL:-gemma-2-2b-instruct}
|
||||
ipc: host
|
||||
shm_size: ${NEXA_SHM_SIZE:-2g}
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8080/v1/models"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: ${NEXA_SDK_CPU_LIMIT:-4.0}
|
||||
memory: ${NEXA_SDK_MEMORY_LIMIT:-8G}
|
||||
reservations:
|
||||
cpus: ${NEXA_SDK_CPU_RESERVATION:-2.0}
|
||||
memory: ${NEXA_SDK_MEMORY_RESERVATION:-4G}
|
||||
profiles:
|
||||
- cpu
|
||||
|
||||
nexa-sdk-cuda:
|
||||
<<: *defaults
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.cuda
|
||||
image: ${GLOBAL_REGISTRY:-}alexsuntop/nexa-sdk:${NEXA_SDK_CUDA_VERSION:-0.2.57-cuda}
|
||||
ports:
|
||||
- "${NEXA_SDK_PORT_OVERRIDE:-8080}:8080"
|
||||
volumes:
|
||||
- nexa_models:/root/.cache/nexa
|
||||
environment:
|
||||
- TZ=${TZ:-UTC}
|
||||
- NEXA_HOST=${NEXA_HOST:-0.0.0.0:8080}
|
||||
- NEXA_KEEPALIVE=${NEXA_KEEPALIVE:-300}
|
||||
- NEXA_ORIGINS=${NEXA_ORIGINS:-*}
|
||||
- NEXA_HFTOKEN=${NEXA_HFTOKEN:-}
|
||||
- NEXA_LOG=${NEXA_LOG:-none}
|
||||
command: >
|
||||
nexa server
|
||||
${NEXA_MODEL:-gemma-2-2b-instruct}
|
||||
-ngl ${NEXA_GPU_LAYERS:--1}
|
||||
ipc: host
|
||||
shm_size: ${NEXA_SHM_SIZE:-2g}
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8080/v1/models"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: ${NEXA_SDK_CPU_LIMIT:-4.0}
|
||||
memory: ${NEXA_SDK_MEMORY_LIMIT:-8G}
|
||||
reservations:
|
||||
cpus: ${NEXA_SDK_CPU_RESERVATION:-2.0}
|
||||
memory: ${NEXA_SDK_MEMORY_RESERVATION:-4G}
|
||||
devices:
|
||||
- driver: nvidia
|
||||
device_ids: ['0']
|
||||
capabilities: [gpu]
|
||||
profiles:
|
||||
- cuda
|
||||
|
||||
volumes:
|
||||
nexa_models:
|
||||
Reference in New Issue
Block a user