diff --git a/README.md b/README.md index e5c2f09..b2d131e 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ These services require building custom Docker images from source. | [Clash](./src/clash) | 1.18.0 | | [ClickHouse](./src/clickhouse) | 24.11.1 | | [Conductor](./src/conductor) | latest | +| [DeepTutor](./apps/deeptutor) | latest | | [Dify](./apps/dify) | 0.18.2 | | [DNSMasq](./src/dnsmasq) | 2.91 | | [Dockge](./src/dockge) | 1 | @@ -72,6 +73,7 @@ These services require building custom Docker images from source. | [LibreOffice](./src/libreoffice) | latest | | [libSQL Server](./src/libsql) | latest | | [LiteLLM](./src/litellm) | main-stable | +| [llama.cpp](./src/llama.cpp) | server | | [LMDeploy](./src/lmdeploy) | v0.11.1 | | [Logstash](./src/logstash) | 8.16.1 | | [MariaDB Galera Cluster](./src/mariadb-galera) | 11.7.2 | diff --git a/README.zh.md b/README.zh.md index 4308efa..68fc15e 100644 --- a/README.zh.md +++ b/README.zh.md @@ -34,6 +34,7 @@ Compose Anything 通过提供一组高质量的 Docker Compose 配置文件, | [Clash](./src/clash) | 1.18.0 | | [ClickHouse](./src/clickhouse) | 24.11.1 | | [Conductor](./src/conductor) | latest | +| [DeepTutor](./apps/deeptutor) | latest | | [Dify](./apps/dify) | 0.18.2 | | [DNSMasq](./src/dnsmasq) | 2.91 | | [Dockge](./src/dockge) | 1 | @@ -72,6 +73,7 @@ Compose Anything 通过提供一组高质量的 Docker Compose 配置文件, | [LibreOffice](./src/libreoffice) | latest | | [libSQL Server](./src/libsql) | latest | | [LiteLLM](./src/litellm) | main-stable | +| [llama.cpp](./src/llama.cpp) | server | | [LMDeploy](./src/lmdeploy) | v0.11.1 | | [Logstash](./src/logstash) | 8.16.1 | | [MariaDB Galera Cluster](./src/mariadb-galera) | 11.7.2 | diff --git a/apps/deeptutor/.env.example b/apps/deeptutor/.env.example new file mode 100644 index 0000000..1a8419c --- /dev/null +++ b/apps/deeptutor/.env.example @@ -0,0 +1,97 @@ +# DeepTutor Configuration +# Copy this file to .env and fill in your API keys + +#! ================================================== +#! General Settings +#! ================================================== + +# Timezone (default: UTC) +TZ=UTC + +# User and Group ID for file permissions (default: 1000) +# Adjust if your host user has a different UID/GID +PUID=1000 +PGID=1000 + +# Global registry prefix (optional) +# Example: registry.example.com/ or leave empty for Docker Hub/GHCR +GLOBAL_REGISTRY= + +#! ================================================== +#! DeepTutor Version +#! ================================================== + +# Image version (default: latest) +# Available tags: latest, v0.5.x +# See: https://github.com/HKUDS/DeepTutor/pkgs/container/deeptutor +DEEPTUTOR_VERSION=latest + +#! ================================================== +#! Port Configuration +#! ================================================== + +# Backend port (internal: 8001) +BACKEND_PORT=8001 +# Host port override for backend +DEEPTUTOR_BACKEND_PORT_OVERRIDE=8001 + +# Frontend port (internal: 3782) +FRONTEND_PORT=3782 +# Host port override for frontend +DEEPTUTOR_FRONTEND_PORT_OVERRIDE=3782 + +#! ================================================== +#! API Base URLs +#! ================================================== + +# Internal API base URL (used by frontend to communicate with backend) +NEXT_PUBLIC_API_BASE=http://localhost:8001 + +# External API base URL (for cloud deployment, set to your public URL) +# Example: https://your-server.com:8001 +# For local deployment, use the same as NEXT_PUBLIC_API_BASE +NEXT_PUBLIC_API_BASE_EXTERNAL=http://localhost:8001 + +#! ================================================== +#! LLM API Keys (Required) +#! ================================================== + +# OpenAI API Key (Required) +# Get from: https://platform.openai.com/api-keys +OPENAI_API_KEY=sk-your-openai-api-key-here + +# OpenAI Base URL (default: https://api.openai.com/v1) +# For OpenAI-compatible APIs (e.g., Azure OpenAI, custom endpoints) +OPENAI_BASE_URL=https://api.openai.com/v1 + +# Default LLM Model (default: gpt-4o) +# Options: gpt-4o, gpt-4-turbo, gpt-4, gpt-3.5-turbo, etc. +DEFAULT_MODEL=gpt-4o + +#! ================================================== +#! Additional LLM API Keys (Optional) +#! ================================================== + +# Anthropic API Key (Optional, for Claude models) +# Get from: https://console.anthropic.com/ +ANTHROPIC_API_KEY= + +# Perplexity API Key (Optional, for web search) +# Get from: https://www.perplexity.ai/settings/api +PERPLEXITY_API_KEY= + +# DashScope API Key (Optional, for Alibaba Cloud models) +# Get from: https://dashscope.console.aliyun.com/ +DASHSCOPE_API_KEY= + +#! ================================================== +#! Resource Limits +#! ================================================== + +# CPU limits (default: 4.00 cores limit, 1.00 cores reservation) +DEEPTUTOR_CPU_LIMIT=4.00 +DEEPTUTOR_CPU_RESERVATION=1.00 + +# Memory limits (default: 8G limit, 2G reservation) +DEEPTUTOR_MEMORY_LIMIT=8G +DEEPTUTOR_MEMORY_RESERVATION=2G diff --git a/apps/deeptutor/README.md b/apps/deeptutor/README.md new file mode 100644 index 0000000..8453cbd --- /dev/null +++ b/apps/deeptutor/README.md @@ -0,0 +1,248 @@ +# DeepTutor + +[中文说明](README.zh.md) | English + +## Overview + +DeepTutor is an AI-powered personalized learning assistant that transforms any document into an interactive learning experience with multi-agent intelligence. It helps you solve problems, generate questions, conduct research, collaborate on writing, organize notes, and guides you through learning paths. + +**Project:** +**License:** Apache-2.0 +**Documentation:** + +## Features + +- **Problem Solving** — Detailed step-by-step solutions with visual diagrams +- **Question Generation** — Adaptive questions based on your knowledge level +- **Research Assistant** — Deep research with multi-agent collaboration +- **Co-Writer** — Interactive idea generation and writing assistance +- **Smart Notebook** — Organize and retrieve learning materials efficiently +- **Guided Learning** — Personalized learning paths and progress tracking +- **Multi-Agent System** — Specialized agents for different learning tasks +- **RAG Integration** — LightRAG and RAG-Anything for knowledge retrieval +- **Code Execution** — Built-in code playground for practice + +## Quick Start + +### Prerequisites + +- Docker and Docker Compose +- OpenAI API key (required) +- Optional: Anthropic, Perplexity, or DashScope API keys + +### Installation + +1. **Clone this repository** + + ```bash + git clone + cd apps/deeptutor + ``` + +2. **Configure environment** + + ```bash + cp .env.example .env + # Edit .env and add your API keys + ``` + + **Required configuration:** + - `OPENAI_API_KEY` — Your OpenAI API key + + **Optional configuration:** + - `ANTHROPIC_API_KEY` — For Claude models + - `PERPLEXITY_API_KEY` — For web search + - `DASHSCOPE_API_KEY` — For Alibaba Cloud models + - Adjust ports if needed (default: 8001 for backend, 3782 for frontend) + - Set `NEXT_PUBLIC_API_BASE_EXTERNAL` for cloud deployments + +3. **Optional: Custom agent configuration** + + Create a `config/agents.yaml` file to customize agent behaviors (see [documentation](https://hkuds.github.io/DeepTutor/guide/config.html) for details). + +4. **Start the service** + + ```bash + docker compose up -d + ``` + + First run takes approximately 30-60 seconds to initialize. + +5. **Access the application** + + - **Frontend:** + - **Backend API:** + - **API Documentation:** + +## Usage + +### Create Knowledge Base + +1. Navigate to +2. Click "New Knowledge Base" +3. Upload documents (supports PDF, DOCX, TXT, Markdown, HTML, etc.) +4. Wait for processing to complete + +### Learning Modes + +- **Solve** — Get step-by-step solutions to problems +- **Question** — Generate practice questions based on your materials +- **Research** — Deep research with multi-agent collaboration +- **Co-Writer** — Interactive writing and idea generation +- **Notebook** — Organize and manage your learning materials +- **Guide** — Follow personalized learning paths + +### Advanced Features + +- **Code Execution** — Practice coding directly in the interface +- **Visual Diagrams** — Automatic diagram generation for complex concepts +- **Export** — Download your work as PDF or Markdown +- **Multi-language** — Support for multiple languages + +## Configuration + +### Environment Variables + +Key environment variables (see [.env.example](.env.example) for all options): + +| Variable | Default | Description | +| ------------------------ | ---------- | ------------------------- | +| `OPENAI_API_KEY` | (required) | Your OpenAI API key | +| `DEFAULT_MODEL` | `gpt-4o` | Default LLM model | +| `BACKEND_PORT` | `8001` | Backend server port | +| `FRONTEND_PORT` | `3782` | Frontend application port | +| `DEEPTUTOR_CPU_LIMIT` | `4.00` | CPU limit (cores) | +| `DEEPTUTOR_MEMORY_LIMIT` | `8G` | Memory limit | + +### Ports + +- **8001** — Backend API server +- **3782** — Frontend web interface + +### Volumes + +- `deeptutor_data` — User data, knowledge bases, and learning materials +- `./config` — Custom agent configurations (optional) + +## Resource Requirements + +**Minimum:** + +- CPU: 1 core +- Memory: 2GB +- Disk: 2GB + space for knowledge bases + +**Recommended:** + +- CPU: 4 cores +- Memory: 8GB +- Disk: 10GB+ + +## Supported Models + +DeepTutor supports multiple LLM providers: + +- **OpenAI** — GPT-4, GPT-4 Turbo, GPT-3.5 Turbo +- **Anthropic** — Claude 3 (Opus, Sonnet, Haiku) +- **Perplexity** — For web search integration +- **DashScope** — Alibaba Cloud models +- **OpenAI-compatible APIs** — Any API compatible with OpenAI format + +## Troubleshooting + +### Backend fails to start + +- Verify `OPENAI_API_KEY` is set correctly in `.env` +- Check logs: `docker compose logs -f` +- Ensure ports 8001 and 3782 are not in use +- Verify sufficient disk space for volumes + +### Frontend cannot connect to backend + +- Confirm backend is running: visit +- For cloud deployments, set `NEXT_PUBLIC_API_BASE_EXTERNAL` to your public URL +- Check firewall settings + +### Knowledge base processing fails + +- Ensure sufficient memory (recommended 8GB+) +- Check document format is supported +- Review logs for specific errors + +### API rate limits + +- Monitor your API usage on provider dashboards +- Consider upgrading your API plan +- Use different models for different tasks + +## Security Notes + +- **API Keys** — Keep your API keys secure, never commit them to version control +- **Network Exposure** — For production deployments, use HTTPS and proper authentication +- **Data Privacy** — User data is stored in Docker volumes; ensure proper backup and security +- **Resource Limits** — Set appropriate CPU and memory limits to prevent resource exhaustion + +## Updates + +To update to the latest version: + +```bash +# Pull the latest image +docker compose pull + +# Recreate containers +docker compose up -d +``` + +To update to a specific version, edit `DEEPTUTOR_VERSION` in `.env` and run: + +```bash +docker compose up -d +``` + +## Advanced Usage + +### Custom Agent Configuration + +Create `config/agents.yaml` to customize agent behaviors: + +```yaml +agents: + solver: + model: gpt-4o + temperature: 0.7 + researcher: + model: gpt-4-turbo + max_tokens: 4000 +``` + +See [official documentation](https://hkuds.github.io/DeepTutor/guide/config.html) for detailed configuration options. + +### Cloud Deployment + +For cloud deployment, additional configuration is needed: + +1. Set public URL in `.env`: + + ```env + NEXT_PUBLIC_API_BASE_EXTERNAL=https://your-domain.com:8001 + ``` + +2. Configure reverse proxy (nginx/Caddy) for HTTPS +3. Ensure proper firewall rules +4. Consider using environment-specific secrets management + +### Using Different Embedding Models + +DeepTutor uses `text-embedding-3-large` by default. To use different embedding models, refer to the [official documentation](https://hkuds.github.io/DeepTutor/guide/config.html). + +## Links + +- **GitHub:** +- **Documentation:** +- **Issues:** +- **Discussions:** + +## License + +DeepTutor is licensed under the Apache-2.0 License. See the [official repository](https://github.com/HKUDS/DeepTutor) for details. diff --git a/apps/deeptutor/README.zh.md b/apps/deeptutor/README.zh.md new file mode 100644 index 0000000..37da444 --- /dev/null +++ b/apps/deeptutor/README.zh.md @@ -0,0 +1,248 @@ +# DeepTutor + +中文说明 | [English](README.md) + +## 概述 + +DeepTutor 是一个 AI 驱动的个性化学习助手,通过多智能体系统将任何文档转化为交互式学习体验。它可以帮助您解决问题、生成题目、进行研究、协作写作、整理笔记,并引导您完成学习路径。 + +**项目地址:** +**许可证:** Apache-2.0 +**文档:** + +## 功能特性 + +- **问题求解** — 提供详细的分步解决方案和可视化图表 +- **题目生成** — 根据您的知识水平生成自适应题目 +- **研究助手** — 通过多智能体协作进行深度研究 +- **协作写作** — 交互式创意生成和写作辅助 +- **智能笔记** — 高效组织和检索学习材料 +- **引导学习** — 个性化学习路径和进度跟踪 +- **多智能体系统** — 针对不同学习任务的专业智能体 +- **RAG 集成** — 使用 LightRAG 和 RAG-Anything 进行知识检索 +- **代码执行** — 内置代码练习环境 + +## 快速开始 + +### 前置要求 + +- Docker 和 Docker Compose +- OpenAI API 密钥(必需) +- 可选:Anthropic、Perplexity 或 DashScope API 密钥 + +### 安装步骤 + +1. **克隆仓库** + + ```bash + git clone + cd apps/deeptutor + ``` + +2. **配置环境变量** + + ```bash + cp .env.example .env + # 编辑 .env 文件并添加您的 API 密钥 + ``` + + **必需配置:** + - `OPENAI_API_KEY` — 您的 OpenAI API 密钥 + + **可选配置:** + - `ANTHROPIC_API_KEY` — 用于 Claude 模型 + - `PERPLEXITY_API_KEY` — 用于网络搜索 + - `DASHSCOPE_API_KEY` — 用于阿里云模型 + - 如需调整端口(默认:后端 8001,前端 3782) + - 云端部署时设置 `NEXT_PUBLIC_API_BASE_EXTERNAL` + +3. **可选:自定义智能体配置** + + 创建 `config/agents.yaml` 文件以自定义智能体行为(详见[文档](https://hkuds.github.io/DeepTutor/guide/config.html))。 + +4. **启动服务** + + ```bash + docker compose up -d + ``` + + 首次运行需要约 30-60 秒初始化。 + +5. **访问应用** + + - **前端界面:** + - **后端 API:** + - **API 文档:** + +## 使用方法 + +### 创建知识库 + +1. 访问 +2. 点击"新建知识库" +3. 上传文档(支持 PDF、DOCX、TXT、Markdown、HTML 等) +4. 等待处理完成 + +### 学习模式 + +- **求解(Solve)** — 获取问题的分步解决方案 +- **题目(Question)** — 基于学习材料生成练习题 +- **研究(Research)** — 通过多智能体协作进行深度研究 +- **协作写作(Co-Writer)** — 交互式写作和创意生成 +- **笔记(Notebook)** — 组织和管理学习材料 +- **引导(Guide)** — 遵循个性化学习路径 + +### 高级功能 + +- **代码执行** — 在界面中直接练习编码 +- **可视化图表** — 为复杂概念自动生成图表 +- **导出** — 将您的工作下载为 PDF 或 Markdown +- **多语言支持** — 支持多种语言 + +## 配置说明 + +### 环境变量 + +主要环境变量(所有选项见 [.env.example](.env.example)): + +| 变量 | 默认值 | 描述 | +| ------------------------ | -------- | -------------------- | +| `OPENAI_API_KEY` | (必需) | 您的 OpenAI API 密钥 | +| `DEFAULT_MODEL` | `gpt-4o` | 默认 LLM 模型 | +| `BACKEND_PORT` | `8001` | 后端服务器端口 | +| `FRONTEND_PORT` | `3782` | 前端应用端口 | +| `DEEPTUTOR_CPU_LIMIT` | `4.00` | CPU 限制(核心数) | +| `DEEPTUTOR_MEMORY_LIMIT` | `8G` | 内存限制 | + +### 端口说明 + +- **8001** — 后端 API 服务器 +- **3782** — 前端 Web 界面 + +### 数据卷 + +- `deeptutor_data` — 用户数据、知识库和学习材料 +- `./config` — 自定义智能体配置(可选) + +## 资源要求 + +**最低配置:** + +- CPU:1 核心 +- 内存:2GB +- 磁盘:2GB + 知识库所需空间 + +**推荐配置:** + +- CPU:4 核心 +- 内存:8GB +- 磁盘:10GB+ + +## 支持的模型 + +DeepTutor 支持多个 LLM 提供商: + +- **OpenAI** — GPT-4、GPT-4 Turbo、GPT-3.5 Turbo +- **Anthropic** — Claude 3(Opus、Sonnet、Haiku) +- **Perplexity** — 用于网络搜索集成 +- **DashScope** — 阿里云模型 +- **OpenAI 兼容 API** — 任何与 OpenAI 格式兼容的 API + +## 故障排查 + +### 后端启动失败 + +- 验证 `.env` 中的 `OPENAI_API_KEY` 是否正确设置 +- 查看日志:`docker compose logs -f` +- 确保端口 8001 和 3782 未被占用 +- 验证数据卷有足够的磁盘空间 + +### 前端无法连接后端 + +- 确认后端正在运行:访问 +- 云端部署时,将 `NEXT_PUBLIC_API_BASE_EXTERNAL` 设置为您的公网 URL +- 检查防火墙设置 + +### 知识库处理失败 + +- 确保有足够的内存(推荐 8GB+) +- 检查文档格式是否支持 +- 查看日志了解具体错误 + +### API 速率限制 + +- 在提供商控制台监控 API 使用情况 +- 考虑升级 API 计划 +- 为不同任务使用不同模型 + +## 安全提示 + +- **API 密钥** — 妥善保管您的 API 密钥,切勿提交到版本控制系统 +- **网络暴露** — 生产环境部署时,使用 HTTPS 和适当的身份验证 +- **数据隐私** — 用户数据存储在 Docker 卷中,请确保适当的备份和安全措施 +- **资源限制** — 设置合适的 CPU 和内存限制以防止资源耗尽 + +## 更新 + +更新到最新版本: + +```bash +# 拉取最新镜像 +docker compose pull + +# 重新创建容器 +docker compose up -d +``` + +更新到特定版本,编辑 `.env` 中的 `DEEPTUTOR_VERSION` 并运行: + +```bash +docker compose up -d +``` + +## 高级用法 + +### 自定义智能体配置 + +创建 `config/agents.yaml` 以自定义智能体行为: + +```yaml +agents: + solver: + model: gpt-4o + temperature: 0.7 + researcher: + model: gpt-4-turbo + max_tokens: 4000 +``` + +详细配置选项请参见[官方文档](https://hkuds.github.io/DeepTutor/guide/config.html)。 + +### 云端部署 + +云端部署需要额外配置: + +1. 在 `.env` 中设置公网 URL: + + ```env + NEXT_PUBLIC_API_BASE_EXTERNAL=https://your-domain.com:8001 + ``` + +2. 配置反向代理(nginx/Caddy)以支持 HTTPS +3. 确保适当的防火墙规则 +4. 考虑使用特定环境的密钥管理 + +### 使用不同的嵌入模型 + +DeepTutor 默认使用 `text-embedding-3-large`。要使用不同的嵌入模型,请参考[官方文档](https://hkuds.github.io/DeepTutor/guide/config.html)。 + +## 相关链接 + +- **GitHub:** +- **文档:** +- **问题反馈:** +- **讨论区:** + +## 许可证 + +DeepTutor 使用 Apache-2.0 许可证。详情请参见[官方仓库](https://github.com/HKUDS/DeepTutor)。 diff --git a/apps/deeptutor/docker-compose.yaml b/apps/deeptutor/docker-compose.yaml new file mode 100644 index 0000000..86aec5a --- /dev/null +++ b/apps/deeptutor/docker-compose.yaml @@ -0,0 +1,68 @@ +# DeepTutor: AI-Powered Personalized Learning Assistant +# https://github.com/HKUDS/DeepTutor +# Transform any document into an interactive learning experience with multi-agent intelligence + +x-defaults: &defaults + restart: unless-stopped + logging: + driver: json-file + options: + max-size: 100m + max-file: "3" + +services: + deeptutor: + <<: *defaults + image: ${GLOBAL_REGISTRY:-ghcr.io}/hkuds/deeptutor:${DEEPTUTOR_VERSION:-latest} + ports: + - "${DEEPTUTOR_BACKEND_PORT_OVERRIDE:-8001}:${BACKEND_PORT:-8001}" + - "${DEEPTUTOR_FRONTEND_PORT_OVERRIDE:-3782}:${FRONTEND_PORT:-3782}" + volumes: + - deeptutor_data:/app/data + - ./config:/app/config:ro + environment: + - TZ=${TZ:-UTC} + # Backend port + - BACKEND_PORT=${BACKEND_PORT:-8001} + # Frontend port + - FRONTEND_PORT=${FRONTEND_PORT:-3782} + # API base URLs + - NEXT_PUBLIC_API_BASE=${NEXT_PUBLIC_API_BASE:-http://localhost:8001} + - NEXT_PUBLIC_API_BASE_EXTERNAL=${NEXT_PUBLIC_API_BASE_EXTERNAL:-http://localhost:8001} + # LLM API Keys + - OPENAI_API_KEY=${OPENAI_API_KEY} + - OPENAI_BASE_URL=${OPENAI_BASE_URL:-https://api.openai.com/v1} + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-} + - PERPLEXITY_API_KEY=${PERPLEXITY_API_KEY:-} + - DASHSCOPE_API_KEY=${DASHSCOPE_API_KEY:-} + # Default LLM model + - DEFAULT_MODEL=${DEFAULT_MODEL:-gpt-4o} + # User ID and Group ID for permission management + - PUID=${PUID:-1000} + - PGID=${PGID:-1000} + healthcheck: + test: + [ + "CMD", + "curl", + "-f", + "http://localhost:${BACKEND_PORT:-8001}/health", + "||", + "exit", + "1", + ] + interval: 30s + timeout: 10s + retries: 3 + start_period: 60s + deploy: + resources: + limits: + cpus: ${DEEPTUTOR_CPU_LIMIT:-4.00} + memory: ${DEEPTUTOR_MEMORY_LIMIT:-8G} + reservations: + cpus: ${DEEPTUTOR_CPU_RESERVATION:-1.00} + memory: ${DEEPTUTOR_MEMORY_RESERVATION:-2G} + +volumes: + deeptutor_data: diff --git a/src/llama.cpp/.env.example b/src/llama.cpp/.env.example new file mode 100644 index 0000000..90a2b18 --- /dev/null +++ b/src/llama.cpp/.env.example @@ -0,0 +1,106 @@ +# ============================================================================= +# llama.cpp Configuration +# https://github.com/ggml-org/llama.cpp +# LLM inference in C/C++ with support for various hardware accelerators +# ============================================================================= + +# ----------------------------------------------------------------------------- +# General Settings +# ----------------------------------------------------------------------------- + +# Timezone for the container (default: UTC) +TZ=UTC + +# Global registry prefix (optional) +# Example: docker.io/, ghcr.io/, registry.example.com/ +GHCR_REGISTRY=ghcr.io/ + +# ----------------------------------------------------------------------------- +# Server Configuration +# ----------------------------------------------------------------------------- + +# Server image variant +# Options: server (CPU), server-cuda (NVIDIA GPU), server-rocm (AMD GPU), +# server-musa (Moore Threads GPU), server-intel (Intel GPU), +# server-vulkan (Vulkan GPU) +LLAMA_CPP_SERVER_VARIANT=server + +# Server port override (default: 8080) +LLAMA_CPP_SERVER_PORT_OVERRIDE=8080 + +# Model path inside the container +# You need to mount your model file to this path +# Example: /models/llama-2-7b-chat.Q4_K_M.gguf +LLAMA_CPP_MODEL_PATH=/models/model.gguf + +# Context size (number of tokens) +# Larger values allow for more context but require more memory +# Default: 512, Common values: 512, 2048, 4096, 8192, 16384, 32768 +LLAMA_CPP_CONTEXT_SIZE=512 + +# Number of GPU layers to offload +# 0 = CPU only, 99 = all layers on GPU (for GPU variants) +# For CPU variant, keep this at 0 +LLAMA_CPP_GPU_LAYERS=0 + +# Number of GPUs to use (for CUDA variant) +LLAMA_CPP_GPU_COUNT=1 + +# Server CPU limit (in cores) +LLAMA_CPP_SERVER_CPU_LIMIT=4.0 + +# Server CPU reservation (in cores) +LLAMA_CPP_SERVER_CPU_RESERVATION=2.0 + +# Server memory limit +LLAMA_CPP_SERVER_MEMORY_LIMIT=8G + +# Server memory reservation +LLAMA_CPP_SERVER_MEMORY_RESERVATION=4G + +# ----------------------------------------------------------------------------- +# CLI Configuration (Light variant) +# ----------------------------------------------------------------------------- + +# CLI image variant +# Options: light (CPU), light-cuda (NVIDIA GPU), light-rocm (AMD GPU), +# light-musa (Moore Threads GPU), light-intel (Intel GPU), +# light-vulkan (Vulkan GPU) +LLAMA_CPP_CLI_VARIANT=light + +# Default prompt for CLI mode +LLAMA_CPP_PROMPT=Hello, how are you? + +# CLI CPU limit (in cores) +LLAMA_CPP_CLI_CPU_LIMIT=2.0 + +# CLI CPU reservation (in cores) +LLAMA_CPP_CLI_CPU_RESERVATION=1.0 + +# CLI memory limit +LLAMA_CPP_CLI_MEMORY_LIMIT=4G + +# CLI memory reservation +LLAMA_CPP_CLI_MEMORY_RESERVATION=2G + +# ----------------------------------------------------------------------------- +# Full Toolkit Configuration +# ----------------------------------------------------------------------------- + +# Full image variant (includes model conversion tools) +# Options: full (CPU), full-cuda (NVIDIA GPU), full-rocm (AMD GPU), +# full-musa (Moore Threads GPU), full-intel (Intel GPU), +# full-vulkan (Vulkan GPU) +LLAMA_CPP_FULL_VARIANT=full + +# Full CPU limit (in cores) +LLAMA_CPP_FULL_CPU_LIMIT=2.0 + +# Full CPU reservation (in cores) +LLAMA_CPP_FULL_CPU_RESERVATION=1.0 + +# Full memory limit +LLAMA_CPP_FULL_MEMORY_LIMIT=4G + +# Full memory reservation +LLAMA_CPP_FULL_MEMORY_RESERVATION=2G diff --git a/src/llama.cpp/README.md b/src/llama.cpp/README.md new file mode 100644 index 0000000..bdf4c42 --- /dev/null +++ b/src/llama.cpp/README.md @@ -0,0 +1,245 @@ +# llama.cpp + +[中文文档](README.zh.md) + +[llama.cpp](https://github.com/ggml-org/llama.cpp) is a high-performance C/C++ implementation for LLM inference with support for various hardware accelerators. + +## Features + +- **Fast Inference**: Optimized C/C++ implementation for efficient LLM inference +- **Multiple Backends**: CPU, CUDA (NVIDIA), ROCm (AMD), MUSA (Moore Threads), Intel GPU, Vulkan +- **OpenAI-compatible API**: Server mode with OpenAI-compatible REST API +- **CLI Support**: Interactive command-line interface for quick testing +- **Model Conversion**: Full toolkit includes tools to convert and quantize models +- **GGUF Format**: Support for the efficient GGUF model format +- **Cross-platform**: Linux (x86-64, ARM64, s390x), Windows, macOS + +## Prerequisites + +- Docker and Docker Compose installed +- At least 4GB of RAM (8GB+ recommended) +- For GPU variants: + - **CUDA**: NVIDIA GPU with [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) + - **ROCm**: AMD GPU with proper ROCm drivers + - **MUSA**: Moore Threads GPU with mt-container-toolkit +- GGUF format model file (e.g., from [Hugging Face](https://huggingface.co/models?library=gguf)) + +## Quick Start + +### 1. Server Mode (CPU) + +```bash +# Copy and configure environment +cp .env.example .env + +# Edit .env and set your model path +# LLAMA_CPP_MODEL_PATH=/models/your-model.gguf + +# Place your GGUF model in a directory, then update docker-compose.yaml +# to mount it, e.g.: +# volumes: +# - ./models:/models + +# Start the server +docker compose --profile server up -d + +# Test the server (OpenAI-compatible API) +curl http://localhost:8080/v1/models + +# Chat completion request +curl http://localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + {"role": "user", "content": "Hello!"} + ] + }' +``` + +### 2. Server Mode with NVIDIA GPU + +```bash +# Edit .env +# Set LLAMA_CPP_GPU_LAYERS=99 to offload all layers to GPU + +# Start GPU-accelerated server +docker compose --profile cuda up -d + +# The server will automatically use NVIDIA GPU +``` + +### 3. Server Mode with AMD GPU + +```bash +# Edit .env +# Set LLAMA_CPP_GPU_LAYERS=99 to offload all layers to GPU + +# Start GPU-accelerated server +docker compose --profile rocm up -d + +# The server will automatically use AMD GPU +``` + +### 4. CLI Mode + +```bash +# Edit .env and configure model path and prompt + +# Run CLI +docker compose --profile cli up + +# For interactive mode, use: +docker compose run --rm llama-cpp-cli \ + -m /models/your-model.gguf \ + -p "Your prompt here" \ + -n 512 +``` + +### 5. Full Toolkit (Model Conversion) + +```bash +# Start the full container +docker compose --profile full up -d + +# Execute commands inside the container +docker compose exec llama-cpp-full bash + +# Inside container, you can use conversion tools +# Example: Convert a Hugging Face model +# python3 convert_hf_to_gguf.py /models/source-model --outfile /models/output.gguf +``` + +## Configuration + +### Environment Variables + +Key environment variables (see [.env.example](.env.example) for all options): + +| Variable | Description | Default | +| -------------------------------- | ------------------------------------------------------------- | -------------------- | +| `LLAMA_CPP_SERVER_VARIANT` | Server image variant (server, server-cuda, server-rocm, etc.) | `server` | +| `LLAMA_CPP_MODEL_PATH` | Model file path inside container | `/models/model.gguf` | +| `LLAMA_CPP_CONTEXT_SIZE` | Context window size in tokens | `512` | +| `LLAMA_CPP_GPU_LAYERS` | Number of layers to offload to GPU (0=CPU only, 99=all) | `0` | +| `LLAMA_CPP_SERVER_PORT_OVERRIDE` | Server port on host | `8080` | +| `LLAMA_CPP_SERVER_MEMORY_LIMIT` | Memory limit for server | `8G` | + +### Available Profiles + +- `server`: CPU-only server +- `cuda`: NVIDIA GPU server (requires nvidia-container-toolkit) +- `rocm`: AMD GPU server (requires ROCm) +- `cli`: Command-line interface +- `full`: Full toolkit with model conversion tools +- `gpu`: Generic GPU profile (includes cuda and rocm) + +### Image Variants + +Each variant comes in multiple flavors: + +- **server**: Only `llama-server` executable (API server) +- **light**: Only `llama-cli` and `llama-completion` executables +- **full**: Complete toolkit including model conversion tools + +Backend options: + +- Base (CPU) +- `-cuda` (NVIDIA GPU) +- `-rocm` (AMD GPU) +- `-musa` (Moore Threads GPU) +- `-intel` (Intel GPU with SYCL) +- `-vulkan` (Vulkan GPU) + +## Server API + +The server provides an OpenAI-compatible API: + +- `GET /health` - Health check +- `GET /v1/models` - List available models +- `POST /v1/chat/completions` - Chat completion +- `POST /v1/completions` - Text completion +- `POST /v1/embeddings` - Generate embeddings + +See the [llama.cpp server documentation](https://github.com/ggml-org/llama.cpp/blob/master/examples/server/README.md) for full API details. + +## Model Sources + +Download GGUF models from: + +- [Hugging Face GGUF Models](https://huggingface.co/models?library=gguf) +- [TheBloke's GGUF Collection](https://huggingface.co/TheBloke) +- Convert your own models using the full toolkit + +Popular quantization formats: + +- `Q4_K_M`: Good balance of quality and size (recommended) +- `Q5_K_M`: Higher quality, larger size +- `Q8_0`: Very high quality, large size +- `Q2_K`: Smallest size, lower quality + +## Resource Requirements + +Minimum requirements by model size: + +| Model Size | RAM (CPU) | VRAM (GPU) | Context Size | +| ---------- | --------- | ---------- | ------------ | +| 7B Q4_K_M | 6GB | 4GB | 2048 | +| 13B Q4_K_M | 10GB | 8GB | 2048 | +| 34B Q4_K_M | 24GB | 20GB | 2048 | +| 70B Q4_K_M | 48GB | 40GB | 2048 | + +Larger context sizes require proportionally more memory. + +## Performance Tuning + +For CPU inference: + +- Increase `LLAMA_CPP_SERVER_CPU_LIMIT` for more cores +- Optimize threads with `-t` flag (default: auto) + +For GPU inference: + +- Set `LLAMA_CPP_GPU_LAYERS=99` to offload all layers +- Increase context size for longer conversations +- Monitor GPU memory usage + +## Security Notes + +- The server binds to `0.0.0.0` by default - ensure proper network security +- No authentication is enabled by default +- Consider using a reverse proxy (nginx, Caddy) for production deployments +- Limit resource usage to prevent system exhaustion + +## Troubleshooting + +### Out of Memory + +- Reduce `LLAMA_CPP_CONTEXT_SIZE` +- Use a smaller quantized model (e.g., Q4 instead of Q8) +- Reduce `LLAMA_CPP_GPU_LAYERS` if using GPU + +### GPU Not Detected + +**NVIDIA**: Verify nvidia-container-toolkit is installed: + +```bash +docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi +``` + +**AMD**: Ensure ROCm drivers and `/dev/kfd`, `/dev/dri` are accessible. + +### Slow Inference + +- Check CPU/GPU utilization +- Increase resource limits in `.env` +- For GPU: Verify all layers are offloaded (`LLAMA_CPP_GPU_LAYERS=99`) + +## Documentation + +- [llama.cpp GitHub](https://github.com/ggml-org/llama.cpp) +- [Docker Documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/docker.md) +- [Server API Docs](https://github.com/ggml-org/llama.cpp/blob/master/examples/server/README.md) + +## License + +llama.cpp is released under the MIT License. See the [LICENSE](https://github.com/ggml-org/llama.cpp/blob/master/LICENSE) file for details. diff --git a/src/llama.cpp/README.zh.md b/src/llama.cpp/README.zh.md new file mode 100644 index 0000000..baf37d0 --- /dev/null +++ b/src/llama.cpp/README.zh.md @@ -0,0 +1,244 @@ +# llama.cpp + +[English Documentation](README.md) + +[llama.cpp](https://github.com/ggml-org/llama.cpp) 是一个高性能的 C/C++ 实现的大语言模型推理引擎,支持多种硬件加速器。 + +## 功能特性 + +- **高速推理**:优化的 C/C++ 实现,提供高效的 LLM 推理 +- **多种后端**:支持 CPU、CUDA(NVIDIA)、ROCm(AMD)、MUSA(摩尔线程)、Intel GPU、Vulkan +- **OpenAI 兼容 API**:服务器模式提供 OpenAI 兼容的 REST API +- **CLI 支持**:交互式命令行界面,方便快速测试 +- **模型转换**:完整工具包包含模型转换和量化工具 +- **GGUF 格式**:支持高效的 GGUF 模型格式 +- **跨平台**:支持 Linux(x86-64、ARM64、s390x)、Windows、macOS + +## 前置要求 + +- 已安装 Docker 和 Docker Compose +- 至少 4GB 内存(推荐 8GB 以上) +- GPU 版本需要: + - **CUDA**:NVIDIA GPU 及 [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) + - **ROCm**:AMD GPU 及相应的 ROCm 驱动 + - **MUSA**:摩尔线程 GPU 及 mt-container-toolkit +- GGUF 格式的模型文件(例如从 [Hugging Face](https://huggingface.co/models?library=gguf) 下载) + +## 快速开始 + +### 1. 服务器模式(CPU) + +```bash +# 复制并配置环境变量 +cp .env.example .env + +# 编辑 .env 并设置模型路径 +# LLAMA_CPP_MODEL_PATH=/models/your-model.gguf + +# 将 GGUF 模型放在目录中,然后更新 docker-compose.yaml 挂载,例如: +# volumes: +# - ./models:/models + +# 启动服务器 +docker compose --profile server up -d + +# 测试服务器(OpenAI 兼容 API) +curl http://localhost:8080/v1/models + +# 聊天补全请求 +curl http://localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + {"role": "user", "content": "你好!"} + ] + }' +``` + +### 2. 服务器模式(NVIDIA GPU) + +```bash +# 编辑 .env +# 设置 LLAMA_CPP_GPU_LAYERS=99 将所有层卸载到 GPU + +# 启动 GPU 加速服务器 +docker compose --profile cuda up -d + +# 服务器将自动使用 NVIDIA GPU +``` + +### 3. 服务器模式(AMD GPU) + +```bash +# 编辑 .env +# 设置 LLAMA_CPP_GPU_LAYERS=99 将所有层卸载到 GPU + +# 启动 GPU 加速服务器 +docker compose --profile rocm up -d + +# 服务器将自动使用 AMD GPU +``` + +### 4. CLI 模式 + +```bash +# 编辑 .env 并配置模型路径和提示词 + +# 运行 CLI +docker compose --profile cli up + +# 交互模式: +docker compose run --rm llama-cpp-cli \ + -m /models/your-model.gguf \ + -p "你的提示词" \ + -n 512 +``` + +### 5. 完整工具包(模型转换) + +```bash +# 启动完整容器 +docker compose --profile full up -d + +# 在容器内执行命令 +docker compose exec llama-cpp-full bash + +# 在容器内可以使用转换工具 +# 示例:转换 Hugging Face 模型 +# python3 convert_hf_to_gguf.py /models/source-model --outfile /models/output.gguf +``` + +## 配置说明 + +### 环境变量 + +主要环境变量(完整选项请查看 [.env.example](.env.example)): + +| 变量 | 说明 | 默认值 | +| -------------------------------- | ----------------------------------------------------- | -------------------- | +| `LLAMA_CPP_SERVER_VARIANT` | 服务器镜像变体(server、server-cuda、server-rocm 等) | `server` | +| `LLAMA_CPP_MODEL_PATH` | 容器内模型文件路径 | `/models/model.gguf` | +| `LLAMA_CPP_CONTEXT_SIZE` | 上下文窗口大小(token 数) | `512` | +| `LLAMA_CPP_GPU_LAYERS` | 卸载到 GPU 的层数(0=仅 CPU,99=全部) | `0` | +| `LLAMA_CPP_SERVER_PORT_OVERRIDE` | 主机端口 | `8080` | +| `LLAMA_CPP_SERVER_MEMORY_LIMIT` | 服务器内存限制 | `8G` | + +### 可用配置文件 + +- `server`:仅 CPU 服务器 +- `cuda`:NVIDIA GPU 服务器(需要 nvidia-container-toolkit) +- `rocm`:AMD GPU 服务器(需要 ROCm) +- `cli`:命令行界面 +- `full`:包含模型转换工具的完整工具包 +- `gpu`:通用 GPU 配置(包括 cuda 和 rocm) + +### 镜像变体 + +每个变体都有多种类型: + +- **server**:仅包含 `llama-server` 可执行文件(API 服务器) +- **light**:仅包含 `llama-cli` 和 `llama-completion` 可执行文件 +- **full**:完整工具包,包括模型转换工具 + +后端选项: + +- 基础版(CPU) +- `-cuda`(NVIDIA GPU) +- `-rocm`(AMD GPU) +- `-musa`(摩尔线程 GPU) +- `-intel`(Intel GPU,支持 SYCL) +- `-vulkan`(Vulkan GPU) + +## 服务器 API + +服务器提供 OpenAI 兼容的 API: + +- `GET /health` - 健康检查 +- `GET /v1/models` - 列出可用模型 +- `POST /v1/chat/completions` - 聊天补全 +- `POST /v1/completions` - 文本补全 +- `POST /v1/embeddings` - 生成嵌入向量 + +完整 API 详情请参阅 [llama.cpp 服务器文档](https://github.com/ggml-org/llama.cpp/blob/master/examples/server/README.md)。 + +## 模型来源 + +下载 GGUF 模型: + +- [Hugging Face GGUF 模型](https://huggingface.co/models?library=gguf) +- [TheBloke 的 GGUF 合集](https://huggingface.co/TheBloke) +- 使用完整工具包转换您自己的模型 + +常用量化格式: + +- `Q4_K_M`:质量和大小的良好平衡(推荐) +- `Q5_K_M`:更高质量,更大体积 +- `Q8_0`:非常高的质量,大体积 +- `Q2_K`:最小体积,较低质量 + +## 资源需求 + +按模型大小的最低要求: + +| 模型大小 | 内存(CPU) | 显存(GPU) | 上下文大小 | +| ---------- | ----------- | ----------- | ---------- | +| 7B Q4_K_M | 6GB | 4GB | 2048 | +| 13B Q4_K_M | 10GB | 8GB | 2048 | +| 34B Q4_K_M | 24GB | 20GB | 2048 | +| 70B Q4_K_M | 48GB | 40GB | 2048 | + +更大的上下文大小需要成比例的更多内存。 + +## 性能调优 + +CPU 推理: + +- 增加 `LLAMA_CPP_SERVER_CPU_LIMIT` 以使用更多核心 +- 使用 `-t` 参数优化线程数(默认:自动) + +GPU 推理: + +- 设置 `LLAMA_CPP_GPU_LAYERS=99` 卸载所有层 +- 增加上下文大小以支持更长对话 +- 监控 GPU 内存使用 + +## 安全注意事项 + +- 服务器默认绑定到 `0.0.0.0` - 请确保网络安全 +- 默认未启用身份验证 +- 生产环境建议使用反向代理(nginx、Caddy) +- 限制资源使用以防止系统资源耗尽 + +## 故障排除 + +### 内存不足 + +- 减小 `LLAMA_CPP_CONTEXT_SIZE` +- 使用更小的量化模型(例如 Q4 而不是 Q8) +- 减少 `LLAMA_CPP_GPU_LAYERS`(如果使用 GPU) + +### GPU 未检测到 + +**NVIDIA**:验证 nvidia-container-toolkit 是否已安装: + +```bash +docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi +``` + +**AMD**:确保 ROCm 驱动已安装且 `/dev/kfd`、`/dev/dri` 可访问。 + +### 推理速度慢 + +- 检查 CPU/GPU 利用率 +- 增加 `.env` 中的资源限制 +- GPU:验证所有层都已卸载(`LLAMA_CPP_GPU_LAYERS=99`) + +## 文档 + +- [llama.cpp GitHub](https://github.com/ggml-org/llama.cpp) +- [Docker 文档](https://github.com/ggml-org/llama.cpp/blob/master/docs/docker.md) +- [服务器 API 文档](https://github.com/ggml-org/llama.cpp/blob/master/examples/server/README.md) + +## 许可证 + +llama.cpp 使用 MIT 许可证发布。详情请参阅 [LICENSE](https://github.com/ggml-org/llama.cpp/blob/master/LICENSE) 文件。 diff --git a/src/llama.cpp/docker-compose.yaml b/src/llama.cpp/docker-compose.yaml new file mode 100644 index 0000000..564d6b1 --- /dev/null +++ b/src/llama.cpp/docker-compose.yaml @@ -0,0 +1,210 @@ +# Docker Compose configuration for llama.cpp +# https://github.com/ggml-org/llama.cpp +# LLM inference in C/C++ with support for various hardware accelerators + +x-defaults: &defaults + restart: unless-stopped + logging: + driver: json-file + options: + max-size: 100m + max-file: "3" + +services: + # llama.cpp server - OpenAI-compatible API server + # Variant: server (CPU), server-cuda (NVIDIA GPU), server-rocm (AMD GPU) + llama-cpp-server: + <<: *defaults + image: ${GHCR_REGISTRY:-ghcr.io/}ggml-org/llama.cpp:${LLAMA_CPP_SERVER_VARIANT:-server} + ports: + - "${LLAMA_CPP_SERVER_PORT_OVERRIDE:-8080}:8080" + volumes: + - llama_cpp_models:/models + command: + - "-m" + - "${LLAMA_CPP_MODEL_PATH:-/models/model.gguf}" + - "--port" + - "8080" + - "--host" + - "0.0.0.0" + - "-n" + - "${LLAMA_CPP_CONTEXT_SIZE:-512}" + - "--n-gpu-layers" + - "${LLAMA_CPP_GPU_LAYERS:-0}" + environment: + - TZ=${TZ:-UTC} + healthcheck: + test: + [ + "CMD", + "wget", + "--quiet", + "--tries=1", + "--spider", + "http://localhost:8080/health", + ] + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s + deploy: + resources: + limits: + cpus: ${LLAMA_CPP_SERVER_CPU_LIMIT:-4.0} + memory: ${LLAMA_CPP_SERVER_MEMORY_LIMIT:-8G} + reservations: + cpus: ${LLAMA_CPP_SERVER_CPU_RESERVATION:-2.0} + memory: ${LLAMA_CPP_SERVER_MEMORY_RESERVATION:-4G} + profiles: + - server + + # llama.cpp server with NVIDIA GPU support + llama-cpp-server-cuda: + <<: *defaults + image: ${GHCR_REGISTRY:-ghcr.io/}ggml-org/llama.cpp:server-cuda + ports: + - "${LLAMA_CPP_SERVER_PORT_OVERRIDE:-8080}:8080" + volumes: + - llama_cpp_models:/models + command: + - "-m" + - "${LLAMA_CPP_MODEL_PATH:-/models/model.gguf}" + - "--port" + - "8080" + - "--host" + - "0.0.0.0" + - "-n" + - "${LLAMA_CPP_CONTEXT_SIZE:-512}" + - "--n-gpu-layers" + - "${LLAMA_CPP_GPU_LAYERS:-99}" + environment: + - TZ=${TZ:-UTC} + healthcheck: + test: + [ + "CMD", + "wget", + "--quiet", + "--tries=1", + "--spider", + "http://localhost:8080/health", + ] + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s + deploy: + resources: + limits: + cpus: ${LLAMA_CPP_SERVER_CPU_LIMIT:-4.0} + memory: ${LLAMA_CPP_SERVER_MEMORY_LIMIT:-8G} + reservations: + cpus: ${LLAMA_CPP_SERVER_CPU_RESERVATION:-2.0} + memory: ${LLAMA_CPP_SERVER_MEMORY_RESERVATION:-4G} + devices: + - driver: nvidia + count: ${LLAMA_CPP_GPU_COUNT:-1} + capabilities: [gpu] + profiles: + - gpu + - cuda + + # llama.cpp server with AMD ROCm GPU support + llama-cpp-server-rocm: + <<: *defaults + image: ${GHCR_REGISTRY:-ghcr.io/}ggml-org/llama.cpp:server-rocm + ports: + - "${LLAMA_CPP_SERVER_PORT_OVERRIDE:-8080}:8080" + volumes: + - llama_cpp_models:/models + devices: + - /dev/kfd + - /dev/dri + command: + - "-m" + - "${LLAMA_CPP_MODEL_PATH:-/models/model.gguf}" + - "--port" + - "8080" + - "--host" + - "0.0.0.0" + - "-n" + - "${LLAMA_CPP_CONTEXT_SIZE:-512}" + - "--n-gpu-layers" + - "${LLAMA_CPP_GPU_LAYERS:-99}" + environment: + - TZ=${TZ:-UTC} + healthcheck: + test: + [ + "CMD", + "wget", + "--quiet", + "--tries=1", + "--spider", + "http://localhost:8080/health", + ] + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s + deploy: + resources: + limits: + cpus: ${LLAMA_CPP_SERVER_CPU_LIMIT:-4.0} + memory: ${LLAMA_CPP_SERVER_MEMORY_LIMIT:-8G} + reservations: + cpus: ${LLAMA_CPP_SERVER_CPU_RESERVATION:-2.0} + memory: ${LLAMA_CPP_SERVER_MEMORY_RESERVATION:-4G} + profiles: + - gpu + - rocm + + # llama.cpp CLI (light) - Interactive command-line interface + llama-cpp-cli: + <<: *defaults + image: ${GHCR_REGISTRY:-ghcr.io/}ggml-org/llama.cpp:${LLAMA_CPP_CLI_VARIANT:-light} + volumes: + - llama_cpp_models:/models + entrypoint: /app/llama-cli + command: + - "-m" + - "${LLAMA_CPP_MODEL_PATH:-/models/model.gguf}" + - "-p" + - "${LLAMA_CPP_PROMPT:-Hello, how are you?}" + - "-n" + - "${LLAMA_CPP_CONTEXT_SIZE:-512}" + environment: + - TZ=${TZ:-UTC} + deploy: + resources: + limits: + cpus: ${LLAMA_CPP_CLI_CPU_LIMIT:-2.0} + memory: ${LLAMA_CPP_CLI_MEMORY_LIMIT:-4G} + reservations: + cpus: ${LLAMA_CPP_CLI_CPU_RESERVATION:-1.0} + memory: ${LLAMA_CPP_CLI_MEMORY_RESERVATION:-2G} + profiles: + - cli + + # llama.cpp full - Complete toolkit including model conversion tools + llama-cpp-full: + <<: *defaults + image: ${GHCR_REGISTRY:-ghcr.io/}ggml-org/llama.cpp:${LLAMA_CPP_FULL_VARIANT:-full} + volumes: + - llama_cpp_models:/models + command: ["sleep", "infinity"] + environment: + - TZ=${TZ:-UTC} + deploy: + resources: + limits: + cpus: ${LLAMA_CPP_FULL_CPU_LIMIT:-2.0} + memory: ${LLAMA_CPP_FULL_MEMORY_LIMIT:-4G} + reservations: + cpus: ${LLAMA_CPP_FULL_CPU_RESERVATION:-1.0} + memory: ${LLAMA_CPP_FULL_MEMORY_RESERVATION:-2G} + profiles: + - full + +volumes: + llama_cpp_models: