feat: Add Chinese documentation and Docker Compose configurations for DeepTutor and llama.cpp

- Created README.zh.md for DeepTutor with comprehensive features, installation steps, and usage instructions in Chinese. - Added docker-compose.yaml for DeepTutor to define services, environment variables, and resource limits. - Introduced .env.example for llama.cpp with configuration options for server settings and resource management. - Added README.md and README.zh.md for llama.cpp detailing features, prerequisites, quick start guides, and API documentation. - Implemented docker-compose.yaml for llama.cpp to support various server configurations (CPU, CUDA, ROCm) and CLI usage.
2026-02-01 16:08:44 +08:00
parent e2ac465417
commit 28ed2462af
10 changed files with 1470 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -34,6 +34,7 @@ These services require building custom Docker images from source.
 | [Clash](./src/clash)                                           | 1.18.0               |
 | [ClickHouse](./src/clickhouse)                                 | 24.11.1              |
 | [Conductor](./src/conductor)                                   | latest               |
 | [DeepTutor](./apps/deeptutor)                                  | latest               |
 | [Dify](./apps/dify)                                            | 0.18.2               |
 | [DNSMasq](./src/dnsmasq)                                       | 2.91                 |
 | [Dockge](./src/dockge)                                         | 1                    |
@@ -72,6 +73,7 @@ These services require building custom Docker images from source.
 | [LibreOffice](./src/libreoffice)                               | latest               |
 | [libSQL Server](./src/libsql)                                  | latest               |
 | [LiteLLM](./src/litellm)                                       | main-stable          |
 | [llama.cpp](./src/llama.cpp)                                   | server               |
 | [LMDeploy](./src/lmdeploy)                                     | v0.11.1              |
 | [Logstash](./src/logstash)                                     | 8.16.1               |
 | [MariaDB Galera Cluster](./src/mariadb-galera)                 | 11.7.2               |
--- a/README.zh.md
+++ b/README.zh.md
@@ -34,6 +34,7 @@ Compose Anything 通过提供一组高质量的 Docker Compose 配置文件，
 | [Clash](./src/clash)                                           | 1.18.0                |
 | [ClickHouse](./src/clickhouse)                                 | 24.11.1               |
 | [Conductor](./src/conductor)                                   | latest                |
 | [DeepTutor](./apps/deeptutor)                                  | latest                |
 | [Dify](./apps/dify)                                            | 0.18.2                |
 | [DNSMasq](./src/dnsmasq)                                       | 2.91                  |
 | [Dockge](./src/dockge)                                         | 1                     |
@@ -72,6 +73,7 @@ Compose Anything 通过提供一组高质量的 Docker Compose 配置文件，
 | [LibreOffice](./src/libreoffice)                               | latest                |
 | [libSQL Server](./src/libsql)                                  | latest                |
 | [LiteLLM](./src/litellm)                                       | main-stable           |
 | [llama.cpp](./src/llama.cpp)                                   | server                |
 | [LMDeploy](./src/lmdeploy)                                     | v0.11.1               |
 | [Logstash](./src/logstash)                                     | 8.16.1                |
 | [MariaDB Galera Cluster](./src/mariadb-galera)                 | 11.7.2                |
--- a/apps/deeptutor/.env.example
+++ b/apps/deeptutor/.env.example
@@ -0,0 +1,97 @@
 # DeepTutor Configuration
 # Copy this file to .env and fill in your API keys
 #! ==================================================
 #! General Settings
 #! ==================================================
 # Timezone (default: UTC)
 TZ=UTC
 # User and Group ID for file permissions (default: 1000)
 # Adjust if your host user has a different UID/GID
 PUID=1000
 PGID=1000
 # Global registry prefix (optional)
 # Example: registry.example.com/ or leave empty for Docker Hub/GHCR
 GLOBAL_REGISTRY=
 #! ==================================================
 #! DeepTutor Version
 #! ==================================================
 # Image version (default: latest)
 # Available tags: latest, v0.5.x
 # See: https://github.com/HKUDS/DeepTutor/pkgs/container/deeptutor
 DEEPTUTOR_VERSION=latest
 #! ==================================================
 #! Port Configuration
 #! ==================================================
 # Backend port (internal: 8001)
 BACKEND_PORT=8001
 # Host port override for backend
 DEEPTUTOR_BACKEND_PORT_OVERRIDE=8001
 # Frontend port (internal: 3782)
 FRONTEND_PORT=3782
 # Host port override for frontend
 DEEPTUTOR_FRONTEND_PORT_OVERRIDE=3782
 #! ==================================================
 #! API Base URLs
 #! ==================================================
 # Internal API base URL (used by frontend to communicate with backend)
 NEXT_PUBLIC_API_BASE=http://localhost:8001
 # External API base URL (for cloud deployment, set to your public URL)
 # Example: https://your-server.com:8001
 # For local deployment, use the same as NEXT_PUBLIC_API_BASE
 NEXT_PUBLIC_API_BASE_EXTERNAL=http://localhost:8001
 #! ==================================================
 #! LLM API Keys (Required)
 #! ==================================================
 # OpenAI API Key (Required)
 # Get from: https://platform.openai.com/api-keys
 OPENAI_API_KEY=sk-your-openai-api-key-here
 # OpenAI Base URL (default: https://api.openai.com/v1)
 # For OpenAI-compatible APIs (e.g., Azure OpenAI, custom endpoints)
 OPENAI_BASE_URL=https://api.openai.com/v1
 # Default LLM Model (default: gpt-4o)
 # Options: gpt-4o, gpt-4-turbo, gpt-4, gpt-3.5-turbo, etc.
 DEFAULT_MODEL=gpt-4o
 #! ==================================================
 #! Additional LLM API Keys (Optional)
 #! ==================================================
 # Anthropic API Key (Optional, for Claude models)
 # Get from: https://console.anthropic.com/
 ANTHROPIC_API_KEY=
 # Perplexity API Key (Optional, for web search)
 # Get from: https://www.perplexity.ai/settings/api
 PERPLEXITY_API_KEY=
 # DashScope API Key (Optional, for Alibaba Cloud models)
 # Get from: https://dashscope.console.aliyun.com/
 DASHSCOPE_API_KEY=
 #! ==================================================
 #! Resource Limits
 #! ==================================================
 # CPU limits (default: 4.00 cores limit, 1.00 cores reservation)
 DEEPTUTOR_CPU_LIMIT=4.00
 DEEPTUTOR_CPU_RESERVATION=1.00
 # Memory limits (default: 8G limit, 2G reservation)
 DEEPTUTOR_MEMORY_LIMIT=8G
 DEEPTUTOR_MEMORY_RESERVATION=2G
--- a/apps/deeptutor/README.md
+++ b/apps/deeptutor/README.md
@@ -0,0 +1,248 @@
 # DeepTutor
 [中文说明](README.zh.md) | English
 ## Overview
 DeepTutor is an AI-powered personalized learning assistant that transforms any document into an interactive learning experience with multi-agent intelligence. It helps you solve problems, generate questions, conduct research, collaborate on writing, organize notes, and guides you through learning paths.
 **Project:** <https://github.com/HKUDS/DeepTutor>  
 **License:** Apache-2.0  
 **Documentation:** <https://hkuds.github.io/DeepTutor/>
 ## Features
 - **Problem Solving** — Detailed step-by-step solutions with visual diagrams
 - **Question Generation** — Adaptive questions based on your knowledge level
 - **Research Assistant** — Deep research with multi-agent collaboration
 - **Co-Writer** — Interactive idea generation and writing assistance
 - **Smart Notebook** — Organize and retrieve learning materials efficiently
 - **Guided Learning** — Personalized learning paths and progress tracking
 - **Multi-Agent System** — Specialized agents for different learning tasks
 - **RAG Integration** — LightRAG and RAG-Anything for knowledge retrieval
 - **Code Execution** — Built-in code playground for practice
 ## Quick Start
 ### Prerequisites
 - Docker and Docker Compose
 - OpenAI API key (required)
 - Optional: Anthropic, Perplexity, or DashScope API keys
 ### Installation
 1. **Clone this repository**
    ```bash
    git clone <your-compose-anything-repo>
    cd apps/deeptutor
    ```
 2. **Configure environment**
    ```bash
    cp .env.example .env
    # Edit .env and add your API keys
    ```
    **Required configuration:**
    - `OPENAI_API_KEY` — Your OpenAI API key
    **Optional configuration:**
    - `ANTHROPIC_API_KEY` — For Claude models
    - `PERPLEXITY_API_KEY` — For web search
    - `DASHSCOPE_API_KEY` — For Alibaba Cloud models
    - Adjust ports if needed (default: 8001 for backend, 3782 for frontend)
    - Set `NEXT_PUBLIC_API_BASE_EXTERNAL` for cloud deployments
 3. **Optional: Custom agent configuration**
    Create a `config/agents.yaml` file to customize agent behaviors (see [documentation](https://hkuds.github.io/DeepTutor/guide/config.html) for details).
 4. **Start the service**
    ```bash
    docker compose up -d
    ```
    First run takes approximately 30-60 seconds to initialize.
 5. **Access the application**
      - **Frontend:** <http://localhost:3782>
      - **Backend API:** <http://localhost:8001>
      - **API Documentation:** <http://localhost:8001/docs>
 ## Usage
 ### Create Knowledge Base
 1. Navigate to <http://localhost:3782/knowledge>
 2. Click "New Knowledge Base"
 3. Upload documents (supports PDF, DOCX, TXT, Markdown, HTML, etc.)
 4. Wait for processing to complete
 ### Learning Modes
 - **Solve** — Get step-by-step solutions to problems
 - **Question** — Generate practice questions based on your materials
 - **Research** — Deep research with multi-agent collaboration
 - **Co-Writer** — Interactive writing and idea generation
 - **Notebook** — Organize and manage your learning materials
 - **Guide** — Follow personalized learning paths
 ### Advanced Features
 - **Code Execution** — Practice coding directly in the interface
 - **Visual Diagrams** — Automatic diagram generation for complex concepts
 - **Export** — Download your work as PDF or Markdown
 - **Multi-language** — Support for multiple languages
 ## Configuration
 ### Environment Variables
 Key environment variables (see [.env.example](.env.example) for all options):
 | Variable                 | Default    | Description               |
 | ------------------------ | ---------- | ------------------------- |
 | `OPENAI_API_KEY`         | (required) | Your OpenAI API key       |
 | `DEFAULT_MODEL`          | `gpt-4o`   | Default LLM model         |
 | `BACKEND_PORT`           | `8001`     | Backend server port       |
 | `FRONTEND_PORT`          | `3782`     | Frontend application port |
 | `DEEPTUTOR_CPU_LIMIT`    | `4.00`     | CPU limit (cores)         |
 | `DEEPTUTOR_MEMORY_LIMIT` | `8G`       | Memory limit              |
 ### Ports
 - **8001** — Backend API server
 - **3782** — Frontend web interface
 ### Volumes
 - `deeptutor_data` — User data, knowledge bases, and learning materials
 - `./config` — Custom agent configurations (optional)
 ## Resource Requirements
 **Minimum:**
 - CPU: 1 core
 - Memory: 2GB
 - Disk: 2GB + space for knowledge bases
 **Recommended:**
 - CPU: 4 cores
 - Memory: 8GB
 - Disk: 10GB+
 ## Supported Models
 DeepTutor supports multiple LLM providers:
 - **OpenAI** — GPT-4, GPT-4 Turbo, GPT-3.5 Turbo
 - **Anthropic** — Claude 3 (Opus, Sonnet, Haiku)
 - **Perplexity** — For web search integration
 - **DashScope** — Alibaba Cloud models
 - **OpenAI-compatible APIs** — Any API compatible with OpenAI format
 ## Troubleshooting
 ### Backend fails to start
 - Verify `OPENAI_API_KEY` is set correctly in `.env`
 - Check logs: `docker compose logs -f`
 - Ensure ports 8001 and 3782 are not in use
 - Verify sufficient disk space for volumes
 ### Frontend cannot connect to backend
 - Confirm backend is running: visit <http://localhost:8001/docs>
 - For cloud deployments, set `NEXT_PUBLIC_API_BASE_EXTERNAL` to your public URL
 - Check firewall settings
 ### Knowledge base processing fails
 - Ensure sufficient memory (recommended 8GB+)
 - Check document format is supported
 - Review logs for specific errors
 ### API rate limits
 - Monitor your API usage on provider dashboards
 - Consider upgrading your API plan
 - Use different models for different tasks
 ## Security Notes
 - **API Keys** — Keep your API keys secure, never commit them to version control
 - **Network Exposure** — For production deployments, use HTTPS and proper authentication
 - **Data Privacy** — User data is stored in Docker volumes; ensure proper backup and security
 - **Resource Limits** — Set appropriate CPU and memory limits to prevent resource exhaustion
 ## Updates
 To update to the latest version:
 ```bash
 # Pull the latest image
 docker compose pull
 # Recreate containers
 docker compose up -d
 ```
 To update to a specific version, edit `DEEPTUTOR_VERSION` in `.env` and run:
 ```bash
 docker compose up -d
 ```
 ## Advanced Usage
 ### Custom Agent Configuration
 Create `config/agents.yaml` to customize agent behaviors:
 ```yaml
 agents:
  solver:
    model: gpt-4o
    temperature: 0.7
  researcher:
    model: gpt-4-turbo
    max_tokens: 4000
 ```
 See [official documentation](https://hkuds.github.io/DeepTutor/guide/config.html) for detailed configuration options.
 ### Cloud Deployment
 For cloud deployment, additional configuration is needed:
 1. Set public URL in `.env`:
    ```env
    NEXT_PUBLIC_API_BASE_EXTERNAL=https://your-domain.com:8001
    ```
 2. Configure reverse proxy (nginx/Caddy) for HTTPS
 3. Ensure proper firewall rules
 4. Consider using environment-specific secrets management
 ### Using Different Embedding Models
 DeepTutor uses `text-embedding-3-large` by default. To use different embedding models, refer to the [official documentation](https://hkuds.github.io/DeepTutor/guide/config.html).
 ## Links
 - **GitHub:** <https://github.com/HKUDS/DeepTutor>
 - **Documentation:** <https://hkuds.github.io/DeepTutor/>
 - **Issues:** <https://github.com/HKUDS/DeepTutor/issues>
 - **Discussions:** <https://github.com/HKUDS/DeepTutor/discussions>
 ## License
 DeepTutor is licensed under the Apache-2.0 License. See the [official repository](https://github.com/HKUDS/DeepTutor) for details.
--- a/apps/deeptutor/README.zh.md
+++ b/apps/deeptutor/README.zh.md
@@ -0,0 +1,248 @@
 # DeepTutor
 中文说明 | [English](README.md)
 ## 概述
 DeepTutor 是一个 AI 驱动的个性化学习助手，通过多智能体系统将任何文档转化为交互式学习体验。它可以帮助您解决问题、生成题目、进行研究、协作写作、整理笔记，并引导您完成学习路径。
 **项目地址：** <https://github.com/HKUDS/DeepTutor>  
 **许可证：** Apache-2.0  
 **文档：** <https://hkuds.github.io/DeepTutor/>
 ## 功能特性
 - **问题求解** — 提供详细的分步解决方案和可视化图表
 - **题目生成** — 根据您的知识水平生成自适应题目
 - **研究助手** — 通过多智能体协作进行深度研究
 - **协作写作** — 交互式创意生成和写作辅助
 - **智能笔记** — 高效组织和检索学习材料
 - **引导学习** — 个性化学习路径和进度跟踪
 - **多智能体系统** — 针对不同学习任务的专业智能体
 - **RAG 集成** — 使用 LightRAG 和 RAG-Anything 进行知识检索
 - **代码执行** — 内置代码练习环境
 ## 快速开始
 ### 前置要求
 - Docker 和 Docker Compose
 - OpenAI API 密钥（必需）
 - 可选：Anthropic、Perplexity 或 DashScope API 密钥
 ### 安装步骤
 1. **克隆仓库**
    ```bash
    git clone <your-compose-anything-repo>
    cd apps/deeptutor
    ```
 2. **配置环境变量**
    ```bash
    cp .env.example .env
    # 编辑 .env 文件并添加您的 API 密钥
    ```
    **必需配置：**
    - `OPENAI_API_KEY` — 您的 OpenAI API 密钥
    **可选配置：**
    - `ANTHROPIC_API_KEY` — 用于 Claude 模型
    - `PERPLEXITY_API_KEY` — 用于网络搜索
    - `DASHSCOPE_API_KEY` — 用于阿里云模型
    - 如需调整端口（默认：后端 8001，前端 3782）
    - 云端部署时设置 `NEXT_PUBLIC_API_BASE_EXTERNAL`
 3. **可选：自定义智能体配置**
    创建 `config/agents.yaml` 文件以自定义智能体行为（详见[文档](https://hkuds.github.io/DeepTutor/guide/config.html)）。
 4. **启动服务**
    ```bash
    docker compose up -d
    ```
    首次运行需要约 30-60 秒初始化。
 5. **访问应用**
      - **前端界面：** <http://localhost:3782>
      - **后端 API：** <http://localhost:8001>
      - **API 文档：** <http://localhost:8001/docs>
 ## 使用方法
 ### 创建知识库
 1. 访问 <http://localhost:3782/knowledge>
 2. 点击"新建知识库"
 3. 上传文档（支持 PDF、DOCX、TXT、Markdown、HTML 等）
 4. 等待处理完成
 ### 学习模式
 - **求解（Solve）** — 获取问题的分步解决方案
 - **题目（Question）** — 基于学习材料生成练习题
 - **研究（Research）** — 通过多智能体协作进行深度研究
 - **协作写作（Co-Writer）** — 交互式写作和创意生成
 - **笔记（Notebook）** — 组织和管理学习材料
 - **引导（Guide）** — 遵循个性化学习路径
 ### 高级功能
 - **代码执行** — 在界面中直接练习编码
 - **可视化图表** — 为复杂概念自动生成图表
 - **导出** — 将您的工作下载为 PDF 或 Markdown
 - **多语言支持** — 支持多种语言
 ## 配置说明
 ### 环境变量
 主要环境变量（所有选项见 [.env.example](.env.example)）：
 | 变量                     | 默认值   | 描述                 |
 | ------------------------ | -------- | -------------------- |
 | `OPENAI_API_KEY`         | （必需） | 您的 OpenAI API 密钥 |
 | `DEFAULT_MODEL`          | `gpt-4o` | 默认 LLM 模型        |
 | `BACKEND_PORT`           | `8001`   | 后端服务器端口       |
 | `FRONTEND_PORT`          | `3782`   | 前端应用端口         |
 | `DEEPTUTOR_CPU_LIMIT`    | `4.00`   | CPU 限制（核心数）   |
 | `DEEPTUTOR_MEMORY_LIMIT` | `8G`     | 内存限制             |
 ### 端口说明
 - **8001** — 后端 API 服务器
 - **3782** — 前端 Web 界面
 ### 数据卷
 - `deeptutor_data` — 用户数据、知识库和学习材料
 - `./config` — 自定义智能体配置（可选）
 ## 资源要求
 **最低配置：**
 - CPU：1 核心
 - 内存：2GB
 - 磁盘：2GB + 知识库所需空间
 **推荐配置：**
 - CPU：4 核心
 - 内存：8GB
 - 磁盘：10GB+
 ## 支持的模型
 DeepTutor 支持多个 LLM 提供商：
 - **OpenAI** — GPT-4、GPT-4 Turbo、GPT-3.5 Turbo
 - **Anthropic** — Claude 3（Opus、Sonnet、Haiku）
 - **Perplexity** — 用于网络搜索集成
 - **DashScope** — 阿里云模型
 - **OpenAI 兼容 API** — 任何与 OpenAI 格式兼容的 API
 ## 故障排查
 ### 后端启动失败
 - 验证 `.env` 中的 `OPENAI_API_KEY` 是否正确设置
 - 查看日志：`docker compose logs -f`
 - 确保端口 8001 和 3782 未被占用
 - 验证数据卷有足够的磁盘空间
 ### 前端无法连接后端
 - 确认后端正在运行：访问 <http://localhost:8001/docs>
 - 云端部署时，将 `NEXT_PUBLIC_API_BASE_EXTERNAL` 设置为您的公网 URL
 - 检查防火墙设置
 ### 知识库处理失败
 - 确保有足够的内存（推荐 8GB+）
 - 检查文档格式是否支持
 - 查看日志了解具体错误
 ### API 速率限制
 - 在提供商控制台监控 API 使用情况
 - 考虑升级 API 计划
 - 为不同任务使用不同模型
 ## 安全提示
 - **API 密钥** — 妥善保管您的 API 密钥，切勿提交到版本控制系统
 - **网络暴露** — 生产环境部署时，使用 HTTPS 和适当的身份验证
 - **数据隐私** — 用户数据存储在 Docker 卷中，请确保适当的备份和安全措施
 - **资源限制** — 设置合适的 CPU 和内存限制以防止资源耗尽
 ## 更新
 更新到最新版本：
 ```bash
 # 拉取最新镜像
 docker compose pull
 # 重新创建容器
 docker compose up -d
 ```
 更新到特定版本，编辑 `.env` 中的 `DEEPTUTOR_VERSION` 并运行：
 ```bash
 docker compose up -d
 ```
 ## 高级用法
 ### 自定义智能体配置
 创建 `config/agents.yaml` 以自定义智能体行为：
 ```yaml
 agents:
  solver:
    model: gpt-4o
    temperature: 0.7
  researcher:
    model: gpt-4-turbo
    max_tokens: 4000
 ```
 详细配置选项请参见[官方文档](https://hkuds.github.io/DeepTutor/guide/config.html)。
 ### 云端部署
 云端部署需要额外配置：
 1. 在 `.env` 中设置公网 URL：
    ```env
    NEXT_PUBLIC_API_BASE_EXTERNAL=https://your-domain.com:8001
    ```
 2. 配置反向代理（nginx/Caddy）以支持 HTTPS
 3. 确保适当的防火墙规则
 4. 考虑使用特定环境的密钥管理
 ### 使用不同的嵌入模型
 DeepTutor 默认使用 `text-embedding-3-large`。要使用不同的嵌入模型，请参考[官方文档](https://hkuds.github.io/DeepTutor/guide/config.html)。
 ## 相关链接
 - **GitHub：** <https://github.com/HKUDS/DeepTutor>
 - **文档：** <https://hkuds.github.io/DeepTutor/>
 - **问题反馈：** <https://github.com/HKUDS/DeepTutor/issues>
 - **讨论区：** <https://github.com/HKUDS/DeepTutor/discussions>
 ## 许可证
 DeepTutor 使用 Apache-2.0 许可证。详情请参见[官方仓库](https://github.com/HKUDS/DeepTutor)。
--- a/apps/deeptutor/docker-compose.yaml
+++ b/apps/deeptutor/docker-compose.yaml
@@ -0,0 +1,68 @@
 # DeepTutor: AI-Powered Personalized Learning Assistant
 # https://github.com/HKUDS/DeepTutor
 # Transform any document into an interactive learning experience with multi-agent intelligence
 x-defaults: &defaults
  restart: unless-stopped
  logging:
    driver: json-file
    options:
      max-size: 100m
      max-file: "3"
 services:
  deeptutor:
    <<: *defaults
    image: ${GLOBAL_REGISTRY:-ghcr.io}/hkuds/deeptutor:${DEEPTUTOR_VERSION:-latest}
    ports:
      - "${DEEPTUTOR_BACKEND_PORT_OVERRIDE:-8001}:${BACKEND_PORT:-8001}"
      - "${DEEPTUTOR_FRONTEND_PORT_OVERRIDE:-3782}:${FRONTEND_PORT:-3782}"
    volumes:
      - deeptutor_data:/app/data
      - ./config:/app/config:ro
    environment:
      - TZ=${TZ:-UTC}
      # Backend port
      - BACKEND_PORT=${BACKEND_PORT:-8001}
      # Frontend port
      - FRONTEND_PORT=${FRONTEND_PORT:-3782}
      # API base URLs
      - NEXT_PUBLIC_API_BASE=${NEXT_PUBLIC_API_BASE:-http://localhost:8001}
      - NEXT_PUBLIC_API_BASE_EXTERNAL=${NEXT_PUBLIC_API_BASE_EXTERNAL:-http://localhost:8001}
      # LLM API Keys
      - OPENAI_API_KEY=${OPENAI_API_KEY}
      - OPENAI_BASE_URL=${OPENAI_BASE_URL:-https://api.openai.com/v1}
      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
      - PERPLEXITY_API_KEY=${PERPLEXITY_API_KEY:-}
      - DASHSCOPE_API_KEY=${DASHSCOPE_API_KEY:-}
      # Default LLM model
      - DEFAULT_MODEL=${DEFAULT_MODEL:-gpt-4o}
      # User ID and Group ID for permission management
      - PUID=${PUID:-1000}
      - PGID=${PGID:-1000}
    healthcheck:
      test:
        [
          "CMD",
          "curl",
          "-f",
          "http://localhost:${BACKEND_PORT:-8001}/health",
          "||",
          "exit",
          "1",
        ]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    deploy:
      resources:
        limits:
          cpus: ${DEEPTUTOR_CPU_LIMIT:-4.00}
          memory: ${DEEPTUTOR_MEMORY_LIMIT:-8G}
        reservations:
          cpus: ${DEEPTUTOR_CPU_RESERVATION:-1.00}
          memory: ${DEEPTUTOR_MEMORY_RESERVATION:-2G}
 volumes:
  deeptutor_data:
--- a/src/llama.cpp/.env.example
+++ b/src/llama.cpp/.env.example
@@ -0,0 +1,106 @@
 # =============================================================================
 # llama.cpp Configuration
 # https://github.com/ggml-org/llama.cpp
 # LLM inference in C/C++ with support for various hardware accelerators
 # =============================================================================
 # -----------------------------------------------------------------------------
 # General Settings
 # -----------------------------------------------------------------------------
 # Timezone for the container (default: UTC)
 TZ=UTC
 # Global registry prefix (optional)
 # Example: docker.io/, ghcr.io/, registry.example.com/
 GHCR_REGISTRY=ghcr.io/
 # -----------------------------------------------------------------------------
 # Server Configuration
 # -----------------------------------------------------------------------------
 # Server image variant
 # Options: server (CPU), server-cuda (NVIDIA GPU), server-rocm (AMD GPU),
 #          server-musa (Moore Threads GPU), server-intel (Intel GPU),
 #          server-vulkan (Vulkan GPU)
 LLAMA_CPP_SERVER_VARIANT=server
 # Server port override (default: 8080)
 LLAMA_CPP_SERVER_PORT_OVERRIDE=8080
 # Model path inside the container
 # You need to mount your model file to this path
 # Example: /models/llama-2-7b-chat.Q4_K_M.gguf
 LLAMA_CPP_MODEL_PATH=/models/model.gguf
 # Context size (number of tokens)
 # Larger values allow for more context but require more memory
 # Default: 512, Common values: 512, 2048, 4096, 8192, 16384, 32768
 LLAMA_CPP_CONTEXT_SIZE=512
 # Number of GPU layers to offload
 # 0 = CPU only, 99 = all layers on GPU (for GPU variants)
 # For CPU variant, keep this at 0
 LLAMA_CPP_GPU_LAYERS=0
 # Number of GPUs to use (for CUDA variant)
 LLAMA_CPP_GPU_COUNT=1
 # Server CPU limit (in cores)
 LLAMA_CPP_SERVER_CPU_LIMIT=4.0
 # Server CPU reservation (in cores)
 LLAMA_CPP_SERVER_CPU_RESERVATION=2.0
 # Server memory limit
 LLAMA_CPP_SERVER_MEMORY_LIMIT=8G
 # Server memory reservation
 LLAMA_CPP_SERVER_MEMORY_RESERVATION=4G
 # -----------------------------------------------------------------------------
 # CLI Configuration (Light variant)
 # -----------------------------------------------------------------------------
 # CLI image variant
 # Options: light (CPU), light-cuda (NVIDIA GPU), light-rocm (AMD GPU),
 #          light-musa (Moore Threads GPU), light-intel (Intel GPU),
 #          light-vulkan (Vulkan GPU)
 LLAMA_CPP_CLI_VARIANT=light
 # Default prompt for CLI mode
 LLAMA_CPP_PROMPT=Hello, how are you?
 # CLI CPU limit (in cores)
 LLAMA_CPP_CLI_CPU_LIMIT=2.0
 # CLI CPU reservation (in cores)
 LLAMA_CPP_CLI_CPU_RESERVATION=1.0
 # CLI memory limit
 LLAMA_CPP_CLI_MEMORY_LIMIT=4G
 # CLI memory reservation
 LLAMA_CPP_CLI_MEMORY_RESERVATION=2G
 # -----------------------------------------------------------------------------
 # Full Toolkit Configuration
 # -----------------------------------------------------------------------------
 # Full image variant (includes model conversion tools)
 # Options: full (CPU), full-cuda (NVIDIA GPU), full-rocm (AMD GPU),
 #          full-musa (Moore Threads GPU), full-intel (Intel GPU),
 #          full-vulkan (Vulkan GPU)
 LLAMA_CPP_FULL_VARIANT=full
 # Full CPU limit (in cores)
 LLAMA_CPP_FULL_CPU_LIMIT=2.0
 # Full CPU reservation (in cores)
 LLAMA_CPP_FULL_CPU_RESERVATION=1.0
 # Full memory limit
 LLAMA_CPP_FULL_MEMORY_LIMIT=4G
 # Full memory reservation
 LLAMA_CPP_FULL_MEMORY_RESERVATION=2G
--- a/src/llama.cpp/README.md
+++ b/src/llama.cpp/README.md
@@ -0,0 +1,245 @@
 # llama.cpp
 [中文文档](README.zh.md)
 [llama.cpp](https://github.com/ggml-org/llama.cpp) is a high-performance C/C++ implementation for LLM inference with support for various hardware accelerators.
 ## Features
 - **Fast Inference**: Optimized C/C++ implementation for efficient LLM inference
 - **Multiple Backends**: CPU, CUDA (NVIDIA), ROCm (AMD), MUSA (Moore Threads), Intel GPU, Vulkan
 - **OpenAI-compatible API**: Server mode with OpenAI-compatible REST API
 - **CLI Support**: Interactive command-line interface for quick testing
 - **Model Conversion**: Full toolkit includes tools to convert and quantize models
 - **GGUF Format**: Support for the efficient GGUF model format
 - **Cross-platform**: Linux (x86-64, ARM64, s390x), Windows, macOS
 ## Prerequisites
 - Docker and Docker Compose installed
 - At least 4GB of RAM (8GB+ recommended)
 - For GPU variants:
  - **CUDA**: NVIDIA GPU with [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit)
  - **ROCm**: AMD GPU with proper ROCm drivers
  - **MUSA**: Moore Threads GPU with mt-container-toolkit
 - GGUF format model file (e.g., from [Hugging Face](https://huggingface.co/models?library=gguf))
 ## Quick Start
 ### 1. Server Mode (CPU)
 ```bash
 # Copy and configure environment
 cp .env.example .env
 # Edit .env and set your model path
 # LLAMA_CPP_MODEL_PATH=/models/your-model.gguf
 # Place your GGUF model in a directory, then update docker-compose.yaml
 # to mount it, e.g.:
 # volumes:
 #   - ./models:/models
 # Start the server
 docker compose --profile server up -d
 # Test the server (OpenAI-compatible API)
 curl http://localhost:8080/v1/models
 # Chat completion request
 curl http://localhost:8080/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "messages": [
      {"role": "user", "content": "Hello!"}
    ]
  }'
 ```
 ### 2. Server Mode with NVIDIA GPU
 ```bash
 # Edit .env
 # Set LLAMA_CPP_GPU_LAYERS=99 to offload all layers to GPU
 # Start GPU-accelerated server
 docker compose --profile cuda up -d
 # The server will automatically use NVIDIA GPU
 ```
 ### 3. Server Mode with AMD GPU
 ```bash
 # Edit .env
 # Set LLAMA_CPP_GPU_LAYERS=99 to offload all layers to GPU
 # Start GPU-accelerated server
 docker compose --profile rocm up -d
 # The server will automatically use AMD GPU
 ```
 ### 4. CLI Mode
 ```bash
 # Edit .env and configure model path and prompt
 # Run CLI
 docker compose --profile cli up
 # For interactive mode, use:
 docker compose run --rm llama-cpp-cli \
  -m /models/your-model.gguf \
  -p "Your prompt here" \
  -n 512
 ```
 ### 5. Full Toolkit (Model Conversion)
 ```bash
 # Start the full container
 docker compose --profile full up -d
 # Execute commands inside the container
 docker compose exec llama-cpp-full bash
 # Inside container, you can use conversion tools
 # Example: Convert a Hugging Face model
 # python3 convert_hf_to_gguf.py /models/source-model --outfile /models/output.gguf
 ```
 ## Configuration
 ### Environment Variables
 Key environment variables (see [.env.example](.env.example) for all options):
 | Variable                         | Description                                                   | Default              |
 | -------------------------------- | ------------------------------------------------------------- | -------------------- |
 | `LLAMA_CPP_SERVER_VARIANT`       | Server image variant (server, server-cuda, server-rocm, etc.) | `server`             |
 | `LLAMA_CPP_MODEL_PATH`           | Model file path inside container                              | `/models/model.gguf` |
 | `LLAMA_CPP_CONTEXT_SIZE`         | Context window size in tokens                                 | `512`                |
 | `LLAMA_CPP_GPU_LAYERS`           | Number of layers to offload to GPU (0=CPU only, 99=all)       | `0`                  |
 | `LLAMA_CPP_SERVER_PORT_OVERRIDE` | Server port on host                                           | `8080`               |
 | `LLAMA_CPP_SERVER_MEMORY_LIMIT`  | Memory limit for server                                       | `8G`                 |
 ### Available Profiles
 - `server`: CPU-only server
 - `cuda`: NVIDIA GPU server (requires nvidia-container-toolkit)
 - `rocm`: AMD GPU server (requires ROCm)
 - `cli`: Command-line interface
 - `full`: Full toolkit with model conversion tools
 - `gpu`: Generic GPU profile (includes cuda and rocm)
 ### Image Variants
 Each variant comes in multiple flavors:
 - **server**: Only `llama-server` executable (API server)
 - **light**: Only `llama-cli` and `llama-completion` executables
 - **full**: Complete toolkit including model conversion tools
 Backend options:
 - Base (CPU)
 - `-cuda` (NVIDIA GPU)
 - `-rocm` (AMD GPU)
 - `-musa` (Moore Threads GPU)
 - `-intel` (Intel GPU with SYCL)
 - `-vulkan` (Vulkan GPU)
 ## Server API
 The server provides an OpenAI-compatible API:
 - `GET /health` - Health check
 - `GET /v1/models` - List available models
 - `POST /v1/chat/completions` - Chat completion
 - `POST /v1/completions` - Text completion
 - `POST /v1/embeddings` - Generate embeddings
 See the [llama.cpp server documentation](https://github.com/ggml-org/llama.cpp/blob/master/examples/server/README.md) for full API details.
 ## Model Sources
 Download GGUF models from:
 - [Hugging Face GGUF Models](https://huggingface.co/models?library=gguf)
 - [TheBloke's GGUF Collection](https://huggingface.co/TheBloke)
 - Convert your own models using the full toolkit
 Popular quantization formats:
 - `Q4_K_M`: Good balance of quality and size (recommended)
 - `Q5_K_M`: Higher quality, larger size
 - `Q8_0`: Very high quality, large size
 - `Q2_K`: Smallest size, lower quality
 ## Resource Requirements
 Minimum requirements by model size:
 | Model Size | RAM (CPU) | VRAM (GPU) | Context Size |
 | ---------- | --------- | ---------- | ------------ |
 | 7B Q4_K_M  | 6GB       | 4GB        | 2048         |
 | 13B Q4_K_M | 10GB      | 8GB        | 2048         |
 | 34B Q4_K_M | 24GB      | 20GB       | 2048         |
 | 70B Q4_K_M | 48GB      | 40GB       | 2048         |
 Larger context sizes require proportionally more memory.
 ## Performance Tuning
 For CPU inference:
 - Increase `LLAMA_CPP_SERVER_CPU_LIMIT` for more cores
 - Optimize threads with `-t` flag (default: auto)
 For GPU inference:
 - Set `LLAMA_CPP_GPU_LAYERS=99` to offload all layers
 - Increase context size for longer conversations
 - Monitor GPU memory usage
 ## Security Notes
 - The server binds to `0.0.0.0` by default - ensure proper network security
 - No authentication is enabled by default
 - Consider using a reverse proxy (nginx, Caddy) for production deployments
 - Limit resource usage to prevent system exhaustion
 ## Troubleshooting
 ### Out of Memory
 - Reduce `LLAMA_CPP_CONTEXT_SIZE`
 - Use a smaller quantized model (e.g., Q4 instead of Q8)
 - Reduce `LLAMA_CPP_GPU_LAYERS` if using GPU
 ### GPU Not Detected
 **NVIDIA**: Verify nvidia-container-toolkit is installed:
 ```bash
 docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi
 ```
 **AMD**: Ensure ROCm drivers and `/dev/kfd`, `/dev/dri` are accessible.
 ### Slow Inference
 - Check CPU/GPU utilization
 - Increase resource limits in `.env`
 - For GPU: Verify all layers are offloaded (`LLAMA_CPP_GPU_LAYERS=99`)
 ## Documentation
 - [llama.cpp GitHub](https://github.com/ggml-org/llama.cpp)
 - [Docker Documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/docker.md)
 - [Server API Docs](https://github.com/ggml-org/llama.cpp/blob/master/examples/server/README.md)
 ## License
 llama.cpp is released under the MIT License. See the [LICENSE](https://github.com/ggml-org/llama.cpp/blob/master/LICENSE) file for details.
--- a/src/llama.cpp/README.zh.md
+++ b/src/llama.cpp/README.zh.md
@@ -0,0 +1,244 @@
 # llama.cpp
 [English Documentation](README.md)
 [llama.cpp](https://github.com/ggml-org/llama.cpp) 是一个高性能的 C/C++ 实现的大语言模型推理引擎，支持多种硬件加速器。
 ## 功能特性
 - **高速推理**：优化的 C/C++ 实现，提供高效的 LLM 推理
 - **多种后端**：支持 CPU、CUDA（NVIDIA）、ROCm（AMD）、MUSA（摩尔线程）、Intel GPU、Vulkan
 - **OpenAI 兼容 API**：服务器模式提供 OpenAI 兼容的 REST API
 - **CLI 支持**：交互式命令行界面，方便快速测试
 - **模型转换**：完整工具包包含模型转换和量化工具
 - **GGUF 格式**：支持高效的 GGUF 模型格式
 - **跨平台**：支持 Linux（x86-64、ARM64、s390x）、Windows、macOS
 ## 前置要求
 - 已安装 Docker 和 Docker Compose
 - 至少 4GB 内存（推荐 8GB 以上）
 - GPU 版本需要：
  - **CUDA**：NVIDIA GPU 及 [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit)
  - **ROCm**：AMD GPU 及相应的 ROCm 驱动
  - **MUSA**：摩尔线程 GPU 及 mt-container-toolkit
 - GGUF 格式的模型文件（例如从 [Hugging Face](https://huggingface.co/models?library=gguf) 下载）
 ## 快速开始
 ### 1. 服务器模式（CPU）
 ```bash
 # 复制并配置环境变量
 cp .env.example .env
 # 编辑 .env 并设置模型路径
 # LLAMA_CPP_MODEL_PATH=/models/your-model.gguf
 # 将 GGUF 模型放在目录中，然后更新 docker-compose.yaml 挂载，例如：
 # volumes:
 #   - ./models:/models
 # 启动服务器
 docker compose --profile server up -d
 # 测试服务器（OpenAI 兼容 API）
 curl http://localhost:8080/v1/models
 # 聊天补全请求
 curl http://localhost:8080/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "messages": [
      {"role": "user", "content": "你好！"}
    ]
  }'
 ```
 ### 2. 服务器模式（NVIDIA GPU）
 ```bash
 # 编辑 .env
 # 设置 LLAMA_CPP_GPU_LAYERS=99 将所有层卸载到 GPU
 # 启动 GPU 加速服务器
 docker compose --profile cuda up -d
 # 服务器将自动使用 NVIDIA GPU
 ```
 ### 3. 服务器模式（AMD GPU）
 ```bash
 # 编辑 .env
 # 设置 LLAMA_CPP_GPU_LAYERS=99 将所有层卸载到 GPU
 # 启动 GPU 加速服务器
 docker compose --profile rocm up -d
 # 服务器将自动使用 AMD GPU
 ```
 ### 4. CLI 模式
 ```bash
 # 编辑 .env 并配置模型路径和提示词
 # 运行 CLI
 docker compose --profile cli up
 # 交互模式：
 docker compose run --rm llama-cpp-cli \
  -m /models/your-model.gguf \
  -p "你的提示词" \
  -n 512
 ```
 ### 5. 完整工具包（模型转换）
 ```bash
 # 启动完整容器
 docker compose --profile full up -d
 # 在容器内执行命令
 docker compose exec llama-cpp-full bash
 # 在容器内可以使用转换工具
 # 示例：转换 Hugging Face 模型
 # python3 convert_hf_to_gguf.py /models/source-model --outfile /models/output.gguf
 ```
 ## 配置说明
 ### 环境变量
 主要环境变量（完整选项请查看 [.env.example](.env.example)）：
 | 变量                             | 说明                                                  | 默认值               |
 | -------------------------------- | ----------------------------------------------------- | -------------------- |
 | `LLAMA_CPP_SERVER_VARIANT`       | 服务器镜像变体（server、server-cuda、server-rocm 等） | `server`             |
 | `LLAMA_CPP_MODEL_PATH`           | 容器内模型文件路径                                    | `/models/model.gguf` |
 | `LLAMA_CPP_CONTEXT_SIZE`         | 上下文窗口大小（token 数）                            | `512`                |
 | `LLAMA_CPP_GPU_LAYERS`           | 卸载到 GPU 的层数（0=仅 CPU，99=全部）                | `0`                  |
 | `LLAMA_CPP_SERVER_PORT_OVERRIDE` | 主机端口                                              | `8080`               |
 | `LLAMA_CPP_SERVER_MEMORY_LIMIT`  | 服务器内存限制                                        | `8G`                 |
 ### 可用配置文件
 - `server`：仅 CPU 服务器
 - `cuda`：NVIDIA GPU 服务器（需要 nvidia-container-toolkit）
 - `rocm`：AMD GPU 服务器（需要 ROCm）
 - `cli`：命令行界面
 - `full`：包含模型转换工具的完整工具包
 - `gpu`：通用 GPU 配置（包括 cuda 和 rocm）
 ### 镜像变体
 每个变体都有多种类型：
 - **server**：仅包含 `llama-server` 可执行文件（API 服务器）
 - **light**：仅包含 `llama-cli` 和 `llama-completion` 可执行文件
 - **full**：完整工具包，包括模型转换工具
 后端选项：
 - 基础版（CPU）
 - `-cuda`（NVIDIA GPU）
 - `-rocm`（AMD GPU）
 - `-musa`（摩尔线程 GPU）
 - `-intel`（Intel GPU，支持 SYCL）
 - `-vulkan`（Vulkan GPU）
 ## 服务器 API
 服务器提供 OpenAI 兼容的 API：
 - `GET /health` - 健康检查
 - `GET /v1/models` - 列出可用模型
 - `POST /v1/chat/completions` - 聊天补全
 - `POST /v1/completions` - 文本补全
 - `POST /v1/embeddings` - 生成嵌入向量
 完整 API 详情请参阅 [llama.cpp 服务器文档](https://github.com/ggml-org/llama.cpp/blob/master/examples/server/README.md)。
 ## 模型来源
 下载 GGUF 模型：
 - [Hugging Face GGUF 模型](https://huggingface.co/models?library=gguf)
 - [TheBloke 的 GGUF 合集](https://huggingface.co/TheBloke)
 - 使用完整工具包转换您自己的模型
 常用量化格式：
 - `Q4_K_M`：质量和大小的良好平衡（推荐）
 - `Q5_K_M`：更高质量，更大体积
 - `Q8_0`：非常高的质量，大体积
 - `Q2_K`：最小体积，较低质量
 ## 资源需求
 按模型大小的最低要求：
 | 模型大小   | 内存（CPU） | 显存（GPU） | 上下文大小 |
 | ---------- | ----------- | ----------- | ---------- |
 | 7B Q4_K_M  | 6GB         | 4GB         | 2048       |
 | 13B Q4_K_M | 10GB        | 8GB         | 2048       |
 | 34B Q4_K_M | 24GB        | 20GB        | 2048       |
 | 70B Q4_K_M | 48GB        | 40GB        | 2048       |
 更大的上下文大小需要成比例的更多内存。
 ## 性能调优
 CPU 推理：
 - 增加 `LLAMA_CPP_SERVER_CPU_LIMIT` 以使用更多核心
 - 使用 `-t` 参数优化线程数（默认：自动）
 GPU 推理：
 - 设置 `LLAMA_CPP_GPU_LAYERS=99` 卸载所有层
 - 增加上下文大小以支持更长对话
 - 监控 GPU 内存使用
 ## 安全注意事项
 - 服务器默认绑定到 `0.0.0.0` - 请确保网络安全
 - 默认未启用身份验证
 - 生产环境建议使用反向代理（nginx、Caddy）
 - 限制资源使用以防止系统资源耗尽
 ## 故障排除
 ### 内存不足
 - 减小 `LLAMA_CPP_CONTEXT_SIZE`
 - 使用更小的量化模型（例如 Q4 而不是 Q8）
 - 减少 `LLAMA_CPP_GPU_LAYERS`（如果使用 GPU）
 ### GPU 未检测到
 **NVIDIA**：验证 nvidia-container-toolkit 是否已安装：
 ```bash
 docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi
 ```
 **AMD**：确保 ROCm 驱动已安装且 `/dev/kfd`、`/dev/dri` 可访问。
 ### 推理速度慢
 - 检查 CPU/GPU 利用率
 - 增加 `.env` 中的资源限制
 - GPU：验证所有层都已卸载（`LLAMA_CPP_GPU_LAYERS=99`）
 ## 文档
 - [llama.cpp GitHub](https://github.com/ggml-org/llama.cpp)
 - [Docker 文档](https://github.com/ggml-org/llama.cpp/blob/master/docs/docker.md)
 - [服务器 API 文档](https://github.com/ggml-org/llama.cpp/blob/master/examples/server/README.md)
 ## 许可证
 llama.cpp 使用 MIT 许可证发布。详情请参阅 [LICENSE](https://github.com/ggml-org/llama.cpp/blob/master/LICENSE) 文件。
--- a/src/llama.cpp/docker-compose.yaml
+++ b/src/llama.cpp/docker-compose.yaml
@@ -0,0 +1,210 @@
 # Docker Compose configuration for llama.cpp
 # https://github.com/ggml-org/llama.cpp
 # LLM inference in C/C++ with support for various hardware accelerators
 x-defaults: &defaults
  restart: unless-stopped
  logging:
    driver: json-file
    options:
      max-size: 100m
      max-file: "3"
 services:
  # llama.cpp server - OpenAI-compatible API server
  # Variant: server (CPU), server-cuda (NVIDIA GPU), server-rocm (AMD GPU)
  llama-cpp-server:
    <<: *defaults
    image: ${GHCR_REGISTRY:-ghcr.io/}ggml-org/llama.cpp:${LLAMA_CPP_SERVER_VARIANT:-server}
    ports:
      - "${LLAMA_CPP_SERVER_PORT_OVERRIDE:-8080}:8080"
    volumes:
      - llama_cpp_models:/models
    command:
      - "-m"
      - "${LLAMA_CPP_MODEL_PATH:-/models/model.gguf}"
      - "--port"
      - "8080"
      - "--host"
      - "0.0.0.0"
      - "-n"
      - "${LLAMA_CPP_CONTEXT_SIZE:-512}"
      - "--n-gpu-layers"
      - "${LLAMA_CPP_GPU_LAYERS:-0}"
    environment:
      - TZ=${TZ:-UTC}
    healthcheck:
      test:
        [
          "CMD",
          "wget",
          "--quiet",
          "--tries=1",
          "--spider",
          "http://localhost:8080/health",
        ]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 30s
    deploy:
      resources:
        limits:
          cpus: ${LLAMA_CPP_SERVER_CPU_LIMIT:-4.0}
          memory: ${LLAMA_CPP_SERVER_MEMORY_LIMIT:-8G}
        reservations:
          cpus: ${LLAMA_CPP_SERVER_CPU_RESERVATION:-2.0}
          memory: ${LLAMA_CPP_SERVER_MEMORY_RESERVATION:-4G}
    profiles:
      - server
  # llama.cpp server with NVIDIA GPU support
  llama-cpp-server-cuda:
    <<: *defaults
    image: ${GHCR_REGISTRY:-ghcr.io/}ggml-org/llama.cpp:server-cuda
    ports:
      - "${LLAMA_CPP_SERVER_PORT_OVERRIDE:-8080}:8080"
    volumes:
      - llama_cpp_models:/models
    command:
      - "-m"
      - "${LLAMA_CPP_MODEL_PATH:-/models/model.gguf}"
      - "--port"
      - "8080"
      - "--host"
      - "0.0.0.0"
      - "-n"
      - "${LLAMA_CPP_CONTEXT_SIZE:-512}"
      - "--n-gpu-layers"
      - "${LLAMA_CPP_GPU_LAYERS:-99}"
    environment:
      - TZ=${TZ:-UTC}
    healthcheck:
      test:
        [
          "CMD",
          "wget",
          "--quiet",
          "--tries=1",
          "--spider",
          "http://localhost:8080/health",
        ]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 30s
    deploy:
      resources:
        limits:
          cpus: ${LLAMA_CPP_SERVER_CPU_LIMIT:-4.0}
          memory: ${LLAMA_CPP_SERVER_MEMORY_LIMIT:-8G}
        reservations:
          cpus: ${LLAMA_CPP_SERVER_CPU_RESERVATION:-2.0}
          memory: ${LLAMA_CPP_SERVER_MEMORY_RESERVATION:-4G}
          devices:
            - driver: nvidia
              count: ${LLAMA_CPP_GPU_COUNT:-1}
              capabilities: [gpu]
    profiles:
      - gpu
      - cuda
  # llama.cpp server with AMD ROCm GPU support
  llama-cpp-server-rocm:
    <<: *defaults
    image: ${GHCR_REGISTRY:-ghcr.io/}ggml-org/llama.cpp:server-rocm
    ports:
      - "${LLAMA_CPP_SERVER_PORT_OVERRIDE:-8080}:8080"
    volumes:
      - llama_cpp_models:/models
    devices:
      - /dev/kfd
      - /dev/dri
    command:
      - "-m"
      - "${LLAMA_CPP_MODEL_PATH:-/models/model.gguf}"
      - "--port"
      - "8080"
      - "--host"
      - "0.0.0.0"
      - "-n"
      - "${LLAMA_CPP_CONTEXT_SIZE:-512}"
      - "--n-gpu-layers"
      - "${LLAMA_CPP_GPU_LAYERS:-99}"
    environment:
      - TZ=${TZ:-UTC}
    healthcheck:
      test:
        [
          "CMD",
          "wget",
          "--quiet",
          "--tries=1",
          "--spider",
          "http://localhost:8080/health",
        ]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 30s
    deploy:
      resources:
        limits:
          cpus: ${LLAMA_CPP_SERVER_CPU_LIMIT:-4.0}
          memory: ${LLAMA_CPP_SERVER_MEMORY_LIMIT:-8G}
        reservations:
          cpus: ${LLAMA_CPP_SERVER_CPU_RESERVATION:-2.0}
          memory: ${LLAMA_CPP_SERVER_MEMORY_RESERVATION:-4G}
    profiles:
      - gpu
      - rocm
  # llama.cpp CLI (light) - Interactive command-line interface
  llama-cpp-cli:
    <<: *defaults
    image: ${GHCR_REGISTRY:-ghcr.io/}ggml-org/llama.cpp:${LLAMA_CPP_CLI_VARIANT:-light}
    volumes:
      - llama_cpp_models:/models
    entrypoint: /app/llama-cli
    command:
      - "-m"
      - "${LLAMA_CPP_MODEL_PATH:-/models/model.gguf}"
      - "-p"
      - "${LLAMA_CPP_PROMPT:-Hello, how are you?}"
      - "-n"
      - "${LLAMA_CPP_CONTEXT_SIZE:-512}"
    environment:
      - TZ=${TZ:-UTC}
    deploy:
      resources:
        limits:
          cpus: ${LLAMA_CPP_CLI_CPU_LIMIT:-2.0}
          memory: ${LLAMA_CPP_CLI_MEMORY_LIMIT:-4G}
        reservations:
          cpus: ${LLAMA_CPP_CLI_CPU_RESERVATION:-1.0}
          memory: ${LLAMA_CPP_CLI_MEMORY_RESERVATION:-2G}
    profiles:
      - cli
  # llama.cpp full - Complete toolkit including model conversion tools
  llama-cpp-full:
    <<: *defaults
    image: ${GHCR_REGISTRY:-ghcr.io/}ggml-org/llama.cpp:${LLAMA_CPP_FULL_VARIANT:-full}
    volumes:
      - llama_cpp_models:/models
    command: ["sleep", "infinity"]
    environment:
      - TZ=${TZ:-UTC}
    deploy:
      resources:
        limits:
          cpus: ${LLAMA_CPP_FULL_CPU_LIMIT:-2.0}
          memory: ${LLAMA_CPP_FULL_MEMORY_LIMIT:-4G}
        reservations:
          cpus: ${LLAMA_CPP_FULL_CPU_RESERVATION:-1.0}
          memory: ${LLAMA_CPP_FULL_MEMORY_RESERVATION:-2G}
    profiles:
      - full
 volumes:
  llama_cpp_models: