diff --git a/README.md b/README.md
index e5c2f09..b2d131e 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,7 @@ These services require building custom Docker images from source.
 | [Clash](./src/clash)                                           | 1.18.0               |
 | [ClickHouse](./src/clickhouse)                                 | 24.11.1              |
 | [Conductor](./src/conductor)                                   | latest               |
+| [DeepTutor](./apps/deeptutor)                                  | latest               |
 | [Dify](./apps/dify)                                            | 0.18.2               |
 | [DNSMasq](./src/dnsmasq)                                       | 2.91                 |
 | [Dockge](./src/dockge)                                         | 1                    |
@@ -72,6 +73,7 @@ These services require building custom Docker images from source.
 | [LibreOffice](./src/libreoffice)                               | latest               |
 | [libSQL Server](./src/libsql)                                  | latest               |
 | [LiteLLM](./src/litellm)                                       | main-stable          |
+| [llama.cpp](./src/llama.cpp)                                   | server               |
 | [LMDeploy](./src/lmdeploy)                                     | v0.11.1              |
 | [Logstash](./src/logstash)                                     | 8.16.1               |
 | [MariaDB Galera Cluster](./src/mariadb-galera)                 | 11.7.2               |
diff --git a/README.zh.md b/README.zh.md
index 4308efa..68fc15e 100644
--- a/README.zh.md
+++ b/README.zh.md
@@ -34,6 +34,7 @@ Compose Anything 通过提供一组高质量的 Docker Compose 配置文件，
 | [Clash](./src/clash)                                           | 1.18.0                |
 | [ClickHouse](./src/clickhouse)                                 | 24.11.1               |
 | [Conductor](./src/conductor)                                   | latest                |
+| [DeepTutor](./apps/deeptutor)                                  | latest                |
 | [Dify](./apps/dify)                                            | 0.18.2                |
 | [DNSMasq](./src/dnsmasq)                                       | 2.91                  |
 | [Dockge](./src/dockge)                                         | 1                     |
@@ -72,6 +73,7 @@ Compose Anything 通过提供一组高质量的 Docker Compose 配置文件，
 | [LibreOffice](./src/libreoffice)                               | latest                |
 | [libSQL Server](./src/libsql)                                  | latest                |
 | [LiteLLM](./src/litellm)                                       | main-stable           |
+| [llama.cpp](./src/llama.cpp)                                   | server                |
 | [LMDeploy](./src/lmdeploy)                                     | v0.11.1               |
 | [Logstash](./src/logstash)                                     | 8.16.1                |
 | [MariaDB Galera Cluster](./src/mariadb-galera)                 | 11.7.2                |
diff --git a/apps/deeptutor/.env.example b/apps/deeptutor/.env.example
new file mode 100644
index 0000000..1a8419c
--- /dev/null
+++ b/apps/deeptutor/.env.example
@@ -0,0 +1,97 @@
+# DeepTutor Configuration
+# Copy this file to .env and fill in your API keys
+
+#! ==================================================
+#! General Settings
+#! ==================================================
+
+# Timezone (default: UTC)
+TZ=UTC
+
+# User and Group ID for file permissions (default: 1000)
+# Adjust if your host user has a different UID/GID
+PUID=1000
+PGID=1000
+
+# Global registry prefix (optional)
+# Example: registry.example.com/ or leave empty for Docker Hub/GHCR
+GLOBAL_REGISTRY=
+
+#! ==================================================
+#! DeepTutor Version
+#! ==================================================
+
+# Image version (default: latest)
+# Available tags: latest, v0.5.x
+# See: https://github.com/HKUDS/DeepTutor/pkgs/container/deeptutor
+DEEPTUTOR_VERSION=latest
+
+#! ==================================================
+#! Port Configuration
+#! ==================================================
+
+# Backend port (internal: 8001)
+BACKEND_PORT=8001
+# Host port override for backend
+DEEPTUTOR_BACKEND_PORT_OVERRIDE=8001
+
+# Frontend port (internal: 3782)
+FRONTEND_PORT=3782
+# Host port override for frontend
+DEEPTUTOR_FRONTEND_PORT_OVERRIDE=3782
+
+#! ==================================================
+#! API Base URLs
+#! ==================================================
+
+# Internal API base URL (used by frontend to communicate with backend)
+NEXT_PUBLIC_API_BASE=http://localhost:8001
+
+# External API base URL (for cloud deployment, set to your public URL)
+# Example: https://your-server.com:8001
+# For local deployment, use the same as NEXT_PUBLIC_API_BASE
+NEXT_PUBLIC_API_BASE_EXTERNAL=http://localhost:8001
+
+#! ==================================================
+#! LLM API Keys (Required)
+#! ==================================================
+
+# OpenAI API Key (Required)
+# Get from: https://platform.openai.com/api-keys
+OPENAI_API_KEY=sk-your-openai-api-key-here
+
+# OpenAI Base URL (default: https://api.openai.com/v1)
+# For OpenAI-compatible APIs (e.g., Azure OpenAI, custom endpoints)
+OPENAI_BASE_URL=https://api.openai.com/v1
+
+# Default LLM Model (default: gpt-4o)
+# Options: gpt-4o, gpt-4-turbo, gpt-4, gpt-3.5-turbo, etc.
+DEFAULT_MODEL=gpt-4o
+
+#! ==================================================
+#! Additional LLM API Keys (Optional)
+#! ==================================================
+
+# Anthropic API Key (Optional, for Claude models)
+# Get from: https://console.anthropic.com/
+ANTHROPIC_API_KEY=
+
+# Perplexity API Key (Optional, for web search)
+# Get from: https://www.perplexity.ai/settings/api
+PERPLEXITY_API_KEY=
+
+# DashScope API Key (Optional, for Alibaba Cloud models)
+# Get from: https://dashscope.console.aliyun.com/
+DASHSCOPE_API_KEY=
+
+#! ==================================================
+#! Resource Limits
+#! ==================================================
+
+# CPU limits (default: 4.00 cores limit, 1.00 cores reservation)
+DEEPTUTOR_CPU_LIMIT=4.00
+DEEPTUTOR_CPU_RESERVATION=1.00
+
+# Memory limits (default: 8G limit, 2G reservation)
+DEEPTUTOR_MEMORY_LIMIT=8G
+DEEPTUTOR_MEMORY_RESERVATION=2G
diff --git a/apps/deeptutor/README.md b/apps/deeptutor/README.md
new file mode 100644
index 0000000..8453cbd
--- /dev/null
+++ b/apps/deeptutor/README.md
@@ -0,0 +1,248 @@
+# DeepTutor
+
+[中文说明](README.zh.md) | English
+
+## Overview
+
+DeepTutor is an AI-powered personalized learning assistant that transforms any document into an interactive learning experience with multi-agent intelligence. It helps you solve problems, generate questions, conduct research, collaborate on writing, organize notes, and guides you through learning paths.
+
+**Project:** <https://github.com/HKUDS/DeepTutor>  
+**License:** Apache-2.0  
+**Documentation:** <https://hkuds.github.io/DeepTutor/>
+
+## Features
+
+- **Problem Solving** — Detailed step-by-step solutions with visual diagrams
+- **Question Generation** — Adaptive questions based on your knowledge level
+- **Research Assistant** — Deep research with multi-agent collaboration
+- **Co-Writer** — Interactive idea generation and writing assistance
+- **Smart Notebook** — Organize and retrieve learning materials efficiently
+- **Guided Learning** — Personalized learning paths and progress tracking
+- **Multi-Agent System** — Specialized agents for different learning tasks
+- **RAG Integration** — LightRAG and RAG-Anything for knowledge retrieval
+- **Code Execution** — Built-in code playground for practice
+
+## Quick Start
+
+### Prerequisites
+
+- Docker and Docker Compose
+- OpenAI API key (required)
+- Optional: Anthropic, Perplexity, or DashScope API keys
+
+### Installation
+
+1. **Clone this repository**
+
+    ```bash
+    git clone <your-compose-anything-repo>
+    cd apps/deeptutor
+    ```
+
+2. **Configure environment**
+
+    ```bash
+    cp .env.example .env
+    # Edit .env and add your API keys
+    ```
+
+    **Required configuration:**
+    - `OPENAI_API_KEY` — Your OpenAI API key
+
+    **Optional configuration:**
+    - `ANTHROPIC_API_KEY` — For Claude models
+    - `PERPLEXITY_API_KEY` — For web search
+    - `DASHSCOPE_API_KEY` — For Alibaba Cloud models
+    - Adjust ports if needed (default: 8001 for backend, 3782 for frontend)
+    - Set `NEXT_PUBLIC_API_BASE_EXTERNAL` for cloud deployments
+
+3. **Optional: Custom agent configuration**
+
+    Create a `config/agents.yaml` file to customize agent behaviors (see [documentation](https://hkuds.github.io/DeepTutor/guide/config.html) for details).
+
+4. **Start the service**
+
+    ```bash
+    docker compose up -d
+    ```
+
+    First run takes approximately 30-60 seconds to initialize.
+
+5. **Access the application**
+
+      - **Frontend:** <http://localhost:3782>
+      - **Backend API:** <http://localhost:8001>
+      - **API Documentation:** <http://localhost:8001/docs>
+
+## Usage
+
+### Create Knowledge Base
+
+1. Navigate to <http://localhost:3782/knowledge>
+2. Click "New Knowledge Base"
+3. Upload documents (supports PDF, DOCX, TXT, Markdown, HTML, etc.)
+4. Wait for processing to complete
+
+### Learning Modes
+
+- **Solve** — Get step-by-step solutions to problems
+- **Question** — Generate practice questions based on your materials
+- **Research** — Deep research with multi-agent collaboration
+- **Co-Writer** — Interactive writing and idea generation
+- **Notebook** — Organize and manage your learning materials
+- **Guide** — Follow personalized learning paths
+
+### Advanced Features
+
+- **Code Execution** — Practice coding directly in the interface
+- **Visual Diagrams** — Automatic diagram generation for complex concepts
+- **Export** — Download your work as PDF or Markdown
+- **Multi-language** — Support for multiple languages
+
+## Configuration
+
+### Environment Variables
+
+Key environment variables (see [.env.example](.env.example) for all options):
+
+| Variable                 | Default    | Description               |
+| ------------------------ | ---------- | ------------------------- |
+| `OPENAI_API_KEY`         | (required) | Your OpenAI API key       |
+| `DEFAULT_MODEL`          | `gpt-4o`   | Default LLM model         |
+| `BACKEND_PORT`           | `8001`     | Backend server port       |
+| `FRONTEND_PORT`          | `3782`     | Frontend application port |
+| `DEEPTUTOR_CPU_LIMIT`    | `4.00`     | CPU limit (cores)         |
+| `DEEPTUTOR_MEMORY_LIMIT` | `8G`       | Memory limit              |
+
+### Ports
+
+- **8001** — Backend API server
+- **3782** — Frontend web interface
+
+### Volumes
+
+- `deeptutor_data` — User data, knowledge bases, and learning materials
+- `./config` — Custom agent configurations (optional)
+
+## Resource Requirements
+
+**Minimum:**
+
+- CPU: 1 core
+- Memory: 2GB
+- Disk: 2GB + space for knowledge bases
+
+**Recommended:**
+
+- CPU: 4 cores
+- Memory: 8GB
+- Disk: 10GB+
+
+## Supported Models
+
+DeepTutor supports multiple LLM providers:
+
+- **OpenAI** — GPT-4, GPT-4 Turbo, GPT-3.5 Turbo
+- **Anthropic** — Claude 3 (Opus, Sonnet, Haiku)
+- **Perplexity** — For web search integration
+- **DashScope** — Alibaba Cloud models
+- **OpenAI-compatible APIs** — Any API compatible with OpenAI format
+
+## Troubleshooting
+
+### Backend fails to start
+
+- Verify `OPENAI_API_KEY` is set correctly in `.env`
+- Check logs: `docker compose logs -f`
+- Ensure ports 8001 and 3782 are not in use
+- Verify sufficient disk space for volumes
+
+### Frontend cannot connect to backend
+
+- Confirm backend is running: visit <http://localhost:8001/docs>
+- For cloud deployments, set `NEXT_PUBLIC_API_BASE_EXTERNAL` to your public URL
+- Check firewall settings
+
+### Knowledge base processing fails
+
+- Ensure sufficient memory (recommended 8GB+)
+- Check document format is supported
+- Review logs for specific errors
+
+### API rate limits
+
+- Monitor your API usage on provider dashboards
+- Consider upgrading your API plan
+- Use different models for different tasks
+
+## Security Notes
+
+- **API Keys** — Keep your API keys secure, never commit them to version control
+- **Network Exposure** — For production deployments, use HTTPS and proper authentication
+- **Data Privacy** — User data is stored in Docker volumes; ensure proper backup and security
+- **Resource Limits** — Set appropriate CPU and memory limits to prevent resource exhaustion
+
+## Updates
+
+To update to the latest version:
+
+```bash
+# Pull the latest image
+docker compose pull
+
+# Recreate containers
+docker compose up -d
+```
+
+To update to a specific version, edit `DEEPTUTOR_VERSION` in `.env` and run:
+
+```bash
+docker compose up -d
+```
+
+## Advanced Usage
+
+### Custom Agent Configuration
+
+Create `config/agents.yaml` to customize agent behaviors:
+
+```yaml
+agents:
+  solver:
+    model: gpt-4o
+    temperature: 0.7
+  researcher:
+    model: gpt-4-turbo
+    max_tokens: 4000
+```
+
+See [official documentation](https://hkuds.github.io/DeepTutor/guide/config.html) for detailed configuration options.
+
+### Cloud Deployment
+
+For cloud deployment, additional configuration is needed:
+
+1. Set public URL in `.env`:
+
+    ```env
+    NEXT_PUBLIC_API_BASE_EXTERNAL=https://your-domain.com:8001
+    ```
+
+2. Configure reverse proxy (nginx/Caddy) for HTTPS
+3. Ensure proper firewall rules
+4. Consider using environment-specific secrets management
+
+### Using Different Embedding Models
+
+DeepTutor uses `text-embedding-3-large` by default. To use different embedding models, refer to the [official documentation](https://hkuds.github.io/DeepTutor/guide/config.html).
+
+## Links
+
+- **GitHub:** <https://github.com/HKUDS/DeepTutor>
+- **Documentation:** <https://hkuds.github.io/DeepTutor/>
+- **Issues:** <https://github.com/HKUDS/DeepTutor/issues>
+- **Discussions:** <https://github.com/HKUDS/DeepTutor/discussions>
+
+## License
+
+DeepTutor is licensed under the Apache-2.0 License. See the [official repository](https://github.com/HKUDS/DeepTutor) for details.
diff --git a/apps/deeptutor/README.zh.md b/apps/deeptutor/README.zh.md
new file mode 100644
index 0000000..37da444
--- /dev/null
+++ b/apps/deeptutor/README.zh.md
@@ -0,0 +1,248 @@
+# DeepTutor
+
+中文说明 | [English](README.md)
+
+## 概述
+
+DeepTutor 是一个 AI 驱动的个性化学习助手，通过多智能体系统将任何文档转化为交互式学习体验。它可以帮助您解决问题、生成题目、进行研究、协作写作、整理笔记，并引导您完成学习路径。
+
+**项目地址：** <https://github.com/HKUDS/DeepTutor>  
+**许可证：** Apache-2.0  
+**文档：** <https://hkuds.github.io/DeepTutor/>
+
+## 功能特性
+
+- **问题求解** — 提供详细的分步解决方案和可视化图表
+- **题目生成** — 根据您的知识水平生成自适应题目
+- **研究助手** — 通过多智能体协作进行深度研究
+- **协作写作** — 交互式创意生成和写作辅助
+- **智能笔记** — 高效组织和检索学习材料
+- **引导学习** — 个性化学习路径和进度跟踪
+- **多智能体系统** — 针对不同学习任务的专业智能体
+- **RAG 集成** — 使用 LightRAG 和 RAG-Anything 进行知识检索
+- **代码执行** — 内置代码练习环境
+
+## 快速开始
+
+### 前置要求
+
+- Docker 和 Docker Compose
+- OpenAI API 密钥（必需）
+- 可选：Anthropic、Perplexity 或 DashScope API 密钥
+
+### 安装步骤
+
+1. **克隆仓库**
+
+    ```bash
+    git clone <your-compose-anything-repo>
+    cd apps/deeptutor
+    ```
+
+2. **配置环境变量**
+
+    ```bash
+    cp .env.example .env
+    # 编辑 .env 文件并添加您的 API 密钥
+    ```
+
+    **必需配置：**
+    - `OPENAI_API_KEY` — 您的 OpenAI API 密钥
+
+    **可选配置：**
+    - `ANTHROPIC_API_KEY` — 用于 Claude 模型
+    - `PERPLEXITY_API_KEY` — 用于网络搜索
+    - `DASHSCOPE_API_KEY` — 用于阿里云模型
+    - 如需调整端口（默认：后端 8001，前端 3782）
+    - 云端部署时设置 `NEXT_PUBLIC_API_BASE_EXTERNAL`
+
+3. **可选：自定义智能体配置**
+
+    创建 `config/agents.yaml` 文件以自定义智能体行为（详见[文档](https://hkuds.github.io/DeepTutor/guide/config.html)）。
+
+4. **启动服务**
+
+    ```bash
+    docker compose up -d
+    ```
+
+    首次运行需要约 30-60 秒初始化。
+
+5. **访问应用**
+
+      - **前端界面：** <http://localhost:3782>
+      - **后端 API：** <http://localhost:8001>
+      - **API 文档：** <http://localhost:8001/docs>
+
+## 使用方法
+
+### 创建知识库
+
+1. 访问 <http://localhost:3782/knowledge>
+2. 点击"新建知识库"
+3. 上传文档（支持 PDF、DOCX、TXT、Markdown、HTML 等）
+4. 等待处理完成
+
+### 学习模式
+
+- **求解（Solve）** — 获取问题的分步解决方案
+- **题目（Question）** — 基于学习材料生成练习题
+- **研究（Research）** — 通过多智能体协作进行深度研究
+- **协作写作（Co-Writer）** — 交互式写作和创意生成
+- **笔记（Notebook）** — 组织和管理学习材料
+- **引导（Guide）** — 遵循个性化学习路径
+
+### 高级功能
+
+- **代码执行** — 在界面中直接练习编码
+- **可视化图表** — 为复杂概念自动生成图表
+- **导出** — 将您的工作下载为 PDF 或 Markdown
+- **多语言支持** — 支持多种语言
+
+## 配置说明
+
+### 环境变量
+
+主要环境变量（所有选项见 [.env.example](.env.example)）：
+
+| 变量                     | 默认值   | 描述                 |
+| ------------------------ | -------- | -------------------- |
+| `OPENAI_API_KEY`         | （必需） | 您的 OpenAI API 密钥 |
+| `DEFAULT_MODEL`          | `gpt-4o` | 默认 LLM 模型        |
+| `BACKEND_PORT`           | `8001`   | 后端服务器端口       |
+| `FRONTEND_PORT`          | `3782`   | 前端应用端口         |
+| `DEEPTUTOR_CPU_LIMIT`    | `4.00`   | CPU 限制（核心数）   |
+| `DEEPTUTOR_MEMORY_LIMIT` | `8G`     | 内存限制             |
+
+### 端口说明
+
+- **8001** — 后端 API 服务器
+- **3782** — 前端 Web 界面
+
+### 数据卷
+
+- `deeptutor_data` — 用户数据、知识库和学习材料
+- `./config` — 自定义智能体配置（可选）
+
+## 资源要求
+
+**最低配置：**
+
+- CPU：1 核心
+- 内存：2GB
+- 磁盘：2GB + 知识库所需空间
+
+**推荐配置：**
+
+- CPU：4 核心
+- 内存：8GB
+- 磁盘：10GB+
+
+## 支持的模型
+
+DeepTutor 支持多个 LLM 提供商：
+
+- **OpenAI** — GPT-4、GPT-4 Turbo、GPT-3.5 Turbo
+- **Anthropic** — Claude 3（Opus、Sonnet、Haiku）
+- **Perplexity** — 用于网络搜索集成
+- **DashScope** — 阿里云模型
+- **OpenAI 兼容 API** — 任何与 OpenAI 格式兼容的 API
+
+## 故障排查
+
+### 后端启动失败
+
+- 验证 `.env` 中的 `OPENAI_API_KEY` 是否正确设置
+- 查看日志：`docker compose logs -f`
+- 确保端口 8001 和 3782 未被占用
+- 验证数据卷有足够的磁盘空间
+
+### 前端无法连接后端
+
+- 确认后端正在运行：访问 <http://localhost:8001/docs>
+- 云端部署时，将 `NEXT_PUBLIC_API_BASE_EXTERNAL` 设置为您的公网 URL
+- 检查防火墙设置
+
+### 知识库处理失败
+
+- 确保有足够的内存（推荐 8GB+）
+- 检查文档格式是否支持
+- 查看日志了解具体错误
+
+### API 速率限制
+
+- 在提供商控制台监控 API 使用情况
+- 考虑升级 API 计划
+- 为不同任务使用不同模型
+
+## 安全提示
+
+- **API 密钥** — 妥善保管您的 API 密钥，切勿提交到版本控制系统
+- **网络暴露** — 生产环境部署时，使用 HTTPS 和适当的身份验证
+- **数据隐私** — 用户数据存储在 Docker 卷中，请确保适当的备份和安全措施
+- **资源限制** — 设置合适的 CPU 和内存限制以防止资源耗尽
+
+## 更新
+
+更新到最新版本：
+
+```bash
+# 拉取最新镜像
+docker compose pull
+
+# 重新创建容器
+docker compose up -d
+```
+
+更新到特定版本，编辑 `.env` 中的 `DEEPTUTOR_VERSION` 并运行：
+
+```bash
+docker compose up -d
+```
+
+## 高级用法
+
+### 自定义智能体配置
+
+创建 `config/agents.yaml` 以自定义智能体行为：
+
+```yaml
+agents:
+  solver:
+    model: gpt-4o
+    temperature: 0.7
+  researcher:
+    model: gpt-4-turbo
+    max_tokens: 4000
+```
+
+详细配置选项请参见[官方文档](https://hkuds.github.io/DeepTutor/guide/config.html)。
+
+### 云端部署
+
+云端部署需要额外配置：
+
+1. 在 `.env` 中设置公网 URL：
+
+    ```env
+    NEXT_PUBLIC_API_BASE_EXTERNAL=https://your-domain.com:8001
+    ```
+
+2. 配置反向代理（nginx/Caddy）以支持 HTTPS
+3. 确保适当的防火墙规则
+4. 考虑使用特定环境的密钥管理
+
+### 使用不同的嵌入模型
+
+DeepTutor 默认使用 `text-embedding-3-large`。要使用不同的嵌入模型，请参考[官方文档](https://hkuds.github.io/DeepTutor/guide/config.html)。
+
+## 相关链接
+
+- **GitHub：** <https://github.com/HKUDS/DeepTutor>
+- **文档：** <https://hkuds.github.io/DeepTutor/>
+- **问题反馈：** <https://github.com/HKUDS/DeepTutor/issues>
+- **讨论区：** <https://github.com/HKUDS/DeepTutor/discussions>
+
+## 许可证
+
+DeepTutor 使用 Apache-2.0 许可证。详情请参见[官方仓库](https://github.com/HKUDS/DeepTutor)。
diff --git a/apps/deeptutor/docker-compose.yaml b/apps/deeptutor/docker-compose.yaml
new file mode 100644
index 0000000..86aec5a
--- /dev/null
+++ b/apps/deeptutor/docker-compose.yaml
@@ -0,0 +1,68 @@
+# DeepTutor: AI-Powered Personalized Learning Assistant
+# https://github.com/HKUDS/DeepTutor
+# Transform any document into an interactive learning experience with multi-agent intelligence
+
+x-defaults: &defaults
+  restart: unless-stopped
+  logging:
+    driver: json-file
+    options:
+      max-size: 100m
+      max-file: "3"
+
+services:
+  deeptutor:
+    <<: *defaults
+    image: ${GLOBAL_REGISTRY:-ghcr.io}/hkuds/deeptutor:${DEEPTUTOR_VERSION:-latest}
+    ports:
+      - "${DEEPTUTOR_BACKEND_PORT_OVERRIDE:-8001}:${BACKEND_PORT:-8001}"
+      - "${DEEPTUTOR_FRONTEND_PORT_OVERRIDE:-3782}:${FRONTEND_PORT:-3782}"
+    volumes:
+      - deeptutor_data:/app/data
+      - ./config:/app/config:ro
+    environment:
+      - TZ=${TZ:-UTC}
+      # Backend port
+      - BACKEND_PORT=${BACKEND_PORT:-8001}
+      # Frontend port
+      - FRONTEND_PORT=${FRONTEND_PORT:-3782}
+      # API base URLs
+      - NEXT_PUBLIC_API_BASE=${NEXT_PUBLIC_API_BASE:-http://localhost:8001}
+      - NEXT_PUBLIC_API_BASE_EXTERNAL=${NEXT_PUBLIC_API_BASE_EXTERNAL:-http://localhost:8001}
+      # LLM API Keys
+      - OPENAI_API_KEY=${OPENAI_API_KEY}
+      - OPENAI_BASE_URL=${OPENAI_BASE_URL:-https://api.openai.com/v1}
+      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
+      - PERPLEXITY_API_KEY=${PERPLEXITY_API_KEY:-}
+      - DASHSCOPE_API_KEY=${DASHSCOPE_API_KEY:-}
+      # Default LLM model
+      - DEFAULT_MODEL=${DEFAULT_MODEL:-gpt-4o}
+      # User ID and Group ID for permission management
+      - PUID=${PUID:-1000}
+      - PGID=${PGID:-1000}
+    healthcheck:
+      test:
+        [
+          "CMD",
+          "curl",
+          "-f",
+          "http://localhost:${BACKEND_PORT:-8001}/health",
+          "||",
+          "exit",
+          "1",
+        ]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+    deploy:
+      resources:
+        limits:
+          cpus: ${DEEPTUTOR_CPU_LIMIT:-4.00}
+          memory: ${DEEPTUTOR_MEMORY_LIMIT:-8G}
+        reservations:
+          cpus: ${DEEPTUTOR_CPU_RESERVATION:-1.00}
+          memory: ${DEEPTUTOR_MEMORY_RESERVATION:-2G}
+
+volumes:
+  deeptutor_data:
diff --git a/src/llama.cpp/.env.example b/src/llama.cpp/.env.example
new file mode 100644
index 0000000..90a2b18
--- /dev/null
+++ b/src/llama.cpp/.env.example
@@ -0,0 +1,106 @@
+# =============================================================================
+# llama.cpp Configuration
+# https://github.com/ggml-org/llama.cpp
+# LLM inference in C/C++ with support for various hardware accelerators
+# =============================================================================
+
+# -----------------------------------------------------------------------------
+# General Settings
+# -----------------------------------------------------------------------------
+
+# Timezone for the container (default: UTC)
+TZ=UTC
+
+# Global registry prefix (optional)
+# Example: docker.io/, ghcr.io/, registry.example.com/
+GHCR_REGISTRY=ghcr.io/
+
+# -----------------------------------------------------------------------------
+# Server Configuration
+# -----------------------------------------------------------------------------
+
+# Server image variant
+# Options: server (CPU), server-cuda (NVIDIA GPU), server-rocm (AMD GPU),
+#          server-musa (Moore Threads GPU), server-intel (Intel GPU),
+#          server-vulkan (Vulkan GPU)
+LLAMA_CPP_SERVER_VARIANT=server
+
+# Server port override (default: 8080)
+LLAMA_CPP_SERVER_PORT_OVERRIDE=8080
+
+# Model path inside the container
+# You need to mount your model file to this path
+# Example: /models/llama-2-7b-chat.Q4_K_M.gguf
+LLAMA_CPP_MODEL_PATH=/models/model.gguf
+
+# Context size (number of tokens)
+# Larger values allow for more context but require more memory
+# Default: 512, Common values: 512, 2048, 4096, 8192, 16384, 32768
+LLAMA_CPP_CONTEXT_SIZE=512
+
+# Number of GPU layers to offload
+# 0 = CPU only, 99 = all layers on GPU (for GPU variants)
+# For CPU variant, keep this at 0
+LLAMA_CPP_GPU_LAYERS=0
+
+# Number of GPUs to use (for CUDA variant)
+LLAMA_CPP_GPU_COUNT=1
+
+# Server CPU limit (in cores)
+LLAMA_CPP_SERVER_CPU_LIMIT=4.0
+
+# Server CPU reservation (in cores)
+LLAMA_CPP_SERVER_CPU_RESERVATION=2.0
+
+# Server memory limit
+LLAMA_CPP_SERVER_MEMORY_LIMIT=8G
+
+# Server memory reservation
+LLAMA_CPP_SERVER_MEMORY_RESERVATION=4G
+
+# -----------------------------------------------------------------------------
+# CLI Configuration (Light variant)
+# -----------------------------------------------------------------------------
+
+# CLI image variant
+# Options: light (CPU), light-cuda (NVIDIA GPU), light-rocm (AMD GPU),
+#          light-musa (Moore Threads GPU), light-intel (Intel GPU),
+#          light-vulkan (Vulkan GPU)
+LLAMA_CPP_CLI_VARIANT=light
+
+# Default prompt for CLI mode
+LLAMA_CPP_PROMPT=Hello, how are you?
+
+# CLI CPU limit (in cores)
+LLAMA_CPP_CLI_CPU_LIMIT=2.0
+
+# CLI CPU reservation (in cores)
+LLAMA_CPP_CLI_CPU_RESERVATION=1.0
+
+# CLI memory limit
+LLAMA_CPP_CLI_MEMORY_LIMIT=4G
+
+# CLI memory reservation
+LLAMA_CPP_CLI_MEMORY_RESERVATION=2G
+
+# -----------------------------------------------------------------------------
+# Full Toolkit Configuration
+# -----------------------------------------------------------------------------
+
+# Full image variant (includes model conversion tools)
+# Options: full (CPU), full-cuda (NVIDIA GPU), full-rocm (AMD GPU),
+#          full-musa (Moore Threads GPU), full-intel (Intel GPU),
+#          full-vulkan (Vulkan GPU)
+LLAMA_CPP_FULL_VARIANT=full
+
+# Full CPU limit (in cores)
+LLAMA_CPP_FULL_CPU_LIMIT=2.0
+
+# Full CPU reservation (in cores)
+LLAMA_CPP_FULL_CPU_RESERVATION=1.0
+
+# Full memory limit
+LLAMA_CPP_FULL_MEMORY_LIMIT=4G
+
+# Full memory reservation
+LLAMA_CPP_FULL_MEMORY_RESERVATION=2G
diff --git a/src/llama.cpp/README.md b/src/llama.cpp/README.md
new file mode 100644
index 0000000..bdf4c42
--- /dev/null
+++ b/src/llama.cpp/README.md
@@ -0,0 +1,245 @@
+# llama.cpp
+
+[中文文档](README.zh.md)
+
+[llama.cpp](https://github.com/ggml-org/llama.cpp) is a high-performance C/C++ implementation for LLM inference with support for various hardware accelerators.
+
+## Features
+
+- **Fast Inference**: Optimized C/C++ implementation for efficient LLM inference
+- **Multiple Backends**: CPU, CUDA (NVIDIA), ROCm (AMD), MUSA (Moore Threads), Intel GPU, Vulkan
+- **OpenAI-compatible API**: Server mode with OpenAI-compatible REST API
+- **CLI Support**: Interactive command-line interface for quick testing
+- **Model Conversion**: Full toolkit includes tools to convert and quantize models
+- **GGUF Format**: Support for the efficient GGUF model format
+- **Cross-platform**: Linux (x86-64, ARM64, s390x), Windows, macOS
+
+## Prerequisites
+
+- Docker and Docker Compose installed
+- At least 4GB of RAM (8GB+ recommended)
+- For GPU variants:
+  - **CUDA**: NVIDIA GPU with [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit)
+  - **ROCm**: AMD GPU with proper ROCm drivers
+  - **MUSA**: Moore Threads GPU with mt-container-toolkit
+- GGUF format model file (e.g., from [Hugging Face](https://huggingface.co/models?library=gguf))
+
+## Quick Start
+
+### 1. Server Mode (CPU)
+
+```bash
+# Copy and configure environment
+cp .env.example .env
+
+# Edit .env and set your model path
+# LLAMA_CPP_MODEL_PATH=/models/your-model.gguf
+
+# Place your GGUF model in a directory, then update docker-compose.yaml
+# to mount it, e.g.:
+# volumes:
+#   - ./models:/models
+
+# Start the server
+docker compose --profile server up -d
+
+# Test the server (OpenAI-compatible API)
+curl http://localhost:8080/v1/models
+
+# Chat completion request
+curl http://localhost:8080/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [
+      {"role": "user", "content": "Hello!"}
+    ]
+  }'
+```
+
+### 2. Server Mode with NVIDIA GPU
+
+```bash
+# Edit .env
+# Set LLAMA_CPP_GPU_LAYERS=99 to offload all layers to GPU
+
+# Start GPU-accelerated server
+docker compose --profile cuda up -d
+
+# The server will automatically use NVIDIA GPU
+```
+
+### 3. Server Mode with AMD GPU
+
+```bash
+# Edit .env
+# Set LLAMA_CPP_GPU_LAYERS=99 to offload all layers to GPU
+
+# Start GPU-accelerated server
+docker compose --profile rocm up -d
+
+# The server will automatically use AMD GPU
+```
+
+### 4. CLI Mode
+
+```bash
+# Edit .env and configure model path and prompt
+
+# Run CLI
+docker compose --profile cli up
+
+# For interactive mode, use:
+docker compose run --rm llama-cpp-cli \
+  -m /models/your-model.gguf \
+  -p "Your prompt here" \
+  -n 512
+```
+
+### 5. Full Toolkit (Model Conversion)
+
+```bash
+# Start the full container
+docker compose --profile full up -d
+
+# Execute commands inside the container
+docker compose exec llama-cpp-full bash
+
+# Inside container, you can use conversion tools
+# Example: Convert a Hugging Face model
+# python3 convert_hf_to_gguf.py /models/source-model --outfile /models/output.gguf
+```
+
+## Configuration
+
+### Environment Variables
+
+Key environment variables (see [.env.example](.env.example) for all options):
+
+| Variable                         | Description                                                   | Default              |
+| -------------------------------- | ------------------------------------------------------------- | -------------------- |
+| `LLAMA_CPP_SERVER_VARIANT`       | Server image variant (server, server-cuda, server-rocm, etc.) | `server`             |
+| `LLAMA_CPP_MODEL_PATH`           | Model file path inside container                              | `/models/model.gguf` |
+| `LLAMA_CPP_CONTEXT_SIZE`         | Context window size in tokens                                 | `512`                |
+| `LLAMA_CPP_GPU_LAYERS`           | Number of layers to offload to GPU (0=CPU only, 99=all)       | `0`                  |
+| `LLAMA_CPP_SERVER_PORT_OVERRIDE` | Server port on host                                           | `8080`               |
+| `LLAMA_CPP_SERVER_MEMORY_LIMIT`  | Memory limit for server                                       | `8G`                 |
+
+### Available Profiles
+
+- `server`: CPU-only server
+- `cuda`: NVIDIA GPU server (requires nvidia-container-toolkit)
+- `rocm`: AMD GPU server (requires ROCm)
+- `cli`: Command-line interface
+- `full`: Full toolkit with model conversion tools
+- `gpu`: Generic GPU profile (includes cuda and rocm)
+
+### Image Variants
+
+Each variant comes in multiple flavors:
+
+- **server**: Only `llama-server` executable (API server)
+- **light**: Only `llama-cli` and `llama-completion` executables
+- **full**: Complete toolkit including model conversion tools
+
+Backend options:
+
+- Base (CPU)
+- `-cuda` (NVIDIA GPU)
+- `-rocm` (AMD GPU)
+- `-musa` (Moore Threads GPU)
+- `-intel` (Intel GPU with SYCL)
+- `-vulkan` (Vulkan GPU)
+
+## Server API
+
+The server provides an OpenAI-compatible API:
+
+- `GET /health` - Health check
+- `GET /v1/models` - List available models
+- `POST /v1/chat/completions` - Chat completion
+- `POST /v1/completions` - Text completion
+- `POST /v1/embeddings` - Generate embeddings
+
+See the [llama.cpp server documentation](https://github.com/ggml-org/llama.cpp/blob/master/examples/server/README.md) for full API details.
+
+## Model Sources
+
+Download GGUF models from:
+
+- [Hugging Face GGUF Models](https://huggingface.co/models?library=gguf)
+- [TheBloke's GGUF Collection](https://huggingface.co/TheBloke)
+- Convert your own models using the full toolkit
+
+Popular quantization formats:
+
+- `Q4_K_M`: Good balance of quality and size (recommended)
+- `Q5_K_M`: Higher quality, larger size
+- `Q8_0`: Very high quality, large size
+- `Q2_K`: Smallest size, lower quality
+
+## Resource Requirements
+
+Minimum requirements by model size:
+
+| Model Size | RAM (CPU) | VRAM (GPU) | Context Size |
+| ---------- | --------- | ---------- | ------------ |
+| 7B Q4_K_M  | 6GB       | 4GB        | 2048         |
+| 13B Q4_K_M | 10GB      | 8GB        | 2048         |
+| 34B Q4_K_M | 24GB      | 20GB       | 2048         |
+| 70B Q4_K_M | 48GB      | 40GB       | 2048         |
+
+Larger context sizes require proportionally more memory.
+
+## Performance Tuning
+
+For CPU inference:
+
+- Increase `LLAMA_CPP_SERVER_CPU_LIMIT` for more cores
+- Optimize threads with `-t` flag (default: auto)
+
+For GPU inference:
+
+- Set `LLAMA_CPP_GPU_LAYERS=99` to offload all layers
+- Increase context size for longer conversations
+- Monitor GPU memory usage
+
+## Security Notes
+
+- The server binds to `0.0.0.0` by default - ensure proper network security
+- No authentication is enabled by default
+- Consider using a reverse proxy (nginx, Caddy) for production deployments
+- Limit resource usage to prevent system exhaustion
+
+## Troubleshooting
+
+### Out of Memory
+
+- Reduce `LLAMA_CPP_CONTEXT_SIZE`
+- Use a smaller quantized model (e.g., Q4 instead of Q8)
+- Reduce `LLAMA_CPP_GPU_LAYERS` if using GPU
+
+### GPU Not Detected
+
+**NVIDIA**: Verify nvidia-container-toolkit is installed:
+
+```bash
+docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi
+```
+
+**AMD**: Ensure ROCm drivers and `/dev/kfd`, `/dev/dri` are accessible.
+
+### Slow Inference
+
+- Check CPU/GPU utilization
+- Increase resource limits in `.env`
+- For GPU: Verify all layers are offloaded (`LLAMA_CPP_GPU_LAYERS=99`)
+
+## Documentation
+
+- [llama.cpp GitHub](https://github.com/ggml-org/llama.cpp)
+- [Docker Documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/docker.md)
+- [Server API Docs](https://github.com/ggml-org/llama.cpp/blob/master/examples/server/README.md)
+
+## License
+
+llama.cpp is released under the MIT License. See the [LICENSE](https://github.com/ggml-org/llama.cpp/blob/master/LICENSE) file for details.
diff --git a/src/llama.cpp/README.zh.md b/src/llama.cpp/README.zh.md
new file mode 100644
index 0000000..baf37d0
--- /dev/null
+++ b/src/llama.cpp/README.zh.md
@@ -0,0 +1,244 @@
+# llama.cpp
+
+[English Documentation](README.md)
+
+[llama.cpp](https://github.com/ggml-org/llama.cpp) 是一个高性能的 C/C++ 实现的大语言模型推理引擎，支持多种硬件加速器。
+
+## 功能特性
+
+- **高速推理**：优化的 C/C++ 实现，提供高效的 LLM 推理
+- **多种后端**：支持 CPU、CUDA（NVIDIA）、ROCm（AMD）、MUSA（摩尔线程）、Intel GPU、Vulkan
+- **OpenAI 兼容 API**：服务器模式提供 OpenAI 兼容的 REST API
+- **CLI 支持**：交互式命令行界面，方便快速测试
+- **模型转换**：完整工具包包含模型转换和量化工具
+- **GGUF 格式**：支持高效的 GGUF 模型格式
+- **跨平台**：支持 Linux（x86-64、ARM64、s390x）、Windows、macOS
+
+## 前置要求
+
+- 已安装 Docker 和 Docker Compose
+- 至少 4GB 内存（推荐 8GB 以上）
+- GPU 版本需要：
+  - **CUDA**：NVIDIA GPU 及 [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit)
+  - **ROCm**：AMD GPU 及相应的 ROCm 驱动
+  - **MUSA**：摩尔线程 GPU 及 mt-container-toolkit
+- GGUF 格式的模型文件（例如从 [Hugging Face](https://huggingface.co/models?library=gguf) 下载）
+
+## 快速开始
+
+### 1. 服务器模式（CPU）
+
+```bash
+# 复制并配置环境变量
+cp .env.example .env
+
+# 编辑 .env 并设置模型路径
+# LLAMA_CPP_MODEL_PATH=/models/your-model.gguf
+
+# 将 GGUF 模型放在目录中，然后更新 docker-compose.yaml 挂载，例如：
+# volumes:
+#   - ./models:/models
+
+# 启动服务器
+docker compose --profile server up -d
+
+# 测试服务器（OpenAI 兼容 API）
+curl http://localhost:8080/v1/models
+
+# 聊天补全请求
+curl http://localhost:8080/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [
+      {"role": "user", "content": "你好！"}
+    ]
+  }'
+```
+
+### 2. 服务器模式（NVIDIA GPU）
+
+```bash
+# 编辑 .env
+# 设置 LLAMA_CPP_GPU_LAYERS=99 将所有层卸载到 GPU
+
+# 启动 GPU 加速服务器
+docker compose --profile cuda up -d
+
+# 服务器将自动使用 NVIDIA GPU
+```
+
+### 3. 服务器模式（AMD GPU）
+
+```bash
+# 编辑 .env
+# 设置 LLAMA_CPP_GPU_LAYERS=99 将所有层卸载到 GPU
+
+# 启动 GPU 加速服务器
+docker compose --profile rocm up -d
+
+# 服务器将自动使用 AMD GPU
+```
+
+### 4. CLI 模式
+
+```bash
+# 编辑 .env 并配置模型路径和提示词
+
+# 运行 CLI
+docker compose --profile cli up
+
+# 交互模式：
+docker compose run --rm llama-cpp-cli \
+  -m /models/your-model.gguf \
+  -p "你的提示词" \
+  -n 512
+```
+
+### 5. 完整工具包（模型转换）
+
+```bash
+# 启动完整容器
+docker compose --profile full up -d
+
+# 在容器内执行命令
+docker compose exec llama-cpp-full bash
+
+# 在容器内可以使用转换工具
+# 示例：转换 Hugging Face 模型
+# python3 convert_hf_to_gguf.py /models/source-model --outfile /models/output.gguf
+```
+
+## 配置说明
+
+### 环境变量
+
+主要环境变量（完整选项请查看 [.env.example](.env.example)）：
+
+| 变量                             | 说明                                                  | 默认值               |
+| -------------------------------- | ----------------------------------------------------- | -------------------- |
+| `LLAMA_CPP_SERVER_VARIANT`       | 服务器镜像变体（server、server-cuda、server-rocm 等） | `server`             |
+| `LLAMA_CPP_MODEL_PATH`           | 容器内模型文件路径                                    | `/models/model.gguf` |
+| `LLAMA_CPP_CONTEXT_SIZE`         | 上下文窗口大小（token 数）                            | `512`                |
+| `LLAMA_CPP_GPU_LAYERS`           | 卸载到 GPU 的层数（0=仅 CPU，99=全部）                | `0`                  |
+| `LLAMA_CPP_SERVER_PORT_OVERRIDE` | 主机端口                                              | `8080`               |
+| `LLAMA_CPP_SERVER_MEMORY_LIMIT`  | 服务器内存限制                                        | `8G`                 |
+
+### 可用配置文件
+
+- `server`：仅 CPU 服务器
+- `cuda`：NVIDIA GPU 服务器（需要 nvidia-container-toolkit）
+- `rocm`：AMD GPU 服务器（需要 ROCm）
+- `cli`：命令行界面
+- `full`：包含模型转换工具的完整工具包
+- `gpu`：通用 GPU 配置（包括 cuda 和 rocm）
+
+### 镜像变体
+
+每个变体都有多种类型：
+
+- **server**：仅包含 `llama-server` 可执行文件（API 服务器）
+- **light**：仅包含 `llama-cli` 和 `llama-completion` 可执行文件
+- **full**：完整工具包，包括模型转换工具
+
+后端选项：
+
+- 基础版（CPU）
+- `-cuda`（NVIDIA GPU）
+- `-rocm`（AMD GPU）
+- `-musa`（摩尔线程 GPU）
+- `-intel`（Intel GPU，支持 SYCL）
+- `-vulkan`（Vulkan GPU）
+
+## 服务器 API
+
+服务器提供 OpenAI 兼容的 API：
+
+- `GET /health` - 健康检查
+- `GET /v1/models` - 列出可用模型
+- `POST /v1/chat/completions` - 聊天补全
+- `POST /v1/completions` - 文本补全
+- `POST /v1/embeddings` - 生成嵌入向量
+
+完整 API 详情请参阅 [llama.cpp 服务器文档](https://github.com/ggml-org/llama.cpp/blob/master/examples/server/README.md)。
+
+## 模型来源
+
+下载 GGUF 模型：
+
+- [Hugging Face GGUF 模型](https://huggingface.co/models?library=gguf)
+- [TheBloke 的 GGUF 合集](https://huggingface.co/TheBloke)
+- 使用完整工具包转换您自己的模型
+
+常用量化格式：
+
+- `Q4_K_M`：质量和大小的良好平衡（推荐）
+- `Q5_K_M`：更高质量，更大体积
+- `Q8_0`：非常高的质量，大体积
+- `Q2_K`：最小体积，较低质量
+
+## 资源需求
+
+按模型大小的最低要求：
+
+| 模型大小   | 内存（CPU） | 显存（GPU） | 上下文大小 |
+| ---------- | ----------- | ----------- | ---------- |
+| 7B Q4_K_M  | 6GB         | 4GB         | 2048       |
+| 13B Q4_K_M | 10GB        | 8GB         | 2048       |
+| 34B Q4_K_M | 24GB        | 20GB        | 2048       |
+| 70B Q4_K_M | 48GB        | 40GB        | 2048       |
+
+更大的上下文大小需要成比例的更多内存。
+
+## 性能调优
+
+CPU 推理：
+
+- 增加 `LLAMA_CPP_SERVER_CPU_LIMIT` 以使用更多核心
+- 使用 `-t` 参数优化线程数（默认：自动）
+
+GPU 推理：
+
+- 设置 `LLAMA_CPP_GPU_LAYERS=99` 卸载所有层
+- 增加上下文大小以支持更长对话
+- 监控 GPU 内存使用
+
+## 安全注意事项
+
+- 服务器默认绑定到 `0.0.0.0` - 请确保网络安全
+- 默认未启用身份验证
+- 生产环境建议使用反向代理（nginx、Caddy）
+- 限制资源使用以防止系统资源耗尽
+
+## 故障排除
+
+### 内存不足
+
+- 减小 `LLAMA_CPP_CONTEXT_SIZE`
+- 使用更小的量化模型（例如 Q4 而不是 Q8）
+- 减少 `LLAMA_CPP_GPU_LAYERS`（如果使用 GPU）
+
+### GPU 未检测到
+
+**NVIDIA**：验证 nvidia-container-toolkit 是否已安装：
+
+```bash
+docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi
+```
+
+**AMD**：确保 ROCm 驱动已安装且 `/dev/kfd`、`/dev/dri` 可访问。
+
+### 推理速度慢
+
+- 检查 CPU/GPU 利用率
+- 增加 `.env` 中的资源限制
+- GPU：验证所有层都已卸载（`LLAMA_CPP_GPU_LAYERS=99`）
+
+## 文档
+
+- [llama.cpp GitHub](https://github.com/ggml-org/llama.cpp)
+- [Docker 文档](https://github.com/ggml-org/llama.cpp/blob/master/docs/docker.md)
+- [服务器 API 文档](https://github.com/ggml-org/llama.cpp/blob/master/examples/server/README.md)
+
+## 许可证
+
+llama.cpp 使用 MIT 许可证发布。详情请参阅 [LICENSE](https://github.com/ggml-org/llama.cpp/blob/master/LICENSE) 文件。
diff --git a/src/llama.cpp/docker-compose.yaml b/src/llama.cpp/docker-compose.yaml
new file mode 100644
index 0000000..564d6b1
--- /dev/null
+++ b/src/llama.cpp/docker-compose.yaml
@@ -0,0 +1,210 @@
+# Docker Compose configuration for llama.cpp
+# https://github.com/ggml-org/llama.cpp
+# LLM inference in C/C++ with support for various hardware accelerators
+
+x-defaults: &defaults
+  restart: unless-stopped
+  logging:
+    driver: json-file
+    options:
+      max-size: 100m
+      max-file: "3"
+
+services:
+  # llama.cpp server - OpenAI-compatible API server
+  # Variant: server (CPU), server-cuda (NVIDIA GPU), server-rocm (AMD GPU)
+  llama-cpp-server:
+    <<: *defaults
+    image: ${GHCR_REGISTRY:-ghcr.io/}ggml-org/llama.cpp:${LLAMA_CPP_SERVER_VARIANT:-server}
+    ports:
+      - "${LLAMA_CPP_SERVER_PORT_OVERRIDE:-8080}:8080"
+    volumes:
+      - llama_cpp_models:/models
+    command:
+      - "-m"
+      - "${LLAMA_CPP_MODEL_PATH:-/models/model.gguf}"
+      - "--port"
+      - "8080"
+      - "--host"
+      - "0.0.0.0"
+      - "-n"
+      - "${LLAMA_CPP_CONTEXT_SIZE:-512}"
+      - "--n-gpu-layers"
+      - "${LLAMA_CPP_GPU_LAYERS:-0}"
+    environment:
+      - TZ=${TZ:-UTC}
+    healthcheck:
+      test:
+        [
+          "CMD",
+          "wget",
+          "--quiet",
+          "--tries=1",
+          "--spider",
+          "http://localhost:8080/health",
+        ]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 30s
+    deploy:
+      resources:
+        limits:
+          cpus: ${LLAMA_CPP_SERVER_CPU_LIMIT:-4.0}
+          memory: ${LLAMA_CPP_SERVER_MEMORY_LIMIT:-8G}
+        reservations:
+          cpus: ${LLAMA_CPP_SERVER_CPU_RESERVATION:-2.0}
+          memory: ${LLAMA_CPP_SERVER_MEMORY_RESERVATION:-4G}
+    profiles:
+      - server
+
+  # llama.cpp server with NVIDIA GPU support
+  llama-cpp-server-cuda:
+    <<: *defaults
+    image: ${GHCR_REGISTRY:-ghcr.io/}ggml-org/llama.cpp:server-cuda
+    ports:
+      - "${LLAMA_CPP_SERVER_PORT_OVERRIDE:-8080}:8080"
+    volumes:
+      - llama_cpp_models:/models
+    command:
+      - "-m"
+      - "${LLAMA_CPP_MODEL_PATH:-/models/model.gguf}"
+      - "--port"
+      - "8080"
+      - "--host"
+      - "0.0.0.0"
+      - "-n"
+      - "${LLAMA_CPP_CONTEXT_SIZE:-512}"
+      - "--n-gpu-layers"
+      - "${LLAMA_CPP_GPU_LAYERS:-99}"
+    environment:
+      - TZ=${TZ:-UTC}
+    healthcheck:
+      test:
+        [
+          "CMD",
+          "wget",
+          "--quiet",
+          "--tries=1",
+          "--spider",
+          "http://localhost:8080/health",
+        ]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 30s
+    deploy:
+      resources:
+        limits:
+          cpus: ${LLAMA_CPP_SERVER_CPU_LIMIT:-4.0}
+          memory: ${LLAMA_CPP_SERVER_MEMORY_LIMIT:-8G}
+        reservations:
+          cpus: ${LLAMA_CPP_SERVER_CPU_RESERVATION:-2.0}
+          memory: ${LLAMA_CPP_SERVER_MEMORY_RESERVATION:-4G}
+          devices:
+            - driver: nvidia
+              count: ${LLAMA_CPP_GPU_COUNT:-1}
+              capabilities: [gpu]
+    profiles:
+      - gpu
+      - cuda
+
+  # llama.cpp server with AMD ROCm GPU support
+  llama-cpp-server-rocm:
+    <<: *defaults
+    image: ${GHCR_REGISTRY:-ghcr.io/}ggml-org/llama.cpp:server-rocm
+    ports:
+      - "${LLAMA_CPP_SERVER_PORT_OVERRIDE:-8080}:8080"
+    volumes:
+      - llama_cpp_models:/models
+    devices:
+      - /dev/kfd
+      - /dev/dri
+    command:
+      - "-m"
+      - "${LLAMA_CPP_MODEL_PATH:-/models/model.gguf}"
+      - "--port"
+      - "8080"
+      - "--host"
+      - "0.0.0.0"
+      - "-n"
+      - "${LLAMA_CPP_CONTEXT_SIZE:-512}"
+      - "--n-gpu-layers"
+      - "${LLAMA_CPP_GPU_LAYERS:-99}"
+    environment:
+      - TZ=${TZ:-UTC}
+    healthcheck:
+      test:
+        [
+          "CMD",
+          "wget",
+          "--quiet",
+          "--tries=1",
+          "--spider",
+          "http://localhost:8080/health",
+        ]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 30s
+    deploy:
+      resources:
+        limits:
+          cpus: ${LLAMA_CPP_SERVER_CPU_LIMIT:-4.0}
+          memory: ${LLAMA_CPP_SERVER_MEMORY_LIMIT:-8G}
+        reservations:
+          cpus: ${LLAMA_CPP_SERVER_CPU_RESERVATION:-2.0}
+          memory: ${LLAMA_CPP_SERVER_MEMORY_RESERVATION:-4G}
+    profiles:
+      - gpu
+      - rocm
+
+  # llama.cpp CLI (light) - Interactive command-line interface
+  llama-cpp-cli:
+    <<: *defaults
+    image: ${GHCR_REGISTRY:-ghcr.io/}ggml-org/llama.cpp:${LLAMA_CPP_CLI_VARIANT:-light}
+    volumes:
+      - llama_cpp_models:/models
+    entrypoint: /app/llama-cli
+    command:
+      - "-m"
+      - "${LLAMA_CPP_MODEL_PATH:-/models/model.gguf}"
+      - "-p"
+      - "${LLAMA_CPP_PROMPT:-Hello, how are you?}"
+      - "-n"
+      - "${LLAMA_CPP_CONTEXT_SIZE:-512}"
+    environment:
+      - TZ=${TZ:-UTC}
+    deploy:
+      resources:
+        limits:
+          cpus: ${LLAMA_CPP_CLI_CPU_LIMIT:-2.0}
+          memory: ${LLAMA_CPP_CLI_MEMORY_LIMIT:-4G}
+        reservations:
+          cpus: ${LLAMA_CPP_CLI_CPU_RESERVATION:-1.0}
+          memory: ${LLAMA_CPP_CLI_MEMORY_RESERVATION:-2G}
+    profiles:
+      - cli
+
+  # llama.cpp full - Complete toolkit including model conversion tools
+  llama-cpp-full:
+    <<: *defaults
+    image: ${GHCR_REGISTRY:-ghcr.io/}ggml-org/llama.cpp:${LLAMA_CPP_FULL_VARIANT:-full}
+    volumes:
+      - llama_cpp_models:/models
+    command: ["sleep", "infinity"]
+    environment:
+      - TZ=${TZ:-UTC}
+    deploy:
+      resources:
+        limits:
+          cpus: ${LLAMA_CPP_FULL_CPU_LIMIT:-2.0}
+          memory: ${LLAMA_CPP_FULL_MEMORY_LIMIT:-4G}
+        reservations:
+          cpus: ${LLAMA_CPP_FULL_CPU_RESERVATION:-1.0}
+          memory: ${LLAMA_CPP_FULL_MEMORY_RESERVATION:-2G}
+    profiles:
+      - full
+
+volumes:
+  llama_cpp_models: