feat: Add Chinese documentation and Docker Compose configurations for DeepTutor and llama.cpp
- Created README.zh.md for DeepTutor with comprehensive features, installation steps, and usage instructions in Chinese. - Added docker-compose.yaml for DeepTutor to define services, environment variables, and resource limits. - Introduced .env.example for llama.cpp with configuration options for server settings and resource management. - Added README.md and README.zh.md for llama.cpp detailing features, prerequisites, quick start guides, and API documentation. - Implemented docker-compose.yaml for llama.cpp to support various server configurations (CPU, CUDA, ROCm) and CLI usage.
This commit is contained in:
@@ -34,6 +34,7 @@ These services require building custom Docker images from source.
|
|||||||
| [Clash](./src/clash) | 1.18.0 |
|
| [Clash](./src/clash) | 1.18.0 |
|
||||||
| [ClickHouse](./src/clickhouse) | 24.11.1 |
|
| [ClickHouse](./src/clickhouse) | 24.11.1 |
|
||||||
| [Conductor](./src/conductor) | latest |
|
| [Conductor](./src/conductor) | latest |
|
||||||
|
| [DeepTutor](./apps/deeptutor) | latest |
|
||||||
| [Dify](./apps/dify) | 0.18.2 |
|
| [Dify](./apps/dify) | 0.18.2 |
|
||||||
| [DNSMasq](./src/dnsmasq) | 2.91 |
|
| [DNSMasq](./src/dnsmasq) | 2.91 |
|
||||||
| [Dockge](./src/dockge) | 1 |
|
| [Dockge](./src/dockge) | 1 |
|
||||||
@@ -72,6 +73,7 @@ These services require building custom Docker images from source.
|
|||||||
| [LibreOffice](./src/libreoffice) | latest |
|
| [LibreOffice](./src/libreoffice) | latest |
|
||||||
| [libSQL Server](./src/libsql) | latest |
|
| [libSQL Server](./src/libsql) | latest |
|
||||||
| [LiteLLM](./src/litellm) | main-stable |
|
| [LiteLLM](./src/litellm) | main-stable |
|
||||||
|
| [llama.cpp](./src/llama.cpp) | server |
|
||||||
| [LMDeploy](./src/lmdeploy) | v0.11.1 |
|
| [LMDeploy](./src/lmdeploy) | v0.11.1 |
|
||||||
| [Logstash](./src/logstash) | 8.16.1 |
|
| [Logstash](./src/logstash) | 8.16.1 |
|
||||||
| [MariaDB Galera Cluster](./src/mariadb-galera) | 11.7.2 |
|
| [MariaDB Galera Cluster](./src/mariadb-galera) | 11.7.2 |
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ Compose Anything 通过提供一组高质量的 Docker Compose 配置文件,
|
|||||||
| [Clash](./src/clash) | 1.18.0 |
|
| [Clash](./src/clash) | 1.18.0 |
|
||||||
| [ClickHouse](./src/clickhouse) | 24.11.1 |
|
| [ClickHouse](./src/clickhouse) | 24.11.1 |
|
||||||
| [Conductor](./src/conductor) | latest |
|
| [Conductor](./src/conductor) | latest |
|
||||||
|
| [DeepTutor](./apps/deeptutor) | latest |
|
||||||
| [Dify](./apps/dify) | 0.18.2 |
|
| [Dify](./apps/dify) | 0.18.2 |
|
||||||
| [DNSMasq](./src/dnsmasq) | 2.91 |
|
| [DNSMasq](./src/dnsmasq) | 2.91 |
|
||||||
| [Dockge](./src/dockge) | 1 |
|
| [Dockge](./src/dockge) | 1 |
|
||||||
@@ -72,6 +73,7 @@ Compose Anything 通过提供一组高质量的 Docker Compose 配置文件,
|
|||||||
| [LibreOffice](./src/libreoffice) | latest |
|
| [LibreOffice](./src/libreoffice) | latest |
|
||||||
| [libSQL Server](./src/libsql) | latest |
|
| [libSQL Server](./src/libsql) | latest |
|
||||||
| [LiteLLM](./src/litellm) | main-stable |
|
| [LiteLLM](./src/litellm) | main-stable |
|
||||||
|
| [llama.cpp](./src/llama.cpp) | server |
|
||||||
| [LMDeploy](./src/lmdeploy) | v0.11.1 |
|
| [LMDeploy](./src/lmdeploy) | v0.11.1 |
|
||||||
| [Logstash](./src/logstash) | 8.16.1 |
|
| [Logstash](./src/logstash) | 8.16.1 |
|
||||||
| [MariaDB Galera Cluster](./src/mariadb-galera) | 11.7.2 |
|
| [MariaDB Galera Cluster](./src/mariadb-galera) | 11.7.2 |
|
||||||
|
|||||||
97
apps/deeptutor/.env.example
Normal file
97
apps/deeptutor/.env.example
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
# DeepTutor Configuration
|
||||||
|
# Copy this file to .env and fill in your API keys
|
||||||
|
|
||||||
|
#! ==================================================
|
||||||
|
#! General Settings
|
||||||
|
#! ==================================================
|
||||||
|
|
||||||
|
# Timezone (default: UTC)
|
||||||
|
TZ=UTC
|
||||||
|
|
||||||
|
# User and Group ID for file permissions (default: 1000)
|
||||||
|
# Adjust if your host user has a different UID/GID
|
||||||
|
PUID=1000
|
||||||
|
PGID=1000
|
||||||
|
|
||||||
|
# Global registry prefix (optional)
|
||||||
|
# Example: registry.example.com/ or leave empty for Docker Hub/GHCR
|
||||||
|
GLOBAL_REGISTRY=
|
||||||
|
|
||||||
|
#! ==================================================
|
||||||
|
#! DeepTutor Version
|
||||||
|
#! ==================================================
|
||||||
|
|
||||||
|
# Image version (default: latest)
|
||||||
|
# Available tags: latest, v0.5.x
|
||||||
|
# See: https://github.com/HKUDS/DeepTutor/pkgs/container/deeptutor
|
||||||
|
DEEPTUTOR_VERSION=latest
|
||||||
|
|
||||||
|
#! ==================================================
|
||||||
|
#! Port Configuration
|
||||||
|
#! ==================================================
|
||||||
|
|
||||||
|
# Backend port (internal: 8001)
|
||||||
|
BACKEND_PORT=8001
|
||||||
|
# Host port override for backend
|
||||||
|
DEEPTUTOR_BACKEND_PORT_OVERRIDE=8001
|
||||||
|
|
||||||
|
# Frontend port (internal: 3782)
|
||||||
|
FRONTEND_PORT=3782
|
||||||
|
# Host port override for frontend
|
||||||
|
DEEPTUTOR_FRONTEND_PORT_OVERRIDE=3782
|
||||||
|
|
||||||
|
#! ==================================================
|
||||||
|
#! API Base URLs
|
||||||
|
#! ==================================================
|
||||||
|
|
||||||
|
# Internal API base URL (used by frontend to communicate with backend)
|
||||||
|
NEXT_PUBLIC_API_BASE=http://localhost:8001
|
||||||
|
|
||||||
|
# External API base URL (for cloud deployment, set to your public URL)
|
||||||
|
# Example: https://your-server.com:8001
|
||||||
|
# For local deployment, use the same as NEXT_PUBLIC_API_BASE
|
||||||
|
NEXT_PUBLIC_API_BASE_EXTERNAL=http://localhost:8001
|
||||||
|
|
||||||
|
#! ==================================================
|
||||||
|
#! LLM API Keys (Required)
|
||||||
|
#! ==================================================
|
||||||
|
|
||||||
|
# OpenAI API Key (Required)
|
||||||
|
# Get from: https://platform.openai.com/api-keys
|
||||||
|
OPENAI_API_KEY=sk-your-openai-api-key-here
|
||||||
|
|
||||||
|
# OpenAI Base URL (default: https://api.openai.com/v1)
|
||||||
|
# For OpenAI-compatible APIs (e.g., Azure OpenAI, custom endpoints)
|
||||||
|
OPENAI_BASE_URL=https://api.openai.com/v1
|
||||||
|
|
||||||
|
# Default LLM Model (default: gpt-4o)
|
||||||
|
# Options: gpt-4o, gpt-4-turbo, gpt-4, gpt-3.5-turbo, etc.
|
||||||
|
DEFAULT_MODEL=gpt-4o
|
||||||
|
|
||||||
|
#! ==================================================
|
||||||
|
#! Additional LLM API Keys (Optional)
|
||||||
|
#! ==================================================
|
||||||
|
|
||||||
|
# Anthropic API Key (Optional, for Claude models)
|
||||||
|
# Get from: https://console.anthropic.com/
|
||||||
|
ANTHROPIC_API_KEY=
|
||||||
|
|
||||||
|
# Perplexity API Key (Optional, for web search)
|
||||||
|
# Get from: https://www.perplexity.ai/settings/api
|
||||||
|
PERPLEXITY_API_KEY=
|
||||||
|
|
||||||
|
# DashScope API Key (Optional, for Alibaba Cloud models)
|
||||||
|
# Get from: https://dashscope.console.aliyun.com/
|
||||||
|
DASHSCOPE_API_KEY=
|
||||||
|
|
||||||
|
#! ==================================================
|
||||||
|
#! Resource Limits
|
||||||
|
#! ==================================================
|
||||||
|
|
||||||
|
# CPU limits (default: 4.00 cores limit, 1.00 cores reservation)
|
||||||
|
DEEPTUTOR_CPU_LIMIT=4.00
|
||||||
|
DEEPTUTOR_CPU_RESERVATION=1.00
|
||||||
|
|
||||||
|
# Memory limits (default: 8G limit, 2G reservation)
|
||||||
|
DEEPTUTOR_MEMORY_LIMIT=8G
|
||||||
|
DEEPTUTOR_MEMORY_RESERVATION=2G
|
||||||
248
apps/deeptutor/README.md
Normal file
248
apps/deeptutor/README.md
Normal file
@@ -0,0 +1,248 @@
|
|||||||
|
# DeepTutor
|
||||||
|
|
||||||
|
[中文说明](README.zh.md) | English
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
DeepTutor is an AI-powered personalized learning assistant that transforms any document into an interactive learning experience with multi-agent intelligence. It helps you solve problems, generate questions, conduct research, collaborate on writing, organize notes, and guides you through learning paths.
|
||||||
|
|
||||||
|
**Project:** <https://github.com/HKUDS/DeepTutor>
|
||||||
|
**License:** Apache-2.0
|
||||||
|
**Documentation:** <https://hkuds.github.io/DeepTutor/>
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **Problem Solving** — Detailed step-by-step solutions with visual diagrams
|
||||||
|
- **Question Generation** — Adaptive questions based on your knowledge level
|
||||||
|
- **Research Assistant** — Deep research with multi-agent collaboration
|
||||||
|
- **Co-Writer** — Interactive idea generation and writing assistance
|
||||||
|
- **Smart Notebook** — Organize and retrieve learning materials efficiently
|
||||||
|
- **Guided Learning** — Personalized learning paths and progress tracking
|
||||||
|
- **Multi-Agent System** — Specialized agents for different learning tasks
|
||||||
|
- **RAG Integration** — LightRAG and RAG-Anything for knowledge retrieval
|
||||||
|
- **Code Execution** — Built-in code playground for practice
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
|
||||||
|
- Docker and Docker Compose
|
||||||
|
- OpenAI API key (required)
|
||||||
|
- Optional: Anthropic, Perplexity, or DashScope API keys
|
||||||
|
|
||||||
|
### Installation
|
||||||
|
|
||||||
|
1. **Clone this repository**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone <your-compose-anything-repo>
|
||||||
|
cd apps/deeptutor
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Configure environment**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cp .env.example .env
|
||||||
|
# Edit .env and add your API keys
|
||||||
|
```
|
||||||
|
|
||||||
|
**Required configuration:**
|
||||||
|
- `OPENAI_API_KEY` — Your OpenAI API key
|
||||||
|
|
||||||
|
**Optional configuration:**
|
||||||
|
- `ANTHROPIC_API_KEY` — For Claude models
|
||||||
|
- `PERPLEXITY_API_KEY` — For web search
|
||||||
|
- `DASHSCOPE_API_KEY` — For Alibaba Cloud models
|
||||||
|
- Adjust ports if needed (default: 8001 for backend, 3782 for frontend)
|
||||||
|
- Set `NEXT_PUBLIC_API_BASE_EXTERNAL` for cloud deployments
|
||||||
|
|
||||||
|
3. **Optional: Custom agent configuration**
|
||||||
|
|
||||||
|
Create a `config/agents.yaml` file to customize agent behaviors (see [documentation](https://hkuds.github.io/DeepTutor/guide/config.html) for details).
|
||||||
|
|
||||||
|
4. **Start the service**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
First run takes approximately 30-60 seconds to initialize.
|
||||||
|
|
||||||
|
5. **Access the application**
|
||||||
|
|
||||||
|
- **Frontend:** <http://localhost:3782>
|
||||||
|
- **Backend API:** <http://localhost:8001>
|
||||||
|
- **API Documentation:** <http://localhost:8001/docs>
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Create Knowledge Base
|
||||||
|
|
||||||
|
1. Navigate to <http://localhost:3782/knowledge>
|
||||||
|
2. Click "New Knowledge Base"
|
||||||
|
3. Upload documents (supports PDF, DOCX, TXT, Markdown, HTML, etc.)
|
||||||
|
4. Wait for processing to complete
|
||||||
|
|
||||||
|
### Learning Modes
|
||||||
|
|
||||||
|
- **Solve** — Get step-by-step solutions to problems
|
||||||
|
- **Question** — Generate practice questions based on your materials
|
||||||
|
- **Research** — Deep research with multi-agent collaboration
|
||||||
|
- **Co-Writer** — Interactive writing and idea generation
|
||||||
|
- **Notebook** — Organize and manage your learning materials
|
||||||
|
- **Guide** — Follow personalized learning paths
|
||||||
|
|
||||||
|
### Advanced Features
|
||||||
|
|
||||||
|
- **Code Execution** — Practice coding directly in the interface
|
||||||
|
- **Visual Diagrams** — Automatic diagram generation for complex concepts
|
||||||
|
- **Export** — Download your work as PDF or Markdown
|
||||||
|
- **Multi-language** — Support for multiple languages
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Environment Variables
|
||||||
|
|
||||||
|
Key environment variables (see [.env.example](.env.example) for all options):
|
||||||
|
|
||||||
|
| Variable | Default | Description |
|
||||||
|
| ------------------------ | ---------- | ------------------------- |
|
||||||
|
| `OPENAI_API_KEY` | (required) | Your OpenAI API key |
|
||||||
|
| `DEFAULT_MODEL` | `gpt-4o` | Default LLM model |
|
||||||
|
| `BACKEND_PORT` | `8001` | Backend server port |
|
||||||
|
| `FRONTEND_PORT` | `3782` | Frontend application port |
|
||||||
|
| `DEEPTUTOR_CPU_LIMIT` | `4.00` | CPU limit (cores) |
|
||||||
|
| `DEEPTUTOR_MEMORY_LIMIT` | `8G` | Memory limit |
|
||||||
|
|
||||||
|
### Ports
|
||||||
|
|
||||||
|
- **8001** — Backend API server
|
||||||
|
- **3782** — Frontend web interface
|
||||||
|
|
||||||
|
### Volumes
|
||||||
|
|
||||||
|
- `deeptutor_data` — User data, knowledge bases, and learning materials
|
||||||
|
- `./config` — Custom agent configurations (optional)
|
||||||
|
|
||||||
|
## Resource Requirements
|
||||||
|
|
||||||
|
**Minimum:**
|
||||||
|
|
||||||
|
- CPU: 1 core
|
||||||
|
- Memory: 2GB
|
||||||
|
- Disk: 2GB + space for knowledge bases
|
||||||
|
|
||||||
|
**Recommended:**
|
||||||
|
|
||||||
|
- CPU: 4 cores
|
||||||
|
- Memory: 8GB
|
||||||
|
- Disk: 10GB+
|
||||||
|
|
||||||
|
## Supported Models
|
||||||
|
|
||||||
|
DeepTutor supports multiple LLM providers:
|
||||||
|
|
||||||
|
- **OpenAI** — GPT-4, GPT-4 Turbo, GPT-3.5 Turbo
|
||||||
|
- **Anthropic** — Claude 3 (Opus, Sonnet, Haiku)
|
||||||
|
- **Perplexity** — For web search integration
|
||||||
|
- **DashScope** — Alibaba Cloud models
|
||||||
|
- **OpenAI-compatible APIs** — Any API compatible with OpenAI format
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Backend fails to start
|
||||||
|
|
||||||
|
- Verify `OPENAI_API_KEY` is set correctly in `.env`
|
||||||
|
- Check logs: `docker compose logs -f`
|
||||||
|
- Ensure ports 8001 and 3782 are not in use
|
||||||
|
- Verify sufficient disk space for volumes
|
||||||
|
|
||||||
|
### Frontend cannot connect to backend
|
||||||
|
|
||||||
|
- Confirm backend is running: visit <http://localhost:8001/docs>
|
||||||
|
- For cloud deployments, set `NEXT_PUBLIC_API_BASE_EXTERNAL` to your public URL
|
||||||
|
- Check firewall settings
|
||||||
|
|
||||||
|
### Knowledge base processing fails
|
||||||
|
|
||||||
|
- Ensure sufficient memory (recommended 8GB+)
|
||||||
|
- Check document format is supported
|
||||||
|
- Review logs for specific errors
|
||||||
|
|
||||||
|
### API rate limits
|
||||||
|
|
||||||
|
- Monitor your API usage on provider dashboards
|
||||||
|
- Consider upgrading your API plan
|
||||||
|
- Use different models for different tasks
|
||||||
|
|
||||||
|
## Security Notes
|
||||||
|
|
||||||
|
- **API Keys** — Keep your API keys secure, never commit them to version control
|
||||||
|
- **Network Exposure** — For production deployments, use HTTPS and proper authentication
|
||||||
|
- **Data Privacy** — User data is stored in Docker volumes; ensure proper backup and security
|
||||||
|
- **Resource Limits** — Set appropriate CPU and memory limits to prevent resource exhaustion
|
||||||
|
|
||||||
|
## Updates
|
||||||
|
|
||||||
|
To update to the latest version:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Pull the latest image
|
||||||
|
docker compose pull
|
||||||
|
|
||||||
|
# Recreate containers
|
||||||
|
docker compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
To update to a specific version, edit `DEEPTUTOR_VERSION` in `.env` and run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
## Advanced Usage
|
||||||
|
|
||||||
|
### Custom Agent Configuration
|
||||||
|
|
||||||
|
Create `config/agents.yaml` to customize agent behaviors:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
agents:
|
||||||
|
solver:
|
||||||
|
model: gpt-4o
|
||||||
|
temperature: 0.7
|
||||||
|
researcher:
|
||||||
|
model: gpt-4-turbo
|
||||||
|
max_tokens: 4000
|
||||||
|
```
|
||||||
|
|
||||||
|
See [official documentation](https://hkuds.github.io/DeepTutor/guide/config.html) for detailed configuration options.
|
||||||
|
|
||||||
|
### Cloud Deployment
|
||||||
|
|
||||||
|
For cloud deployment, additional configuration is needed:
|
||||||
|
|
||||||
|
1. Set public URL in `.env`:
|
||||||
|
|
||||||
|
```env
|
||||||
|
NEXT_PUBLIC_API_BASE_EXTERNAL=https://your-domain.com:8001
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Configure reverse proxy (nginx/Caddy) for HTTPS
|
||||||
|
3. Ensure proper firewall rules
|
||||||
|
4. Consider using environment-specific secrets management
|
||||||
|
|
||||||
|
### Using Different Embedding Models
|
||||||
|
|
||||||
|
DeepTutor uses `text-embedding-3-large` by default. To use different embedding models, refer to the [official documentation](https://hkuds.github.io/DeepTutor/guide/config.html).
|
||||||
|
|
||||||
|
## Links
|
||||||
|
|
||||||
|
- **GitHub:** <https://github.com/HKUDS/DeepTutor>
|
||||||
|
- **Documentation:** <https://hkuds.github.io/DeepTutor/>
|
||||||
|
- **Issues:** <https://github.com/HKUDS/DeepTutor/issues>
|
||||||
|
- **Discussions:** <https://github.com/HKUDS/DeepTutor/discussions>
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
DeepTutor is licensed under the Apache-2.0 License. See the [official repository](https://github.com/HKUDS/DeepTutor) for details.
|
||||||
248
apps/deeptutor/README.zh.md
Normal file
248
apps/deeptutor/README.zh.md
Normal file
@@ -0,0 +1,248 @@
|
|||||||
|
# DeepTutor
|
||||||
|
|
||||||
|
中文说明 | [English](README.md)
|
||||||
|
|
||||||
|
## 概述
|
||||||
|
|
||||||
|
DeepTutor 是一个 AI 驱动的个性化学习助手,通过多智能体系统将任何文档转化为交互式学习体验。它可以帮助您解决问题、生成题目、进行研究、协作写作、整理笔记,并引导您完成学习路径。
|
||||||
|
|
||||||
|
**项目地址:** <https://github.com/HKUDS/DeepTutor>
|
||||||
|
**许可证:** Apache-2.0
|
||||||
|
**文档:** <https://hkuds.github.io/DeepTutor/>
|
||||||
|
|
||||||
|
## 功能特性
|
||||||
|
|
||||||
|
- **问题求解** — 提供详细的分步解决方案和可视化图表
|
||||||
|
- **题目生成** — 根据您的知识水平生成自适应题目
|
||||||
|
- **研究助手** — 通过多智能体协作进行深度研究
|
||||||
|
- **协作写作** — 交互式创意生成和写作辅助
|
||||||
|
- **智能笔记** — 高效组织和检索学习材料
|
||||||
|
- **引导学习** — 个性化学习路径和进度跟踪
|
||||||
|
- **多智能体系统** — 针对不同学习任务的专业智能体
|
||||||
|
- **RAG 集成** — 使用 LightRAG 和 RAG-Anything 进行知识检索
|
||||||
|
- **代码执行** — 内置代码练习环境
|
||||||
|
|
||||||
|
## 快速开始
|
||||||
|
|
||||||
|
### 前置要求
|
||||||
|
|
||||||
|
- Docker 和 Docker Compose
|
||||||
|
- OpenAI API 密钥(必需)
|
||||||
|
- 可选:Anthropic、Perplexity 或 DashScope API 密钥
|
||||||
|
|
||||||
|
### 安装步骤
|
||||||
|
|
||||||
|
1. **克隆仓库**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone <your-compose-anything-repo>
|
||||||
|
cd apps/deeptutor
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **配置环境变量**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cp .env.example .env
|
||||||
|
# 编辑 .env 文件并添加您的 API 密钥
|
||||||
|
```
|
||||||
|
|
||||||
|
**必需配置:**
|
||||||
|
- `OPENAI_API_KEY` — 您的 OpenAI API 密钥
|
||||||
|
|
||||||
|
**可选配置:**
|
||||||
|
- `ANTHROPIC_API_KEY` — 用于 Claude 模型
|
||||||
|
- `PERPLEXITY_API_KEY` — 用于网络搜索
|
||||||
|
- `DASHSCOPE_API_KEY` — 用于阿里云模型
|
||||||
|
- 如需调整端口(默认:后端 8001,前端 3782)
|
||||||
|
- 云端部署时设置 `NEXT_PUBLIC_API_BASE_EXTERNAL`
|
||||||
|
|
||||||
|
3. **可选:自定义智能体配置**
|
||||||
|
|
||||||
|
创建 `config/agents.yaml` 文件以自定义智能体行为(详见[文档](https://hkuds.github.io/DeepTutor/guide/config.html))。
|
||||||
|
|
||||||
|
4. **启动服务**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
首次运行需要约 30-60 秒初始化。
|
||||||
|
|
||||||
|
5. **访问应用**
|
||||||
|
|
||||||
|
- **前端界面:** <http://localhost:3782>
|
||||||
|
- **后端 API:** <http://localhost:8001>
|
||||||
|
- **API 文档:** <http://localhost:8001/docs>
|
||||||
|
|
||||||
|
## 使用方法
|
||||||
|
|
||||||
|
### 创建知识库
|
||||||
|
|
||||||
|
1. 访问 <http://localhost:3782/knowledge>
|
||||||
|
2. 点击"新建知识库"
|
||||||
|
3. 上传文档(支持 PDF、DOCX、TXT、Markdown、HTML 等)
|
||||||
|
4. 等待处理完成
|
||||||
|
|
||||||
|
### 学习模式
|
||||||
|
|
||||||
|
- **求解(Solve)** — 获取问题的分步解决方案
|
||||||
|
- **题目(Question)** — 基于学习材料生成练习题
|
||||||
|
- **研究(Research)** — 通过多智能体协作进行深度研究
|
||||||
|
- **协作写作(Co-Writer)** — 交互式写作和创意生成
|
||||||
|
- **笔记(Notebook)** — 组织和管理学习材料
|
||||||
|
- **引导(Guide)** — 遵循个性化学习路径
|
||||||
|
|
||||||
|
### 高级功能
|
||||||
|
|
||||||
|
- **代码执行** — 在界面中直接练习编码
|
||||||
|
- **可视化图表** — 为复杂概念自动生成图表
|
||||||
|
- **导出** — 将您的工作下载为 PDF 或 Markdown
|
||||||
|
- **多语言支持** — 支持多种语言
|
||||||
|
|
||||||
|
## 配置说明
|
||||||
|
|
||||||
|
### 环境变量
|
||||||
|
|
||||||
|
主要环境变量(所有选项见 [.env.example](.env.example)):
|
||||||
|
|
||||||
|
| 变量 | 默认值 | 描述 |
|
||||||
|
| ------------------------ | -------- | -------------------- |
|
||||||
|
| `OPENAI_API_KEY` | (必需) | 您的 OpenAI API 密钥 |
|
||||||
|
| `DEFAULT_MODEL` | `gpt-4o` | 默认 LLM 模型 |
|
||||||
|
| `BACKEND_PORT` | `8001` | 后端服务器端口 |
|
||||||
|
| `FRONTEND_PORT` | `3782` | 前端应用端口 |
|
||||||
|
| `DEEPTUTOR_CPU_LIMIT` | `4.00` | CPU 限制(核心数) |
|
||||||
|
| `DEEPTUTOR_MEMORY_LIMIT` | `8G` | 内存限制 |
|
||||||
|
|
||||||
|
### 端口说明
|
||||||
|
|
||||||
|
- **8001** — 后端 API 服务器
|
||||||
|
- **3782** — 前端 Web 界面
|
||||||
|
|
||||||
|
### 数据卷
|
||||||
|
|
||||||
|
- `deeptutor_data` — 用户数据、知识库和学习材料
|
||||||
|
- `./config` — 自定义智能体配置(可选)
|
||||||
|
|
||||||
|
## 资源要求
|
||||||
|
|
||||||
|
**最低配置:**
|
||||||
|
|
||||||
|
- CPU:1 核心
|
||||||
|
- 内存:2GB
|
||||||
|
- 磁盘:2GB + 知识库所需空间
|
||||||
|
|
||||||
|
**推荐配置:**
|
||||||
|
|
||||||
|
- CPU:4 核心
|
||||||
|
- 内存:8GB
|
||||||
|
- 磁盘:10GB+
|
||||||
|
|
||||||
|
## 支持的模型
|
||||||
|
|
||||||
|
DeepTutor 支持多个 LLM 提供商:
|
||||||
|
|
||||||
|
- **OpenAI** — GPT-4、GPT-4 Turbo、GPT-3.5 Turbo
|
||||||
|
- **Anthropic** — Claude 3(Opus、Sonnet、Haiku)
|
||||||
|
- **Perplexity** — 用于网络搜索集成
|
||||||
|
- **DashScope** — 阿里云模型
|
||||||
|
- **OpenAI 兼容 API** — 任何与 OpenAI 格式兼容的 API
|
||||||
|
|
||||||
|
## 故障排查
|
||||||
|
|
||||||
|
### 后端启动失败
|
||||||
|
|
||||||
|
- 验证 `.env` 中的 `OPENAI_API_KEY` 是否正确设置
|
||||||
|
- 查看日志:`docker compose logs -f`
|
||||||
|
- 确保端口 8001 和 3782 未被占用
|
||||||
|
- 验证数据卷有足够的磁盘空间
|
||||||
|
|
||||||
|
### 前端无法连接后端
|
||||||
|
|
||||||
|
- 确认后端正在运行:访问 <http://localhost:8001/docs>
|
||||||
|
- 云端部署时,将 `NEXT_PUBLIC_API_BASE_EXTERNAL` 设置为您的公网 URL
|
||||||
|
- 检查防火墙设置
|
||||||
|
|
||||||
|
### 知识库处理失败
|
||||||
|
|
||||||
|
- 确保有足够的内存(推荐 8GB+)
|
||||||
|
- 检查文档格式是否支持
|
||||||
|
- 查看日志了解具体错误
|
||||||
|
|
||||||
|
### API 速率限制
|
||||||
|
|
||||||
|
- 在提供商控制台监控 API 使用情况
|
||||||
|
- 考虑升级 API 计划
|
||||||
|
- 为不同任务使用不同模型
|
||||||
|
|
||||||
|
## 安全提示
|
||||||
|
|
||||||
|
- **API 密钥** — 妥善保管您的 API 密钥,切勿提交到版本控制系统
|
||||||
|
- **网络暴露** — 生产环境部署时,使用 HTTPS 和适当的身份验证
|
||||||
|
- **数据隐私** — 用户数据存储在 Docker 卷中,请确保适当的备份和安全措施
|
||||||
|
- **资源限制** — 设置合适的 CPU 和内存限制以防止资源耗尽
|
||||||
|
|
||||||
|
## 更新
|
||||||
|
|
||||||
|
更新到最新版本:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 拉取最新镜像
|
||||||
|
docker compose pull
|
||||||
|
|
||||||
|
# 重新创建容器
|
||||||
|
docker compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
更新到特定版本,编辑 `.env` 中的 `DEEPTUTOR_VERSION` 并运行:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
## 高级用法
|
||||||
|
|
||||||
|
### 自定义智能体配置
|
||||||
|
|
||||||
|
创建 `config/agents.yaml` 以自定义智能体行为:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
agents:
|
||||||
|
solver:
|
||||||
|
model: gpt-4o
|
||||||
|
temperature: 0.7
|
||||||
|
researcher:
|
||||||
|
model: gpt-4-turbo
|
||||||
|
max_tokens: 4000
|
||||||
|
```
|
||||||
|
|
||||||
|
详细配置选项请参见[官方文档](https://hkuds.github.io/DeepTutor/guide/config.html)。
|
||||||
|
|
||||||
|
### 云端部署
|
||||||
|
|
||||||
|
云端部署需要额外配置:
|
||||||
|
|
||||||
|
1. 在 `.env` 中设置公网 URL:
|
||||||
|
|
||||||
|
```env
|
||||||
|
NEXT_PUBLIC_API_BASE_EXTERNAL=https://your-domain.com:8001
|
||||||
|
```
|
||||||
|
|
||||||
|
2. 配置反向代理(nginx/Caddy)以支持 HTTPS
|
||||||
|
3. 确保适当的防火墙规则
|
||||||
|
4. 考虑使用特定环境的密钥管理
|
||||||
|
|
||||||
|
### 使用不同的嵌入模型
|
||||||
|
|
||||||
|
DeepTutor 默认使用 `text-embedding-3-large`。要使用不同的嵌入模型,请参考[官方文档](https://hkuds.github.io/DeepTutor/guide/config.html)。
|
||||||
|
|
||||||
|
## 相关链接
|
||||||
|
|
||||||
|
- **GitHub:** <https://github.com/HKUDS/DeepTutor>
|
||||||
|
- **文档:** <https://hkuds.github.io/DeepTutor/>
|
||||||
|
- **问题反馈:** <https://github.com/HKUDS/DeepTutor/issues>
|
||||||
|
- **讨论区:** <https://github.com/HKUDS/DeepTutor/discussions>
|
||||||
|
|
||||||
|
## 许可证
|
||||||
|
|
||||||
|
DeepTutor 使用 Apache-2.0 许可证。详情请参见[官方仓库](https://github.com/HKUDS/DeepTutor)。
|
||||||
68
apps/deeptutor/docker-compose.yaml
Normal file
68
apps/deeptutor/docker-compose.yaml
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
# DeepTutor: AI-Powered Personalized Learning Assistant
|
||||||
|
# https://github.com/HKUDS/DeepTutor
|
||||||
|
# Transform any document into an interactive learning experience with multi-agent intelligence
|
||||||
|
|
||||||
|
x-defaults: &defaults
|
||||||
|
restart: unless-stopped
|
||||||
|
logging:
|
||||||
|
driver: json-file
|
||||||
|
options:
|
||||||
|
max-size: 100m
|
||||||
|
max-file: "3"
|
||||||
|
|
||||||
|
services:
|
||||||
|
deeptutor:
|
||||||
|
<<: *defaults
|
||||||
|
image: ${GLOBAL_REGISTRY:-ghcr.io}/hkuds/deeptutor:${DEEPTUTOR_VERSION:-latest}
|
||||||
|
ports:
|
||||||
|
- "${DEEPTUTOR_BACKEND_PORT_OVERRIDE:-8001}:${BACKEND_PORT:-8001}"
|
||||||
|
- "${DEEPTUTOR_FRONTEND_PORT_OVERRIDE:-3782}:${FRONTEND_PORT:-3782}"
|
||||||
|
volumes:
|
||||||
|
- deeptutor_data:/app/data
|
||||||
|
- ./config:/app/config:ro
|
||||||
|
environment:
|
||||||
|
- TZ=${TZ:-UTC}
|
||||||
|
# Backend port
|
||||||
|
- BACKEND_PORT=${BACKEND_PORT:-8001}
|
||||||
|
# Frontend port
|
||||||
|
- FRONTEND_PORT=${FRONTEND_PORT:-3782}
|
||||||
|
# API base URLs
|
||||||
|
- NEXT_PUBLIC_API_BASE=${NEXT_PUBLIC_API_BASE:-http://localhost:8001}
|
||||||
|
- NEXT_PUBLIC_API_BASE_EXTERNAL=${NEXT_PUBLIC_API_BASE_EXTERNAL:-http://localhost:8001}
|
||||||
|
# LLM API Keys
|
||||||
|
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
||||||
|
- OPENAI_BASE_URL=${OPENAI_BASE_URL:-https://api.openai.com/v1}
|
||||||
|
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
|
||||||
|
- PERPLEXITY_API_KEY=${PERPLEXITY_API_KEY:-}
|
||||||
|
- DASHSCOPE_API_KEY=${DASHSCOPE_API_KEY:-}
|
||||||
|
# Default LLM model
|
||||||
|
- DEFAULT_MODEL=${DEFAULT_MODEL:-gpt-4o}
|
||||||
|
# User ID and Group ID for permission management
|
||||||
|
- PUID=${PUID:-1000}
|
||||||
|
- PGID=${PGID:-1000}
|
||||||
|
healthcheck:
|
||||||
|
test:
|
||||||
|
[
|
||||||
|
"CMD",
|
||||||
|
"curl",
|
||||||
|
"-f",
|
||||||
|
"http://localhost:${BACKEND_PORT:-8001}/health",
|
||||||
|
"||",
|
||||||
|
"exit",
|
||||||
|
"1",
|
||||||
|
]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 60s
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpus: ${DEEPTUTOR_CPU_LIMIT:-4.00}
|
||||||
|
memory: ${DEEPTUTOR_MEMORY_LIMIT:-8G}
|
||||||
|
reservations:
|
||||||
|
cpus: ${DEEPTUTOR_CPU_RESERVATION:-1.00}
|
||||||
|
memory: ${DEEPTUTOR_MEMORY_RESERVATION:-2G}
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
deeptutor_data:
|
||||||
106
src/llama.cpp/.env.example
Normal file
106
src/llama.cpp/.env.example
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
# =============================================================================
|
||||||
|
# llama.cpp Configuration
|
||||||
|
# https://github.com/ggml-org/llama.cpp
|
||||||
|
# LLM inference in C/C++ with support for various hardware accelerators
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# General Settings
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Timezone for the container (default: UTC)
|
||||||
|
TZ=UTC
|
||||||
|
|
||||||
|
# Global registry prefix (optional)
|
||||||
|
# Example: docker.io/, ghcr.io/, registry.example.com/
|
||||||
|
GHCR_REGISTRY=ghcr.io/
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Server Configuration
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Server image variant
|
||||||
|
# Options: server (CPU), server-cuda (NVIDIA GPU), server-rocm (AMD GPU),
|
||||||
|
# server-musa (Moore Threads GPU), server-intel (Intel GPU),
|
||||||
|
# server-vulkan (Vulkan GPU)
|
||||||
|
LLAMA_CPP_SERVER_VARIANT=server
|
||||||
|
|
||||||
|
# Server port override (default: 8080)
|
||||||
|
LLAMA_CPP_SERVER_PORT_OVERRIDE=8080
|
||||||
|
|
||||||
|
# Model path inside the container
|
||||||
|
# You need to mount your model file to this path
|
||||||
|
# Example: /models/llama-2-7b-chat.Q4_K_M.gguf
|
||||||
|
LLAMA_CPP_MODEL_PATH=/models/model.gguf
|
||||||
|
|
||||||
|
# Context size (number of tokens)
|
||||||
|
# Larger values allow for more context but require more memory
|
||||||
|
# Default: 512, Common values: 512, 2048, 4096, 8192, 16384, 32768
|
||||||
|
LLAMA_CPP_CONTEXT_SIZE=512
|
||||||
|
|
||||||
|
# Number of GPU layers to offload
|
||||||
|
# 0 = CPU only, 99 = all layers on GPU (for GPU variants)
|
||||||
|
# For CPU variant, keep this at 0
|
||||||
|
LLAMA_CPP_GPU_LAYERS=0
|
||||||
|
|
||||||
|
# Number of GPUs to use (for CUDA variant)
|
||||||
|
LLAMA_CPP_GPU_COUNT=1
|
||||||
|
|
||||||
|
# Server CPU limit (in cores)
|
||||||
|
LLAMA_CPP_SERVER_CPU_LIMIT=4.0
|
||||||
|
|
||||||
|
# Server CPU reservation (in cores)
|
||||||
|
LLAMA_CPP_SERVER_CPU_RESERVATION=2.0
|
||||||
|
|
||||||
|
# Server memory limit
|
||||||
|
LLAMA_CPP_SERVER_MEMORY_LIMIT=8G
|
||||||
|
|
||||||
|
# Server memory reservation
|
||||||
|
LLAMA_CPP_SERVER_MEMORY_RESERVATION=4G
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# CLI Configuration (Light variant)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# CLI image variant
|
||||||
|
# Options: light (CPU), light-cuda (NVIDIA GPU), light-rocm (AMD GPU),
|
||||||
|
# light-musa (Moore Threads GPU), light-intel (Intel GPU),
|
||||||
|
# light-vulkan (Vulkan GPU)
|
||||||
|
LLAMA_CPP_CLI_VARIANT=light
|
||||||
|
|
||||||
|
# Default prompt for CLI mode
|
||||||
|
LLAMA_CPP_PROMPT=Hello, how are you?
|
||||||
|
|
||||||
|
# CLI CPU limit (in cores)
|
||||||
|
LLAMA_CPP_CLI_CPU_LIMIT=2.0
|
||||||
|
|
||||||
|
# CLI CPU reservation (in cores)
|
||||||
|
LLAMA_CPP_CLI_CPU_RESERVATION=1.0
|
||||||
|
|
||||||
|
# CLI memory limit
|
||||||
|
LLAMA_CPP_CLI_MEMORY_LIMIT=4G
|
||||||
|
|
||||||
|
# CLI memory reservation
|
||||||
|
LLAMA_CPP_CLI_MEMORY_RESERVATION=2G
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Full Toolkit Configuration
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Full image variant (includes model conversion tools)
|
||||||
|
# Options: full (CPU), full-cuda (NVIDIA GPU), full-rocm (AMD GPU),
|
||||||
|
# full-musa (Moore Threads GPU), full-intel (Intel GPU),
|
||||||
|
# full-vulkan (Vulkan GPU)
|
||||||
|
LLAMA_CPP_FULL_VARIANT=full
|
||||||
|
|
||||||
|
# Full CPU limit (in cores)
|
||||||
|
LLAMA_CPP_FULL_CPU_LIMIT=2.0
|
||||||
|
|
||||||
|
# Full CPU reservation (in cores)
|
||||||
|
LLAMA_CPP_FULL_CPU_RESERVATION=1.0
|
||||||
|
|
||||||
|
# Full memory limit
|
||||||
|
LLAMA_CPP_FULL_MEMORY_LIMIT=4G
|
||||||
|
|
||||||
|
# Full memory reservation
|
||||||
|
LLAMA_CPP_FULL_MEMORY_RESERVATION=2G
|
||||||
245
src/llama.cpp/README.md
Normal file
245
src/llama.cpp/README.md
Normal file
@@ -0,0 +1,245 @@
|
|||||||
|
# llama.cpp
|
||||||
|
|
||||||
|
[中文文档](README.zh.md)
|
||||||
|
|
||||||
|
[llama.cpp](https://github.com/ggml-org/llama.cpp) is a high-performance C/C++ implementation for LLM inference with support for various hardware accelerators.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **Fast Inference**: Optimized C/C++ implementation for efficient LLM inference
|
||||||
|
- **Multiple Backends**: CPU, CUDA (NVIDIA), ROCm (AMD), MUSA (Moore Threads), Intel GPU, Vulkan
|
||||||
|
- **OpenAI-compatible API**: Server mode with OpenAI-compatible REST API
|
||||||
|
- **CLI Support**: Interactive command-line interface for quick testing
|
||||||
|
- **Model Conversion**: Full toolkit includes tools to convert and quantize models
|
||||||
|
- **GGUF Format**: Support for the efficient GGUF model format
|
||||||
|
- **Cross-platform**: Linux (x86-64, ARM64, s390x), Windows, macOS
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- Docker and Docker Compose installed
|
||||||
|
- At least 4GB of RAM (8GB+ recommended)
|
||||||
|
- For GPU variants:
|
||||||
|
- **CUDA**: NVIDIA GPU with [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit)
|
||||||
|
- **ROCm**: AMD GPU with proper ROCm drivers
|
||||||
|
- **MUSA**: Moore Threads GPU with mt-container-toolkit
|
||||||
|
- GGUF format model file (e.g., from [Hugging Face](https://huggingface.co/models?library=gguf))
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### 1. Server Mode (CPU)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Copy and configure environment
|
||||||
|
cp .env.example .env
|
||||||
|
|
||||||
|
# Edit .env and set your model path
|
||||||
|
# LLAMA_CPP_MODEL_PATH=/models/your-model.gguf
|
||||||
|
|
||||||
|
# Place your GGUF model in a directory, then update docker-compose.yaml
|
||||||
|
# to mount it, e.g.:
|
||||||
|
# volumes:
|
||||||
|
# - ./models:/models
|
||||||
|
|
||||||
|
# Start the server
|
||||||
|
docker compose --profile server up -d
|
||||||
|
|
||||||
|
# Test the server (OpenAI-compatible API)
|
||||||
|
curl http://localhost:8080/v1/models
|
||||||
|
|
||||||
|
# Chat completion request
|
||||||
|
curl http://localhost:8080/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "Hello!"}
|
||||||
|
]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Server Mode with NVIDIA GPU
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Edit .env
|
||||||
|
# Set LLAMA_CPP_GPU_LAYERS=99 to offload all layers to GPU
|
||||||
|
|
||||||
|
# Start GPU-accelerated server
|
||||||
|
docker compose --profile cuda up -d
|
||||||
|
|
||||||
|
# The server will automatically use NVIDIA GPU
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Server Mode with AMD GPU
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Edit .env
|
||||||
|
# Set LLAMA_CPP_GPU_LAYERS=99 to offload all layers to GPU
|
||||||
|
|
||||||
|
# Start GPU-accelerated server
|
||||||
|
docker compose --profile rocm up -d
|
||||||
|
|
||||||
|
# The server will automatically use AMD GPU
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. CLI Mode
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Edit .env and configure model path and prompt
|
||||||
|
|
||||||
|
# Run CLI
|
||||||
|
docker compose --profile cli up
|
||||||
|
|
||||||
|
# For interactive mode, use:
|
||||||
|
docker compose run --rm llama-cpp-cli \
|
||||||
|
-m /models/your-model.gguf \
|
||||||
|
-p "Your prompt here" \
|
||||||
|
-n 512
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. Full Toolkit (Model Conversion)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Start the full container
|
||||||
|
docker compose --profile full up -d
|
||||||
|
|
||||||
|
# Execute commands inside the container
|
||||||
|
docker compose exec llama-cpp-full bash
|
||||||
|
|
||||||
|
# Inside container, you can use conversion tools
|
||||||
|
# Example: Convert a Hugging Face model
|
||||||
|
# python3 convert_hf_to_gguf.py /models/source-model --outfile /models/output.gguf
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Environment Variables
|
||||||
|
|
||||||
|
Key environment variables (see [.env.example](.env.example) for all options):
|
||||||
|
|
||||||
|
| Variable | Description | Default |
|
||||||
|
| -------------------------------- | ------------------------------------------------------------- | -------------------- |
|
||||||
|
| `LLAMA_CPP_SERVER_VARIANT` | Server image variant (server, server-cuda, server-rocm, etc.) | `server` |
|
||||||
|
| `LLAMA_CPP_MODEL_PATH` | Model file path inside container | `/models/model.gguf` |
|
||||||
|
| `LLAMA_CPP_CONTEXT_SIZE` | Context window size in tokens | `512` |
|
||||||
|
| `LLAMA_CPP_GPU_LAYERS` | Number of layers to offload to GPU (0=CPU only, 99=all) | `0` |
|
||||||
|
| `LLAMA_CPP_SERVER_PORT_OVERRIDE` | Server port on host | `8080` |
|
||||||
|
| `LLAMA_CPP_SERVER_MEMORY_LIMIT` | Memory limit for server | `8G` |
|
||||||
|
|
||||||
|
### Available Profiles
|
||||||
|
|
||||||
|
- `server`: CPU-only server
|
||||||
|
- `cuda`: NVIDIA GPU server (requires nvidia-container-toolkit)
|
||||||
|
- `rocm`: AMD GPU server (requires ROCm)
|
||||||
|
- `cli`: Command-line interface
|
||||||
|
- `full`: Full toolkit with model conversion tools
|
||||||
|
- `gpu`: Generic GPU profile (includes cuda and rocm)
|
||||||
|
|
||||||
|
### Image Variants
|
||||||
|
|
||||||
|
Each variant comes in multiple flavors:
|
||||||
|
|
||||||
|
- **server**: Only `llama-server` executable (API server)
|
||||||
|
- **light**: Only `llama-cli` and `llama-completion` executables
|
||||||
|
- **full**: Complete toolkit including model conversion tools
|
||||||
|
|
||||||
|
Backend options:
|
||||||
|
|
||||||
|
- Base (CPU)
|
||||||
|
- `-cuda` (NVIDIA GPU)
|
||||||
|
- `-rocm` (AMD GPU)
|
||||||
|
- `-musa` (Moore Threads GPU)
|
||||||
|
- `-intel` (Intel GPU with SYCL)
|
||||||
|
- `-vulkan` (Vulkan GPU)
|
||||||
|
|
||||||
|
## Server API
|
||||||
|
|
||||||
|
The server provides an OpenAI-compatible API:
|
||||||
|
|
||||||
|
- `GET /health` - Health check
|
||||||
|
- `GET /v1/models` - List available models
|
||||||
|
- `POST /v1/chat/completions` - Chat completion
|
||||||
|
- `POST /v1/completions` - Text completion
|
||||||
|
- `POST /v1/embeddings` - Generate embeddings
|
||||||
|
|
||||||
|
See the [llama.cpp server documentation](https://github.com/ggml-org/llama.cpp/blob/master/examples/server/README.md) for full API details.
|
||||||
|
|
||||||
|
## Model Sources
|
||||||
|
|
||||||
|
Download GGUF models from:
|
||||||
|
|
||||||
|
- [Hugging Face GGUF Models](https://huggingface.co/models?library=gguf)
|
||||||
|
- [TheBloke's GGUF Collection](https://huggingface.co/TheBloke)
|
||||||
|
- Convert your own models using the full toolkit
|
||||||
|
|
||||||
|
Popular quantization formats:
|
||||||
|
|
||||||
|
- `Q4_K_M`: Good balance of quality and size (recommended)
|
||||||
|
- `Q5_K_M`: Higher quality, larger size
|
||||||
|
- `Q8_0`: Very high quality, large size
|
||||||
|
- `Q2_K`: Smallest size, lower quality
|
||||||
|
|
||||||
|
## Resource Requirements
|
||||||
|
|
||||||
|
Minimum requirements by model size:
|
||||||
|
|
||||||
|
| Model Size | RAM (CPU) | VRAM (GPU) | Context Size |
|
||||||
|
| ---------- | --------- | ---------- | ------------ |
|
||||||
|
| 7B Q4_K_M | 6GB | 4GB | 2048 |
|
||||||
|
| 13B Q4_K_M | 10GB | 8GB | 2048 |
|
||||||
|
| 34B Q4_K_M | 24GB | 20GB | 2048 |
|
||||||
|
| 70B Q4_K_M | 48GB | 40GB | 2048 |
|
||||||
|
|
||||||
|
Larger context sizes require proportionally more memory.
|
||||||
|
|
||||||
|
## Performance Tuning
|
||||||
|
|
||||||
|
For CPU inference:
|
||||||
|
|
||||||
|
- Increase `LLAMA_CPP_SERVER_CPU_LIMIT` for more cores
|
||||||
|
- Optimize threads with `-t` flag (default: auto)
|
||||||
|
|
||||||
|
For GPU inference:
|
||||||
|
|
||||||
|
- Set `LLAMA_CPP_GPU_LAYERS=99` to offload all layers
|
||||||
|
- Increase context size for longer conversations
|
||||||
|
- Monitor GPU memory usage
|
||||||
|
|
||||||
|
## Security Notes
|
||||||
|
|
||||||
|
- The server binds to `0.0.0.0` by default - ensure proper network security
|
||||||
|
- No authentication is enabled by default
|
||||||
|
- Consider using a reverse proxy (nginx, Caddy) for production deployments
|
||||||
|
- Limit resource usage to prevent system exhaustion
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Out of Memory
|
||||||
|
|
||||||
|
- Reduce `LLAMA_CPP_CONTEXT_SIZE`
|
||||||
|
- Use a smaller quantized model (e.g., Q4 instead of Q8)
|
||||||
|
- Reduce `LLAMA_CPP_GPU_LAYERS` if using GPU
|
||||||
|
|
||||||
|
### GPU Not Detected
|
||||||
|
|
||||||
|
**NVIDIA**: Verify nvidia-container-toolkit is installed:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi
|
||||||
|
```
|
||||||
|
|
||||||
|
**AMD**: Ensure ROCm drivers and `/dev/kfd`, `/dev/dri` are accessible.
|
||||||
|
|
||||||
|
### Slow Inference
|
||||||
|
|
||||||
|
- Check CPU/GPU utilization
|
||||||
|
- Increase resource limits in `.env`
|
||||||
|
- For GPU: Verify all layers are offloaded (`LLAMA_CPP_GPU_LAYERS=99`)
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
|
||||||
|
- [llama.cpp GitHub](https://github.com/ggml-org/llama.cpp)
|
||||||
|
- [Docker Documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/docker.md)
|
||||||
|
- [Server API Docs](https://github.com/ggml-org/llama.cpp/blob/master/examples/server/README.md)
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
llama.cpp is released under the MIT License. See the [LICENSE](https://github.com/ggml-org/llama.cpp/blob/master/LICENSE) file for details.
|
||||||
244
src/llama.cpp/README.zh.md
Normal file
244
src/llama.cpp/README.zh.md
Normal file
@@ -0,0 +1,244 @@
|
|||||||
|
# llama.cpp
|
||||||
|
|
||||||
|
[English Documentation](README.md)
|
||||||
|
|
||||||
|
[llama.cpp](https://github.com/ggml-org/llama.cpp) 是一个高性能的 C/C++ 实现的大语言模型推理引擎,支持多种硬件加速器。
|
||||||
|
|
||||||
|
## 功能特性
|
||||||
|
|
||||||
|
- **高速推理**:优化的 C/C++ 实现,提供高效的 LLM 推理
|
||||||
|
- **多种后端**:支持 CPU、CUDA(NVIDIA)、ROCm(AMD)、MUSA(摩尔线程)、Intel GPU、Vulkan
|
||||||
|
- **OpenAI 兼容 API**:服务器模式提供 OpenAI 兼容的 REST API
|
||||||
|
- **CLI 支持**:交互式命令行界面,方便快速测试
|
||||||
|
- **模型转换**:完整工具包包含模型转换和量化工具
|
||||||
|
- **GGUF 格式**:支持高效的 GGUF 模型格式
|
||||||
|
- **跨平台**:支持 Linux(x86-64、ARM64、s390x)、Windows、macOS
|
||||||
|
|
||||||
|
## 前置要求
|
||||||
|
|
||||||
|
- 已安装 Docker 和 Docker Compose
|
||||||
|
- 至少 4GB 内存(推荐 8GB 以上)
|
||||||
|
- GPU 版本需要:
|
||||||
|
- **CUDA**:NVIDIA GPU 及 [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit)
|
||||||
|
- **ROCm**:AMD GPU 及相应的 ROCm 驱动
|
||||||
|
- **MUSA**:摩尔线程 GPU 及 mt-container-toolkit
|
||||||
|
- GGUF 格式的模型文件(例如从 [Hugging Face](https://huggingface.co/models?library=gguf) 下载)
|
||||||
|
|
||||||
|
## 快速开始
|
||||||
|
|
||||||
|
### 1. 服务器模式(CPU)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 复制并配置环境变量
|
||||||
|
cp .env.example .env
|
||||||
|
|
||||||
|
# 编辑 .env 并设置模型路径
|
||||||
|
# LLAMA_CPP_MODEL_PATH=/models/your-model.gguf
|
||||||
|
|
||||||
|
# 将 GGUF 模型放在目录中,然后更新 docker-compose.yaml 挂载,例如:
|
||||||
|
# volumes:
|
||||||
|
# - ./models:/models
|
||||||
|
|
||||||
|
# 启动服务器
|
||||||
|
docker compose --profile server up -d
|
||||||
|
|
||||||
|
# 测试服务器(OpenAI 兼容 API)
|
||||||
|
curl http://localhost:8080/v1/models
|
||||||
|
|
||||||
|
# 聊天补全请求
|
||||||
|
curl http://localhost:8080/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "你好!"}
|
||||||
|
]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. 服务器模式(NVIDIA GPU)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 编辑 .env
|
||||||
|
# 设置 LLAMA_CPP_GPU_LAYERS=99 将所有层卸载到 GPU
|
||||||
|
|
||||||
|
# 启动 GPU 加速服务器
|
||||||
|
docker compose --profile cuda up -d
|
||||||
|
|
||||||
|
# 服务器将自动使用 NVIDIA GPU
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. 服务器模式(AMD GPU)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 编辑 .env
|
||||||
|
# 设置 LLAMA_CPP_GPU_LAYERS=99 将所有层卸载到 GPU
|
||||||
|
|
||||||
|
# 启动 GPU 加速服务器
|
||||||
|
docker compose --profile rocm up -d
|
||||||
|
|
||||||
|
# 服务器将自动使用 AMD GPU
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. CLI 模式
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 编辑 .env 并配置模型路径和提示词
|
||||||
|
|
||||||
|
# 运行 CLI
|
||||||
|
docker compose --profile cli up
|
||||||
|
|
||||||
|
# 交互模式:
|
||||||
|
docker compose run --rm llama-cpp-cli \
|
||||||
|
-m /models/your-model.gguf \
|
||||||
|
-p "你的提示词" \
|
||||||
|
-n 512
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. 完整工具包(模型转换)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 启动完整容器
|
||||||
|
docker compose --profile full up -d
|
||||||
|
|
||||||
|
# 在容器内执行命令
|
||||||
|
docker compose exec llama-cpp-full bash
|
||||||
|
|
||||||
|
# 在容器内可以使用转换工具
|
||||||
|
# 示例:转换 Hugging Face 模型
|
||||||
|
# python3 convert_hf_to_gguf.py /models/source-model --outfile /models/output.gguf
|
||||||
|
```
|
||||||
|
|
||||||
|
## 配置说明
|
||||||
|
|
||||||
|
### 环境变量
|
||||||
|
|
||||||
|
主要环境变量(完整选项请查看 [.env.example](.env.example)):
|
||||||
|
|
||||||
|
| 变量 | 说明 | 默认值 |
|
||||||
|
| -------------------------------- | ----------------------------------------------------- | -------------------- |
|
||||||
|
| `LLAMA_CPP_SERVER_VARIANT` | 服务器镜像变体(server、server-cuda、server-rocm 等) | `server` |
|
||||||
|
| `LLAMA_CPP_MODEL_PATH` | 容器内模型文件路径 | `/models/model.gguf` |
|
||||||
|
| `LLAMA_CPP_CONTEXT_SIZE` | 上下文窗口大小(token 数) | `512` |
|
||||||
|
| `LLAMA_CPP_GPU_LAYERS` | 卸载到 GPU 的层数(0=仅 CPU,99=全部) | `0` |
|
||||||
|
| `LLAMA_CPP_SERVER_PORT_OVERRIDE` | 主机端口 | `8080` |
|
||||||
|
| `LLAMA_CPP_SERVER_MEMORY_LIMIT` | 服务器内存限制 | `8G` |
|
||||||
|
|
||||||
|
### 可用配置文件
|
||||||
|
|
||||||
|
- `server`:仅 CPU 服务器
|
||||||
|
- `cuda`:NVIDIA GPU 服务器(需要 nvidia-container-toolkit)
|
||||||
|
- `rocm`:AMD GPU 服务器(需要 ROCm)
|
||||||
|
- `cli`:命令行界面
|
||||||
|
- `full`:包含模型转换工具的完整工具包
|
||||||
|
- `gpu`:通用 GPU 配置(包括 cuda 和 rocm)
|
||||||
|
|
||||||
|
### 镜像变体
|
||||||
|
|
||||||
|
每个变体都有多种类型:
|
||||||
|
|
||||||
|
- **server**:仅包含 `llama-server` 可执行文件(API 服务器)
|
||||||
|
- **light**:仅包含 `llama-cli` 和 `llama-completion` 可执行文件
|
||||||
|
- **full**:完整工具包,包括模型转换工具
|
||||||
|
|
||||||
|
后端选项:
|
||||||
|
|
||||||
|
- 基础版(CPU)
|
||||||
|
- `-cuda`(NVIDIA GPU)
|
||||||
|
- `-rocm`(AMD GPU)
|
||||||
|
- `-musa`(摩尔线程 GPU)
|
||||||
|
- `-intel`(Intel GPU,支持 SYCL)
|
||||||
|
- `-vulkan`(Vulkan GPU)
|
||||||
|
|
||||||
|
## 服务器 API
|
||||||
|
|
||||||
|
服务器提供 OpenAI 兼容的 API:
|
||||||
|
|
||||||
|
- `GET /health` - 健康检查
|
||||||
|
- `GET /v1/models` - 列出可用模型
|
||||||
|
- `POST /v1/chat/completions` - 聊天补全
|
||||||
|
- `POST /v1/completions` - 文本补全
|
||||||
|
- `POST /v1/embeddings` - 生成嵌入向量
|
||||||
|
|
||||||
|
完整 API 详情请参阅 [llama.cpp 服务器文档](https://github.com/ggml-org/llama.cpp/blob/master/examples/server/README.md)。
|
||||||
|
|
||||||
|
## 模型来源
|
||||||
|
|
||||||
|
下载 GGUF 模型:
|
||||||
|
|
||||||
|
- [Hugging Face GGUF 模型](https://huggingface.co/models?library=gguf)
|
||||||
|
- [TheBloke 的 GGUF 合集](https://huggingface.co/TheBloke)
|
||||||
|
- 使用完整工具包转换您自己的模型
|
||||||
|
|
||||||
|
常用量化格式:
|
||||||
|
|
||||||
|
- `Q4_K_M`:质量和大小的良好平衡(推荐)
|
||||||
|
- `Q5_K_M`:更高质量,更大体积
|
||||||
|
- `Q8_0`:非常高的质量,大体积
|
||||||
|
- `Q2_K`:最小体积,较低质量
|
||||||
|
|
||||||
|
## 资源需求
|
||||||
|
|
||||||
|
按模型大小的最低要求:
|
||||||
|
|
||||||
|
| 模型大小 | 内存(CPU) | 显存(GPU) | 上下文大小 |
|
||||||
|
| ---------- | ----------- | ----------- | ---------- |
|
||||||
|
| 7B Q4_K_M | 6GB | 4GB | 2048 |
|
||||||
|
| 13B Q4_K_M | 10GB | 8GB | 2048 |
|
||||||
|
| 34B Q4_K_M | 24GB | 20GB | 2048 |
|
||||||
|
| 70B Q4_K_M | 48GB | 40GB | 2048 |
|
||||||
|
|
||||||
|
更大的上下文大小需要成比例的更多内存。
|
||||||
|
|
||||||
|
## 性能调优
|
||||||
|
|
||||||
|
CPU 推理:
|
||||||
|
|
||||||
|
- 增加 `LLAMA_CPP_SERVER_CPU_LIMIT` 以使用更多核心
|
||||||
|
- 使用 `-t` 参数优化线程数(默认:自动)
|
||||||
|
|
||||||
|
GPU 推理:
|
||||||
|
|
||||||
|
- 设置 `LLAMA_CPP_GPU_LAYERS=99` 卸载所有层
|
||||||
|
- 增加上下文大小以支持更长对话
|
||||||
|
- 监控 GPU 内存使用
|
||||||
|
|
||||||
|
## 安全注意事项
|
||||||
|
|
||||||
|
- 服务器默认绑定到 `0.0.0.0` - 请确保网络安全
|
||||||
|
- 默认未启用身份验证
|
||||||
|
- 生产环境建议使用反向代理(nginx、Caddy)
|
||||||
|
- 限制资源使用以防止系统资源耗尽
|
||||||
|
|
||||||
|
## 故障排除
|
||||||
|
|
||||||
|
### 内存不足
|
||||||
|
|
||||||
|
- 减小 `LLAMA_CPP_CONTEXT_SIZE`
|
||||||
|
- 使用更小的量化模型(例如 Q4 而不是 Q8)
|
||||||
|
- 减少 `LLAMA_CPP_GPU_LAYERS`(如果使用 GPU)
|
||||||
|
|
||||||
|
### GPU 未检测到
|
||||||
|
|
||||||
|
**NVIDIA**:验证 nvidia-container-toolkit 是否已安装:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi
|
||||||
|
```
|
||||||
|
|
||||||
|
**AMD**:确保 ROCm 驱动已安装且 `/dev/kfd`、`/dev/dri` 可访问。
|
||||||
|
|
||||||
|
### 推理速度慢
|
||||||
|
|
||||||
|
- 检查 CPU/GPU 利用率
|
||||||
|
- 增加 `.env` 中的资源限制
|
||||||
|
- GPU:验证所有层都已卸载(`LLAMA_CPP_GPU_LAYERS=99`)
|
||||||
|
|
||||||
|
## 文档
|
||||||
|
|
||||||
|
- [llama.cpp GitHub](https://github.com/ggml-org/llama.cpp)
|
||||||
|
- [Docker 文档](https://github.com/ggml-org/llama.cpp/blob/master/docs/docker.md)
|
||||||
|
- [服务器 API 文档](https://github.com/ggml-org/llama.cpp/blob/master/examples/server/README.md)
|
||||||
|
|
||||||
|
## 许可证
|
||||||
|
|
||||||
|
llama.cpp 使用 MIT 许可证发布。详情请参阅 [LICENSE](https://github.com/ggml-org/llama.cpp/blob/master/LICENSE) 文件。
|
||||||
210
src/llama.cpp/docker-compose.yaml
Normal file
210
src/llama.cpp/docker-compose.yaml
Normal file
@@ -0,0 +1,210 @@
|
|||||||
|
# Docker Compose configuration for llama.cpp
|
||||||
|
# https://github.com/ggml-org/llama.cpp
|
||||||
|
# LLM inference in C/C++ with support for various hardware accelerators
|
||||||
|
|
||||||
|
x-defaults: &defaults
|
||||||
|
restart: unless-stopped
|
||||||
|
logging:
|
||||||
|
driver: json-file
|
||||||
|
options:
|
||||||
|
max-size: 100m
|
||||||
|
max-file: "3"
|
||||||
|
|
||||||
|
services:
|
||||||
|
# llama.cpp server - OpenAI-compatible API server
|
||||||
|
# Variant: server (CPU), server-cuda (NVIDIA GPU), server-rocm (AMD GPU)
|
||||||
|
llama-cpp-server:
|
||||||
|
<<: *defaults
|
||||||
|
image: ${GHCR_REGISTRY:-ghcr.io/}ggml-org/llama.cpp:${LLAMA_CPP_SERVER_VARIANT:-server}
|
||||||
|
ports:
|
||||||
|
- "${LLAMA_CPP_SERVER_PORT_OVERRIDE:-8080}:8080"
|
||||||
|
volumes:
|
||||||
|
- llama_cpp_models:/models
|
||||||
|
command:
|
||||||
|
- "-m"
|
||||||
|
- "${LLAMA_CPP_MODEL_PATH:-/models/model.gguf}"
|
||||||
|
- "--port"
|
||||||
|
- "8080"
|
||||||
|
- "--host"
|
||||||
|
- "0.0.0.0"
|
||||||
|
- "-n"
|
||||||
|
- "${LLAMA_CPP_CONTEXT_SIZE:-512}"
|
||||||
|
- "--n-gpu-layers"
|
||||||
|
- "${LLAMA_CPP_GPU_LAYERS:-0}"
|
||||||
|
environment:
|
||||||
|
- TZ=${TZ:-UTC}
|
||||||
|
healthcheck:
|
||||||
|
test:
|
||||||
|
[
|
||||||
|
"CMD",
|
||||||
|
"wget",
|
||||||
|
"--quiet",
|
||||||
|
"--tries=1",
|
||||||
|
"--spider",
|
||||||
|
"http://localhost:8080/health",
|
||||||
|
]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 30s
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpus: ${LLAMA_CPP_SERVER_CPU_LIMIT:-4.0}
|
||||||
|
memory: ${LLAMA_CPP_SERVER_MEMORY_LIMIT:-8G}
|
||||||
|
reservations:
|
||||||
|
cpus: ${LLAMA_CPP_SERVER_CPU_RESERVATION:-2.0}
|
||||||
|
memory: ${LLAMA_CPP_SERVER_MEMORY_RESERVATION:-4G}
|
||||||
|
profiles:
|
||||||
|
- server
|
||||||
|
|
||||||
|
# llama.cpp server with NVIDIA GPU support
|
||||||
|
llama-cpp-server-cuda:
|
||||||
|
<<: *defaults
|
||||||
|
image: ${GHCR_REGISTRY:-ghcr.io/}ggml-org/llama.cpp:server-cuda
|
||||||
|
ports:
|
||||||
|
- "${LLAMA_CPP_SERVER_PORT_OVERRIDE:-8080}:8080"
|
||||||
|
volumes:
|
||||||
|
- llama_cpp_models:/models
|
||||||
|
command:
|
||||||
|
- "-m"
|
||||||
|
- "${LLAMA_CPP_MODEL_PATH:-/models/model.gguf}"
|
||||||
|
- "--port"
|
||||||
|
- "8080"
|
||||||
|
- "--host"
|
||||||
|
- "0.0.0.0"
|
||||||
|
- "-n"
|
||||||
|
- "${LLAMA_CPP_CONTEXT_SIZE:-512}"
|
||||||
|
- "--n-gpu-layers"
|
||||||
|
- "${LLAMA_CPP_GPU_LAYERS:-99}"
|
||||||
|
environment:
|
||||||
|
- TZ=${TZ:-UTC}
|
||||||
|
healthcheck:
|
||||||
|
test:
|
||||||
|
[
|
||||||
|
"CMD",
|
||||||
|
"wget",
|
||||||
|
"--quiet",
|
||||||
|
"--tries=1",
|
||||||
|
"--spider",
|
||||||
|
"http://localhost:8080/health",
|
||||||
|
]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 30s
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpus: ${LLAMA_CPP_SERVER_CPU_LIMIT:-4.0}
|
||||||
|
memory: ${LLAMA_CPP_SERVER_MEMORY_LIMIT:-8G}
|
||||||
|
reservations:
|
||||||
|
cpus: ${LLAMA_CPP_SERVER_CPU_RESERVATION:-2.0}
|
||||||
|
memory: ${LLAMA_CPP_SERVER_MEMORY_RESERVATION:-4G}
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
count: ${LLAMA_CPP_GPU_COUNT:-1}
|
||||||
|
capabilities: [gpu]
|
||||||
|
profiles:
|
||||||
|
- gpu
|
||||||
|
- cuda
|
||||||
|
|
||||||
|
# llama.cpp server with AMD ROCm GPU support
|
||||||
|
llama-cpp-server-rocm:
|
||||||
|
<<: *defaults
|
||||||
|
image: ${GHCR_REGISTRY:-ghcr.io/}ggml-org/llama.cpp:server-rocm
|
||||||
|
ports:
|
||||||
|
- "${LLAMA_CPP_SERVER_PORT_OVERRIDE:-8080}:8080"
|
||||||
|
volumes:
|
||||||
|
- llama_cpp_models:/models
|
||||||
|
devices:
|
||||||
|
- /dev/kfd
|
||||||
|
- /dev/dri
|
||||||
|
command:
|
||||||
|
- "-m"
|
||||||
|
- "${LLAMA_CPP_MODEL_PATH:-/models/model.gguf}"
|
||||||
|
- "--port"
|
||||||
|
- "8080"
|
||||||
|
- "--host"
|
||||||
|
- "0.0.0.0"
|
||||||
|
- "-n"
|
||||||
|
- "${LLAMA_CPP_CONTEXT_SIZE:-512}"
|
||||||
|
- "--n-gpu-layers"
|
||||||
|
- "${LLAMA_CPP_GPU_LAYERS:-99}"
|
||||||
|
environment:
|
||||||
|
- TZ=${TZ:-UTC}
|
||||||
|
healthcheck:
|
||||||
|
test:
|
||||||
|
[
|
||||||
|
"CMD",
|
||||||
|
"wget",
|
||||||
|
"--quiet",
|
||||||
|
"--tries=1",
|
||||||
|
"--spider",
|
||||||
|
"http://localhost:8080/health",
|
||||||
|
]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 30s
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpus: ${LLAMA_CPP_SERVER_CPU_LIMIT:-4.0}
|
||||||
|
memory: ${LLAMA_CPP_SERVER_MEMORY_LIMIT:-8G}
|
||||||
|
reservations:
|
||||||
|
cpus: ${LLAMA_CPP_SERVER_CPU_RESERVATION:-2.0}
|
||||||
|
memory: ${LLAMA_CPP_SERVER_MEMORY_RESERVATION:-4G}
|
||||||
|
profiles:
|
||||||
|
- gpu
|
||||||
|
- rocm
|
||||||
|
|
||||||
|
# llama.cpp CLI (light) - Interactive command-line interface
|
||||||
|
llama-cpp-cli:
|
||||||
|
<<: *defaults
|
||||||
|
image: ${GHCR_REGISTRY:-ghcr.io/}ggml-org/llama.cpp:${LLAMA_CPP_CLI_VARIANT:-light}
|
||||||
|
volumes:
|
||||||
|
- llama_cpp_models:/models
|
||||||
|
entrypoint: /app/llama-cli
|
||||||
|
command:
|
||||||
|
- "-m"
|
||||||
|
- "${LLAMA_CPP_MODEL_PATH:-/models/model.gguf}"
|
||||||
|
- "-p"
|
||||||
|
- "${LLAMA_CPP_PROMPT:-Hello, how are you?}"
|
||||||
|
- "-n"
|
||||||
|
- "${LLAMA_CPP_CONTEXT_SIZE:-512}"
|
||||||
|
environment:
|
||||||
|
- TZ=${TZ:-UTC}
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpus: ${LLAMA_CPP_CLI_CPU_LIMIT:-2.0}
|
||||||
|
memory: ${LLAMA_CPP_CLI_MEMORY_LIMIT:-4G}
|
||||||
|
reservations:
|
||||||
|
cpus: ${LLAMA_CPP_CLI_CPU_RESERVATION:-1.0}
|
||||||
|
memory: ${LLAMA_CPP_CLI_MEMORY_RESERVATION:-2G}
|
||||||
|
profiles:
|
||||||
|
- cli
|
||||||
|
|
||||||
|
# llama.cpp full - Complete toolkit including model conversion tools
|
||||||
|
llama-cpp-full:
|
||||||
|
<<: *defaults
|
||||||
|
image: ${GHCR_REGISTRY:-ghcr.io/}ggml-org/llama.cpp:${LLAMA_CPP_FULL_VARIANT:-full}
|
||||||
|
volumes:
|
||||||
|
- llama_cpp_models:/models
|
||||||
|
command: ["sleep", "infinity"]
|
||||||
|
environment:
|
||||||
|
- TZ=${TZ:-UTC}
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpus: ${LLAMA_CPP_FULL_CPU_LIMIT:-2.0}
|
||||||
|
memory: ${LLAMA_CPP_FULL_MEMORY_LIMIT:-4G}
|
||||||
|
reservations:
|
||||||
|
cpus: ${LLAMA_CPP_FULL_CPU_RESERVATION:-1.0}
|
||||||
|
memory: ${LLAMA_CPP_FULL_MEMORY_RESERVATION:-2G}
|
||||||
|
profiles:
|
||||||
|
- full
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
llama_cpp_models:
|
||||||
Reference in New Issue
Block a user