feat: add TurboOCR
This commit is contained in:
@@ -0,0 +1,59 @@
|
||||
# TurboOCR image version
|
||||
# See https://github.com/aiptimizer/TurboOCR/releases for available tags
|
||||
TURBOOCR_VERSION="v2.1.1"
|
||||
|
||||
# Language bundle (leave empty for latin / English-default)
|
||||
# Supported: latin, chinese, greek, eslav, arabic, korean, thai
|
||||
TURBOOCR_LANG=""
|
||||
|
||||
# When TURBOOCR_LANG=chinese, set to 1 to use the 84MB PP-OCRv5 server rec
|
||||
# instead of the 16MB mobile rec (higher accuracy, more VRAM)
|
||||
TURBOOCR_SERVER=""
|
||||
|
||||
# Concurrent GPU pipelines (~1.4 GB VRAM each); empty = auto-detect
|
||||
TURBOOCR_PIPELINE_POOL_SIZE=""
|
||||
|
||||
# Disable PP-DocLayoutV3 layout detection model (1 = disable, saves ~300-500 MB VRAM)
|
||||
TURBOOCR_DISABLE_LAYOUT=0
|
||||
|
||||
# Default PDF extraction mode
|
||||
# ocr - render + full OCR (safest, immune to text-layer attacks)
|
||||
# geometric - PDFium text layer only (~10x faster, but trusts PDF content)
|
||||
# auto - per-page text layer if available, else OCR
|
||||
# auto_verified - OCR + cross-check against text layer
|
||||
TURBOOCR_PDF_MODE="ocr"
|
||||
|
||||
# Skip angle classifier (1 = skip, ~0.4ms latency savings)
|
||||
TURBOOCR_DISABLE_ANGLE_CLS=0
|
||||
|
||||
# Max detection input size in pixels
|
||||
TURBOOCR_DET_MAX_SIDE=960
|
||||
|
||||
# PDF render parallelism
|
||||
TURBOOCR_PDF_DAEMONS=16
|
||||
TURBOOCR_PDF_WORKERS=4
|
||||
|
||||
# Maximum pages allowed per PDF request
|
||||
TURBOOCR_MAX_PDF_PAGES=2000
|
||||
|
||||
# Log level: debug / info / warn / error
|
||||
TURBOOCR_LOG_LEVEL="info"
|
||||
|
||||
# Log format: json (structured) / text (human-readable)
|
||||
TURBOOCR_LOG_FORMAT="json"
|
||||
|
||||
# Host port mappings
|
||||
TURBOOCR_HTTP_PORT_OVERRIDE=8000
|
||||
TURBOOCR_GRPC_PORT_OVERRIDE=50051
|
||||
|
||||
# Resource limits
|
||||
TURBOOCR_CPU_LIMIT=8.0
|
||||
TURBOOCR_MEMORY_LIMIT=12G
|
||||
TURBOOCR_CPU_RESERVATION=2.0
|
||||
TURBOOCR_MEMORY_RESERVATION=4G
|
||||
|
||||
# Number of NVIDIA GPUs to reserve
|
||||
TURBOOCR_GPU_COUNT=1
|
||||
|
||||
# Shared memory size for the container
|
||||
TURBOOCR_SHM_SIZE=2g
|
||||
@@ -0,0 +1,119 @@
|
||||
# TurboOCR
|
||||
|
||||
[English](./README.md) | [中文](./README.zh.md)
|
||||
|
||||
This service deploys [TurboOCR](https://github.com/aiptimizer/TurboOCR), a GPU-accelerated OCR server built on C++ / CUDA / TensorRT / PP-OCRv5. It exposes both an HTTP API and a gRPC API from a single binary that share the same GPU pipeline pool, with Prometheus metrics built in.
|
||||
|
||||
## Services
|
||||
|
||||
- `turboocr`: TurboOCR HTTP (port 8000) + gRPC (port 50051) inference server
|
||||
|
||||
## Requirements
|
||||
|
||||
- Linux host with NVIDIA driver 595 or newer
|
||||
- Turing or newer GPU (RTX 20-series / GTX 16-series and up)
|
||||
- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) installed and configured for Docker
|
||||
|
||||
## Environment Variables
|
||||
|
||||
| Variable Name | Description | Default Value |
|
||||
| ----------------------------- | --------------------------------------------------------------------------------- | ------------- |
|
||||
| `TURBOOCR_VERSION` | TurboOCR image version | `v2.1.1` |
|
||||
| `TURBOOCR_LANG` | Language bundle: `latin`, `chinese`, `greek`, `eslav`, `arabic`, `korean`, `thai` | `""` (latin) |
|
||||
| `TURBOOCR_SERVER` | With `chinese`, set to `1` for the 84 MB server rec | `""` |
|
||||
| `TURBOOCR_PIPELINE_POOL_SIZE` | Concurrent GPU pipelines (~1.4 GB VRAM each); empty = auto | `""` |
|
||||
| `TURBOOCR_DISABLE_LAYOUT` | Disable layout detection model (saves ~300-500 MB VRAM) | `0` |
|
||||
| `TURBOOCR_PDF_MODE` | Default PDF mode: `ocr` / `geometric` / `auto` / `auto_verified` | `ocr` |
|
||||
| `TURBOOCR_DISABLE_ANGLE_CLS` | Skip angle classifier (~0.4 ms savings) | `0` |
|
||||
| `TURBOOCR_DET_MAX_SIDE` | Max detection input size in pixels | `960` |
|
||||
| `TURBOOCR_PDF_DAEMONS` | PDF render daemons | `16` |
|
||||
| `TURBOOCR_PDF_WORKERS` | PDF worker threads | `4` |
|
||||
| `TURBOOCR_MAX_PDF_PAGES` | Maximum pages per PDF request | `2000` |
|
||||
| `TURBOOCR_LOG_LEVEL` | Log level: `debug` / `info` / `warn` / `error` | `info` |
|
||||
| `TURBOOCR_LOG_FORMAT` | Log format: `json` / `text` | `json` |
|
||||
| `TURBOOCR_HTTP_PORT_OVERRIDE` | Host port for HTTP API | `8000` |
|
||||
| `TURBOOCR_GRPC_PORT_OVERRIDE` | Host port for gRPC API | `50051` |
|
||||
| `TURBOOCR_CPU_LIMIT` | CPU limit | `8.0` |
|
||||
| `TURBOOCR_MEMORY_LIMIT` | Memory limit | `12G` |
|
||||
| `TURBOOCR_GPU_COUNT` | Number of NVIDIA GPUs to reserve | `1` |
|
||||
| `TURBOOCR_SHM_SIZE` | Shared memory size | `2g` |
|
||||
|
||||
Copy `.env.example` to `.env` and override only the variables you need to change.
|
||||
|
||||
## Volumes
|
||||
|
||||
- `turboocr_trt_cache`: Caches TensorRT engines built from ONNX on first start. Must be a **named** volume — a bind-mount of an empty host directory would shadow the baked-in language bundles and the server would fail to load models.
|
||||
|
||||
## Usage
|
||||
|
||||
### Start TurboOCR
|
||||
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
The first start builds TensorRT engines from ONNX. Build time depends on your GPU: roughly 5 minutes on high-end desktop GPUs and 20–30 minutes on laptop GPUs. The container may report `unhealthy` while compilation is in progress — this is normal. Once the build finishes the server starts and the container transitions to `healthy`. Subsequent restarts reuse the cached engines and start in seconds.
|
||||
|
||||
### Endpoints
|
||||
|
||||
- HTTP API: <http://localhost:8000>
|
||||
- gRPC API: `localhost:50051`
|
||||
- Health: <http://localhost:8000/health>
|
||||
- Readiness: <http://localhost:8000/health/ready>
|
||||
- Metrics (Prometheus): <http://localhost:8000/metrics>
|
||||
|
||||
### Test the API
|
||||
|
||||
```bash
|
||||
# Image — raw bytes (fastest path)
|
||||
curl -X POST http://localhost:8000/ocr/raw \
|
||||
--data-binary @document.png \
|
||||
-H "Content-Type: image/png"
|
||||
|
||||
# Image — base64 JSON
|
||||
curl -X POST http://localhost:8000/ocr \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"image":"'$(base64 -w0 document.png)'"}'
|
||||
|
||||
# PDF — raw bytes
|
||||
curl -X POST http://localhost:8000/ocr/pdf \
|
||||
--data-binary @document.pdf
|
||||
|
||||
# PDF with layout detection enabled
|
||||
curl -X POST "http://localhost:8000/ocr/pdf?layout=1&mode=auto" \
|
||||
--data-binary @document.pdf
|
||||
```
|
||||
|
||||
> **Important:** Use HTTP keep-alive. Sending many short-lived connections (e.g. one `curl` per request in a loop) can overwhelm the server. Standard HTTP client libraries (`requests.Session`, `aiohttp`, Go `http.Client`, etc.) reuse connections by default.
|
||||
|
||||
### Switching Languages
|
||||
|
||||
Edit `.env` and restart:
|
||||
|
||||
```bash
|
||||
TURBOOCR_LANG=chinese
|
||||
TURBOOCR_SERVER=1 # optional: use the 84 MB Chinese server rec
|
||||
```
|
||||
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
All language bundles are baked into the image at build time (SHA256-verified from the pinned PP-OCRv5 release). No runtime downloads.
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
- **GPU pipelines** — set `TURBOOCR_PIPELINE_POOL_SIZE` based on available VRAM (~1.4 GB each)
|
||||
- **Layout overhead** — `?layout=1` reduces throughput by ~20%; set `TURBOOCR_DISABLE_LAYOUT=1` to skip loading the model entirely
|
||||
- **Shared memory** — increase `TURBOOCR_SHM_SIZE` if you process very large PDFs
|
||||
|
||||
## Security Notes
|
||||
|
||||
- The API has no authentication by default. Put a reverse proxy (nginx, Caddy) in front for production.
|
||||
- The default PDF mode is `ocr`, which only trusts pixel data and is safe for untrusted PDF uploads.
|
||||
- Do **not** set `TURBOOCR_PDF_MODE` to `geometric` or `auto` globally if you accept PDFs from untrusted sources — a malicious PDF can embed invisible text or remap glyphs to inject arbitrary strings into the text layer.
|
||||
- Use `auto_verified` for higher accuracy on trusted documents; it cross-checks the native text layer against OCR results.
|
||||
|
||||
## License
|
||||
|
||||
TurboOCR is licensed under the MIT License. See the [TurboOCR GitHub repository](https://github.com/aiptimizer/TurboOCR) for details.
|
||||
@@ -0,0 +1,119 @@
|
||||
# TurboOCR
|
||||
|
||||
[English](./README.md) | [中文](./README.zh.md)
|
||||
|
||||
此服务用于部署 [TurboOCR](https://github.com/aiptimizer/TurboOCR),一个基于 C++ / CUDA / TensorRT / PP-OCRv5 的 GPU 加速 OCR 服务器。单一二进制同时提供 HTTP 与 gRPC 两套接口,共享同一个 GPU 流水线池,并内置 Prometheus 指标。
|
||||
|
||||
## 服务
|
||||
|
||||
- `turboocr`:TurboOCR HTTP(端口 8000)+ gRPC(端口 50051)推理服务
|
||||
|
||||
## 运行要求
|
||||
|
||||
- Linux 主机,NVIDIA 驱动 595 或更高版本
|
||||
- Turing 及以上架构 GPU(RTX 20 系列 / GTX 16 系列及更新)
|
||||
- 已安装并配置好 [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)
|
||||
|
||||
## 环境变量
|
||||
|
||||
| 变量名 | 说明 | 默认值 |
|
||||
| ----------------------------- | ------------------------------------------------------------------------ | ------------- |
|
||||
| `TURBOOCR_VERSION` | TurboOCR 镜像版本 | `v2.1.1` |
|
||||
| `TURBOOCR_LANG` | 语言包:`latin`、`chinese`、`greek`、`eslav`、`arabic`、`korean`、`thai` | `""`(latin) |
|
||||
| `TURBOOCR_SERVER` | 当 `chinese` 时,设为 `1` 使用 84 MB 服务端识别模型 | `""` |
|
||||
| `TURBOOCR_PIPELINE_POOL_SIZE` | 并发 GPU 流水线数(每条约 1.4 GB 显存),留空则自动 | `""` |
|
||||
| `TURBOOCR_DISABLE_LAYOUT` | 禁用版面检测模型(节省约 300-500 MB 显存) | `0` |
|
||||
| `TURBOOCR_PDF_MODE` | PDF 默认模式:`ocr` / `geometric` / `auto` / `auto_verified` | `ocr` |
|
||||
| `TURBOOCR_DISABLE_ANGLE_CLS` | 跳过方向分类器(约节省 0.4 ms) | `0` |
|
||||
| `TURBOOCR_DET_MAX_SIDE` | 检测输入最大尺寸(像素) | `960` |
|
||||
| `TURBOOCR_PDF_DAEMONS` | PDF 渲染守护进程数 | `16` |
|
||||
| `TURBOOCR_PDF_WORKERS` | PDF 工作线程数 | `4` |
|
||||
| `TURBOOCR_MAX_PDF_PAGES` | 单次 PDF 请求最大页数 | `2000` |
|
||||
| `TURBOOCR_LOG_LEVEL` | 日志级别:`debug` / `info` / `warn` / `error` | `info` |
|
||||
| `TURBOOCR_LOG_FORMAT` | 日志格式:`json` / `text` | `json` |
|
||||
| `TURBOOCR_HTTP_PORT_OVERRIDE` | HTTP API 主机端口 | `8000` |
|
||||
| `TURBOOCR_GRPC_PORT_OVERRIDE` | gRPC API 主机端口 | `50051` |
|
||||
| `TURBOOCR_CPU_LIMIT` | CPU 限制 | `8.0` |
|
||||
| `TURBOOCR_MEMORY_LIMIT` | 内存限制 | `12G` |
|
||||
| `TURBOOCR_GPU_COUNT` | 预留的 NVIDIA GPU 数量 | `1` |
|
||||
| `TURBOOCR_SHM_SIZE` | 共享内存大小 | `2g` |
|
||||
|
||||
复制 `.env.example` 为 `.env`,仅覆盖你需要修改的变量。
|
||||
|
||||
## 卷
|
||||
|
||||
- `turboocr_trt_cache`:缓存首次启动时由 ONNX 构建出的 TensorRT 引擎。必须使用**命名卷**,如果绑定挂载一个空的主机目录,会覆盖镜像内置的语言包,导致服务无法加载模型。
|
||||
|
||||
## 使用方法
|
||||
|
||||
### 启动 TurboOCR
|
||||
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
首次启动需要从 ONNX 构建 TensorRT 引擎,耗时因 GPU 而异:高端桌面 GPU 约 5 分钟,笔记本 GPU 约 20–30 分钟。编译期间容器可能显示 `unhealthy`,这属于正常现象——构建完成后服务会自动启动并切换为 `healthy`。后续重启会复用缓存的引擎,几乎瞬间完成。
|
||||
|
||||
### 访问端点
|
||||
|
||||
- HTTP API:<http://localhost:8000>
|
||||
- gRPC API:`localhost:50051`
|
||||
- 健康检查:<http://localhost:8000/health>
|
||||
- 就绪检查:<http://localhost:8000/health/ready>
|
||||
- Prometheus 指标:<http://localhost:8000/metrics>
|
||||
|
||||
### 测试 API
|
||||
|
||||
```bash
|
||||
# 图片 —— 原始字节(最快路径)
|
||||
curl -X POST http://localhost:8000/ocr/raw \
|
||||
--data-binary @document.png \
|
||||
-H "Content-Type: image/png"
|
||||
|
||||
# 图片 —— base64 JSON
|
||||
curl -X POST http://localhost:8000/ocr \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"image":"'$(base64 -w0 document.png)'"}'
|
||||
|
||||
# PDF —— 原始字节
|
||||
curl -X POST http://localhost:8000/ocr/pdf \
|
||||
--data-binary @document.pdf
|
||||
|
||||
# PDF 启用版面检测
|
||||
curl -X POST "http://localhost:8000/ocr/pdf?layout=1&mode=auto" \
|
||||
--data-binary @document.pdf
|
||||
```
|
||||
|
||||
> **重要提示**:请使用 HTTP keep-alive。如果在循环中频繁建立短连接(例如每次请求一个 `curl`),可能会压垮服务。标准 HTTP 客户端库(`requests.Session`、`aiohttp`、Go `http.Client` 等)默认会复用连接。
|
||||
|
||||
### 切换语言
|
||||
|
||||
修改 `.env` 后重启:
|
||||
|
||||
```bash
|
||||
TURBOOCR_LANG=chinese
|
||||
TURBOOCR_SERVER=1 # 可选:使用 84 MB 的中文服务端识别模型
|
||||
```
|
||||
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
所有语言包都在构建镜像时打包进来(基于固定版本的 PP-OCRv5 发布,并校验 SHA256),运行时无需联网下载。
|
||||
|
||||
## 性能调优
|
||||
|
||||
- **GPU 流水线**:根据显存大小设置 `TURBOOCR_PIPELINE_POOL_SIZE`(每条约 1.4 GB)
|
||||
- **版面开销**:`?layout=1` 会使吞吐下降约 20%;设置 `TURBOOCR_DISABLE_LAYOUT=1` 可完全跳过模型加载
|
||||
- **共享内存**:处理超大 PDF 时可增加 `TURBOOCR_SHM_SIZE`
|
||||
|
||||
## 安全说明
|
||||
|
||||
- API 默认无身份认证。生产环境请在前面套一层反向代理(nginx、Caddy 等)。
|
||||
- PDF 默认模式为 `ocr`,只信任像素数据,可安全处理不可信来源的 PDF 上传。
|
||||
- 如果你的服务接收不可信来源的 PDF,**不要**将 `TURBOOCR_PDF_MODE` 全局设为 `geometric` 或 `auto`:恶意 PDF 可以嵌入隐形文字、重映射 ToUnicode 字符或在文本层注入任意字符串。
|
||||
- 在可信文档场景下可使用 `auto_verified` 模式,会先做 OCR,再用文本层与之对照校验。
|
||||
|
||||
## 许可证
|
||||
|
||||
TurboOCR 采用 MIT 许可证。详情请参见 [TurboOCR GitHub 仓库](https://github.com/aiptimizer/TurboOCR)。
|
||||
@@ -0,0 +1,71 @@
|
||||
x-defaults: &defaults
|
||||
restart: unless-stopped
|
||||
logging:
|
||||
driver: json-file
|
||||
options:
|
||||
max-size: 100m
|
||||
max-file: '3'
|
||||
|
||||
services:
|
||||
turboocr:
|
||||
<<: *defaults
|
||||
image: ${GLOBAL_REGISTRY:-ghcr.io/}aiptimizer/turboocr:${TURBOOCR_VERSION:-v2.1.1}
|
||||
ports:
|
||||
- '${TURBOOCR_HTTP_PORT_OVERRIDE:-8000}:8000'
|
||||
- '${TURBOOCR_GRPC_PORT_OVERRIDE:-50051}:50051'
|
||||
volumes:
|
||||
# Named volume caches TensorRT engines built from ONNX on first start (~90s).
|
||||
# Must be a named volume - bind-mounting an empty host dir would shadow the
|
||||
# baked-in language bundles and prevent the server from loading models.
|
||||
- turboocr_trt_cache:/home/ocr/.cache/turbo-ocr
|
||||
environment:
|
||||
- TZ=${TZ:-UTC}
|
||||
# Language bundle: latin (default), chinese, greek, eslav, arabic, korean, thai
|
||||
- OCR_LANG=${TURBOOCR_LANG:-}
|
||||
# Set to 1 with OCR_LANG=chinese to use the 84MB server rec instead of 16MB mobile
|
||||
- OCR_SERVER=${TURBOOCR_SERVER:-}
|
||||
# Concurrent GPU pipelines (~1.4 GB VRAM each); empty = auto
|
||||
- PIPELINE_POOL_SIZE=${TURBOOCR_PIPELINE_POOL_SIZE:-}
|
||||
# Set to 1 to disable PP-DocLayoutV3 layout detection (saves ~300-500 MB VRAM)
|
||||
- DISABLE_LAYOUT=${TURBOOCR_DISABLE_LAYOUT:-0}
|
||||
# Default PDF mode: ocr (safest) / geometric / auto / auto_verified
|
||||
- ENABLE_PDF_MODE=${TURBOOCR_PDF_MODE:-ocr}
|
||||
# Skip angle classifier (~0.4ms savings)
|
||||
- DISABLE_ANGLE_CLS=${TURBOOCR_DISABLE_ANGLE_CLS:-0}
|
||||
# Max detection input size
|
||||
- DET_MAX_SIDE=${TURBOOCR_DET_MAX_SIDE:-960}
|
||||
# PDF render parallelism
|
||||
- PDF_DAEMONS=${TURBOOCR_PDF_DAEMONS:-16}
|
||||
- PDF_WORKERS=${TURBOOCR_PDF_WORKERS:-4}
|
||||
# Maximum pages per PDF request
|
||||
- MAX_PDF_PAGES=${TURBOOCR_MAX_PDF_PAGES:-2000}
|
||||
# Log level: debug / info / warn / error
|
||||
- LOG_LEVEL=${TURBOOCR_LOG_LEVEL:-info}
|
||||
# Log format: json (structured) / text (human-readable)
|
||||
- LOG_FORMAT=${TURBOOCR_LOG_FORMAT:-json}
|
||||
healthcheck:
|
||||
test: [CMD, curl, -fsS, 'http://localhost:8000/health']
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
# First start builds TensorRT engines from ONNX. Build time varies by GPU:
|
||||
# ~5 min on high-end desktop GPUs, 20-30 min on laptop GPUs. The container
|
||||
# may show "unhealthy" during compilation but will become healthy once done.
|
||||
# Subsequent restarts reuse the cached engines and start in seconds.
|
||||
start_period: 30m
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: ${TURBOOCR_CPU_LIMIT:-8.0}
|
||||
memory: ${TURBOOCR_MEMORY_LIMIT:-12G}
|
||||
reservations:
|
||||
cpus: ${TURBOOCR_CPU_RESERVATION:-2.0}
|
||||
memory: ${TURBOOCR_MEMORY_RESERVATION:-4G}
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: ${TURBOOCR_GPU_COUNT:-1}
|
||||
capabilities: [gpu]
|
||||
shm_size: ${TURBOOCR_SHM_SIZE:-2g}
|
||||
|
||||
volumes:
|
||||
turboocr_trt_cache:
|
||||
Reference in New Issue
Block a user