feat: add TurboOCR

2026-04-28 10:05:39 +08:00
parent 3483dd80f0
commit ce16588916
25 changed files with 1460 additions and 12 deletions
@@ -0,0 +1,59 @@
+# TurboOCR image version
+# See https://github.com/aiptimizer/TurboOCR/releases for available tags
+TURBOOCR_VERSION="v2.1.1"
+
+# Language bundle (leave empty for latin / English-default)
+# Supported: latin, chinese, greek, eslav, arabic, korean, thai
+TURBOOCR_LANG=""
+
+# When TURBOOCR_LANG=chinese, set to 1 to use the 84MB PP-OCRv5 server rec
+# instead of the 16MB mobile rec (higher accuracy, more VRAM)
+TURBOOCR_SERVER=""
+
+# Concurrent GPU pipelines (~1.4 GB VRAM each); empty = auto-detect
+TURBOOCR_PIPELINE_POOL_SIZE=""
+
+# Disable PP-DocLayoutV3 layout detection model (1 = disable, saves ~300-500 MB VRAM)
+TURBOOCR_DISABLE_LAYOUT=0
+
+# Default PDF extraction mode
+# ocr           - render + full OCR (safest, immune to text-layer attacks)
+# geometric     - PDFium text layer only (~10x faster, but trusts PDF content)
+# auto          - per-page text layer if available, else OCR
+# auto_verified - OCR + cross-check against text layer
+TURBOOCR_PDF_MODE="ocr"
+
+# Skip angle classifier (1 = skip, ~0.4ms latency savings)
+TURBOOCR_DISABLE_ANGLE_CLS=0
+
+# Max detection input size in pixels
+TURBOOCR_DET_MAX_SIDE=960
+
+# PDF render parallelism
+TURBOOCR_PDF_DAEMONS=16
+TURBOOCR_PDF_WORKERS=4
+
+# Maximum pages allowed per PDF request
+TURBOOCR_MAX_PDF_PAGES=2000
+
+# Log level: debug / info / warn / error
+TURBOOCR_LOG_LEVEL="info"
+
+# Log format: json (structured) / text (human-readable)
+TURBOOCR_LOG_FORMAT="json"
+
+# Host port mappings
+TURBOOCR_HTTP_PORT_OVERRIDE=8000
+TURBOOCR_GRPC_PORT_OVERRIDE=50051
+
+# Resource limits
+TURBOOCR_CPU_LIMIT=8.0
+TURBOOCR_MEMORY_LIMIT=12G
+TURBOOCR_CPU_RESERVATION=2.0
+TURBOOCR_MEMORY_RESERVATION=4G
+
+# Number of NVIDIA GPUs to reserve
+TURBOOCR_GPU_COUNT=1
+
+# Shared memory size for the container
+TURBOOCR_SHM_SIZE=2g
@@ -0,0 +1,119 @@
+# TurboOCR
+
+[English](./README.md) | [中文](./README.zh.md)
+
+This service deploys [TurboOCR](https://github.com/aiptimizer/TurboOCR), a GPU-accelerated OCR server built on C++ / CUDA / TensorRT / PP-OCRv5. It exposes both an HTTP API and a gRPC API from a single binary that share the same GPU pipeline pool, with Prometheus metrics built in.
+
+## Services
+
+- `turboocr`: TurboOCR HTTP (port 8000) + gRPC (port 50051) inference server
+
+## Requirements
+
+- Linux host with NVIDIA driver 595 or newer
+- Turing or newer GPU (RTX 20-series / GTX 16-series and up)
+- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) installed and configured for Docker
+
+## Environment Variables
+
+| Variable Name                 | Description                                                                       | Default Value |
+| ----------------------------- | --------------------------------------------------------------------------------- | ------------- |
+| `TURBOOCR_VERSION`            | TurboOCR image version                                                            | `v2.1.1`      |
+| `TURBOOCR_LANG`               | Language bundle: `latin`, `chinese`, `greek`, `eslav`, `arabic`, `korean`, `thai` | `""` (latin)  |
+| `TURBOOCR_SERVER`             | With `chinese`, set to `1` for the 84 MB server rec                               | `""`          |
+| `TURBOOCR_PIPELINE_POOL_SIZE` | Concurrent GPU pipelines (~1.4 GB VRAM each); empty = auto                        | `""`          |
+| `TURBOOCR_DISABLE_LAYOUT`     | Disable layout detection model (saves ~300-500 MB VRAM)                           | `0`           |
+| `TURBOOCR_PDF_MODE`           | Default PDF mode: `ocr` / `geometric` / `auto` / `auto_verified`                  | `ocr`         |
+| `TURBOOCR_DISABLE_ANGLE_CLS`  | Skip angle classifier (~0.4 ms savings)                                           | `0`           |
+| `TURBOOCR_DET_MAX_SIDE`       | Max detection input size in pixels                                                | `960`         |
+| `TURBOOCR_PDF_DAEMONS`        | PDF render daemons                                                                | `16`          |
+| `TURBOOCR_PDF_WORKERS`        | PDF worker threads                                                                | `4`           |
+| `TURBOOCR_MAX_PDF_PAGES`      | Maximum pages per PDF request                                                     | `2000`        |
+| `TURBOOCR_LOG_LEVEL`          | Log level: `debug` / `info` / `warn` / `error`                                    | `info`        |
+| `TURBOOCR_LOG_FORMAT`         | Log format: `json` / `text`                                                       | `json`        |
+| `TURBOOCR_HTTP_PORT_OVERRIDE` | Host port for HTTP API                                                            | `8000`        |
+| `TURBOOCR_GRPC_PORT_OVERRIDE` | Host port for gRPC API                                                            | `50051`       |
+| `TURBOOCR_CPU_LIMIT`          | CPU limit                                                                         | `8.0`         |
+| `TURBOOCR_MEMORY_LIMIT`       | Memory limit                                                                      | `12G`         |
+| `TURBOOCR_GPU_COUNT`          | Number of NVIDIA GPUs to reserve                                                  | `1`           |
+| `TURBOOCR_SHM_SIZE`           | Shared memory size                                                                | `2g`          |
+
+Copy `.env.example` to `.env` and override only the variables you need to change.
+
+## Volumes
+
+- `turboocr_trt_cache`: Caches TensorRT engines built from ONNX on first start. Must be a **named** volume — a bind-mount of an empty host directory would shadow the baked-in language bundles and the server would fail to load models.
+
+## Usage
+
+### Start TurboOCR
+
+```bash
+docker compose up -d
+```
+
+The first start builds TensorRT engines from ONNX. Build time depends on your GPU: roughly 5 minutes on high-end desktop GPUs and 20–30 minutes on laptop GPUs. The container may report `unhealthy` while compilation is in progress — this is normal. Once the build finishes the server starts and the container transitions to `healthy`. Subsequent restarts reuse the cached engines and start in seconds.
+
+### Endpoints
+
+- HTTP API: <http://localhost:8000>
+- gRPC API: `localhost:50051`
+- Health: <http://localhost:8000/health>
+- Readiness: <http://localhost:8000/health/ready>
+- Metrics (Prometheus): <http://localhost:8000/metrics>
+
+### Test the API
+
+```bash
+# Image — raw bytes (fastest path)
+curl -X POST http://localhost:8000/ocr/raw \
+  --data-binary @document.png \
+  -H "Content-Type: image/png"
+
+# Image — base64 JSON
+curl -X POST http://localhost:8000/ocr \
+  -H "Content-Type: application/json" \
+  -d '{"image":"'$(base64 -w0 document.png)'"}'
+
+# PDF — raw bytes
+curl -X POST http://localhost:8000/ocr/pdf \
+  --data-binary @document.pdf
+
+# PDF with layout detection enabled
+curl -X POST "http://localhost:8000/ocr/pdf?layout=1&mode=auto" \
+  --data-binary @document.pdf
+```
+
+> **Important:** Use HTTP keep-alive. Sending many short-lived connections (e.g. one `curl` per request in a loop) can overwhelm the server. Standard HTTP client libraries (`requests.Session`, `aiohttp`, Go `http.Client`, etc.) reuse connections by default.
+
+### Switching Languages
+
+Edit `.env` and restart:
+
+```bash
+TURBOOCR_LANG=chinese
+TURBOOCR_SERVER=1   # optional: use the 84 MB Chinese server rec
+```
+
+```bash
+docker compose up -d
+```
+
+All language bundles are baked into the image at build time (SHA256-verified from the pinned PP-OCRv5 release). No runtime downloads.
+
+## Performance Tuning
+
+- **GPU pipelines** — set `TURBOOCR_PIPELINE_POOL_SIZE` based on available VRAM (~1.4 GB each)
+- **Layout overhead** — `?layout=1` reduces throughput by ~20%; set `TURBOOCR_DISABLE_LAYOUT=1` to skip loading the model entirely
+- **Shared memory** — increase `TURBOOCR_SHM_SIZE` if you process very large PDFs
+
+## Security Notes
+
+- The API has no authentication by default. Put a reverse proxy (nginx, Caddy) in front for production.
+- The default PDF mode is `ocr`, which only trusts pixel data and is safe for untrusted PDF uploads.
+- Do **not** set `TURBOOCR_PDF_MODE` to `geometric` or `auto` globally if you accept PDFs from untrusted sources — a malicious PDF can embed invisible text or remap glyphs to inject arbitrary strings into the text layer.
+- Use `auto_verified` for higher accuracy on trusted documents; it cross-checks the native text layer against OCR results.
+
+## License
+
+TurboOCR is licensed under the MIT License. See the [TurboOCR GitHub repository](https://github.com/aiptimizer/TurboOCR) for details.
@@ -0,0 +1,119 @@
+# TurboOCR
+
+[English](./README.md) | [中文](./README.zh.md)
+
+此服务用于部署 [TurboOCR](https://github.com/aiptimizer/TurboOCR)，一个基于 C++ / CUDA / TensorRT / PP-OCRv5 的 GPU 加速 OCR 服务器。单一二进制同时提供 HTTP 与 gRPC 两套接口，共享同一个 GPU 流水线池，并内置 Prometheus 指标。
+
+## 服务
+
+- `turboocr`：TurboOCR HTTP（端口 8000）+ gRPC（端口 50051）推理服务
+
+## 运行要求
+
+- Linux 主机，NVIDIA 驱动 595 或更高版本
+- Turing 及以上架构 GPU（RTX 20 系列 / GTX 16 系列及更新）
+- 已安装并配置好 [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)
+
+## 环境变量
+
+| 变量名                        | 说明                                                                     | 默认值        |
+| ----------------------------- | ------------------------------------------------------------------------ | ------------- |
+| `TURBOOCR_VERSION`            | TurboOCR 镜像版本                                                        | `v2.1.1`      |
+| `TURBOOCR_LANG`               | 语言包：`latin`、`chinese`、`greek`、`eslav`、`arabic`、`korean`、`thai` | `""`（latin） |
+| `TURBOOCR_SERVER`             | 当 `chinese` 时，设为 `1` 使用 84 MB 服务端识别模型                      | `""`          |
+| `TURBOOCR_PIPELINE_POOL_SIZE` | 并发 GPU 流水线数（每条约 1.4 GB 显存），留空则自动                      | `""`          |
+| `TURBOOCR_DISABLE_LAYOUT`     | 禁用版面检测模型（节省约 300-500 MB 显存）                               | `0`           |
+| `TURBOOCR_PDF_MODE`           | PDF 默认模式：`ocr` / `geometric` / `auto` / `auto_verified`             | `ocr`         |
+| `TURBOOCR_DISABLE_ANGLE_CLS`  | 跳过方向分类器（约节省 0.4 ms）                                          | `0`           |
+| `TURBOOCR_DET_MAX_SIDE`       | 检测输入最大尺寸（像素）                                                 | `960`         |
+| `TURBOOCR_PDF_DAEMONS`        | PDF 渲染守护进程数                                                       | `16`          |
+| `TURBOOCR_PDF_WORKERS`        | PDF 工作线程数                                                           | `4`           |
+| `TURBOOCR_MAX_PDF_PAGES`      | 单次 PDF 请求最大页数                                                    | `2000`        |
+| `TURBOOCR_LOG_LEVEL`          | 日志级别：`debug` / `info` / `warn` / `error`                            | `info`        |
+| `TURBOOCR_LOG_FORMAT`         | 日志格式：`json` / `text`                                                | `json`        |
+| `TURBOOCR_HTTP_PORT_OVERRIDE` | HTTP API 主机端口                                                        | `8000`        |
+| `TURBOOCR_GRPC_PORT_OVERRIDE` | gRPC API 主机端口                                                        | `50051`       |
+| `TURBOOCR_CPU_LIMIT`          | CPU 限制                                                                 | `8.0`         |
+| `TURBOOCR_MEMORY_LIMIT`       | 内存限制                                                                 | `12G`         |
+| `TURBOOCR_GPU_COUNT`          | 预留的 NVIDIA GPU 数量                                                   | `1`           |
+| `TURBOOCR_SHM_SIZE`           | 共享内存大小                                                             | `2g`          |
+
+复制 `.env.example` 为 `.env`，仅覆盖你需要修改的变量。
+
+## 卷
+
+- `turboocr_trt_cache`：缓存首次启动时由 ONNX 构建出的 TensorRT 引擎。必须使用**命名卷**，如果绑定挂载一个空的主机目录，会覆盖镜像内置的语言包，导致服务无法加载模型。
+
+## 使用方法
+
+### 启动 TurboOCR
+
+```bash
+docker compose up -d
+```
+
+首次启动需要从 ONNX 构建 TensorRT 引擎，耗时因 GPU 而异：高端桌面 GPU 约 5 分钟，笔记本 GPU 约 20–30 分钟。编译期间容器可能显示 `unhealthy`，这属于正常现象——构建完成后服务会自动启动并切换为 `healthy`。后续重启会复用缓存的引擎，几乎瞬间完成。
+
+### 访问端点
+
+- HTTP API：<http://localhost:8000>
+- gRPC API：`localhost:50051`
+- 健康检查：<http://localhost:8000/health>
+- 就绪检查：<http://localhost:8000/health/ready>
+- Prometheus 指标：<http://localhost:8000/metrics>
+
+### 测试 API
+
+```bash
+# 图片 —— 原始字节（最快路径）
+curl -X POST http://localhost:8000/ocr/raw \
+  --data-binary @document.png \
+  -H "Content-Type: image/png"
+
+# 图片 —— base64 JSON
+curl -X POST http://localhost:8000/ocr \
+  -H "Content-Type: application/json" \
+  -d '{"image":"'$(base64 -w0 document.png)'"}'
+
+# PDF —— 原始字节
+curl -X POST http://localhost:8000/ocr/pdf \
+  --data-binary @document.pdf
+
+# PDF 启用版面检测
+curl -X POST "http://localhost:8000/ocr/pdf?layout=1&mode=auto" \
+  --data-binary @document.pdf
+```
+
+> **重要提示**：请使用 HTTP keep-alive。如果在循环中频繁建立短连接（例如每次请求一个 `curl`），可能会压垮服务。标准 HTTP 客户端库（`requests.Session`、`aiohttp`、Go `http.Client` 等）默认会复用连接。
+
+### 切换语言
+
+修改 `.env` 后重启：
+
+```bash
+TURBOOCR_LANG=chinese
+TURBOOCR_SERVER=1   # 可选：使用 84 MB 的中文服务端识别模型
+```
+
+```bash
+docker compose up -d
+```
+
+所有语言包都在构建镜像时打包进来（基于固定版本的 PP-OCRv5 发布，并校验 SHA256），运行时无需联网下载。
+
+## 性能调优
+
+- **GPU 流水线**：根据显存大小设置 `TURBOOCR_PIPELINE_POOL_SIZE`（每条约 1.4 GB）
+- **版面开销**：`?layout=1` 会使吞吐下降约 20%；设置 `TURBOOCR_DISABLE_LAYOUT=1` 可完全跳过模型加载
+- **共享内存**：处理超大 PDF 时可增加 `TURBOOCR_SHM_SIZE`
+
+## 安全说明
+
+- API 默认无身份认证。生产环境请在前面套一层反向代理（nginx、Caddy 等）。
+- PDF 默认模式为 `ocr`，只信任像素数据，可安全处理不可信来源的 PDF 上传。
+- 如果你的服务接收不可信来源的 PDF，**不要**将 `TURBOOCR_PDF_MODE` 全局设为 `geometric` 或 `auto`：恶意 PDF 可以嵌入隐形文字、重映射 ToUnicode 字符或在文本层注入任意字符串。
+- 在可信文档场景下可使用 `auto_verified` 模式，会先做 OCR，再用文本层与之对照校验。
+
+## 许可证
+
+TurboOCR 采用 MIT 许可证。详情请参见 [TurboOCR GitHub 仓库](https://github.com/aiptimizer/TurboOCR)。
@@ -0,0 +1,71 @@
+x-defaults: &defaults
+  restart: unless-stopped
+  logging:
+    driver: json-file
+    options:
+      max-size: 100m
+      max-file: '3'
+
+services:
+  turboocr:
+    <<: *defaults
+    image: ${GLOBAL_REGISTRY:-ghcr.io/}aiptimizer/turboocr:${TURBOOCR_VERSION:-v2.1.1}
+    ports:
+      - '${TURBOOCR_HTTP_PORT_OVERRIDE:-8000}:8000'
+      - '${TURBOOCR_GRPC_PORT_OVERRIDE:-50051}:50051'
+    volumes:
+      # Named volume caches TensorRT engines built from ONNX on first start (~90s).
+      # Must be a named volume - bind-mounting an empty host dir would shadow the
+      # baked-in language bundles and prevent the server from loading models.
+      - turboocr_trt_cache:/home/ocr/.cache/turbo-ocr
+    environment:
+      - TZ=${TZ:-UTC}
+      # Language bundle: latin (default), chinese, greek, eslav, arabic, korean, thai
+      - OCR_LANG=${TURBOOCR_LANG:-}
+      # Set to 1 with OCR_LANG=chinese to use the 84MB server rec instead of 16MB mobile
+      - OCR_SERVER=${TURBOOCR_SERVER:-}
+      # Concurrent GPU pipelines (~1.4 GB VRAM each); empty = auto
+      - PIPELINE_POOL_SIZE=${TURBOOCR_PIPELINE_POOL_SIZE:-}
+      # Set to 1 to disable PP-DocLayoutV3 layout detection (saves ~300-500 MB VRAM)
+      - DISABLE_LAYOUT=${TURBOOCR_DISABLE_LAYOUT:-0}
+      # Default PDF mode: ocr (safest) / geometric / auto / auto_verified
+      - ENABLE_PDF_MODE=${TURBOOCR_PDF_MODE:-ocr}
+      # Skip angle classifier (~0.4ms savings)
+      - DISABLE_ANGLE_CLS=${TURBOOCR_DISABLE_ANGLE_CLS:-0}
+      # Max detection input size
+      - DET_MAX_SIDE=${TURBOOCR_DET_MAX_SIDE:-960}
+      # PDF render parallelism
+      - PDF_DAEMONS=${TURBOOCR_PDF_DAEMONS:-16}
+      - PDF_WORKERS=${TURBOOCR_PDF_WORKERS:-4}
+      # Maximum pages per PDF request
+      - MAX_PDF_PAGES=${TURBOOCR_MAX_PDF_PAGES:-2000}
+      # Log level: debug / info / warn / error
+      - LOG_LEVEL=${TURBOOCR_LOG_LEVEL:-info}
+      # Log format: json (structured) / text (human-readable)
+      - LOG_FORMAT=${TURBOOCR_LOG_FORMAT:-json}
+    healthcheck:
+      test: [CMD, curl, -fsS, 'http://localhost:8000/health']
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      # First start builds TensorRT engines from ONNX. Build time varies by GPU:
+      # ~5 min on high-end desktop GPUs, 20-30 min on laptop GPUs. The container
+      # may show "unhealthy" during compilation but will become healthy once done.
+      # Subsequent restarts reuse the cached engines and start in seconds.
+      start_period: 30m
+    deploy:
+      resources:
+        limits:
+          cpus: ${TURBOOCR_CPU_LIMIT:-8.0}
+          memory: ${TURBOOCR_MEMORY_LIMIT:-12G}
+        reservations:
+          cpus: ${TURBOOCR_CPU_RESERVATION:-2.0}
+          memory: ${TURBOOCR_MEMORY_RESERVATION:-4G}
+          devices:
+            - driver: nvidia
+              count: ${TURBOOCR_GPU_COUNT:-1}
+              capabilities: [gpu]
+    shm_size: ${TURBOOCR_SHM_SIZE:-2g}
+
+volumes:
+  turboocr_trt_cache: