From 5f8503df42ddd4ce68381a83caac3fad12f3c9f4 Mon Sep 17 00:00:00 2001
From: Sun-ZhenXing <1006925066@qq.com>
Date: Wed, 29 Apr 2026 11:54:59 +0800
Subject: [PATCH] feat: add build turboocr

---
 README.md                           |   1 +
 README.zh.md                        |   1 +
 builds/turboocr/.env.example        |  73 ++++++++++++++++
 builds/turboocr/Dockerfile.cpu      | 104 +++++++++++++++++++++++
 builds/turboocr/Dockerfile.cuda12   | 118 ++++++++++++++++++++++++++
 builds/turboocr/README.md           | 127 ++++++++++++++++++++++++++++
 builds/turboocr/README.zh.md        | 127 ++++++++++++++++++++++++++++
 builds/turboocr/docker-compose.yaml | 110 ++++++++++++++++++++++++
 src/turboocr/README.md              |   4 +-
 src/turboocr/README.zh.md           |   4 +-
 src/turboocr/docker-compose.yaml    |  14 +--
 11 files changed, 675 insertions(+), 8 deletions(-)
 create mode 100644 builds/turboocr/.env.example
 create mode 100644 builds/turboocr/Dockerfile.cpu
 create mode 100644 builds/turboocr/Dockerfile.cuda12
 create mode 100644 builds/turboocr/README.md
 create mode 100644 builds/turboocr/README.zh.md
 create mode 100644 builds/turboocr/docker-compose.yaml

diff --git a/README.md b/README.md
index b457ceb..79d906b 100644
--- a/README.md
+++ b/README.md
@@ -42,6 +42,7 @@ These services require building custom Docker images from source.
 | [Multica](./builds/multica)                 | v0.1.32 |
 | [OpenFang](./builds/openfang)               | 0.1.0   |
 | [Paperclip](./builds/paperclip)             | main    |
+| [TurboOCR](./builds/turboocr)               | v2.1.1  |
 
 ## Supported Services
 
diff --git a/README.zh.md b/README.zh.md
index fbf1f2c..990bfac 100644
--- a/README.zh.md
+++ b/README.zh.md
@@ -42,6 +42,7 @@ docker compose exec redis redis-cli ping
 | [Multica](./builds/multica)                 | v0.1.32 |
 | [OpenFang](./builds/openfang)               | 0.1.0   |
 | [Paperclip](./builds/paperclip)             | main    |
+| [TurboOCR](./builds/turboocr)               | v2.1.1  |
 
 ## 已经支持的服务
 
diff --git a/builds/turboocr/.env.example b/builds/turboocr/.env.example
new file mode 100644
index 0000000..d42f753
--- /dev/null
+++ b/builds/turboocr/.env.example
@@ -0,0 +1,73 @@
+# Source build configuration
+TURBOOCR_VERSION=v2.1.1
+
+# Registry mirror prefix for docker build — leave empty for direct pull.
+# China users: set to "docker.m.daocloud.io/" to proxy Docker Hub via DaoCloud.
+# Example: TURBOOCR_DOCKER_MIRROR=docker.m.daocloud.io/
+TURBOOCR_DOCKER_MIRROR=
+
+# NGC (nvcr.io) mirror prefix for the CUDA 12.x GPU build — leave empty for direct pull.
+# Standard Docker Hub mirrors (e.g. DaoCloud) do NOT proxy nvcr.io.
+# Set this only if you have a dedicated NGC pull-through proxy.
+TURBOOCR_NGC_MIRROR=
+
+# Network configuration
+TURBOOCR_HTTP_PORT_OVERRIDE=8000
+TURBOOCR_GRPC_PORT_OVERRIDE=50051
+
+# Language bundle: latin (default), chinese, greek, eslav, arabic, korean, thai
+TURBOOCR_LANG=
+# Set to 1 with TURBOOCR_LANG=chinese to use the 84 MB server rec model
+TURBOOCR_SERVER=
+
+# GPU pipeline pool — number of concurrent inference pipelines (~1.4 GB VRAM each).
+# Leave empty to let the server choose automatically based on available VRAM.
+# Ignored in CPU mode.
+TURBOOCR_PIPELINE_POOL_SIZE=
+
+# Set to 1 to skip loading the PP-DocLayoutV3 layout detection model.
+# Saves ~300-500 MB VRAM and cuts first-start compilation time by ~28 min on laptop GPUs.
+# Only do this if you do not need the ?layout=1 PDF endpoint.
+TURBOOCR_DISABLE_LAYOUT=0
+
+# Default PDF parsing mode: ocr (safest) / geometric / auto / auto_verified
+TURBOOCR_PDF_MODE=ocr
+
+# Set to 1 to skip the angle classifier (~0.4 ms savings per image)
+TURBOOCR_DISABLE_ANGLE_CLS=0
+
+# Maximum detection input dimension in pixels
+TURBOOCR_DET_MAX_SIDE=960
+
+# PDF render parallelism
+TURBOOCR_PDF_DAEMONS=16
+TURBOOCR_PDF_WORKERS=4
+
+# Maximum pages accepted in a single PDF request
+TURBOOCR_MAX_PDF_PAGES=2000
+
+# Log level: debug / info / warn / error
+TURBOOCR_LOG_LEVEL=info
+
+# Log format: json (structured) / text (human-readable)
+TURBOOCR_LOG_FORMAT=json
+
+# Resources — GPU variant (profile: gpu)
+# First-start builds TRT engines; 12 G covers the GPU + engine compilation headroom.
+TURBOOCR_CPU_LIMIT=8.0
+TURBOOCR_MEMORY_LIMIT=12G
+TURBOOCR_CPU_RESERVATION=2.0
+TURBOOCR_MEMORY_RESERVATION=4G
+
+# Number of NVIDIA GPUs to reserve (GPU variant only)
+TURBOOCR_GPU_COUNT=1
+
+# Shared memory — fastpdf2png uses /dev/shm for inter-process PDF page transfers
+TURBOOCR_SHM_SIZE=2g
+
+# Logging
+TURBOOCR_LOG_MAX_SIZE=100m
+TURBOOCR_LOG_MAX_FILE=3
+
+# Timezone
+TZ=UTC
diff --git a/builds/turboocr/Dockerfile.cpu b/builds/turboocr/Dockerfile.cpu
new file mode 100644
index 0000000..1d11f2d
--- /dev/null
+++ b/builds/turboocr/Dockerfile.cpu
@@ -0,0 +1,104 @@
+# ============================================================
+# TurboOCR — CPU-only build (ONNX Runtime backend, no GPU required)
+# Base image: ubuntu:24.04
+#
+# Produces: /app/build_cpu/paddle_cpu_server (HTTP + gRPC server)
+#
+# Image size: ~500 MB (vs ~10 GB for the GPU image).
+# No TRT compilation on first start — ONNX Runtime is used directly.
+# Startup is fast (~30 s) and requires no NVIDIA driver.
+#
+# Build:  docker build -f Dockerfile.cpu -t turboocr-cpu .
+# ============================================================
+
+ARG TURBOOCR_VERSION=v2.1.1
+ARG ORT_VERSION=1.22.0
+# Registry mirror prefix — leave empty for direct pull.
+# China users: set to "docker.m.daocloud.io/" to proxy Docker Hub via DaoCloud.
+ARG DOCKER_MIRROR=
+
+FROM ${DOCKER_MIRROR}ubuntu:24.04
+
+# Re-declare ARGs after FROM so they remain in scope
+ARG TURBOOCR_VERSION
+ARG ORT_VERSION
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    cmake \
+    g++ \
+    make \
+    pkg-config \
+    libopencv-dev \
+    nginx \
+    gosu \
+    libgrpc++-dev \
+    libc-ares-dev \
+    libprotobuf-dev \
+    protobuf-compiler \
+    protobuf-compiler-grpc \
+    libjsoncpp-dev \
+    uuid-dev \
+    zlib1g-dev \
+    libssl-dev \
+    git \
+    wget \
+    curl \
+    ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Drogon HTTP framework (async, epoll-based)
+RUN cd /tmp && \
+    git clone --depth 1 --branch v1.9.12 https://github.com/drogonframework/drogon.git && \
+    cd drogon && git submodule update --init && \
+    mkdir build && cd build && \
+    cmake .. -DBUILD_EXAMPLES=OFF -DBUILD_CTL=OFF -DBUILD_ORM=OFF \
+             -DBUILD_POSTGRESQL=OFF -DBUILD_MYSQL=OFF -DBUILD_SQLITE=OFF \
+             -DBUILD_REDIS=OFF -DBUILD_TESTING=OFF && \
+    make -j$(nproc) && make install && \
+    rm -rf /tmp/drogon
+
+# Install ONNX Runtime C++ SDK
+RUN cd /tmp && \
+    wget -q "https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-linux-x64-${ORT_VERSION}.tgz" && \
+    tar xzf "onnxruntime-linux-x64-${ORT_VERSION}.tgz" && \
+    cp -r "onnxruntime-linux-x64-${ORT_VERSION}/include/"* /usr/local/include/ && \
+    cp "onnxruntime-linux-x64-${ORT_VERSION}/lib/libonnxruntime.so"* /usr/local/lib/ && \
+    ldconfig && rm -rf /tmp/onnxruntime*
+
+# Clone TurboOCR at the pinned release tag
+RUN git clone --depth 1 --branch "${TURBOOCR_VERSION}" \
+    https://github.com/aiptimizer/TurboOCR.git /app
+
+WORKDIR /app
+
+# Install fastpdf2png (PDF renderer — PDFium vendored in third_party/).
+# Copy vendored libpdfium first so the installer does not need network access.
+RUN cp third_party/pdfium/lib/libpdfium.so /usr/lib/ && ldconfig && \
+    bash scripts/install_fastpdf2png.sh && \
+    { cp bin/libpdfium.so /usr/lib/ 2>/dev/null || true; } && \
+    ldconfig
+
+# Build CPU-only mode with ONNX Runtime backend
+RUN mkdir -p build_cpu && cd build_cpu && \
+    cmake .. -DUSE_CPU_ONLY=ON -DFETCH_MODELS=OFF && \
+    make -j$(nproc)
+
+# Create non-root user and redirect /app/models/rec into the named cache volume.
+RUN useradd -m -s /bin/bash ocr \
+    && chmod +x /app/scripts/entrypoint.sh \
+    && mkdir -p /home/ocr/.cache/turbo-ocr/models/rec /app/models \
+    && ln -s /home/ocr/.cache/turbo-ocr/models/rec /app/models/rec
+
+# Fetch all PP-OCRv5 language bundles (SHA256-verified from pinned GitHub Release)
+ARG OCR_INCLUDE_SERVER=1
+ENV OCR_INCLUDE_SERVER=${OCR_INCLUDE_SERVER}
+RUN bash scripts/fetch_release_models.sh \
+    && chown -R ocr:ocr /app /home/ocr/.cache
+
+EXPOSE 8000 50051
+
+ENTRYPOINT ["/app/scripts/entrypoint.sh"]
+CMD ["./build_cpu/paddle_cpu_server"]
diff --git a/builds/turboocr/Dockerfile.cuda12 b/builds/turboocr/Dockerfile.cuda12
new file mode 100644
index 0000000..a4fa92c
--- /dev/null
+++ b/builds/turboocr/Dockerfile.cuda12
@@ -0,0 +1,118 @@
+# ============================================================
+# TurboOCR — CUDA 12.x build (TensorRT 10.8 / CUDA 12.7)
+# Base image: nvcr.io/nvidia/tensorrt:24.12-py3
+#
+# Supported compute capabilities (NVIDIA GPU reference):
+#   https://developer.nvidia.com/cuda-gpus
+#   7.5  Turing     — GTX 16xx / RTX 20xx
+#   8.0  Ampere     — A100, RTX 30xx server-class
+#   8.6  Ampere     — RTX 30xx desktop / laptop
+#   8.9  Ada        — RTX 40xx
+#
+# Blackwell (CC 12.0) requires CUDA 13.x.
+# For that, use the upstream docker/Dockerfile.gpu (tensorrt:26.03-py3).
+#
+# Build:  docker build -f Dockerfile.cuda12 -t turboocr-cuda12 .
+# ============================================================
+
+ARG TURBOOCR_VERSION=v2.1.1
+ARG CMAKE_VERSION=3.31.6
+ARG ORT_VERSION=1.22.0
+# NGC registry mirror prefix — leave empty for direct pull from nvcr.io.
+# Note: standard Docker Hub mirrors (e.g. DaoCloud) do NOT proxy nvcr.io.
+# Set this only if you have a dedicated NGC mirror or a pull-through proxy.
+ARG NGC_MIRROR=
+
+FROM ${NGC_MIRROR}nvcr.io/nvidia/tensorrt:24.12-py3
+
+# Re-declare ARGs after FROM so they remain in scope
+ARG TURBOOCR_VERSION
+ARG CMAKE_VERSION
+ARG ORT_VERSION
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    pkg-config \
+    libopencv-dev \
+    nginx \
+    gosu \
+    libgrpc++-dev \
+    libprotobuf-dev \
+    protobuf-compiler-grpc \
+    libjsoncpp-dev \
+    uuid-dev \
+    zlib1g-dev \
+    libssl-dev \
+    libc-ares-dev \
+    git \
+    wget \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Drogon HTTP framework (async, epoll-based)
+RUN cd /tmp && \
+    git clone --depth 1 --branch v1.9.12 https://github.com/drogonframework/drogon.git && \
+    cd drogon && git submodule update --init && \
+    mkdir build && cd build && \
+    cmake .. -DBUILD_EXAMPLES=OFF -DBUILD_CTL=OFF -DBUILD_ORM=OFF \
+             -DBUILD_POSTGRESQL=OFF -DBUILD_MYSQL=OFF -DBUILD_SQLITE=OFF \
+             -DBUILD_REDIS=OFF -DBUILD_TESTING=OFF && \
+    make -j$(nproc) && make install && \
+    rm -rf /tmp/drogon
+
+# Upgrade CMake (the base image may ship an older version)
+RUN cd /tmp && \
+    wget -q "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz" && \
+    tar xzf "cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz" && \
+    cp -r "cmake-${CMAKE_VERSION}-linux-x86_64/bin/"* /usr/local/bin/ && \
+    cp -r "cmake-${CMAKE_VERSION}-linux-x86_64/share/"* /usr/local/share/ && \
+    rm -rf /tmp/cmake*
+
+# Install ONNX Runtime C++ SDK (used by the CPU inference fallback path)
+RUN cd /tmp && \
+    wget -q "https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-linux-x64-${ORT_VERSION}.tgz" && \
+    tar xzf "onnxruntime-linux-x64-${ORT_VERSION}.tgz" && \
+    cp -r "onnxruntime-linux-x64-${ORT_VERSION}/include/"* /usr/local/include/ && \
+    cp "onnxruntime-linux-x64-${ORT_VERSION}/lib/libonnxruntime.so"* /usr/local/lib/ && \
+    ldconfig && rm -rf /tmp/onnxruntime*
+
+# Clone TurboOCR at the pinned release tag
+RUN git clone --depth 1 --branch "${TURBOOCR_VERSION}" \
+    https://github.com/aiptimizer/TurboOCR.git /app
+
+WORKDIR /app
+
+# Install fastpdf2png (PDF renderer — PDFium vendored in third_party/)
+RUN bash scripts/install_fastpdf2png.sh && \
+    cp bin/libpdfium.so /usr/lib/ && ldconfig
+
+# Build GPU mode.
+# - CUDA_ARCHITECTURES: 7.5-8.9 covers Turing through Ada Lovelace under CUDA 12.x.
+#   CC 12.0 (Blackwell) is excluded — it requires CUDA 13.x.
+# - TENSORRT_DIR: /usr/local/tensorrt is the cmake default and matches the 24.12-py3
+#   base image layout. No override needed (upstream 26.03 uses /usr/lib/x86_64-linux-gnu).
+# - FETCH_MODELS=OFF: models are fetched in a separate layer below for better caching.
+RUN mkdir -p build && cd build && \
+    cmake .. \
+      -DFETCH_MODELS=OFF \
+      -DCMAKE_CUDA_ARCHITECTURES="75;80;86;89" \
+    && make -j$(nproc)
+
+# Create non-root user and redirect /app/models/rec into the named cache volume.
+# TRT engines built at first start are persisted via: -v turboocr_cache:/home/ocr/.cache/turbo-ocr
+RUN useradd -m -s /bin/bash ocr \
+    && chmod +x /app/scripts/entrypoint.sh \
+    && mkdir -p /home/ocr/.cache/turbo-ocr/models/rec /app/models \
+    && ln -s /home/ocr/.cache/turbo-ocr/models/rec /app/models/rec
+
+# Fetch all PP-OCRv5 language bundles (SHA256-verified from pinned GitHub Release)
+ARG OCR_INCLUDE_SERVER=1
+ENV OCR_INCLUDE_SERVER=${OCR_INCLUDE_SERVER}
+RUN bash scripts/fetch_release_models.sh \
+    && chown -R ocr:ocr /app /home/ocr/.cache
+
+EXPOSE 8000 50051
+
+ENTRYPOINT ["/app/scripts/entrypoint.sh"]
+CMD ["./build/paddle_highspeed_cpp"]
diff --git a/builds/turboocr/README.md b/builds/turboocr/README.md
new file mode 100644
index 0000000..dfe4697
--- /dev/null
+++ b/builds/turboocr/README.md
@@ -0,0 +1,127 @@
+# TurboOCR — Custom Builds
+
+[中文文档](README.zh.md)
+
+This directory builds [TurboOCR](https://github.com/aiptimizer/TurboOCR) from source for two targets that are not covered by the upstream pre-built images:
+
+| Variant | Dockerfile | Profile | Base image |
+| ------- | ---------- | ------- | ---------- |
+| **CUDA 12.x** | `Dockerfile.cuda12` | `gpu` | `nvcr.io/nvidia/tensorrt:24.12-py3` (TRT 10.8 / CUDA 12.7) |
+| **CPU-only** | `Dockerfile.cpu` | `cpu` | `ubuntu:24.04` (ONNX Runtime) |
+
+The upstream pre-built image targets CUDA 13.x (Blackwell / CC 12.0). Use this directory if your GPU is on CUDA 12.x (Turing through Ada Lovelace, CC 7.5–8.9) or if you have no GPU at all.
+
+## Quick Start
+
+1. Copy the example environment file:
+
+   ```bash
+   cp .env.example .env
+   ```
+
+2. Build and start the variant you need:
+
+   **CUDA 12.x (GPU — Turing through Ada Lovelace):**
+
+   ```bash
+   docker compose --profile gpu up -d --build
+   ```
+
+   **CPU-only (no GPU required):**
+
+   ```bash
+   docker compose --profile cpu up -d --build
+   ```
+
+3. Access the API at <http://localhost:8000>.
+
+> **Note:** The first build compiles Drogon and TurboOCR from source, which takes 10–30 minutes depending on your CPU core count. Subsequent builds use the Docker layer cache and are fast.
+
+## First-Start Behavior
+
+### GPU variant
+
+On the very first container start, TensorRT compiles 4 ONNX models into engine files. Measured times on an RTX 3070 Laptop:
+
+| Engine | Time |
+| ------ | ---- |
+| det | ~5 min |
+| rec | ~30 min |
+| cls | ~4 min |
+| layout | ~28 min |
+| **Total** | **~67–90 min** |
+
+High-end desktop GPUs finish in ~15 minutes. The container shows `unhealthy` during compilation — this is expected. Once all engines are ready the server starts and the status transitions to `healthy`. Subsequent restarts reuse the cached engines and start in seconds.
+
+> **Tip:** Set `TURBOOCR_DISABLE_LAYOUT=1` to skip the layout detection engine (~28 min savings on laptop GPUs). Use this only if you do not need the `?layout=1` PDF endpoint.
+
+### CPU variant
+
+No TRT compilation occurs. ONNX Runtime loads the models directly at startup. The container is typically `healthy` within 60 seconds.
+
+## Default Ports
+
+| Port | Protocol | Description |
+| ---- | -------- | ----------- |
+| 8000 | HTTP | OCR REST API + health/metrics |
+| 50051 | gRPC | OCR gRPC API |
+
+## Important Environment Variables
+
+| Variable | Description | Default |
+| -------- | ----------- | ------- |
+| `TURBOOCR_VERSION` | Git tag used for the source build | `v2.1.1` |
+| `TURBOOCR_HTTP_PORT_OVERRIDE` | Host port for the HTTP API | `8000` |
+| `TURBOOCR_GRPC_PORT_OVERRIDE` | Host port for the gRPC API | `50051` |
+| `TURBOOCR_LANG` | Language bundle: `latin`, `chinese`, `greek`, `eslav`, `arabic`, `korean`, `thai` | `""` (latin) |
+| `TURBOOCR_SERVER` | With `chinese`, set to `1` for the 84 MB server rec model | `""` |
+| `TURBOOCR_PIPELINE_POOL_SIZE` | Concurrent GPU pipelines (~1.4 GB VRAM each); empty = auto | `""` |
+| `TURBOOCR_DISABLE_LAYOUT` | Disable layout detection model (saves ~300–500 MB VRAM) | `0` |
+| `TURBOOCR_PDF_MODE` | PDF parsing mode: `ocr` / `geometric` / `auto` / `auto_verified` | `ocr` |
+| `TURBOOCR_CPU_LIMIT` | CPU core limit (both variants) | `8.0` |
+| `TURBOOCR_MEMORY_LIMIT` | Memory limit — `12G` for GPU, `4G` for CPU | variant default |
+| `TURBOOCR_GPU_COUNT` | NVIDIA GPUs to reserve (GPU variant only) | `1` |
+| `TURBOOCR_SHM_SIZE` | Shared memory for fastpdf2png — `2g` for GPU, `512m` for CPU | variant default |
+| `TZ` | Container timezone | `UTC` |
+
+## Storage
+
+- `turboocr_build_cache` — named volume at `/home/ocr/.cache/turbo-ocr`. Stores TRT engine files (GPU) or the model cache directory (CPU). Must be a named volume — a bind-mount of an empty host directory would shadow the baked-in language bundles and the server would fail to load models.
+
+## Supported GPU Architectures (CUDA 12.x variant)
+
+| Compute Capability | Architecture | GPUs |
+| ------------------ | ------------ | ---- |
+| 7.5 | Turing | GTX 16xx, RTX 20xx |
+| 8.0 | Ampere | A100, RTX 30xx (server) |
+| 8.6 | Ampere | RTX 30xx (desktop / laptop) |
+| 8.9 | Ada Lovelace | RTX 40xx |
+
+Blackwell (CC 12.0, RTX 50xx) requires CUDA 13.x — use the upstream pre-built image from `src/turboocr` instead.
+
+## Notes
+
+- Both Dockerfiles build TurboOCR from source via `git clone` inside the image. A working internet connection is required at build time.
+- The CUDA 12.x Dockerfile overrides `CMAKE_CUDA_ARCHITECTURES` to `75;80;86;89`, removing CC 12.0 which is not supported by CUDA 12.x.
+- TensorRT 10.8 is located at `/usr/local/tensorrt` in the `24.12-py3` base image, which matches the CMake default. No `-DTENSORRT_DIR` override is needed.
+- The CPU variant uses ONNX Runtime 1.22.0 and produces a `paddle_cpu_server` binary with both HTTP and gRPC interfaces.
+
+## Endpoints
+
+- HTTP API: <http://localhost:8000>
+- gRPC API: `localhost:50051`
+- Health: <http://localhost:8000/health>
+- Readiness: <http://localhost:8000/health/ready>
+- Metrics (Prometheus): <http://localhost:8000/metrics>
+
+## Security Notes
+
+- The API has no authentication by default. Put a reverse proxy (nginx, Caddy) in front for production.
+- The default PDF mode is `ocr`, which only trusts pixel data and is safe for untrusted PDF uploads.
+- Do **not** set `TURBOOCR_PDF_MODE` to `geometric` or `auto` globally if you accept PDFs from untrusted sources.
+
+## References
+
+- [TurboOCR Repository](https://github.com/aiptimizer/TurboOCR)
+- [NVIDIA TensorRT Container Releases](https://docs.nvidia.com/deeplearning/tensorrt/container-release-notes/)
+- [NVIDIA CUDA GPU Compute Capability Table](https://developer.nvidia.com/cuda-gpus)
diff --git a/builds/turboocr/README.zh.md b/builds/turboocr/README.zh.md
new file mode 100644
index 0000000..4dd48a3
--- /dev/null
+++ b/builds/turboocr/README.zh.md
@@ -0,0 +1,127 @@
+# TurboOCR — 自定义构建
+
+[English](README.md)
+
+此目录从源码构建 [TurboOCR](https://github.com/aiptimizer/TurboOCR)，覆盖上游预构建镜像未提供的两个目标：
+
+| 变体 | Dockerfile | Profile | 基础镜像 |
+| ---- | ---------- | ------- | -------- |
+| **CUDA 12.x** | `Dockerfile.cuda12` | `gpu` | `nvcr.io/nvidia/tensorrt:24.12-py3`（TRT 10.8 / CUDA 12.7） |
+| **纯 CPU** | `Dockerfile.cpu` | `cpu` | `ubuntu:24.04`（ONNX Runtime） |
+
+上游预构建镜像针对 CUDA 13.x（Blackwell / CC 12.0）。如果你的 GPU 属于 CUDA 12.x 范围（Turing 到 Ada Lovelace，CC 7.5–8.9），或者没有 GPU，请使用本目录。
+
+## 快速开始
+
+1. 复制示例环境文件：
+
+   ```bash
+   cp .env.example .env
+   ```
+
+2. 按需构建并启动对应变体：
+
+   **CUDA 12.x（GPU — Turing 到 Ada Lovelace）：**
+
+   ```bash
+   docker compose --profile gpu up -d --build
+   ```
+
+   **纯 CPU（无需 GPU）：**
+
+   ```bash
+   docker compose --profile cpu up -d --build
+   ```
+
+3. 访问 API：<http://localhost:8000>。
+
+> **说明：** 首次构建需要从源码编译 Drogon 和 TurboOCR，耗时约 10–30 分钟，具体取决于 CPU 核心数。后续构建会复用 Docker 层缓存，速度很快。
+
+## 首次启动说明
+
+### GPU 变体
+
+容器首次启动时，TensorRT 会将 4 个 ONNX 模型编译为引擎文件。在 RTX 3070 Laptop 上的实测耗时：
+
+| 引擎 | 耗时 |
+| ---- | ---- |
+| det | 约 5 分钟 |
+| rec | 约 30 分钟 |
+| cls | 约 4 分钟 |
+| layout | 约 28 分钟 |
+| **合计** | **约 67–90 分钟** |
+
+高端桌面 GPU 约 15 分钟完成。编译期间容器显示 `unhealthy` 属于正常现象——所有引擎构建完成后服务启动，状态切换为 `healthy`。后续重启会复用缓存引擎，几乎瞬间完成。
+
+> **提示：** 设置 `TURBOOCR_DISABLE_LAYOUT=1` 可跳过版面检测引擎的编译（笔记本 GPU 约节省 28 分钟）。仅在不需要 `?layout=1` PDF 端点时使用此选项。
+
+### CPU 变体
+
+无 TRT 编译过程。ONNX Runtime 在启动时直接加载模型，通常在 60 秒内变为 `healthy`。
+
+## 默认端口
+
+| 端口 | 协议 | 说明 |
+| ---- | ---- | ---- |
+| 8000 | HTTP | OCR REST API + 健康检查/指标 |
+| 50051 | gRPC | OCR gRPC API |
+
+## 主要环境变量
+
+| 变量名 | 说明 | 默认值 |
+| ------ | ---- | ------ |
+| `TURBOOCR_VERSION` | 构建所用的 Git 标签 | `v2.1.1` |
+| `TURBOOCR_HTTP_PORT_OVERRIDE` | HTTP API 主机端口 | `8000` |
+| `TURBOOCR_GRPC_PORT_OVERRIDE` | gRPC API 主机端口 | `50051` |
+| `TURBOOCR_LANG` | 语言包：`latin`、`chinese`、`greek`、`eslav`、`arabic`、`korean`、`thai` | `""`（latin） |
+| `TURBOOCR_SERVER` | 当使用 `chinese` 时，设为 `1` 启用 84 MB 服务端识别模型 | `""` |
+| `TURBOOCR_PIPELINE_POOL_SIZE` | 并发 GPU 流水线数（每条约 1.4 GB 显存），留空则自动 | `""` |
+| `TURBOOCR_DISABLE_LAYOUT` | 禁用版面检测模型（节省约 300–500 MB 显存） | `0` |
+| `TURBOOCR_PDF_MODE` | PDF 解析模式：`ocr` / `geometric` / `auto` / `auto_verified` | `ocr` |
+| `TURBOOCR_CPU_LIMIT` | CPU 核心限制（两个变体通用） | `8.0` |
+| `TURBOOCR_MEMORY_LIMIT` | 内存限制——GPU 变体 `12G`，CPU 变体 `4G` | 变体默认值 |
+| `TURBOOCR_GPU_COUNT` | 预留的 NVIDIA GPU 数量（仅 GPU 变体） | `1` |
+| `TURBOOCR_SHM_SIZE` | fastpdf2png 共享内存——GPU 变体 `2g`，CPU 变体 `512m` | 变体默认值 |
+| `TZ` | 容器时区 | `UTC` |
+
+## 存储
+
+- `turboocr_build_cache`——命名卷，挂载于 `/home/ocr/.cache/turbo-ocr`。用于存储 TRT 引擎文件（GPU 变体）或模型缓存目录（CPU 变体）。必须使用**命名卷**——绑定挂载空主机目录会遮蔽镜像内置语言包，导致服务无法加载模型。
+
+## 支持的 GPU 架构（CUDA 12.x 变体）
+
+| 算力版本 | 架构 | GPU 型号 |
+| -------- | ---- | -------- |
+| 7.5 | Turing | GTX 16xx、RTX 20xx |
+| 8.0 | Ampere | A100、RTX 30xx（服务器） |
+| 8.6 | Ampere | RTX 30xx（桌面/笔记本） |
+| 8.9 | Ada Lovelace | RTX 40xx |
+
+Blackwell（CC 12.0，RTX 50xx）需要 CUDA 13.x——请改用 `src/turboocr` 中的上游预构建镜像。
+
+## 说明
+
+- 两个 Dockerfile 均在镜像内通过 `git clone` 从源码构建 TurboOCR，构建时需要可访问互联网。
+- CUDA 12.x Dockerfile 将 `CMAKE_CUDA_ARCHITECTURES` 设置为 `75;80;86;89`，去除了 CUDA 12.x 不支持的 CC 12.0。
+- TensorRT 10.8 在 `24.12-py3` 基础镜像中位于 `/usr/local/tensorrt`，与 CMake 默认值一致，无需额外的 `-DTENSORRT_DIR` 参数。
+- CPU 变体使用 ONNX Runtime 1.22.0，生成同时支持 HTTP 和 gRPC 接口的 `paddle_cpu_server` 二进制文件。
+
+## 访问端点
+
+- HTTP API：<http://localhost:8000>
+- gRPC API：`localhost:50051`
+- 健康检查：<http://localhost:8000/health>
+- 就绪检查：<http://localhost:8000/health/ready>
+- Prometheus 指标：<http://localhost:8000/metrics>
+
+## 安全说明
+
+- API 默认无身份认证。生产环境请在前面套一层反向代理（nginx、Caddy 等）。
+- PDF 默认模式为 `ocr`，只信任像素数据，可安全处理不可信来源的 PDF 上传。
+- 如果你的服务接收不可信来源的 PDF，**不要**将 `TURBOOCR_PDF_MODE` 全局设为 `geometric` 或 `auto`。
+
+## 参考链接
+
+- [TurboOCR 仓库](https://github.com/aiptimizer/TurboOCR)
+- [NVIDIA TensorRT 容器发布说明](https://docs.nvidia.com/deeplearning/tensorrt/container-release-notes/)
+- [NVIDIA CUDA GPU 算力版本对照表](https://developer.nvidia.com/cuda-gpus)
diff --git a/builds/turboocr/docker-compose.yaml b/builds/turboocr/docker-compose.yaml
new file mode 100644
index 0000000..6f32be5
--- /dev/null
+++ b/builds/turboocr/docker-compose.yaml
@@ -0,0 +1,110 @@
+x-defaults: &defaults
+  restart: unless-stopped
+  logging:
+    driver: json-file
+    options:
+      max-size: ${TURBOOCR_LOG_MAX_SIZE:-100m}
+      max-file: '${TURBOOCR_LOG_MAX_FILE:-3}'
+
+x-turboocr-common: &turboocr-common
+  <<: *defaults
+  ports:
+    - '${TURBOOCR_HTTP_PORT_OVERRIDE:-8000}:8000'
+    - '${TURBOOCR_GRPC_PORT_OVERRIDE:-50051}:50051'
+  volumes:
+    # Named volume persists TRT engines (GPU) or ONNX model cache (CPU).
+    # Must be a named volume — bind-mounting an empty host dir shadows the
+    # baked-in language bundles and prevents the server from loading models.
+    - turboocr_build_cache:/home/ocr/.cache/turbo-ocr
+  environment:
+    - TZ=${TZ:-UTC}
+    # Language bundle: latin (default), chinese, greek, eslav, arabic, korean, thai
+    - OCR_LANG=${TURBOOCR_LANG:-}
+    # Set to 1 with OCR_LANG=chinese to use the 84 MB server rec model
+    - OCR_SERVER=${TURBOOCR_SERVER:-}
+    # Concurrent GPU pipelines (~1.4 GB VRAM each); empty = auto; ignored in CPU mode
+    - PIPELINE_POOL_SIZE=${TURBOOCR_PIPELINE_POOL_SIZE:-}
+    # Set to 1 to disable PP-DocLayoutV3 layout detection (saves ~300-500 MB VRAM)
+    - DISABLE_LAYOUT=${TURBOOCR_DISABLE_LAYOUT:-0}
+    # Default PDF mode: ocr (safest) / geometric / auto / auto_verified
+    - ENABLE_PDF_MODE=${TURBOOCR_PDF_MODE:-ocr}
+    # Skip angle classifier (~0.4 ms savings)
+    - DISABLE_ANGLE_CLS=${TURBOOCR_DISABLE_ANGLE_CLS:-0}
+    # Max detection input size in pixels
+    - DET_MAX_SIDE=${TURBOOCR_DET_MAX_SIDE:-960}
+    # PDF render parallelism
+    - PDF_DAEMONS=${TURBOOCR_PDF_DAEMONS:-16}
+    - PDF_WORKERS=${TURBOOCR_PDF_WORKERS:-4}
+    # Maximum pages per PDF request
+    - MAX_PDF_PAGES=${TURBOOCR_MAX_PDF_PAGES:-2000}
+    # Log level: debug / info / warn / error
+    - LOG_LEVEL=${TURBOOCR_LOG_LEVEL:-info}
+    # Log format: json (structured) / text (human-readable)
+    - LOG_FORMAT=${TURBOOCR_LOG_FORMAT:-json}
+
+services:
+  turboocr-cuda12:
+    <<: *turboocr-common
+    profiles: [gpu]
+    build:
+      context: .
+      dockerfile: Dockerfile.cuda12
+      args:
+        TURBOOCR_VERSION: ${TURBOOCR_VERSION:-v2.1.1}
+        NGC_MIRROR: ${TURBOOCR_NGC_MIRROR:-}
+    image: ${GLOBAL_REGISTRY:-}alexsuntop/turboocr-cuda12:${TURBOOCR_VERSION:-v2.1.1}
+    healthcheck:
+      test: [CMD, curl, -fsS, 'http://localhost:8000/health']
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      # First start builds 4 TensorRT engines from ONNX. Measured times on an
+      # RTX 3070 Laptop: det (~5 min) + rec (~30 min) + cls (~4 min) +
+      # layout (~28 min) = ~67-90 min. High-end desktop GPUs finish in ~15 min.
+      # Set TURBOOCR_DISABLE_LAYOUT=1 to skip layout and save ~28 min.
+      # Subsequent restarts reuse the cached engines and start in seconds.
+      start_period: 120m
+    deploy:
+      resources:
+        limits:
+          cpus: ${TURBOOCR_CPU_LIMIT:-8.0}
+          memory: ${TURBOOCR_MEMORY_LIMIT:-12G}
+        reservations:
+          cpus: ${TURBOOCR_CPU_RESERVATION:-2.0}
+          memory: ${TURBOOCR_MEMORY_RESERVATION:-4G}
+          devices:
+            - driver: nvidia
+              count: ${TURBOOCR_GPU_COUNT:-1}
+              capabilities: [gpu]
+    shm_size: ${TURBOOCR_SHM_SIZE:-2g}
+
+  turboocr-cpu:
+    <<: *turboocr-common
+    profiles: [cpu]
+    build:
+      context: .
+      dockerfile: Dockerfile.cpu
+      args:
+        TURBOOCR_VERSION: ${TURBOOCR_VERSION:-v2.1.1}
+        DOCKER_MIRROR: ${TURBOOCR_DOCKER_MIRROR:-}
+    image: ${GLOBAL_REGISTRY:-}alexsuntop/turboocr-cpu:${TURBOOCR_VERSION:-v2.1.1}
+    healthcheck:
+      test: [CMD, curl, -fsS, 'http://localhost:8000/health']
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      # CPU mode uses ONNX Runtime directly — no TRT compilation on first start.
+      # Expect startup in under 60 s on most hardware.
+      start_period: 2m
+    deploy:
+      resources:
+        limits:
+          cpus: ${TURBOOCR_CPU_LIMIT:-8.0}
+          memory: ${TURBOOCR_MEMORY_LIMIT:-4G}
+        reservations:
+          cpus: ${TURBOOCR_CPU_RESERVATION:-2.0}
+          memory: ${TURBOOCR_MEMORY_RESERVATION:-1G}
+    shm_size: ${TURBOOCR_SHM_SIZE:-512m}
+
+volumes:
+  turboocr_build_cache:
diff --git a/src/turboocr/README.md b/src/turboocr/README.md
index 36fb2db..ffa00fe 100644
--- a/src/turboocr/README.md
+++ b/src/turboocr/README.md
@@ -52,7 +52,9 @@ Copy `.env.example` to `.env` and override only the variables you need to change
 docker compose up -d
 ```
 
-The first start builds TensorRT engines from ONNX. Build time depends on your GPU: roughly 5 minutes on high-end desktop GPUs and 20–30 minutes on laptop GPUs. The container may report `unhealthy` while compilation is in progress — this is normal. Once the build finishes the server starts and the container transitions to `healthy`. Subsequent restarts reuse the cached engines and start in seconds.
+The first start builds 4 TensorRT engines from ONNX. Measured build times on an RTX 3070 Laptop: det (~5 min) + rec (~30 min) + cls (~4 min) + layout (~28 min) = **~67–90 minutes total**. High-end desktop GPUs finish in ~15 minutes. The container reports `unhealthy` while compilation is in progress — this is expected. Once all engines are built the server starts and the container transitions to `healthy`. Subsequent restarts reuse the cached engines and start in seconds.
+
+> **Tip — faster first boot:** Set `TURBOOCR_DISABLE_LAYOUT=1` to skip the layout detection engine (~28 min on laptop GPUs). Only do this if you don't need the `?layout=1` PDF endpoint.
 
 ### Endpoints
 
diff --git a/src/turboocr/README.zh.md b/src/turboocr/README.zh.md
index 0f7eed9..ee0cff4 100644
--- a/src/turboocr/README.zh.md
+++ b/src/turboocr/README.zh.md
@@ -52,7 +52,9 @@
 docker compose up -d
 ```
 
-首次启动需要从 ONNX 构建 TensorRT 引擎，耗时因 GPU 而异：高端桌面 GPU 约 5 分钟，笔记本 GPU 约 20–30 分钟。编译期间容器可能显示 `unhealthy`，这属于正常现象——构建完成后服务会自动启动并切换为 `healthy`。后续重启会复用缓存的引擎，几乎瞬间完成。
+首次启动需要编译 4 个 TensorRT 引擎。在 RTX 3070 Laptop 上的实测耗时：det（约 5 分钟）+ rec（约 30 分钟）+ cls（约 4 分钟）+ layout（约 28 分钟）= **总计约 67–90 分钟**。高端桌面 GPU 约 15 分钟完成。编译期间容器显示 `unhealthy` 属于正常现象——所有引擎构建完成后服务会自动启动并切换为 `healthy`。后续重启会复用缓存的引擎，几乎瞬间完成。
+
+> **提示——加快首次启动**：设置 `TURBOOCR_DISABLE_LAYOUT=1` 可跳过版面检测引擎的编译（笔记本 GPU 约节省 28 分钟）。仅在不需要 `?layout=1` PDF 端点时使用此选项。
 
 ### 访问端点
 
diff --git a/src/turboocr/docker-compose.yaml b/src/turboocr/docker-compose.yaml
index aa34680..18a9d7b 100644
--- a/src/turboocr/docker-compose.yaml
+++ b/src/turboocr/docker-compose.yaml
@@ -14,7 +14,7 @@ services:
       - '${TURBOOCR_HTTP_PORT_OVERRIDE:-8000}:8000'
       - '${TURBOOCR_GRPC_PORT_OVERRIDE:-50051}:50051'
     volumes:
-      # Named volume caches TensorRT engines built from ONNX on first start (~90s).
+      # Named volume caches TensorRT engines built from ONNX on first start.
       # Must be a named volume - bind-mounting an empty host dir would shadow the
       # baked-in language bundles and prevent the server from loading models.
       - turboocr_trt_cache:/home/ocr/.cache/turbo-ocr
@@ -48,11 +48,13 @@ services:
       interval: 30s
       timeout: 10s
       retries: 5
-      # First start builds TensorRT engines from ONNX. Build time varies by GPU:
-      # ~5 min on high-end desktop GPUs, 20-30 min on laptop GPUs. The container
-      # may show "unhealthy" during compilation but will become healthy once done.
-      # Subsequent restarts reuse the cached engines and start in seconds.
-      start_period: 30m
+      # First start builds 4 TensorRT engines from ONNX. Measured build times:
+      #   det (~5 min) + rec (~30 min) + cls (~4 min) + layout (~28 min) ≈ 67-90 min
+      # on an RTX 3070 Laptop. High-end desktop GPUs finish in ~15 min.
+      # Set TURBOOCR_DISABLE_LAYOUT=1 to skip the layout engine and cut ~28 min.
+      # The container shows "unhealthy" while building but recovers once done.
+      # Subsequent restarts reuse cached engines and start in seconds.
+      start_period: 120m
     deploy:
       resources:
         limits: