feat: add build turboocr

2026-04-29 11:54:59 +08:00
parent ce16588916
commit 5f8503df42
11 changed files with 675 additions and 8 deletions
@@ -42,6 +42,7 @@ These services require building custom Docker images from source.
 | [Multica](./builds/multica)                 | v0.1.32 |
 | [OpenFang](./builds/openfang)               | 0.1.0   |
 | [Paperclip](./builds/paperclip)             | main    |
 | [TurboOCR](./builds/turboocr)               | v2.1.1  |
 ## Supported Services
@@ -42,6 +42,7 @@ docker compose exec redis redis-cli ping
 | [Multica](./builds/multica)                 | v0.1.32 |
 | [OpenFang](./builds/openfang)               | 0.1.0   |
 | [Paperclip](./builds/paperclip)             | main    |
 | [TurboOCR](./builds/turboocr)               | v2.1.1  |
 ## 已经支持的服务
@@ -0,0 +1,73 @@
 # Source build configuration
 TURBOOCR_VERSION=v2.1.1
 # Registry mirror prefix for docker build — leave empty for direct pull.
 # China users: set to "docker.m.daocloud.io/" to proxy Docker Hub via DaoCloud.
 # Example: TURBOOCR_DOCKER_MIRROR=docker.m.daocloud.io/
 TURBOOCR_DOCKER_MIRROR=
 # NGC (nvcr.io) mirror prefix for the CUDA 12.x GPU build — leave empty for direct pull.
 # Standard Docker Hub mirrors (e.g. DaoCloud) do NOT proxy nvcr.io.
 # Set this only if you have a dedicated NGC pull-through proxy.
 TURBOOCR_NGC_MIRROR=
 # Network configuration
 TURBOOCR_HTTP_PORT_OVERRIDE=8000
 TURBOOCR_GRPC_PORT_OVERRIDE=50051
 # Language bundle: latin (default), chinese, greek, eslav, arabic, korean, thai
 TURBOOCR_LANG=
 # Set to 1 with TURBOOCR_LANG=chinese to use the 84 MB server rec model
 TURBOOCR_SERVER=
 # GPU pipeline pool — number of concurrent inference pipelines (~1.4 GB VRAM each).
 # Leave empty to let the server choose automatically based on available VRAM.
 # Ignored in CPU mode.
 TURBOOCR_PIPELINE_POOL_SIZE=
 # Set to 1 to skip loading the PP-DocLayoutV3 layout detection model.
 # Saves ~300-500 MB VRAM and cuts first-start compilation time by ~28 min on laptop GPUs.
 # Only do this if you do not need the ?layout=1 PDF endpoint.
 TURBOOCR_DISABLE_LAYOUT=0
 # Default PDF parsing mode: ocr (safest) / geometric / auto / auto_verified
 TURBOOCR_PDF_MODE=ocr
 # Set to 1 to skip the angle classifier (~0.4 ms savings per image)
 TURBOOCR_DISABLE_ANGLE_CLS=0
 # Maximum detection input dimension in pixels
 TURBOOCR_DET_MAX_SIDE=960
 # PDF render parallelism
 TURBOOCR_PDF_DAEMONS=16
 TURBOOCR_PDF_WORKERS=4
 # Maximum pages accepted in a single PDF request
 TURBOOCR_MAX_PDF_PAGES=2000
 # Log level: debug / info / warn / error
 TURBOOCR_LOG_LEVEL=info
 # Log format: json (structured) / text (human-readable)
 TURBOOCR_LOG_FORMAT=json
 # Resources — GPU variant (profile: gpu)
 # First-start builds TRT engines; 12 G covers the GPU + engine compilation headroom.
 TURBOOCR_CPU_LIMIT=8.0
 TURBOOCR_MEMORY_LIMIT=12G
 TURBOOCR_CPU_RESERVATION=2.0
 TURBOOCR_MEMORY_RESERVATION=4G
 # Number of NVIDIA GPUs to reserve (GPU variant only)
 TURBOOCR_GPU_COUNT=1
 # Shared memory — fastpdf2png uses /dev/shm for inter-process PDF page transfers
 TURBOOCR_SHM_SIZE=2g
 # Logging
 TURBOOCR_LOG_MAX_SIZE=100m
 TURBOOCR_LOG_MAX_FILE=3
 # Timezone
 TZ=UTC
@@ -0,0 +1,104 @@
 # ============================================================
 # TurboOCR — CPU-only build (ONNX Runtime backend, no GPU required)
 # Base image: ubuntu:24.04
 #
 # Produces: /app/build_cpu/paddle_cpu_server (HTTP + gRPC server)
 #
 # Image size: ~500 MB (vs ~10 GB for the GPU image).
 # No TRT compilation on first start — ONNX Runtime is used directly.
 # Startup is fast (~30 s) and requires no NVIDIA driver.
 #
 # Build:  docker build -f Dockerfile.cpu -t turboocr-cpu .
 # ============================================================
 ARG TURBOOCR_VERSION=v2.1.1
 ARG ORT_VERSION=1.22.0
 # Registry mirror prefix — leave empty for direct pull.
 # China users: set to "docker.m.daocloud.io/" to proxy Docker Hub via DaoCloud.
 ARG DOCKER_MIRROR=
 FROM ${DOCKER_MIRROR}ubuntu:24.04
 # Re-declare ARGs after FROM so they remain in scope
 ARG TURBOOCR_VERSION
 ARG ORT_VERSION
 ENV DEBIAN_FRONTEND=noninteractive
 # Install build dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
    cmake \
    g++ \
    make \
    pkg-config \
    libopencv-dev \
    nginx \
    gosu \
    libgrpc++-dev \
    libc-ares-dev \
    libprotobuf-dev \
    protobuf-compiler \
    protobuf-compiler-grpc \
    libjsoncpp-dev \
    uuid-dev \
    zlib1g-dev \
    libssl-dev \
    git \
    wget \
    curl \
    ca-certificates \
    && rm -rf /var/lib/apt/lists/*
 # Install Drogon HTTP framework (async, epoll-based)
 RUN cd /tmp && \
    git clone --depth 1 --branch v1.9.12 https://github.com/drogonframework/drogon.git && \
    cd drogon && git submodule update --init && \
    mkdir build && cd build && \
    cmake .. -DBUILD_EXAMPLES=OFF -DBUILD_CTL=OFF -DBUILD_ORM=OFF \
             -DBUILD_POSTGRESQL=OFF -DBUILD_MYSQL=OFF -DBUILD_SQLITE=OFF \
             -DBUILD_REDIS=OFF -DBUILD_TESTING=OFF && \
    make -j$(nproc) && make install && \
    rm -rf /tmp/drogon
 # Install ONNX Runtime C++ SDK
 RUN cd /tmp && \
    wget -q "https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-linux-x64-${ORT_VERSION}.tgz" && \
    tar xzf "onnxruntime-linux-x64-${ORT_VERSION}.tgz" && \
    cp -r "onnxruntime-linux-x64-${ORT_VERSION}/include/"* /usr/local/include/ && \
    cp "onnxruntime-linux-x64-${ORT_VERSION}/lib/libonnxruntime.so"* /usr/local/lib/ && \
    ldconfig && rm -rf /tmp/onnxruntime*
 # Clone TurboOCR at the pinned release tag
 RUN git clone --depth 1 --branch "${TURBOOCR_VERSION}" \
    https://github.com/aiptimizer/TurboOCR.git /app
 WORKDIR /app
 # Install fastpdf2png (PDF renderer — PDFium vendored in third_party/).
 # Copy vendored libpdfium first so the installer does not need network access.
 RUN cp third_party/pdfium/lib/libpdfium.so /usr/lib/ && ldconfig && \
    bash scripts/install_fastpdf2png.sh && \
    { cp bin/libpdfium.so /usr/lib/ 2>/dev/null || true; } && \
    ldconfig
 # Build CPU-only mode with ONNX Runtime backend
 RUN mkdir -p build_cpu && cd build_cpu && \
    cmake .. -DUSE_CPU_ONLY=ON -DFETCH_MODELS=OFF && \
    make -j$(nproc)
 # Create non-root user and redirect /app/models/rec into the named cache volume.
 RUN useradd -m -s /bin/bash ocr \
    && chmod +x /app/scripts/entrypoint.sh \
    && mkdir -p /home/ocr/.cache/turbo-ocr/models/rec /app/models \
    && ln -s /home/ocr/.cache/turbo-ocr/models/rec /app/models/rec
 # Fetch all PP-OCRv5 language bundles (SHA256-verified from pinned GitHub Release)
 ARG OCR_INCLUDE_SERVER=1
 ENV OCR_INCLUDE_SERVER=${OCR_INCLUDE_SERVER}
 RUN bash scripts/fetch_release_models.sh \
    && chown -R ocr:ocr /app /home/ocr/.cache
 EXPOSE 8000 50051
 ENTRYPOINT ["/app/scripts/entrypoint.sh"]
 CMD ["./build_cpu/paddle_cpu_server"]
@@ -0,0 +1,118 @@
 # ============================================================
 # TurboOCR — CUDA 12.x build (TensorRT 10.8 / CUDA 12.7)
 # Base image: nvcr.io/nvidia/tensorrt:24.12-py3
 #
 # Supported compute capabilities (NVIDIA GPU reference):
 #   https://developer.nvidia.com/cuda-gpus
 #   7.5  Turing     — GTX 16xx / RTX 20xx
 #   8.0  Ampere     — A100, RTX 30xx server-class
 #   8.6  Ampere     — RTX 30xx desktop / laptop
 #   8.9  Ada        — RTX 40xx
 #
 # Blackwell (CC 12.0) requires CUDA 13.x.
 # For that, use the upstream docker/Dockerfile.gpu (tensorrt:26.03-py3).
 #
 # Build:  docker build -f Dockerfile.cuda12 -t turboocr-cuda12 .
 # ============================================================
 ARG TURBOOCR_VERSION=v2.1.1
 ARG CMAKE_VERSION=3.31.6
 ARG ORT_VERSION=1.22.0
 # NGC registry mirror prefix — leave empty for direct pull from nvcr.io.
 # Note: standard Docker Hub mirrors (e.g. DaoCloud) do NOT proxy nvcr.io.
 # Set this only if you have a dedicated NGC mirror or a pull-through proxy.
 ARG NGC_MIRROR=
 FROM ${NGC_MIRROR}nvcr.io/nvidia/tensorrt:24.12-py3
 # Re-declare ARGs after FROM so they remain in scope
 ARG TURBOOCR_VERSION
 ARG CMAKE_VERSION
 ARG ORT_VERSION
 # Install build dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
    build-essential \
    pkg-config \
    libopencv-dev \
    nginx \
    gosu \
    libgrpc++-dev \
    libprotobuf-dev \
    protobuf-compiler-grpc \
    libjsoncpp-dev \
    uuid-dev \
    zlib1g-dev \
    libssl-dev \
    libc-ares-dev \
    git \
    wget \
    curl \
    && rm -rf /var/lib/apt/lists/*
 # Install Drogon HTTP framework (async, epoll-based)
 RUN cd /tmp && \
    git clone --depth 1 --branch v1.9.12 https://github.com/drogonframework/drogon.git && \
    cd drogon && git submodule update --init && \
    mkdir build && cd build && \
    cmake .. -DBUILD_EXAMPLES=OFF -DBUILD_CTL=OFF -DBUILD_ORM=OFF \
             -DBUILD_POSTGRESQL=OFF -DBUILD_MYSQL=OFF -DBUILD_SQLITE=OFF \
             -DBUILD_REDIS=OFF -DBUILD_TESTING=OFF && \
    make -j$(nproc) && make install && \
    rm -rf /tmp/drogon
 # Upgrade CMake (the base image may ship an older version)
 RUN cd /tmp && \
    wget -q "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz" && \
    tar xzf "cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz" && \
    cp -r "cmake-${CMAKE_VERSION}-linux-x86_64/bin/"* /usr/local/bin/ && \
    cp -r "cmake-${CMAKE_VERSION}-linux-x86_64/share/"* /usr/local/share/ && \
    rm -rf /tmp/cmake*
 # Install ONNX Runtime C++ SDK (used by the CPU inference fallback path)
 RUN cd /tmp && \
    wget -q "https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-linux-x64-${ORT_VERSION}.tgz" && \
    tar xzf "onnxruntime-linux-x64-${ORT_VERSION}.tgz" && \
    cp -r "onnxruntime-linux-x64-${ORT_VERSION}/include/"* /usr/local/include/ && \
    cp "onnxruntime-linux-x64-${ORT_VERSION}/lib/libonnxruntime.so"* /usr/local/lib/ && \
    ldconfig && rm -rf /tmp/onnxruntime*
 # Clone TurboOCR at the pinned release tag
 RUN git clone --depth 1 --branch "${TURBOOCR_VERSION}" \
    https://github.com/aiptimizer/TurboOCR.git /app
 WORKDIR /app
 # Install fastpdf2png (PDF renderer — PDFium vendored in third_party/)
 RUN bash scripts/install_fastpdf2png.sh && \
    cp bin/libpdfium.so /usr/lib/ && ldconfig
 # Build GPU mode.
 # - CUDA_ARCHITECTURES: 7.5-8.9 covers Turing through Ada Lovelace under CUDA 12.x.
 #   CC 12.0 (Blackwell) is excluded — it requires CUDA 13.x.
 # - TENSORRT_DIR: /usr/local/tensorrt is the cmake default and matches the 24.12-py3
 #   base image layout. No override needed (upstream 26.03 uses /usr/lib/x86_64-linux-gnu).
 # - FETCH_MODELS=OFF: models are fetched in a separate layer below for better caching.
 RUN mkdir -p build && cd build && \
    cmake .. \
      -DFETCH_MODELS=OFF \
      -DCMAKE_CUDA_ARCHITECTURES="75;80;86;89" \
    && make -j$(nproc)
 # Create non-root user and redirect /app/models/rec into the named cache volume.
 # TRT engines built at first start are persisted via: -v turboocr_cache:/home/ocr/.cache/turbo-ocr
 RUN useradd -m -s /bin/bash ocr \
    && chmod +x /app/scripts/entrypoint.sh \
    && mkdir -p /home/ocr/.cache/turbo-ocr/models/rec /app/models \
    && ln -s /home/ocr/.cache/turbo-ocr/models/rec /app/models/rec
 # Fetch all PP-OCRv5 language bundles (SHA256-verified from pinned GitHub Release)
 ARG OCR_INCLUDE_SERVER=1
 ENV OCR_INCLUDE_SERVER=${OCR_INCLUDE_SERVER}
 RUN bash scripts/fetch_release_models.sh \
    && chown -R ocr:ocr /app /home/ocr/.cache
 EXPOSE 8000 50051
 ENTRYPOINT ["/app/scripts/entrypoint.sh"]
 CMD ["./build/paddle_highspeed_cpp"]
@@ -0,0 +1,127 @@
 # TurboOCR — Custom Builds
 [中文文档](README.zh.md)
 This directory builds [TurboOCR](https://github.com/aiptimizer/TurboOCR) from source for two targets that are not covered by the upstream pre-built images:
 | Variant | Dockerfile | Profile | Base image |
 | ------- | ---------- | ------- | ---------- |
 | **CUDA 12.x** | `Dockerfile.cuda12` | `gpu` | `nvcr.io/nvidia/tensorrt:24.12-py3` (TRT 10.8 / CUDA 12.7) |
 | **CPU-only** | `Dockerfile.cpu` | `cpu` | `ubuntu:24.04` (ONNX Runtime) |
 The upstream pre-built image targets CUDA 13.x (Blackwell / CC 12.0). Use this directory if your GPU is on CUDA 12.x (Turing through Ada Lovelace, CC 7.5–8.9) or if you have no GPU at all.
 ## Quick Start
 1. Copy the example environment file:
   ```bash
   cp .env.example .env
   ```
 2. Build and start the variant you need:
   **CUDA 12.x (GPU — Turing through Ada Lovelace):**
   ```bash
   docker compose --profile gpu up -d --build
   ```
   **CPU-only (no GPU required):**
   ```bash
   docker compose --profile cpu up -d --build
   ```
 3. Access the API at <http://localhost:8000>.
 > **Note:** The first build compiles Drogon and TurboOCR from source, which takes 10–30 minutes depending on your CPU core count. Subsequent builds use the Docker layer cache and are fast.
 ## First-Start Behavior
 ### GPU variant
 On the very first container start, TensorRT compiles 4 ONNX models into engine files. Measured times on an RTX 3070 Laptop:
 | Engine | Time |
 | ------ | ---- |
 | det | ~5 min |
 | rec | ~30 min |
 | cls | ~4 min |
 | layout | ~28 min |
 | **Total** | **~67–90 min** |
 High-end desktop GPUs finish in ~15 minutes. The container shows `unhealthy` during compilation — this is expected. Once all engines are ready the server starts and the status transitions to `healthy`. Subsequent restarts reuse the cached engines and start in seconds.
 > **Tip:** Set `TURBOOCR_DISABLE_LAYOUT=1` to skip the layout detection engine (~28 min savings on laptop GPUs). Use this only if you do not need the `?layout=1` PDF endpoint.
 ### CPU variant
 No TRT compilation occurs. ONNX Runtime loads the models directly at startup. The container is typically `healthy` within 60 seconds.
 ## Default Ports
 | Port | Protocol | Description |
 | ---- | -------- | ----------- |
 | 8000 | HTTP | OCR REST API + health/metrics |
 | 50051 | gRPC | OCR gRPC API |
 ## Important Environment Variables
 | Variable | Description | Default |
 | -------- | ----------- | ------- |
 | `TURBOOCR_VERSION` | Git tag used for the source build | `v2.1.1` |
 | `TURBOOCR_HTTP_PORT_OVERRIDE` | Host port for the HTTP API | `8000` |
 | `TURBOOCR_GRPC_PORT_OVERRIDE` | Host port for the gRPC API | `50051` |
 | `TURBOOCR_LANG` | Language bundle: `latin`, `chinese`, `greek`, `eslav`, `arabic`, `korean`, `thai` | `""` (latin) |
 | `TURBOOCR_SERVER` | With `chinese`, set to `1` for the 84 MB server rec model | `""` |
 | `TURBOOCR_PIPELINE_POOL_SIZE` | Concurrent GPU pipelines (~1.4 GB VRAM each); empty = auto | `""` |
 | `TURBOOCR_DISABLE_LAYOUT` | Disable layout detection model (saves ~300–500 MB VRAM) | `0` |
 | `TURBOOCR_PDF_MODE` | PDF parsing mode: `ocr` / `geometric` / `auto` / `auto_verified` | `ocr` |
 | `TURBOOCR_CPU_LIMIT` | CPU core limit (both variants) | `8.0` |
 | `TURBOOCR_MEMORY_LIMIT` | Memory limit — `12G` for GPU, `4G` for CPU | variant default |
 | `TURBOOCR_GPU_COUNT` | NVIDIA GPUs to reserve (GPU variant only) | `1` |
 | `TURBOOCR_SHM_SIZE` | Shared memory for fastpdf2png — `2g` for GPU, `512m` for CPU | variant default |
 | `TZ` | Container timezone | `UTC` |
 ## Storage
 - `turboocr_build_cache` — named volume at `/home/ocr/.cache/turbo-ocr`. Stores TRT engine files (GPU) or the model cache directory (CPU). Must be a named volume — a bind-mount of an empty host directory would shadow the baked-in language bundles and the server would fail to load models.
 ## Supported GPU Architectures (CUDA 12.x variant)
 | Compute Capability | Architecture | GPUs |
 | ------------------ | ------------ | ---- |
 | 7.5 | Turing | GTX 16xx, RTX 20xx |
 | 8.0 | Ampere | A100, RTX 30xx (server) |
 | 8.6 | Ampere | RTX 30xx (desktop / laptop) |
 | 8.9 | Ada Lovelace | RTX 40xx |
 Blackwell (CC 12.0, RTX 50xx) requires CUDA 13.x — use the upstream pre-built image from `src/turboocr` instead.
 ## Notes
 - Both Dockerfiles build TurboOCR from source via `git clone` inside the image. A working internet connection is required at build time.
 - The CUDA 12.x Dockerfile overrides `CMAKE_CUDA_ARCHITECTURES` to `75;80;86;89`, removing CC 12.0 which is not supported by CUDA 12.x.
 - TensorRT 10.8 is located at `/usr/local/tensorrt` in the `24.12-py3` base image, which matches the CMake default. No `-DTENSORRT_DIR` override is needed.
 - The CPU variant uses ONNX Runtime 1.22.0 and produces a `paddle_cpu_server` binary with both HTTP and gRPC interfaces.
 ## Endpoints
 - HTTP API: <http://localhost:8000>
 - gRPC API: `localhost:50051`
 - Health: <http://localhost:8000/health>
 - Readiness: <http://localhost:8000/health/ready>
 - Metrics (Prometheus): <http://localhost:8000/metrics>
 ## Security Notes
 - The API has no authentication by default. Put a reverse proxy (nginx, Caddy) in front for production.
 - The default PDF mode is `ocr`, which only trusts pixel data and is safe for untrusted PDF uploads.
 - Do **not** set `TURBOOCR_PDF_MODE` to `geometric` or `auto` globally if you accept PDFs from untrusted sources.
 ## References
 - [TurboOCR Repository](https://github.com/aiptimizer/TurboOCR)
 - [NVIDIA TensorRT Container Releases](https://docs.nvidia.com/deeplearning/tensorrt/container-release-notes/)
 - [NVIDIA CUDA GPU Compute Capability Table](https://developer.nvidia.com/cuda-gpus)
@@ -0,0 +1,127 @@
 # TurboOCR — 自定义构建
 [English](README.md)
 此目录从源码构建 [TurboOCR](https://github.com/aiptimizer/TurboOCR)，覆盖上游预构建镜像未提供的两个目标：
 | 变体 | Dockerfile | Profile | 基础镜像 |
 | ---- | ---------- | ------- | -------- |
 | **CUDA 12.x** | `Dockerfile.cuda12` | `gpu` | `nvcr.io/nvidia/tensorrt:24.12-py3`（TRT 10.8 / CUDA 12.7） |
 | **纯 CPU** | `Dockerfile.cpu` | `cpu` | `ubuntu:24.04`（ONNX Runtime） |
 上游预构建镜像针对 CUDA 13.x（Blackwell / CC 12.0）。如果你的 GPU 属于 CUDA 12.x 范围（Turing 到 Ada Lovelace，CC 7.5–8.9），或者没有 GPU，请使用本目录。
 ## 快速开始
 1. 复制示例环境文件：
   ```bash
   cp .env.example .env
   ```
 2. 按需构建并启动对应变体：
   **CUDA 12.x（GPU — Turing 到 Ada Lovelace）：**
   ```bash
   docker compose --profile gpu up -d --build
   ```
   **纯 CPU（无需 GPU）：**
   ```bash
   docker compose --profile cpu up -d --build
   ```
 3. 访问 API：<http://localhost:8000>。
 > **说明：** 首次构建需要从源码编译 Drogon 和 TurboOCR，耗时约 10–30 分钟，具体取决于 CPU 核心数。后续构建会复用 Docker 层缓存，速度很快。
 ## 首次启动说明
 ### GPU 变体
 容器首次启动时，TensorRT 会将 4 个 ONNX 模型编译为引擎文件。在 RTX 3070 Laptop 上的实测耗时：
 | 引擎 | 耗时 |
 | ---- | ---- |
 | det | 约 5 分钟 |
 | rec | 约 30 分钟 |
 | cls | 约 4 分钟 |
 | layout | 约 28 分钟 |
 | **合计** | **约 67–90 分钟** |
 高端桌面 GPU 约 15 分钟完成。编译期间容器显示 `unhealthy` 属于正常现象——所有引擎构建完成后服务启动，状态切换为 `healthy`。后续重启会复用缓存引擎，几乎瞬间完成。
 > **提示：** 设置 `TURBOOCR_DISABLE_LAYOUT=1` 可跳过版面检测引擎的编译（笔记本 GPU 约节省 28 分钟）。仅在不需要 `?layout=1` PDF 端点时使用此选项。
 ### CPU 变体
 无 TRT 编译过程。ONNX Runtime 在启动时直接加载模型，通常在 60 秒内变为 `healthy`。
 ## 默认端口
 | 端口 | 协议 | 说明 |
 | ---- | ---- | ---- |
 | 8000 | HTTP | OCR REST API + 健康检查/指标 |
 | 50051 | gRPC | OCR gRPC API |
 ## 主要环境变量
 | 变量名 | 说明 | 默认值 |
 | ------ | ---- | ------ |
 | `TURBOOCR_VERSION` | 构建所用的 Git 标签 | `v2.1.1` |
 | `TURBOOCR_HTTP_PORT_OVERRIDE` | HTTP API 主机端口 | `8000` |
 | `TURBOOCR_GRPC_PORT_OVERRIDE` | gRPC API 主机端口 | `50051` |
 | `TURBOOCR_LANG` | 语言包：`latin`、`chinese`、`greek`、`eslav`、`arabic`、`korean`、`thai` | `""`（latin） |
 | `TURBOOCR_SERVER` | 当使用 `chinese` 时，设为 `1` 启用 84 MB 服务端识别模型 | `""` |
 | `TURBOOCR_PIPELINE_POOL_SIZE` | 并发 GPU 流水线数（每条约 1.4 GB 显存），留空则自动 | `""` |
 | `TURBOOCR_DISABLE_LAYOUT` | 禁用版面检测模型（节省约 300–500 MB 显存） | `0` |
 | `TURBOOCR_PDF_MODE` | PDF 解析模式：`ocr` / `geometric` / `auto` / `auto_verified` | `ocr` |
 | `TURBOOCR_CPU_LIMIT` | CPU 核心限制（两个变体通用） | `8.0` |
 | `TURBOOCR_MEMORY_LIMIT` | 内存限制——GPU 变体 `12G`，CPU 变体 `4G` | 变体默认值 |
 | `TURBOOCR_GPU_COUNT` | 预留的 NVIDIA GPU 数量（仅 GPU 变体） | `1` |
 | `TURBOOCR_SHM_SIZE` | fastpdf2png 共享内存——GPU 变体 `2g`，CPU 变体 `512m` | 变体默认值 |
 | `TZ` | 容器时区 | `UTC` |
 ## 存储
 - `turboocr_build_cache`——命名卷，挂载于 `/home/ocr/.cache/turbo-ocr`。用于存储 TRT 引擎文件（GPU 变体）或模型缓存目录（CPU 变体）。必须使用**命名卷**——绑定挂载空主机目录会遮蔽镜像内置语言包，导致服务无法加载模型。
 ## 支持的 GPU 架构（CUDA 12.x 变体）
 | 算力版本 | 架构 | GPU 型号 |
 | -------- | ---- | -------- |
 | 7.5 | Turing | GTX 16xx、RTX 20xx |
 | 8.0 | Ampere | A100、RTX 30xx（服务器） |
 | 8.6 | Ampere | RTX 30xx（桌面/笔记本） |
 | 8.9 | Ada Lovelace | RTX 40xx |
 Blackwell（CC 12.0，RTX 50xx）需要 CUDA 13.x——请改用 `src/turboocr` 中的上游预构建镜像。
 ## 说明
 - 两个 Dockerfile 均在镜像内通过 `git clone` 从源码构建 TurboOCR，构建时需要可访问互联网。
 - CUDA 12.x Dockerfile 将 `CMAKE_CUDA_ARCHITECTURES` 设置为 `75;80;86;89`，去除了 CUDA 12.x 不支持的 CC 12.0。
 - TensorRT 10.8 在 `24.12-py3` 基础镜像中位于 `/usr/local/tensorrt`，与 CMake 默认值一致，无需额外的 `-DTENSORRT_DIR` 参数。
 - CPU 变体使用 ONNX Runtime 1.22.0，生成同时支持 HTTP 和 gRPC 接口的 `paddle_cpu_server` 二进制文件。
 ## 访问端点
 - HTTP API：<http://localhost:8000>
 - gRPC API：`localhost:50051`
 - 健康检查：<http://localhost:8000/health>
 - 就绪检查：<http://localhost:8000/health/ready>
 - Prometheus 指标：<http://localhost:8000/metrics>
 ## 安全说明
 - API 默认无身份认证。生产环境请在前面套一层反向代理（nginx、Caddy 等）。
 - PDF 默认模式为 `ocr`，只信任像素数据，可安全处理不可信来源的 PDF 上传。
 - 如果你的服务接收不可信来源的 PDF，**不要**将 `TURBOOCR_PDF_MODE` 全局设为 `geometric` 或 `auto`。
 ## 参考链接
 - [TurboOCR 仓库](https://github.com/aiptimizer/TurboOCR)
 - [NVIDIA TensorRT 容器发布说明](https://docs.nvidia.com/deeplearning/tensorrt/container-release-notes/)
 - [NVIDIA CUDA GPU 算力版本对照表](https://developer.nvidia.com/cuda-gpus)
@@ -0,0 +1,110 @@
 x-defaults: &defaults
  restart: unless-stopped
  logging:
    driver: json-file
    options:
      max-size: ${TURBOOCR_LOG_MAX_SIZE:-100m}
      max-file: '${TURBOOCR_LOG_MAX_FILE:-3}'
 x-turboocr-common: &turboocr-common
  <<: *defaults
  ports:
    - '${TURBOOCR_HTTP_PORT_OVERRIDE:-8000}:8000'
    - '${TURBOOCR_GRPC_PORT_OVERRIDE:-50051}:50051'
  volumes:
    # Named volume persists TRT engines (GPU) or ONNX model cache (CPU).
    # Must be a named volume — bind-mounting an empty host dir shadows the
    # baked-in language bundles and prevents the server from loading models.
    - turboocr_build_cache:/home/ocr/.cache/turbo-ocr
  environment:
    - TZ=${TZ:-UTC}
    # Language bundle: latin (default), chinese, greek, eslav, arabic, korean, thai
    - OCR_LANG=${TURBOOCR_LANG:-}
    # Set to 1 with OCR_LANG=chinese to use the 84 MB server rec model
    - OCR_SERVER=${TURBOOCR_SERVER:-}
    # Concurrent GPU pipelines (~1.4 GB VRAM each); empty = auto; ignored in CPU mode
    - PIPELINE_POOL_SIZE=${TURBOOCR_PIPELINE_POOL_SIZE:-}
    # Set to 1 to disable PP-DocLayoutV3 layout detection (saves ~300-500 MB VRAM)
    - DISABLE_LAYOUT=${TURBOOCR_DISABLE_LAYOUT:-0}
    # Default PDF mode: ocr (safest) / geometric / auto / auto_verified
    - ENABLE_PDF_MODE=${TURBOOCR_PDF_MODE:-ocr}
    # Skip angle classifier (~0.4 ms savings)
    - DISABLE_ANGLE_CLS=${TURBOOCR_DISABLE_ANGLE_CLS:-0}
    # Max detection input size in pixels
    - DET_MAX_SIDE=${TURBOOCR_DET_MAX_SIDE:-960}
    # PDF render parallelism
    - PDF_DAEMONS=${TURBOOCR_PDF_DAEMONS:-16}
    - PDF_WORKERS=${TURBOOCR_PDF_WORKERS:-4}
    # Maximum pages per PDF request
    - MAX_PDF_PAGES=${TURBOOCR_MAX_PDF_PAGES:-2000}
    # Log level: debug / info / warn / error
    - LOG_LEVEL=${TURBOOCR_LOG_LEVEL:-info}
    # Log format: json (structured) / text (human-readable)
    - LOG_FORMAT=${TURBOOCR_LOG_FORMAT:-json}
 services:
  turboocr-cuda12:
    <<: *turboocr-common
    profiles: [gpu]
    build:
      context: .
      dockerfile: Dockerfile.cuda12
      args:
        TURBOOCR_VERSION: ${TURBOOCR_VERSION:-v2.1.1}
        NGC_MIRROR: ${TURBOOCR_NGC_MIRROR:-}
    image: ${GLOBAL_REGISTRY:-}alexsuntop/turboocr-cuda12:${TURBOOCR_VERSION:-v2.1.1}
    healthcheck:
      test: [CMD, curl, -fsS, 'http://localhost:8000/health']
      interval: 30s
      timeout: 10s
      retries: 5
      # First start builds 4 TensorRT engines from ONNX. Measured times on an
      # RTX 3070 Laptop: det (~5 min) + rec (~30 min) + cls (~4 min) +
      # layout (~28 min) = ~67-90 min. High-end desktop GPUs finish in ~15 min.
      # Set TURBOOCR_DISABLE_LAYOUT=1 to skip layout and save ~28 min.
      # Subsequent restarts reuse the cached engines and start in seconds.
      start_period: 120m
    deploy:
      resources:
        limits:
          cpus: ${TURBOOCR_CPU_LIMIT:-8.0}
          memory: ${TURBOOCR_MEMORY_LIMIT:-12G}
        reservations:
          cpus: ${TURBOOCR_CPU_RESERVATION:-2.0}
          memory: ${TURBOOCR_MEMORY_RESERVATION:-4G}
          devices:
            - driver: nvidia
              count: ${TURBOOCR_GPU_COUNT:-1}
              capabilities: [gpu]
    shm_size: ${TURBOOCR_SHM_SIZE:-2g}
  turboocr-cpu:
    <<: *turboocr-common
    profiles: [cpu]
    build:
      context: .
      dockerfile: Dockerfile.cpu
      args:
        TURBOOCR_VERSION: ${TURBOOCR_VERSION:-v2.1.1}
        DOCKER_MIRROR: ${TURBOOCR_DOCKER_MIRROR:-}
    image: ${GLOBAL_REGISTRY:-}alexsuntop/turboocr-cpu:${TURBOOCR_VERSION:-v2.1.1}
    healthcheck:
      test: [CMD, curl, -fsS, 'http://localhost:8000/health']
      interval: 30s
      timeout: 10s
      retries: 5
      # CPU mode uses ONNX Runtime directly — no TRT compilation on first start.
      # Expect startup in under 60 s on most hardware.
      start_period: 2m
    deploy:
      resources:
        limits:
          cpus: ${TURBOOCR_CPU_LIMIT:-8.0}
          memory: ${TURBOOCR_MEMORY_LIMIT:-4G}
        reservations:
          cpus: ${TURBOOCR_CPU_RESERVATION:-2.0}
          memory: ${TURBOOCR_MEMORY_RESERVATION:-1G}
    shm_size: ${TURBOOCR_SHM_SIZE:-512m}
 volumes:
  turboocr_build_cache:
@@ -52,7 +52,9 @@ Copy `.env.example` to `.env` and override only the variables you need to change
 docker compose up -d
 ```
-The first start builds TensorRT engines from ONNX. Build time depends on your GPU: roughly 5 minutes on high-end desktop GPUs and 20–30 minutes on laptop GPUs. The container may report `unhealthy` while compilation is in progress — this is normal. Once the build finishes the server starts and the container transitions to `healthy`. Subsequent restarts reuse the cached engines and start in seconds.
+The first start builds 4 TensorRT engines from ONNX. Measured build times on an RTX 3070 Laptop: det (~5 min) + rec (~30 min) + cls (~4 min) + layout (~28 min) = **~67–90 minutes total**. High-end desktop GPUs finish in ~15 minutes. The container reports `unhealthy` while compilation is in progress — this is expected. Once all engines are built the server starts and the container transitions to `healthy`. Subsequent restarts reuse the cached engines and start in seconds.
 > **Tip — faster first boot:** Set `TURBOOCR_DISABLE_LAYOUT=1` to skip the layout detection engine (~28 min on laptop GPUs). Only do this if you don't need the `?layout=1` PDF endpoint.
 ### Endpoints
@@ -52,7 +52,9 @@
 docker compose up -d
 ```
-首次启动需要从 ONNX 构建 TensorRT 引擎，耗时因 GPU 而异：高端桌面 GPU 约 5 分钟，笔记本 GPU 约 20–30 分钟。编译期间容器可能显示 `unhealthy`，这属于正常现象——构建完成后服务会自动启动并切换为 `healthy`。后续重启会复用缓存的引擎，几乎瞬间完成。
+首次启动需要编译 4 个 TensorRT 引擎。在 RTX 3070 Laptop 上的实测耗时：det（约 5 分钟）+ rec（约 30 分钟）+ cls（约 4 分钟）+ layout（约 28 分钟）= **总计约 67–90 分钟**。高端桌面 GPU 约 15 分钟完成。编译期间容器显示 `unhealthy` 属于正常现象——所有引擎构建完成后服务会自动启动并切换为 `healthy`。后续重启会复用缓存的引擎，几乎瞬间完成。
 > **提示——加快首次启动**：设置 `TURBOOCR_DISABLE_LAYOUT=1` 可跳过版面检测引擎的编译（笔记本 GPU 约节省 28 分钟）。仅在不需要 `?layout=1` PDF 端点时使用此选项。
 ### 访问端点
@@ -14,7 +14,7 @@ services:
      - '${TURBOOCR_HTTP_PORT_OVERRIDE:-8000}:8000'
      - '${TURBOOCR_GRPC_PORT_OVERRIDE:-50051}:50051'
    volumes:
-      # Named volume caches TensorRT engines built from ONNX on first start (~90s).
+      # Named volume caches TensorRT engines built from ONNX on first start.
      # Must be a named volume - bind-mounting an empty host dir would shadow the
      # baked-in language bundles and prevent the server from loading models.
      - turboocr_trt_cache:/home/ocr/.cache/turbo-ocr
@@ -48,11 +48,13 @@ services:
      interval: 30s
      timeout: 10s
      retries: 5
-      # First start builds TensorRT engines from ONNX. Build time varies by GPU:
+      # First start builds 4 TensorRT engines from ONNX. Measured build times:
-      # ~5 min on high-end desktop GPUs, 20-30 min on laptop GPUs. The container
+      #   det (~5 min) + rec (~30 min) + cls (~4 min) + layout (~28 min) ≈ 67-90 min
-      # may show "unhealthy" during compilation but will become healthy once done.
+      # on an RTX 3070 Laptop. High-end desktop GPUs finish in ~15 min.
-      # Subsequent restarts reuse the cached engines and start in seconds.
+      # Set TURBOOCR_DISABLE_LAYOUT=1 to skip the layout engine and cut ~28 min.
-      start_period: 30m
+      # The container shows "unhealthy" while building but recovers once done.
      # Subsequent restarts reuse cached engines and start in seconds.
      start_period: 120m
    deploy:
      resources:
        limits: