From 51fd7ea08bd8e29f6f308e08828e16ff6164dbbb Mon Sep 17 00:00:00 2001 From: Sun-ZhenXing <1006925066@qq.com> Date: Sun, 11 Jan 2026 23:42:34 +0800 Subject: [PATCH] feat: add more otel services --- README.md | 9 +- README.zh.md | 9 +- src/bifrost-gateway/.env.example | 2 +- src/bifrost-gateway/README.md | 2 +- src/bifrost-gateway/README.zh.md | 2 +- src/bifrost-gateway/docker-compose.yaml | 2 +- src/loki/.env.example | 19 ++ src/loki/README.md | 144 ++++++++++++++ src/loki/README.zh.md | 144 ++++++++++++++ src/loki/docker-compose.yaml | 38 ++++ src/loki/loki-config.yaml | 50 +++++ src/otel-collector/.env.example | 62 ++++++ src/otel-collector/README.md | 247 ++++++++++++++++++++++++ src/otel-collector/README.zh.md | 247 ++++++++++++++++++++++++ src/otel-collector/docker-compose.yaml | 58 ++++++ src/phoenix/.env.example | 13 +- src/phoenix/README.md | 59 ++++-- src/phoenix/README.zh.md | 59 ++++-- src/phoenix/docker-compose.yaml | 72 +++++-- src/signoz/.env.example | 151 +++++++++++++++ src/signoz/README.md | 148 ++++++++++++++ src/signoz/README.zh.md | 148 ++++++++++++++ src/signoz/docker-compose.yaml | 202 +++++++++++++++++++ src/tempo/.env.example | 25 +++ src/tempo/README.md | 211 ++++++++++++++++++++ src/tempo/README.zh.md | 211 ++++++++++++++++++++ src/tempo/docker-compose.yaml | 44 +++++ src/tempo/tempo-config.yaml | 50 +++++ 28 files changed, 2358 insertions(+), 70 deletions(-) create mode 100644 src/loki/.env.example create mode 100644 src/loki/README.md create mode 100644 src/loki/README.zh.md create mode 100644 src/loki/docker-compose.yaml create mode 100644 src/loki/loki-config.yaml create mode 100644 src/otel-collector/.env.example create mode 100644 src/otel-collector/README.md create mode 100644 src/otel-collector/README.zh.md create mode 100644 src/otel-collector/docker-compose.yaml create mode 100644 src/signoz/.env.example create mode 100644 src/signoz/README.md create mode 100644 src/signoz/README.zh.md create mode 100644 src/signoz/docker-compose.yaml create mode 100644 src/tempo/.env.example create mode 100644 src/tempo/README.md create mode 100644 src/tempo/README.zh.md create mode 100644 src/tempo/docker-compose.yaml create mode 100644 src/tempo/tempo-config.yaml diff --git a/README.md b/README.md index 74a6ad6..47a0f00 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,6 @@ These services require building custom Docker images from source. | [goose](./builds/goose) | 1.18.0 | | [IOPaint](./builds/io-paint) | 1.6.0 | | [K3s inside DinD](./builds/k3s-inside-dind) | 0.2.2 | -| [KrunVM DinD](./builds/krunvm) | 0.1.0 | | [MinerU vLLM](./builds/mineru) | 2.7.1 | ## Supported Services @@ -27,7 +26,7 @@ These services require building custom Docker images from source. | [Apache Kafka](./src/kafka) | 7.8.0 | | [Apache Pulsar](./src/pulsar) | 4.0.7 | | [Apache RocketMQ](./src/rocketmq) | 5.3.1 | -| [Bifrost Gateway](./src/bifrost-gateway) | v1.3.59 | +| [Bifrost Gateway](./src/bifrost-gateway) | v1.3.63 | | [Bolt.diy](./apps/bolt-diy) | latest | | [Budibase](./src/budibase) | 3.23.0 | | [Bytebot](./src/bytebot) | edge | @@ -51,6 +50,8 @@ These services require building custom Docker images from source. | [GitLab](./src/gitlab) | 17.10.4-ce.0 | | [GPUStack](./src/gpustack) | v0.5.3 | | [Grafana](./src/grafana) | 12.1.1 | +| [Grafana Loki](./src/loki) | 3.3.2 | +| [Grafana Tempo](./src/tempo) | 2.7.2 | | [Halo](./src/halo) | 2.21.9 | | [Harbor](./src/harbor) | v2.12.0 | | [HashiCorp Consul](./src/consul) | 1.20.3 | @@ -88,13 +89,14 @@ These services require building custom Docker images from source. | [Odoo](./src/odoo) | 19.0 | | [Ollama](./src/ollama) | 0.12.0 | | [Open WebUI](./src/open-webui) | main | -| [Phoenix (Arize)](./src/phoenix) | 12.27.0-nonroot | +| [Phoenix (Arize)](./src/phoenix) | 12.28.1-nonroot | | [Pingora Proxy Manager](./src/pingora-proxy-manager) | v1.0.3 | | [Open WebUI Rust](./src/open-webui-rust) | latest | | [OpenCoze](./apps/opencoze) | See Docs | | [OpenCut](./src/opencut) | latest | | [OpenList](./src/openlist) | latest | | [OpenSearch](./src/opensearch) | 2.19.0 | +| [OpenTelemetry Collector](./src/otel-collector) | 0.115.1 | | [PocketBase](./src/pocketbase) | 0.30.0 | | [Podman](./src/podman) | v5.7.1 | | [Portainer](./src/portainer) | 2.27.3-alpine | @@ -112,6 +114,7 @@ These services require building custom Docker images from source. | [Restate Cluster](./src/restate-cluster) | 1.5.3 | | [Restate](./src/restate) | 1.5.3 | | [SearXNG](./src/searxng) | 2025.1.20-1ce14ef99 | +| [SigNoz](./src/signoz) | 0.55.0 | | [Sim](./apps/sim) | latest | | [Stable Diffusion WebUI](./apps/stable-diffusion-webui-docker) | latest | | [Stirling-PDF](./apps/stirling-pdf) | latest | diff --git a/README.zh.md b/README.zh.md index 6237f51..81fa074 100644 --- a/README.zh.md +++ b/README.zh.md @@ -12,7 +12,6 @@ Compose Anything 通过提供一组高质量的 Docker Compose 配置文件, | [goose](./builds/goose) | 1.18.0 | | [IOPaint](./builds/io-paint) | 1.6.0 | | [K3s inside DinD](./builds/k3s-inside-dind) | 0.2.2 | -| [KrunVM DinD](./builds/krunvm) | 0.1.0 | | [MinerU vLLM](./builds/mineru) | 2.7.1 | ## 已经支持的服务 @@ -27,7 +26,7 @@ Compose Anything 通过提供一组高质量的 Docker Compose 配置文件, | [Apache Kafka](./src/kafka) | 7.8.0 | | [Apache Pulsar](./src/pulsar) | 4.0.7 | | [Apache RocketMQ](./src/rocketmq) | 5.3.1 | -| [Bifrost Gateway](./src/bifrost-gateway) | v1.3.59 | +| [Bifrost Gateway](./src/bifrost-gateway) | v1.3.63 | | [Bolt.diy](./apps/bolt-diy) | latest | | [Budibase](./src/budibase) | 3.23.0 | | [Bytebot](./src/bytebot) | edge | @@ -51,6 +50,8 @@ Compose Anything 通过提供一组高质量的 Docker Compose 配置文件, | [GitLab](./src/gitlab) | 17.10.4-ce.0 | | [GPUStack](./src/gpustack) | v0.5.3 | | [Grafana](./src/grafana) | 12.1.1 | +| [Grafana Loki](./src/loki) | 3.3.2 | +| [Grafana Tempo](./src/tempo) | 2.7.2 | | [Halo](./src/halo) | 2.21.9 | | [Harbor](./src/harbor) | v2.12.0 | | [HashiCorp Consul](./src/consul) | 1.20.3 | @@ -88,13 +89,14 @@ Compose Anything 通过提供一组高质量的 Docker Compose 配置文件, | [Odoo](./src/odoo) | 19.0 | | [Ollama](./src/ollama) | 0.12.0 | | [Open WebUI](./src/open-webui) | main | -| [Phoenix (Arize)](./src/phoenix) | 12.27.0-nonroot | +| [Phoenix (Arize)](./src/phoenix) | 12.28.1-nonroot | | [Pingora Proxy Manager](./src/pingora-proxy-manager) | v1.0.3 | | [Open WebUI Rust](./src/open-webui-rust) | latest | | [OpenCoze](./apps/opencoze) | See Docs | | [OpenCut](./src/opencut) | latest | | [OpenList](./src/openlist) | latest | | [OpenSearch](./src/opensearch) | 2.19.0 | +| [OpenTelemetry Collector](./src/otel-collector) | 0.115.1 | | [PocketBase](./src/pocketbase) | 0.30.0 | | [Podman](./src/podman) | v5.7.1 | | [Portainer](./src/portainer) | 2.27.3-alpine | @@ -112,6 +114,7 @@ Compose Anything 通过提供一组高质量的 Docker Compose 配置文件, | [Restate Cluster](./src/restate-cluster) | 1.5.3 | | [Restate](./src/restate) | 1.5.3 | | [SearXNG](./src/searxng) | 2025.1.20-1ce14ef99 | +| [SigNoz](./src/signoz) | 0.55.0 | | [Sim](./apps/sim) | latest | | [Stable Diffusion WebUI](./apps/stable-diffusion-webui-docker) | latest | | [Stirling-PDF](./apps/stirling-pdf) | latest | diff --git a/src/bifrost-gateway/.env.example b/src/bifrost-gateway/.env.example index 847ed1c..a80ecc5 100644 --- a/src/bifrost-gateway/.env.example +++ b/src/bifrost-gateway/.env.example @@ -1,5 +1,5 @@ # Bifrost Gateway Version -BIFROST_VERSION=v1.3.59 +BIFROST_VERSION=v1.3.63 # Port to bind to on the host machine BIFROST_PORT=28080 diff --git a/src/bifrost-gateway/README.md b/src/bifrost-gateway/README.md index c566cf6..04bdda5 100644 --- a/src/bifrost-gateway/README.md +++ b/src/bifrost-gateway/README.md @@ -12,7 +12,7 @@ Bifrost is a lightweight, high-performance LLM gateway that supports multiple mo ## Configuration -- `BIFROST_VERSION`: The version of the Bifrost image, default is `v1.3.59`. +- `BIFROST_VERSION`: The version of the Bifrost image, default is `v1.3.63`. - `BIFROST_PORT`: The port for the Bifrost service, default is `28080`. ### Telemetry diff --git a/src/bifrost-gateway/README.zh.md b/src/bifrost-gateway/README.zh.md index 8aaecf6..405f549 100644 --- a/src/bifrost-gateway/README.zh.md +++ b/src/bifrost-gateway/README.zh.md @@ -12,7 +12,7 @@ Bifrost 是一个轻量级、高性能的 LLM 网关,支持多种模型和提 ## 配置 -- `BIFROST_VERSION`: Bifrost 镜像的版本,默认为 `v1.3.59`。 +- `BIFROST_VERSION`: Bifrost 镜像的版本,默认为 `v1.3.63`。 - `BIFROST_PORT`: Bifrost 服务的端口,默认为 `28080`。 ### 遥测 (Telemetry) diff --git a/src/bifrost-gateway/docker-compose.yaml b/src/bifrost-gateway/docker-compose.yaml index 98a3d35..1320b40 100644 --- a/src/bifrost-gateway/docker-compose.yaml +++ b/src/bifrost-gateway/docker-compose.yaml @@ -9,7 +9,7 @@ x-defaults: &defaults services: bifrost: <<: *defaults - image: ${GLOBAL_REGISTRY:-}maximhq/bifrost:${BIFROST_VERSION:-v1.3.59} + image: ${GLOBAL_REGISTRY:-}maximhq/bifrost:${BIFROST_VERSION:-v1.3.63} volumes: - bifrost_data:/app/data ports: diff --git a/src/loki/.env.example b/src/loki/.env.example new file mode 100644 index 0000000..d416bf7 --- /dev/null +++ b/src/loki/.env.example @@ -0,0 +1,19 @@ +# Global Registry (optional) +# GLOBAL_REGISTRY=registry.example.com/ + +# Loki Version +LOKI_VERSION=3.3.2 + +# Port Override +LOKI_PORT_OVERRIDE=3100 + +# Timezone +TZ=UTC + +# Resource Limits +LOKI_CPU_LIMIT=1.0 +LOKI_MEMORY_LIMIT=1G + +# Resource Reservations +LOKI_CPU_RESERVATION=0.25 +LOKI_MEMORY_RESERVATION=256M diff --git a/src/loki/README.md b/src/loki/README.md new file mode 100644 index 0000000..9f572a6 --- /dev/null +++ b/src/loki/README.md @@ -0,0 +1,144 @@ +# Grafana Loki + +[中文文档](README.zh.md) + +Grafana Loki is a horizontally scalable, highly available, multi-tenant log aggregation system inspired by Prometheus. It is designed to be very cost effective and easy to operate. Unlike other logging systems, Loki does not index the contents of the logs, but rather a set of labels for each log stream. + +## Features + +- **Cost-effective**: Only indexes metadata instead of full text, significantly reducing storage costs +- **LogQL**: Powerful query language similar to PromQL for filtering and aggregating logs +- **Multi-tenancy**: Built-in support for multi-tenant deployments +- **Grafana Integration**: Native integration with Grafana for visualization +- **Scalable**: Can scale horizontally to handle large volumes of logs + +## Quick Start + +1. Copy the example environment file: + + ```bash + cp .env.example .env + ``` + +2. Start the service: + + ```bash + docker compose up -d + ``` + +3. Verify the service is running: + + ```bash + docker compose ps + curl http://localhost:3100/ready + ``` + +## Configuration + +### Environment Variables + +| Variable | Default | Description | +| ------------------------- | ------- | ------------------ | +| `LOKI_VERSION` | `3.3.2` | Loki version | +| `LOKI_PORT_OVERRIDE` | `3100` | HTTP API port | +| `TZ` | `UTC` | Timezone | +| `LOKI_CPU_LIMIT` | `1.0` | CPU limit | +| `LOKI_MEMORY_LIMIT` | `1G` | Memory limit | +| `LOKI_CPU_RESERVATION` | `0.25` | CPU reservation | +| `LOKI_MEMORY_RESERVATION` | `256M` | Memory reservation | + +### Default Configuration + +The service includes a basic configuration file (`loki-config.yaml`) that: + +- Disables authentication (suitable for development/testing) +- Uses local filesystem storage +- Configures a single replica (monolithic mode) +- Sets up basic caching for query performance + +For production deployments, you should customize the configuration based on your requirements. + +## Integration with Grafana + +1. Add Loki as a data source in Grafana: + - URL: `http://loki:3100` (if running in the same Docker network) + - Or: `http://localhost:3100` (from host machine) + +2. Create dashboards and explore logs using LogQL queries + +## Sending Logs to Loki + +### Using Promtail + +Promtail is the recommended agent for shipping logs to Loki: + +```yaml +services: + promtail: + image: grafana/promtail:3.3.2 + volumes: + - /var/log:/var/log:ro + - ./promtail-config.yaml:/etc/promtail/config.yaml + command: -config.file=/etc/promtail/config.yaml +``` + +### Using Docker Driver + +You can configure Docker to send logs directly to Loki: + +```yaml +logging: + driver: loki + options: + loki-url: "http://localhost:3100/loki/api/v1/push" + loki-batch-size: "400" +``` + +### Using HTTP API + +Send logs directly via HTTP POST: + +```bash +curl -H "Content-Type: application/json" -XPOST -s "http://localhost:3100/loki/api/v1/push" --data-raw \ + '{"streams": [{ "stream": { "job": "test" }, "values": [ [ "$(date +%s)000000000", "test log message" ] ] }]}' +``` + +## Storage + +Logs and metadata are stored in a Docker volume named `loki_data`. + +## Health Check + +The service includes a health check that monitors the `/ready` endpoint every 30 seconds. + +## Resource Requirements + +- **Minimum**: 256MB RAM, 0.25 CPU +- **Recommended**: 1GB RAM, 1 CPU (for moderate log volumes) +- **Production**: Scale based on log ingestion rate and retention period + +## Security Considerations + +The default configuration: + +- Runs as non-root user (UID:GID 10001:10001) +- Disables authentication (suitable for development only) +- Uses filesystem storage (not suitable for distributed deployments) + +For production: + +- Enable authentication and multi-tenancy +- Use object storage (S3, GCS, Azure Blob, etc.) +- Implement proper network security and access controls +- Configure retention policies to manage storage costs + +## Documentation + +- [Official Documentation](https://grafana.com/docs/loki/latest/) +- [LogQL Query Language](https://grafana.com/docs/loki/latest/query/) +- [Best Practices](https://grafana.com/docs/loki/latest/operations/best-practices/) +- [GitHub Repository](https://github.com/grafana/loki) + +## License + +Loki is licensed under the [AGPLv3 License](https://github.com/grafana/loki/blob/main/LICENSE). diff --git a/src/loki/README.zh.md b/src/loki/README.zh.md new file mode 100644 index 0000000..1279bed --- /dev/null +++ b/src/loki/README.zh.md @@ -0,0 +1,144 @@ +# Grafana Loki + +[English Documentation](README.md) + +Grafana Loki 是一个受 Prometheus 启发的水平可扩展、高可用、多租户日志聚合系统。它被设计为非常高效且易于操作。与其他日志系统不同,Loki 不索引日志的内容,而是为每个日志流索引一组标签。 + +## 特性 + +- **成本效益**:仅索引元数据而非全文,显著降低存储成本 +- **LogQL**:类似于 PromQL 的强大查询语言,用于过滤和聚合日志 +- **多租户**:内置对多租户部署的支持 +- **Grafana 集成**:与 Grafana 原生集成进行可视化 +- **可扩展**:可以水平扩展以处理大量日志 + +## 快速开始 + +1. 复制示例环境文件: + + ```bash + cp .env.example .env + ``` + +2. 启动服务: + + ```bash + docker compose up -d + ``` + +3. 验证服务正在运行: + + ```bash + docker compose ps + curl http://localhost:3100/ready + ``` + +## 配置 + +### 环境变量 + +| 变量 | 默认值 | 描述 | +| ------------------------- | ------- | ------------- | +| `LOKI_VERSION` | `3.3.2` | Loki 版本 | +| `LOKI_PORT_OVERRIDE` | `3100` | HTTP API 端口 | +| `TZ` | `UTC` | 时区 | +| `LOKI_CPU_LIMIT` | `1.0` | CPU 限制 | +| `LOKI_MEMORY_LIMIT` | `1G` | 内存限制 | +| `LOKI_CPU_RESERVATION` | `0.25` | CPU 预留 | +| `LOKI_MEMORY_RESERVATION` | `256M` | 内存预留 | + +### 默认配置 + +该服务包含一个基本配置文件(`loki-config.yaml`),该文件: + +- 禁用身份验证(适用于开发/测试) +- 使用本地文件系统存储 +- 配置单副本(单体模式) +- 设置基本缓存以提高查询性能 + +对于生产部署,您应该根据需求自定义配置。 + +## 与 Grafana 集成 + +1. 在 Grafana 中添加 Loki 作为数据源: + - URL:`http://loki:3100`(如果在同一 Docker 网络中运行) + - 或者:`http://localhost:3100`(从主机访问) + +2. 使用 LogQL 查询创建仪表板和探索日志 + +## 向 Loki 发送日志 + +### 使用 Promtail + +Promtail 是向 Loki 发送日志的推荐代理: + +```yaml +services: + promtail: + image: grafana/promtail:3.3.2 + volumes: + - /var/log:/var/log:ro + - ./promtail-config.yaml:/etc/promtail/config.yaml + command: -config.file=/etc/promtail/config.yaml +``` + +### 使用 Docker 驱动 + +您可以配置 Docker 直接向 Loki 发送日志: + +```yaml +logging: + driver: loki + options: + loki-url: "http://localhost:3100/loki/api/v1/push" + loki-batch-size: "400" +``` + +### 使用 HTTP API + +通过 HTTP POST 直接发送日志: + +```bash +curl -H "Content-Type: application/json" -XPOST -s "http://localhost:3100/loki/api/v1/push" --data-raw \ + '{"streams": [{ "stream": { "job": "test" }, "values": [ [ "$(date +%s)000000000", "test log message" ] ] }]}' +``` + +## 存储 + +日志和元数据存储在名为 `loki_data` 的 Docker 卷中。 + +## 健康检查 + +该服务包含健康检查,每 30 秒监控一次 `/ready` 端点。 + +## 资源需求 + +- **最低要求**:256MB RAM,0.25 CPU +- **推荐配置**:1GB RAM,1 CPU(用于中等日志量) +- **生产环境**:根据日志摄入速率和保留期限进行扩展 + +## 安全注意事项 + +默认配置: + +- 以非 root 用户运行(UID:GID 10001:10001) +- 禁用身份验证(仅适用于开发环境) +- 使用文件系统存储(不适用于分布式部署) + +对于生产环境: + +- 启用身份验证和多租户 +- 使用对象存储(S3、GCS、Azure Blob 等) +- 实施适当的网络安全和访问控制 +- 配置保留策略以管理存储成本 + +## 文档 + +- [官方文档](https://grafana.com/docs/loki/latest/) +- [LogQL 查询语言](https://grafana.com/docs/loki/latest/query/) +- [最佳实践](https://grafana.com/docs/loki/latest/operations/best-practices/) +- [GitHub 仓库](https://github.com/grafana/loki) + +## 许可证 + +Loki 使用 [AGPLv3 许可证](https://github.com/grafana/loki/blob/main/LICENSE)。 diff --git a/src/loki/docker-compose.yaml b/src/loki/docker-compose.yaml new file mode 100644 index 0000000..dbaef79 --- /dev/null +++ b/src/loki/docker-compose.yaml @@ -0,0 +1,38 @@ +x-defaults: &defaults + restart: unless-stopped + logging: + driver: json-file + options: + max-size: 100m + max-file: "3" + +services: + loki: + <<: *defaults + image: ${GLOBAL_REGISTRY:-}grafana/loki:${LOKI_VERSION:-3.3.2} + ports: + - "${LOKI_PORT_OVERRIDE:-3100}:3100" + volumes: + - loki_data:/loki + - ./loki-config.yaml:/etc/loki/local-config.yaml:ro + environment: + - TZ=${TZ:-UTC} + command: -config.file=/etc/loki/local-config.yaml + user: "10001:10001" # Loki user + deploy: + resources: + limits: + cpus: ${LOKI_CPU_LIMIT:-1.0} + memory: ${LOKI_MEMORY_LIMIT:-1G} + reservations: + cpus: ${LOKI_CPU_RESERVATION:-0.25} + memory: ${LOKI_MEMORY_RESERVATION:-256M} + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3100/ready"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s + +volumes: + loki_data: diff --git a/src/loki/loki-config.yaml b/src/loki/loki-config.yaml new file mode 100644 index 0000000..6006988 --- /dev/null +++ b/src/loki/loki-config.yaml @@ -0,0 +1,50 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 100 + +schema_config: + configs: + - from: 2020-10-24 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +ruler: + alertmanager_url: http://localhost:9093 + +# By default, Loki will send anonymous, but uniquely-identifiable usage and configuration +# analytics to Grafana Labs. These statistics are sent to https://stats.grafana.org/ +# +# Statistics help us better understand how Loki is used, and they show us performance +# levels for most users. This helps us prioritize features and documentation. +# For more information on what's sent, look at +# https://github.com/grafana/loki/blob/main/pkg/analytics/stats.go +# Refer to the buildReport method to see what goes into a report. +# +# If you would like to disable reporting, uncomment the following lines: +#analytics: +# reporting_enabled: false diff --git a/src/otel-collector/.env.example b/src/otel-collector/.env.example new file mode 100644 index 0000000..d7d7a19 --- /dev/null +++ b/src/otel-collector/.env.example @@ -0,0 +1,62 @@ +# Global registry prefix (optional, e.g., docker.io/, ghcr.io/) +# Leave empty to pull from Docker Hub by default +GLOBAL_REGISTRY= + +# Timezone setting for all containers +TZ=UTC + +# ============================================ +# OpenTelemetry Collector Version +# ============================================ + +# OTel Collector Contrib version +OTEL_COLLECTOR_VERSION=0.115.1 + +# ============================================ +# Port Configuration +# ============================================ + +# OTLP gRPC receiver port (default: 4317) +OTEL_COLLECTOR_OTLP_GRPC_PORT_OVERRIDE=4317 + +# OTLP HTTP receiver port (default: 4318) +OTEL_COLLECTOR_OTLP_HTTP_PORT_OVERRIDE=4318 + +# Jaeger gRPC receiver port (default: 14250) +OTEL_COLLECTOR_JAEGER_GRPC_PORT_OVERRIDE=14250 + +# Jaeger Thrift HTTP receiver port (default: 14268) +OTEL_COLLECTOR_JAEGER_THRIFT_HTTP_PORT_OVERRIDE=14268 + +# Zipkin receiver port (default: 9411) +OTEL_COLLECTOR_ZIPKIN_PORT_OVERRIDE=9411 + +# Prometheus metrics port (default: 8888) +OTEL_COLLECTOR_PROMETHEUS_PORT_OVERRIDE=8888 + +# Health check extension port (default: 13133) +OTEL_COLLECTOR_HEALTH_CHECK_PORT_OVERRIDE=13133 + +# ============================================ +# Collector Configuration +# ============================================ + +# Go memory limit (e.g., 1536MiB, leave empty for automatic) +# Recommended to set to ~80% of container memory limit +OTEL_COLLECTOR_GOMEMLIMIT= + +# ============================================ +# Resource Limits +# ============================================ + +# CPU limit for OTel Collector +OTEL_COLLECTOR_CPU_LIMIT=1.0 + +# Memory limit for OTel Collector +OTEL_COLLECTOR_MEMORY_LIMIT=2G + +# CPU reservation for OTel Collector +OTEL_COLLECTOR_CPU_RESERVATION=0.25 + +# Memory reservation for OTel Collector +OTEL_COLLECTOR_MEMORY_RESERVATION=512M diff --git a/src/otel-collector/README.md b/src/otel-collector/README.md new file mode 100644 index 0000000..1baac83 --- /dev/null +++ b/src/otel-collector/README.md @@ -0,0 +1,247 @@ +# OpenTelemetry Collector + +[English](README.md) | [中文](README.zh.md) + +OpenTelemetry Collector is a vendor-agnostic service for receiving, processing, and exporting telemetry data (traces, metrics, and logs). It supports multiple protocols and can send data to various backends. + +## Features + +- **Protocol Support**: OTLP, Jaeger, Zipkin, Prometheus, and more +- **Flexible Processing**: Filter, transform, and enrich telemetry data +- **Multiple Exporters**: Send data to various observability backends +- **High Performance**: Efficient data processing with low overhead +- **Extensible**: Rich ecosystem of receivers, processors, and exporters +- **Vendor Neutral**: Works with any OpenTelemetry-compatible backend + +## Quick Start + +1. Copy the environment file and adjust if needed: + + ```bash + cp .env.example .env + ``` + +2. Create a configuration file `config.yaml`: + + ```bash + # See example below or use the provided template + ``` + +3. Start the collector: + + ```bash + docker compose up -d + ``` + +## Default Ports + +| Protocol | Port | Description | +| ------------------ | ----- | -------------------------------- | +| OTLP gRPC | 4317 | OpenTelemetry Protocol over gRPC | +| OTLP HTTP | 4318 | OpenTelemetry Protocol over HTTP | +| Jaeger gRPC | 14250 | Jaeger gRPC receiver | +| Jaeger Thrift HTTP | 14268 | Jaeger Thrift HTTP receiver | +| Zipkin | 9411 | Zipkin HTTP receiver | +| Prometheus | 8888 | Internal metrics endpoint | +| Health Check | 13133 | Health check endpoint | + +## Configuration + +### Environment Variables + +Key environment variables (see `.env.example` for complete list): + +- `OTEL_COLLECTOR_VERSION`: Collector version (default: 0.115.1) +- `OTEL_COLLECTOR_OTLP_GRPC_PORT_OVERRIDE`: OTLP gRPC port (default: 4317) +- `OTEL_COLLECTOR_OTLP_HTTP_PORT_OVERRIDE`: OTLP HTTP port (default: 4318) +- `OTEL_COLLECTOR_GOMEMLIMIT`: Go memory limit for the collector + +### Configuration File + +Create a `config.yaml` file to define the collector pipeline. Here's a minimal example: + +```yaml +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 10s + send_batch_size: 1024 + +exporters: + logging: + loglevel: info + + # Example: Export to Jaeger + # otlp/jaeger: + # endpoint: jaeger:4317 + # tls: + # insecure: true + + # Example: Export to Prometheus + # prometheusremotewrite: + # endpoint: http://prometheus:9090/api/v1/write + +extensions: + health_check: + endpoint: 0.0.0.0:13133 + pprof: + endpoint: 0.0.0.0:1777 + +service: + extensions: [health_check, pprof] + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [logging] + metrics: + receivers: [otlp] + processors: [batch] + exporters: [logging] + logs: + receivers: [otlp] + processors: [batch] + exporters: [logging] +``` + +For production use, configure appropriate exporters for your observability backend (e.g., Jaeger, Prometheus, Grafana Tempo, DataDog, etc.). + +### Common Receivers + +- **OTLP**: Native OpenTelemetry protocol (gRPC and HTTP) +- **Jaeger**: Jaeger native formats +- **Zipkin**: Zipkin JSON format +- **Prometheus**: Prometheus scraping +- **Kafka**: Receive from Kafka topics + +### Common Processors + +- **Batch**: Batch telemetry data before export +- **Memory Limiter**: Prevent out-of-memory situations +- **Resource Detection**: Automatically detect resource attributes +- **Attributes**: Modify attributes on telemetry data +- **Sampling**: Sample traces based on various strategies + +### Common Exporters + +- **OTLP**: Send to OTLP-compatible backends +- **Jaeger**: Export to Jaeger +- **Zipkin**: Export to Zipkin +- **Prometheus**: Expose metrics for Prometheus scraping +- **Prometheus Remote Write**: Push metrics to Prometheus +- **Logging**: Log telemetry to console (for debugging) + +## Sending Data to the Collector + +### Using OpenTelemetry SDKs + +Configure your application to send data to the collector: + +**Environment Variables**: + +```bash +export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 +export OTEL_EXPORTER_OTLP_PROTOCOL=grpc +``` + +**Python Example**: + +```python +from opentelemetry import trace +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor + +provider = TracerProvider() +processor = BatchSpanProcessor(OTLPSpanExporter(endpoint="localhost:4317", insecure=True)) +provider.add_span_processor(processor) +trace.set_tracer_provider(provider) +``` + +**Node.js Example**: + +```javascript +const { NodeTracerProvider } = require('@opentelemetry/sdk-trace-node'); +const { OTLPTraceExporter } = require('@opentelemetry/exporter-trace-otlp-grpc'); + +const provider = new NodeTracerProvider(); +const exporter = new OTLPTraceExporter({ + url: 'http://localhost:4317', +}); +provider.addSpanProcessor(new BatchSpanProcessor(exporter)); +provider.register(); +``` + +## Resource Requirements + +Minimum recommended resources: + +- **CPU**: 0.5 cores +- **Memory**: 1GB RAM + +For high-throughput environments, increase resources accordingly. + +## Data Persistence + +The collector itself is stateless. Data persistence depends on the configured exporters and backend systems. + +## Security Considerations + +- Configure TLS for production deployments +- Use authentication when available (e.g., API keys, mTLS) +- Restrict network access to necessary ports +- Consider using the `memory_limiter` processor to prevent OOM +- Review and minimize exposed ports +- Use secrets management for sensitive configuration + +## Healthchecks + +The collector exposes a health check endpoint on port 13133: + +- `http://localhost:13133/` - Overall health status + +## Monitoring the Collector + +The collector exposes its own metrics on the Prometheus metrics endpoint (default port 8888): + +- `http://localhost:8888/metrics` + +Key metrics to monitor: + +- `otelcol_receiver_accepted_spans`: Number of spans accepted +- `otelcol_receiver_refused_spans`: Number of spans refused +- `otelcol_exporter_sent_spans`: Number of spans sent +- `otelcol_processor_batch_batch_send_size`: Batch sizes + +## Troubleshooting + +1. **Data not being received**: Check receiver configuration and port bindings +2. **High memory usage**: Configure `memory_limiter` processor +3. **Slow processing**: Adjust batch processor settings +4. **Export failures**: Check exporter configuration and backend connectivity + +## Performance Tuning + +- Use the `batch` processor to improve efficiency +- Configure `memory_limiter` to prevent OOM +- Adjust queue sizes based on throughput requirements +- Use sampling processors for high-volume trace data +- Monitor collector metrics for bottlenecks + +## License + +OpenTelemetry Collector is licensed under the Apache 2.0 License. See the [official repository](https://github.com/open-telemetry/opentelemetry-collector) for more details. + +## References + +- [Official Documentation](https://opentelemetry.io/docs/collector/) +- [GitHub Repository](https://github.com/open-telemetry/opentelemetry-collector) +- [Collector Contrib Repository](https://github.com/open-telemetry/opentelemetry-collector-contrib) +- [Configuration Reference](https://opentelemetry.io/docs/collector/configuration/) diff --git a/src/otel-collector/README.zh.md b/src/otel-collector/README.zh.md new file mode 100644 index 0000000..eee8a30 --- /dev/null +++ b/src/otel-collector/README.zh.md @@ -0,0 +1,247 @@ +# OpenTelemetry Collector + +[English](README.md) | [中文](README.zh.md) + +OpenTelemetry Collector 是一个与供应商无关的服务,用于接收、处理和导出遥测数据(追踪、指标和日志)。它支持多种协议,可以将数据发送到各种后端系统。 + +## 功能特性 + +- **协议支持**:OTLP、Jaeger、Zipkin、Prometheus 等 +- **灵活处理**:过滤、转换和丰富遥测数据 +- **多种导出器**:将数据发送到各种可观测性后端 +- **高性能**:高效的数据处理,开销低 +- **可扩展**:丰富的接收器、处理器和导出器生态系统 +- **供应商中立**:与任何兼容 OpenTelemetry 的后端配合使用 + +## 快速开始 + +1. 复制环境文件并根据需要调整: + + ```bash + cp .env.example .env + ``` + +2. 创建配置文件 `config.yaml`: + + ```bash + # 参见下面的示例或使用提供的模板 + ``` + +3. 启动收集器: + + ```bash + docker compose up -d + ``` + +## 默认端口 + +| 协议 | 端口 | 描述 | +| ------------------ | ----- | ------------------------------- | +| OTLP gRPC | 4317 | 基于 gRPC 的 OpenTelemetry 协议 | +| OTLP HTTP | 4318 | 基于 HTTP 的 OpenTelemetry 协议 | +| Jaeger gRPC | 14250 | Jaeger gRPC 接收器 | +| Jaeger Thrift HTTP | 14268 | Jaeger Thrift HTTP 接收器 | +| Zipkin | 9411 | Zipkin HTTP 接收器 | +| Prometheus | 8888 | 内部指标端点 | +| Health Check | 13133 | 健康检查端点 | + +## 配置说明 + +### 环境变量 + +主要环境变量(完整列表请查看 `.env.example`): + +- `OTEL_COLLECTOR_VERSION`:收集器版本(默认:0.115.1) +- `OTEL_COLLECTOR_OTLP_GRPC_PORT_OVERRIDE`:OTLP gRPC 端口(默认:4317) +- `OTEL_COLLECTOR_OTLP_HTTP_PORT_OVERRIDE`:OTLP HTTP 端口(默认:4318) +- `OTEL_COLLECTOR_GOMEMLIMIT`:收集器的 Go 内存限制 + +### 配置文件 + +创建 `config.yaml` 文件来定义收集器管道。以下是一个最小示例: + +```yaml +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 10s + send_batch_size: 1024 + +exporters: + logging: + loglevel: info + + # 示例:导出到 Jaeger + # otlp/jaeger: + # endpoint: jaeger:4317 + # tls: + # insecure: true + + # 示例:导出到 Prometheus + # prometheusremotewrite: + # endpoint: http://prometheus:9090/api/v1/write + +extensions: + health_check: + endpoint: 0.0.0.0:13133 + pprof: + endpoint: 0.0.0.0:1777 + +service: + extensions: [health_check, pprof] + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [logging] + metrics: + receivers: [otlp] + processors: [batch] + exporters: [logging] + logs: + receivers: [otlp] + processors: [batch] + exporters: [logging] +``` + +对于生产环境,请为您的可观测性后端配置适当的导出器(例如 Jaeger、Prometheus、Grafana Tempo、DataDog 等)。 + +### 常用接收器 + +- **OTLP**:原生 OpenTelemetry 协议(gRPC 和 HTTP) +- **Jaeger**:Jaeger 原生格式 +- **Zipkin**:Zipkin JSON 格式 +- **Prometheus**:Prometheus 抓取 +- **Kafka**:从 Kafka 主题接收 + +### 常用处理器 + +- **Batch**:在导出前批处理遥测数据 +- **Memory Limiter**:防止内存溢出情况 +- **Resource Detection**:自动检测资源属性 +- **Attributes**:修改遥测数据的属性 +- **Sampling**:基于各种策略对追踪进行采样 + +### 常用导出器 + +- **OTLP**:发送到兼容 OTLP 的后端 +- **Jaeger**:导出到 Jaeger +- **Zipkin**:导出到 Zipkin +- **Prometheus**:公开指标供 Prometheus 抓取 +- **Prometheus Remote Write**:推送指标到 Prometheus +- **Logging**:将遥测数据记录到控制台(用于调试) + +## 向收集器发送数据 + +### 使用 OpenTelemetry SDK + +配置您的应用程序将数据发送到收集器: + +**环境变量**: + +```bash +export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 +export OTEL_EXPORTER_OTLP_PROTOCOL=grpc +``` + +**Python 示例**: + +```python +from opentelemetry import trace +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor + +provider = TracerProvider() +processor = BatchSpanProcessor(OTLPSpanExporter(endpoint="localhost:4317", insecure=True)) +provider.add_span_processor(processor) +trace.set_tracer_provider(provider) +``` + +**Node.js 示例**: + +```javascript +const { NodeTracerProvider } = require('@opentelemetry/sdk-trace-node'); +const { OTLPTraceExporter } = require('@opentelemetry/exporter-trace-otlp-grpc'); + +const provider = new NodeTracerProvider(); +const exporter = new OTLPTraceExporter({ + url: 'http://localhost:4317', +}); +provider.addSpanProcessor(new BatchSpanProcessor(exporter)); +provider.register(); +``` + +## 资源需求 + +最低推荐资源: + +- **CPU**:0.5 核 +- **内存**:1GB RAM + +对于高吞吐量环境,请相应增加资源。 + +## 数据持久化 + +收集器本身是无状态的。数据持久化取决于配置的导出器和后端系统。 + +## 安全注意事项 + +- 生产部署时配置 TLS +- 在可用时使用身份验证(例如 API 密钥、mTLS) +- 限制对必要端口的网络访问 +- 考虑使用 `memory_limiter` 处理器防止 OOM +- 检查并最小化暴露的端口 +- 对敏感配置使用密钥管理 + +## 健康检查 + +收集器在端口 13133 上公开健康检查端点: + +- `http://localhost:13133/` - 整体健康状态 + +## 监控收集器 + +收集器在 Prometheus 指标端点(默认端口 8888)上公开自己的指标: + +- `http://localhost:8888/metrics` + +需要监控的关键指标: + +- `otelcol_receiver_accepted_spans`:接受的 span 数量 +- `otelcol_receiver_refused_spans`:拒绝的 span 数量 +- `otelcol_exporter_sent_spans`:发送的 span 数量 +- `otelcol_processor_batch_batch_send_size`:批处理大小 + +## 故障排查 + +1. **未接收到数据**:检查接收器配置和端口绑定 +2. **内存使用过高**:配置 `memory_limiter` 处理器 +3. **处理速度慢**:调整批处理器设置 +4. **导出失败**:检查导出器配置和后端连接性 + +## 性能调优 + +- 使用 `batch` 处理器提高效率 +- 配置 `memory_limiter` 防止 OOM +- 根据吞吐量要求调整队列大小 +- 对高容量追踪数据使用采样处理器 +- 监控收集器指标以发现瓶颈 + +## 许可证 + +OpenTelemetry Collector 采用 Apache 2.0 许可证。详情请参阅 [官方仓库](https://github.com/open-telemetry/opentelemetry-collector)。 + +## 参考资料 + +- [官方文档](https://opentelemetry.io/docs/collector/) +- [GitHub 仓库](https://github.com/open-telemetry/opentelemetry-collector) +- [Collector Contrib 仓库](https://github.com/open-telemetry/opentelemetry-collector-contrib) +- [配置参考](https://opentelemetry.io/docs/collector/configuration/) diff --git a/src/otel-collector/docker-compose.yaml b/src/otel-collector/docker-compose.yaml new file mode 100644 index 0000000..a773d2f --- /dev/null +++ b/src/otel-collector/docker-compose.yaml @@ -0,0 +1,58 @@ +x-defaults: &defaults + restart: unless-stopped + logging: + driver: json-file + options: + max-size: 100m + max-file: "3" + +services: + otel-collector: + <<: *defaults + image: ${GLOBAL_REGISTRY:-}otel/opentelemetry-collector-contrib:${OTEL_COLLECTOR_VERSION:-0.115.1} + command: + - "--config=/etc/otelcol-contrib/config.yaml" + volumes: + - ./config.yaml:/etc/otelcol-contrib/config.yaml:ro + ports: + # OTLP gRPC receiver + - "${OTEL_COLLECTOR_OTLP_GRPC_PORT_OVERRIDE:-4317}:4317" + # OTLP HTTP receiver + - "${OTEL_COLLECTOR_OTLP_HTTP_PORT_OVERRIDE:-4318}:4318" + # Jaeger gRPC receiver + - "${OTEL_COLLECTOR_JAEGER_GRPC_PORT_OVERRIDE:-14250}:14250" + # Jaeger Thrift HTTP receiver + - "${OTEL_COLLECTOR_JAEGER_THRIFT_HTTP_PORT_OVERRIDE:-14268}:14268" + # Zipkin receiver + - "${OTEL_COLLECTOR_ZIPKIN_PORT_OVERRIDE:-9411}:9411" + # Prometheus metrics + - "${OTEL_COLLECTOR_PROMETHEUS_PORT_OVERRIDE:-8888}:8888" + # Health check extension + - "${OTEL_COLLECTOR_HEALTH_CHECK_PORT_OVERRIDE:-13133}:13133" + environment: + - TZ=${TZ:-UTC} + # Optional: Configure memory limit for the collector + - GOMEMLIMIT=${OTEL_COLLECTOR_GOMEMLIMIT:-} + user: "10001:10001" # Non-root user + deploy: + resources: + limits: + cpus: ${OTEL_COLLECTOR_CPU_LIMIT:-1.0} + memory: ${OTEL_COLLECTOR_MEMORY_LIMIT:-2G} + reservations: + cpus: ${OTEL_COLLECTOR_CPU_RESERVATION:-0.25} + memory: ${OTEL_COLLECTOR_MEMORY_RESERVATION:-512M} + healthcheck: + test: + [ + "CMD", + "wget", + "--no-verbose", + "--tries=1", + "--spider", + "http://localhost:13133/", + ] + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s diff --git a/src/phoenix/.env.example b/src/phoenix/.env.example index ded75c8..5f35e39 100644 --- a/src/phoenix/.env.example +++ b/src/phoenix/.env.example @@ -1,18 +1,23 @@ +# Compose Profiles +# Toggle between 'sqlite' (default) and 'postgres' (or 'pg') +COMPOSE_PROFILES=sqlite + # Phoenix version -PHOENIX_VERSION=12.27.0-nonroot +PHOENIX_VERSION=12.28.1-nonroot # Timezone TZ=UTC # Phoenix ports -PHOENIX_PORT_OVERRIDE=6006 # UI and OTLP HTTP collector -PHOENIX_GRPC_PORT_OVERRIDE=4317 # OTLP gRPC collector +PHOENIX_PORT_OVERRIDE=6006 # UI and OTLP HTTP collector +PHOENIX_GRPC_PORT_OVERRIDE=4317 # OTLP gRPC collector +PHOENIX_PROMETHEUS_PORT_OVERRIDE=9090 # Prometheus metrics (optional) # Phoenix configuration PHOENIX_ENABLE_PROMETHEUS=false PHOENIX_SECRET= # Optional: Set for authentication, generate with: openssl rand -base64 32 -# PostgreSQL configuration +# PostgreSQL configuration (only used when COMPOSE_PROFILES=postgres) POSTGRES_VERSION=17.2-alpine3.21 POSTGRES_USER=postgres POSTGRES_PASSWORD=postgres diff --git a/src/phoenix/README.md b/src/phoenix/README.md index 2a39653..ba740f1 100644 --- a/src/phoenix/README.md +++ b/src/phoenix/README.md @@ -6,8 +6,18 @@ Arize Phoenix is an open-source AI observability platform for LLM applications. ## Services -- `phoenix`: The main Phoenix application server with UI and OpenTelemetry collectors. -- `phoenix-db`: PostgreSQL database for persistent storage. +- `phoenix`: The main Phoenix application server (SQLite version). +- `phoenix-pg`: The Phoenix application server configured for PostgreSQL (requires `postgres` profile). +- `phoenix-db`: PostgreSQL database for persistent storage (requires `postgres` profile). + +## Profiles + +This project supports two modes of operation via Docker Compose profiles: + +1. **sqlite** (Default): Uses SQLite for storage. Simple and good for local development. + Set `COMPOSE_PROFILES=sqlite` in `.env`. +2. **postgres** (or **pg**): Uses PostgreSQL for storage. Recommended for production. + Set `COMPOSE_PROFILES=postgres` in `.env`. ## Ports @@ -15,24 +25,28 @@ Arize Phoenix is an open-source AI observability platform for LLM applications. | ---- | -------- | ----------------------------------------- | | 6006 | HTTP | UI and OTLP HTTP collector (`/v1/traces`) | | 4317 | gRPC | OTLP gRPC collector | +| 9090 | HTTP | Prometheus metrics (optional) | ## Environment Variables -| Variable Name | Description | Default Value | -| -------------------------- | ------------------------------------- | ----------------- | -| PHOENIX_VERSION | Phoenix image version | `12.27.0-nonroot` | -| PHOENIX_PORT_OVERRIDE | Host port for Phoenix UI and HTTP API | `6006` | -| PHOENIX_GRPC_PORT_OVERRIDE | Host port for OTLP gRPC collector | `4317` | -| PHOENIX_ENABLE_PROMETHEUS | Enable Prometheus metrics endpoint | `false` | -| PHOENIX_SECRET | Secret for authentication (optional) | `""` | -| POSTGRES_VERSION | PostgreSQL image version | `17.2-alpine3.21` | -| POSTGRES_USER | PostgreSQL username | `postgres` | -| POSTGRES_PASSWORD | PostgreSQL password | `postgres` | -| POSTGRES_DB | PostgreSQL database name | `phoenix` | +| Variable Name | Description | Default Value | +| -------------------------------- | ---------------------------------------- | ----------------- | +| COMPOSE_PROFILES | Active profiles (`sqlite` or `postgres`) | `sqlite` | +| PHOENIX_VERSION | Phoenix image version | `12.28.1-nonroot` | +| PHOENIX_PORT_OVERRIDE | Host port for Phoenix UI and HTTP API | `6006` | +| PHOENIX_GRPC_PORT_OVERRIDE | Host port for OTLP gRPC collector | `4317` | +| PHOENIX_PROMETHEUS_PORT_OVERRIDE | Host port for Prometheus metrics | `9090` | +| PHOENIX_ENABLE_PROMETHEUS | Enable Prometheus metrics endpoint | `false` | +| PHOENIX_SECRET | Secret for authentication (optional) | `""` | +| POSTGRES_VERSION | PostgreSQL image version | `17.2-alpine3.21` | +| POSTGRES_USER | PostgreSQL username | `postgres` | +| POSTGRES_PASSWORD | PostgreSQL password | `postgres` | +| POSTGRES_DB | PostgreSQL database name | `phoenix` | ## Volumes -- `phoenix_db_data`: PostgreSQL data volume for persistent storage. +- `phoenix_data`: Data volume for SQLite mode (mounted to `/data`). +- `phoenix_db_data`: Data volume for PostgreSQL mode. ## Getting Started @@ -42,11 +56,20 @@ Arize Phoenix is an open-source AI observability platform for LLM applications. cp .env.example .env ``` -2. (Optional) For production, set a secure password and secret: +2. Select your deployment mode by editing `.env` (default is `sqlite`). - ```bash - # Generate a secret for authentication - openssl rand -base64 32 + **For SQLite (Default):** + Ensure `.env` contains: + + ```dotenv + COMPOSE_PROFILES=sqlite + ``` + + **For PostgreSQL:** + Change `.env` to: + + ```dotenv + COMPOSE_PROFILES=postgres ``` 3. Start the services: diff --git a/src/phoenix/README.zh.md b/src/phoenix/README.zh.md index c56aeb3..48cd7bf 100644 --- a/src/phoenix/README.zh.md +++ b/src/phoenix/README.zh.md @@ -6,8 +6,18 @@ Arize Phoenix 是一个开源的 AI 可观测性平台,专为 LLM 应用设计 ## 服务 -- `phoenix`:Phoenix 主应用服务器,包含 UI 和 OpenTelemetry 采集器。 -- `phoenix-db`:用于持久化存储的 PostgreSQL 数据库。 +- `phoenix`:Phoenix 主应用服务器(SQLite 版本)。 +- `phoenix-pg`:配置为使用 PostgreSQL 的 Phoenix 应用服务器(需要 `postgres` 配置文件)。 +- `phoenix-db`:用于持久化存储的 PostgreSQL 数据库(需要 `postgres` 配置文件)。 + +## 配置文件 (Profiles) + +本项目支持通过 Docker Compose 配置文件使用两种运行模式: + +1. **sqlite**(默认):使用 SQLite 存储。简单易用,适合本地开发。 + 在 `.env` 中设置 `COMPOSE_PROFILES=sqlite`。 +2. **postgres**(或 **pg**):使用 PostgreSQL 存储。推荐用于生产环境。 + 在 `.env` 中设置 `COMPOSE_PROFILES=postgres`。 ## 端口 @@ -15,24 +25,28 @@ Arize Phoenix 是一个开源的 AI 可观测性平台,专为 LLM 应用设计 | ---- | ---- | -------------------------------------- | | 6006 | HTTP | UI 和 OTLP HTTP 采集器(`/v1/traces`) | | 4317 | gRPC | OTLP gRPC 采集器 | +| 9090 | HTTP | Prometheus 指标(可选) | ## 环境变量 -| 变量名 | 描述 | 默认值 | -| -------------------------- | --------------------------------- | ----------------- | -| PHOENIX_VERSION | Phoenix 镜像版本 | `12.27.0-nonroot` | -| PHOENIX_PORT_OVERRIDE | Phoenix UI 和 HTTP API 的主机端口 | `6006` | -| PHOENIX_GRPC_PORT_OVERRIDE | OTLP gRPC 采集器的主机端口 | `4317` | -| PHOENIX_ENABLE_PROMETHEUS | 启用 Prometheus 指标端点 | `false` | -| PHOENIX_SECRET | 认证密钥(可选) | `""` | -| POSTGRES_VERSION | PostgreSQL 镜像版本 | `17.2-alpine3.21` | -| POSTGRES_USER | PostgreSQL 用户名 | `postgres` | -| POSTGRES_PASSWORD | PostgreSQL 密码 | `postgres` | -| POSTGRES_DB | PostgreSQL 数据库名 | `phoenix` | +| 变量名 | 描述 | 默认值 | +| -------------------------------- | ---------------------------------------- | ----------------- | +| COMPOSE_PROFILES | 激活的配置文件(`sqlite` 或 `postgres`) | `sqlite` | +| PHOENIX_VERSION | Phoenix 镜像版本 | `12.28.1-nonroot` | +| PHOENIX_PORT_OVERRIDE | Phoenix UI 和 HTTP API 的主机端口 | `6006` | +| PHOENIX_GRPC_PORT_OVERRIDE | OTLP gRPC 采集器的主机端口 | `4317` | +| PHOENIX_PROMETHEUS_PORT_OVERRIDE | Prometheus 指标的主机端口 | `9090` | +| PHOENIX_ENABLE_PROMETHEUS | 启用 Prometheus 指标端点 | `false` | +| PHOENIX_SECRET | 认证密钥(可选) | `""` | +| POSTGRES_VERSION | PostgreSQL 镜像版本 | `17.2-alpine3.21` | +| POSTGRES_USER | PostgreSQL 用户名 | `postgres` | +| POSTGRES_PASSWORD | PostgreSQL 密码 | `postgres` | +| POSTGRES_DB | PostgreSQL 数据库名 | `phoenix` | ## 数据卷 -- `phoenix_db_data`:PostgreSQL 数据卷,用于持久化存储。 +- `phoenix_data`:SQLite 模式的数据卷(挂载到 `/data`)。 +- `phoenix_db_data`:PostgreSQL 模式的数据卷。 ## 快速开始 @@ -42,11 +56,20 @@ Arize Phoenix 是一个开源的 AI 可观测性平台,专为 LLM 应用设计 cp .env.example .env ``` -2. (可选)生产环境下,请设置安全的密码和密钥: +2. 通过编辑 `.env` 选择部署模式(默认为 `sqlite`)。 - ```bash - # 生成认证密钥 - openssl rand -base64 32 + **使用 SQLite(默认):** + 确保 `.env` 包含: + + ```dotenv + COMPOSE_PROFILES=sqlite + ``` + + **使用 PostgreSQL:** + 将 `.env` 修改为: + + ```dotenv + COMPOSE_PROFILES=postgres ``` 3. 启动服务: diff --git a/src/phoenix/docker-compose.yaml b/src/phoenix/docker-compose.yaml index 60f4cad..820e375 100644 --- a/src/phoenix/docker-compose.yaml +++ b/src/phoenix/docker-compose.yaml @@ -9,38 +9,69 @@ x-defaults: &defaults max-size: 100m max-file: "3" +x-phoenix-common: &phoenix-common + <<: *defaults + image: ${GLOBAL_REGISTRY:-}arizephoenix/phoenix:${PHOENIX_VERSION:-12.28.1-nonroot} + ports: + - "${PHOENIX_PORT_OVERRIDE:-6006}:6006" # UI and OTLP HTTP collector + - "${PHOENIX_GRPC_PORT_OVERRIDE:-4317}:4317" # OTLP gRPC collector + - "${PHOENIX_PROMETHEUS_PORT_OVERRIDE:-9090}:9090" # Prometheus metrics + environment: + - TZ=${TZ:-UTC} + - PHOENIX_ENABLE_PROMETHEUS=${PHOENIX_ENABLE_PROMETHEUS:-false} + - PHOENIX_SECRET=${PHOENIX_SECRET:-} + healthcheck: + test: + [ + "CMD", + "python3", + "-c", + "import httpx;r=httpx.get('http://localhost:6006/healthz').raise_for_status()", + ] + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s + deploy: + resources: + limits: + cpus: ${PHOENIX_CPU_LIMIT:-2.0} + memory: ${PHOENIX_MEMORY_LIMIT:-2G} + reservations: + cpus: ${PHOENIX_CPU_RESERVATION:-0.5} + memory: ${PHOENIX_MEMORY_RESERVATION:-512M} + services: + # Default SQLite configuration phoenix: - <<: *defaults - image: ${GLOBAL_REGISTRY:-}arizephoenix/phoenix:${PHOENIX_VERSION:-12.27.0-nonroot} - ports: - - "${PHOENIX_PORT_OVERRIDE:-6006}:6006" # UI and OTLP HTTP collector - - "${PHOENIX_GRPC_PORT_OVERRIDE:-4317}:4317" # OTLP gRPC collector + <<: *phoenix-common + profiles: + - sqlite + - ${COMPOSE_PROFILES:-} environment: - TZ=${TZ:-UTC} - - PHOENIX_SQL_DATABASE_URL=postgresql://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD:-postgres}@phoenix-db:5432/${POSTGRES_DB:-phoenix} - PHOENIX_ENABLE_PROMETHEUS=${PHOENIX_ENABLE_PROMETHEUS:-false} - PHOENIX_SECRET=${PHOENIX_SECRET:-} + - PHOENIX_WORKING_DIR=/data + volumes: + - phoenix_data:/data + + # PostgreSQL configuration + phoenix-pg: + <<: *phoenix-common + profiles: ["postgres", "pg"] + environment: + - TZ=${TZ:-UTC} + - PHOENIX_ENABLE_PROMETHEUS=${PHOENIX_ENABLE_PROMETHEUS:-false} + - PHOENIX_SECRET=${PHOENIX_SECRET:-} + - PHOENIX_SQL_DATABASE_URL=postgresql://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD:-postgres}@phoenix-db:5432/${POSTGRES_DB:-phoenix} depends_on: phoenix-db: condition: service_healthy - healthcheck: - test: ["CMD", "python3", "-c", "import httpx;r=httpx.get('http://localhost:6006/healthz').raise_for_status()"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 30s - deploy: - resources: - limits: - cpus: ${PHOENIX_CPU_LIMIT:-2.0} - memory: ${PHOENIX_MEMORY_LIMIT:-2G} - reservations: - cpus: ${PHOENIX_CPU_RESERVATION:-0.5} - memory: ${PHOENIX_MEMORY_RESERVATION:-512M} phoenix-db: <<: *defaults + profiles: ["postgres", "pg"] image: ${GLOBAL_REGISTRY:-}postgres:${POSTGRES_VERSION:-17.2-alpine3.21} environment: - TZ=${TZ:-UTC} @@ -65,4 +96,5 @@ services: memory: ${PHOENIX_DB_MEMORY_RESERVATION:-256M} volumes: + phoenix_data: phoenix_db_data: diff --git a/src/signoz/.env.example b/src/signoz/.env.example new file mode 100644 index 0000000..4edd17c --- /dev/null +++ b/src/signoz/.env.example @@ -0,0 +1,151 @@ +# Global registry prefix (optional, e.g., docker.io/, ghcr.io/) +# Leave empty to pull from Docker Hub by default +GLOBAL_REGISTRY= + +# Timezone setting for all containers +TZ=UTC + +# ============================================ +# SigNoz Version Configuration +# ============================================ + +# SigNoz ClickHouse version +SIGNOZ_CLICKHOUSE_VERSION=24.11.1-alpine + +# SigNoz OTel Collector version +SIGNOZ_OTEL_COLLECTOR_VERSION=0.102.8 + +# SigNoz Query Service version +SIGNOZ_QUERY_SERVICE_VERSION=0.55.0 + +# SigNoz Frontend version +SIGNOZ_FRONTEND_VERSION=0.55.0 + +# SigNoz Alert Manager version +SIGNOZ_ALERTMANAGER_VERSION=0.23.5 + +# ============================================ +# Port Configuration +# ============================================ + +# SigNoz Frontend UI port (default: 3301) +SIGNOZ_PORT_OVERRIDE=3301 + +# OTel Collector OTLP gRPC port (default: 4317) +SIGNOZ_OTEL_GRPC_PORT_OVERRIDE=4317 + +# OTel Collector OTLP HTTP port (default: 4318) +SIGNOZ_OTEL_HTTP_PORT_OVERRIDE=4318 + +# ============================================ +# ClickHouse Configuration +# ============================================ + +# ClickHouse database name +SIGNOZ_CLICKHOUSE_DB=signoz + +# ClickHouse connection URL +SIGNOZ_CLICKHOUSE_URL=tcp://clickhouse:9000/?database=signoz + +# ============================================ +# Query Service Configuration +# ============================================ + +# Storage type (clickhouse) +SIGNOZ_STORAGE=clickhouse + +# Go debug settings +SIGNOZ_GODEBUG=netdns=go + +# Telemetry enabled (true/false) +SIGNOZ_TELEMETRY_ENABLED=true + +# Deployment type +SIGNOZ_DEPLOYMENT_TYPE=docker-standalone-amd + +# ============================================ +# OTel Collector Configuration +# ============================================ + +# OTel resource attributes +SIGNOZ_OTEL_RESOURCE_ATTRIBUTES=host.name=signoz-host + +# ============================================ +# Resource Limits - ClickHouse +# ============================================ + +# CPU limit for ClickHouse +SIGNOZ_CLICKHOUSE_CPU_LIMIT=2.0 + +# Memory limit for ClickHouse +SIGNOZ_CLICKHOUSE_MEMORY_LIMIT=4G + +# CPU reservation for ClickHouse +SIGNOZ_CLICKHOUSE_CPU_RESERVATION=0.5 + +# Memory reservation for ClickHouse +SIGNOZ_CLICKHOUSE_MEMORY_RESERVATION=1G + +# ============================================ +# Resource Limits - OTel Collector +# ============================================ + +# CPU limit for OTel Collector +SIGNOZ_OTEL_COLLECTOR_CPU_LIMIT=1.0 + +# Memory limit for OTel Collector +SIGNOZ_OTEL_COLLECTOR_MEMORY_LIMIT=2G + +# CPU reservation for OTel Collector +SIGNOZ_OTEL_COLLECTOR_CPU_RESERVATION=0.25 + +# Memory reservation for OTel Collector +SIGNOZ_OTEL_COLLECTOR_MEMORY_RESERVATION=512M + +# ============================================ +# Resource Limits - Query Service +# ============================================ + +# CPU limit for Query Service +SIGNOZ_QUERY_SERVICE_CPU_LIMIT=1.0 + +# Memory limit for Query Service +SIGNOZ_QUERY_SERVICE_MEMORY_LIMIT=1G + +# CPU reservation for Query Service +SIGNOZ_QUERY_SERVICE_CPU_RESERVATION=0.25 + +# Memory reservation for Query Service +SIGNOZ_QUERY_SERVICE_MEMORY_RESERVATION=256M + +# ============================================ +# Resource Limits - Frontend +# ============================================ + +# CPU limit for Frontend +SIGNOZ_FRONTEND_CPU_LIMIT=0.5 + +# Memory limit for Frontend +SIGNOZ_FRONTEND_MEMORY_LIMIT=512M + +# CPU reservation for Frontend +SIGNOZ_FRONTEND_CPU_RESERVATION=0.1 + +# Memory reservation for Frontend +SIGNOZ_FRONTEND_MEMORY_RESERVATION=128M + +# ============================================ +# Resource Limits - Alert Manager +# ============================================ + +# CPU limit for Alert Manager +SIGNOZ_ALERTMANAGER_CPU_LIMIT=0.5 + +# Memory limit for Alert Manager +SIGNOZ_ALERTMANAGER_MEMORY_LIMIT=512M + +# CPU reservation for Alert Manager +SIGNOZ_ALERTMANAGER_CPU_RESERVATION=0.1 + +# Memory reservation for Alert Manager +SIGNOZ_ALERTMANAGER_MEMORY_RESERVATION=128M diff --git a/src/signoz/README.md b/src/signoz/README.md new file mode 100644 index 0000000..193f4d1 --- /dev/null +++ b/src/signoz/README.md @@ -0,0 +1,148 @@ +# SigNoz + +[English](README.md) | [中文](README.zh.md) + +SigNoz is an open-source observability platform that provides monitoring and troubleshooting capabilities for distributed applications. It offers traces, metrics, and logs in a single platform, similar to DataDog or New Relic. + +## Features + +- **Distributed Tracing**: Track requests across microservices +- **Metrics Monitoring**: Collect and visualize application and infrastructure metrics +- **Log Management**: Centralized log aggregation and analysis +- **Service Maps**: Visualize service dependencies and performance +- **Alerts**: Configure alerts based on metrics and traces +- **OpenTelemetry Native**: Built on top of OpenTelemetry standards + +## Quick Start + +1. Copy the environment file and adjust if needed: + + ```bash + cp .env.example .env + ``` + +2. Create required configuration files: + + ```bash + mkdir -p query-service frontend + # Download or create configuration files as needed + ``` + +3. Start the services: + + ```bash + docker compose up -d + ``` + +4. Access SigNoz UI at `http://localhost:3301` + +## Default Ports + +| Service | Port | Description | +| --------------------- | ---- | -------------------- | +| Frontend UI | 3301 | SigNoz web interface | +| OTel Collector (gRPC) | 4317 | OTLP gRPC receiver | +| OTel Collector (HTTP) | 4318 | OTLP HTTP receiver | + +## Configuration + +### Environment Variables + +Key environment variables (see `.env.example` for complete list): + +- `SIGNOZ_PORT_OVERRIDE`: Frontend UI port (default: 3301) +- `SIGNOZ_OTEL_GRPC_PORT_OVERRIDE`: OTLP gRPC receiver port (default: 4317) +- `SIGNOZ_OTEL_HTTP_PORT_OVERRIDE`: OTLP HTTP receiver port (default: 4318) +- `SIGNOZ_CLICKHOUSE_VERSION`: ClickHouse version +- `SIGNOZ_QUERY_SERVICE_VERSION`: Query service version +- `SIGNOZ_FRONTEND_VERSION`: Frontend version + +### Required Configuration Files + +This setup requires several configuration files: + +1. **clickhouse-config.xml**: ClickHouse server configuration +2. **clickhouse-users.xml**: ClickHouse user configuration +3. **otel-collector-config.yaml**: OTel Collector pipeline configuration +4. **query-service/prometheus.yml**: Query service Prometheus configuration +5. **frontend/nginx-config.conf**: Nginx configuration for frontend + +You can obtain these files from the [official SigNoz repository](https://github.com/SigNoz/signoz/tree/main/deploy/docker/clickhouse-setup). + +### Sending Telemetry Data + +To send telemetry data to SigNoz, configure your application to use OpenTelemetry with the following endpoints: + +- **gRPC**: `localhost:4317` +- **HTTP**: `localhost:4318` + +Example for Node.js: + +```javascript +const { NodeTracerProvider } = require('@opentelemetry/sdk-trace-node'); +const { OTLPTraceExporter } = require('@opentelemetry/exporter-trace-otlp-grpc'); + +const exporter = new OTLPTraceExporter({ + url: 'http://localhost:4317', +}); +``` + +## Architecture + +SigNoz consists of the following components: + +1. **ClickHouse**: Time-series database for storing traces, metrics, and logs +2. **OTel Collector**: Receives, processes, and exports telemetry data +3. **Query Service**: Queries data from ClickHouse +4. **Frontend**: Web UI for visualization and analysis +5. **Alert Manager**: Manages and sends alerts + +## Resource Requirements + +Minimum recommended resources: + +- **CPU**: 4 cores +- **Memory**: 8GB RAM +- **Storage**: 20GB for data + +## Data Persistence + +Data is persisted in Docker volumes: + +- `clickhouse_data`: ClickHouse database files +- `signoz_data`: SigNoz application data +- `alertmanager_data`: Alert manager data + +## Security Considerations + +- Change default credentials if applicable +- Use environment variables for sensitive configuration +- Consider using secrets management for production deployments +- Restrict network access to necessary ports only +- Enable authentication for production use + +## Healthchecks + +All services include healthchecks to ensure proper startup and dependency management: + +- ClickHouse: HTTP health endpoint +- OTel Collector: HTTP health endpoint +- Query Service: HTTP health endpoint +- Frontend: HTTP health endpoint +- Alert Manager: HTTP health endpoint + +## Troubleshooting + +1. **Services not starting**: Check logs with `docker compose logs` +2. **No data visible**: Verify OTel Collector configuration and application instrumentation +3. **High memory usage**: Adjust ClickHouse memory limits or data retention policies + +## License + +SigNoz is licensed under the MIT License. See the [official repository](https://github.com/SigNoz/signoz) for more details. + +## References + +- [Official Documentation](https://signoz.io/docs/) +- [GitHub Repository](https://github.com/SigNoz/signoz) +- [OpenTelemetry Documentation](https://opentelemetry.io/docs/) diff --git a/src/signoz/README.zh.md b/src/signoz/README.zh.md new file mode 100644 index 0000000..7231335 --- /dev/null +++ b/src/signoz/README.zh.md @@ -0,0 +1,148 @@ +# SigNoz + +[English](README.md) | [中文](README.zh.md) + +SigNoz 是一个开源的可观测性平台,为分布式应用程序提供监控和故障排查能力。它在单一平台中提供追踪、指标和日志功能,类似于 DataDog 或 New Relic。 + +## 功能特性 + +- **分布式追踪**:跨微服务追踪请求 +- **指标监控**:收集和可视化应用程序及基础设施指标 +- **日志管理**:集中式日志聚合和分析 +- **服务地图**:可视化服务依赖关系和性能 +- **告警**:基于指标和追踪配置告警 +- **OpenTelemetry 原生**:构建在 OpenTelemetry 标准之上 + +## 快速开始 + +1. 复制环境文件并根据需要调整: + + ```bash + cp .env.example .env + ``` + +2. 创建所需的配置文件: + + ```bash + mkdir -p query-service frontend + # 根据需要下载或创建配置文件 + ``` + +3. 启动服务: + + ```bash + docker compose up -d + ``` + +4. 访问 SigNoz UI:`http://localhost:3301` + +## 默认端口 + +| 服务 | 端口 | 描述 | +| --------------------- | ---- | ---------------- | +| Frontend UI | 3301 | SigNoz Web 界面 | +| OTel Collector (gRPC) | 4317 | OTLP gRPC 接收器 | +| OTel Collector (HTTP) | 4318 | OTLP HTTP 接收器 | + +## 配置说明 + +### 环境变量 + +主要环境变量(完整列表请查看 `.env.example`): + +- `SIGNOZ_PORT_OVERRIDE`:前端 UI 端口(默认:3301) +- `SIGNOZ_OTEL_GRPC_PORT_OVERRIDE`:OTLP gRPC 接收器端口(默认:4317) +- `SIGNOZ_OTEL_HTTP_PORT_OVERRIDE`:OTLP HTTP 接收器端口(默认:4318) +- `SIGNOZ_CLICKHOUSE_VERSION`:ClickHouse 版本 +- `SIGNOZ_QUERY_SERVICE_VERSION`:查询服务版本 +- `SIGNOZ_FRONTEND_VERSION`:前端版本 + +### 必需的配置文件 + +此设置需要以下配置文件: + +1. **clickhouse-config.xml**:ClickHouse 服务器配置 +2. **clickhouse-users.xml**:ClickHouse 用户配置 +3. **otel-collector-config.yaml**:OTel Collector 管道配置 +4. **query-service/prometheus.yml**:查询服务 Prometheus 配置 +5. **frontend/nginx-config.conf**:前端 Nginx 配置 + +您可以从 [SigNoz 官方仓库](https://github.com/SigNoz/signoz/tree/main/deploy/docker/clickhouse-setup) 获取这些文件。 + +### 发送遥测数据 + +要向 SigNoz 发送遥测数据,请配置您的应用程序使用 OpenTelemetry 并使用以下端点: + +- **gRPC**:`localhost:4317` +- **HTTP**:`localhost:4318` + +Node.js 示例: + +```javascript +const { NodeTracerProvider } = require('@opentelemetry/sdk-trace-node'); +const { OTLPTraceExporter } = require('@opentelemetry/exporter-trace-otlp-grpc'); + +const exporter = new OTLPTraceExporter({ + url: 'http://localhost:4317', +}); +``` + +## 架构说明 + +SigNoz 由以下组件组成: + +1. **ClickHouse**:用于存储追踪、指标和日志的时序数据库 +2. **OTel Collector**:接收、处理和导出遥测数据 +3. **Query Service**:从 ClickHouse 查询数据 +4. **Frontend**:用于可视化和分析的 Web UI +5. **Alert Manager**:管理和发送告警 + +## 资源需求 + +最低推荐资源: + +- **CPU**:4 核 +- **内存**:8GB RAM +- **存储**:20GB 数据存储空间 + +## 数据持久化 + +数据持久化在 Docker 卷中: + +- `clickhouse_data`:ClickHouse 数据库文件 +- `signoz_data`:SigNoz 应用程序数据 +- `alertmanager_data`:告警管理器数据 + +## 安全注意事项 + +- 如适用,请更改默认凭据 +- 使用环境变量配置敏感信息 +- 生产环境部署时考虑使用密钥管理 +- 仅限制必要端口的网络访问 +- 生产环境请启用身份验证 + +## 健康检查 + +所有服务都包含健康检查以确保正确启动和依赖管理: + +- ClickHouse:HTTP 健康端点 +- OTel Collector:HTTP 健康端点 +- Query Service:HTTP 健康端点 +- Frontend:HTTP 健康端点 +- Alert Manager:HTTP 健康端点 + +## 故障排查 + +1. **服务未启动**:使用 `docker compose logs` 检查日志 +2. **无数据显示**:验证 OTel Collector 配置和应用程序仪器化 +3. **内存使用过高**:调整 ClickHouse 内存限制或数据保留策略 + +## 许可证 + +SigNoz 采用 MIT 许可证。详情请参阅 [官方仓库](https://github.com/SigNoz/signoz)。 + +## 参考资料 + +- [官方文档](https://signoz.io/docs/) +- [GitHub 仓库](https://github.com/SigNoz/signoz) +- [OpenTelemetry 文档](https://opentelemetry.io/docs/) diff --git a/src/signoz/docker-compose.yaml b/src/signoz/docker-compose.yaml new file mode 100644 index 0000000..b988d1a --- /dev/null +++ b/src/signoz/docker-compose.yaml @@ -0,0 +1,202 @@ +x-defaults: &defaults + restart: unless-stopped + logging: + driver: json-file + options: + max-size: 100m + max-file: "3" + +x-clickhouse-defaults: &clickhouse-defaults + restart: on-failure + logging: + driver: json-file + options: + max-size: 100m + max-file: "3" + healthcheck: + test: ["CMD", "wget", "--spider", "-q", "localhost:8123/ping"] + interval: 30s + timeout: 5s + retries: 3 + deploy: + resources: + limits: + cpus: ${SIGNOZ_CLICKHOUSE_CPU_LIMIT:-2.0} + memory: ${SIGNOZ_CLICKHOUSE_MEMORY_LIMIT:-4G} + reservations: + cpus: ${SIGNOZ_CLICKHOUSE_CPU_RESERVATION:-0.5} + memory: ${SIGNOZ_CLICKHOUSE_MEMORY_RESERVATION:-1G} + +services: + # ClickHouse for storing traces, metrics and logs + clickhouse: + <<: *clickhouse-defaults + image: ${GLOBAL_REGISTRY:-}clickhouse/clickhouse-server:${SIGNOZ_CLICKHOUSE_VERSION:-24.11.1-alpine} + user: "101:101" # ClickHouse user + volumes: + - clickhouse_data:/var/lib/clickhouse + - ./clickhouse-config.xml:/etc/clickhouse-server/config.d/logging.xml:ro + - ./clickhouse-users.xml:/etc/clickhouse-server/users.d/logging.xml:ro + environment: + - TZ=${TZ:-UTC} + - CLICKHOUSE_DB=${SIGNOZ_CLICKHOUSE_DB:-signoz} + + # OTel Collector for receiving telemetry data + otel-collector: + <<: *defaults + image: ${GLOBAL_REGISTRY:-}signoz/signoz-otel-collector:${SIGNOZ_OTEL_COLLECTOR_VERSION:-0.102.8} + command: + - "--config=/etc/otel-collector-config.yaml" + volumes: + - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml:ro + environment: + - TZ=${TZ:-UTC} + - OTEL_RESOURCE_ATTRIBUTES=${SIGNOZ_OTEL_RESOURCE_ATTRIBUTES:-host.name=signoz-host} + ports: + - "${SIGNOZ_OTEL_GRPC_PORT_OVERRIDE:-4317}:4317" # OTLP gRPC receiver + - "${SIGNOZ_OTEL_HTTP_PORT_OVERRIDE:-4318}:4318" # OTLP HTTP receiver + depends_on: + clickhouse: + condition: service_healthy + deploy: + resources: + limits: + cpus: ${SIGNOZ_OTEL_COLLECTOR_CPU_LIMIT:-1.0} + memory: ${SIGNOZ_OTEL_COLLECTOR_MEMORY_LIMIT:-2G} + reservations: + cpus: ${SIGNOZ_OTEL_COLLECTOR_CPU_RESERVATION:-0.25} + memory: ${SIGNOZ_OTEL_COLLECTOR_MEMORY_RESERVATION:-512M} + healthcheck: + test: + [ + "CMD", + "wget", + "--no-verbose", + "--tries=1", + "--spider", + "http://localhost:13133/", + ] + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s + + # Query Service for querying data + query-service: + <<: *defaults + image: ${GLOBAL_REGISTRY:-}signoz/query-service:${SIGNOZ_QUERY_SERVICE_VERSION:-0.55.0} + command: + - "-config=/root/config/prometheus.yml" + volumes: + - ./query-service/prometheus.yml:/root/config/prometheus.yml:ro + - signoz_data:/var/lib/signoz + environment: + - TZ=${TZ:-UTC} + - ClickHouseUrl=${SIGNOZ_CLICKHOUSE_URL:-tcp://clickhouse:9000/?database=signoz} + - STORAGE=${SIGNOZ_STORAGE:-clickhouse} + - GODEBUG=${SIGNOZ_GODEBUG:-netdns=go} + - TELEMETRY_ENABLED=${SIGNOZ_TELEMETRY_ENABLED:-true} + - DEPLOYMENT_TYPE=${SIGNOZ_DEPLOYMENT_TYPE:-docker-standalone-amd} + depends_on: + clickhouse: + condition: service_healthy + deploy: + resources: + limits: + cpus: ${SIGNOZ_QUERY_SERVICE_CPU_LIMIT:-1.0} + memory: ${SIGNOZ_QUERY_SERVICE_MEMORY_LIMIT:-1G} + reservations: + cpus: ${SIGNOZ_QUERY_SERVICE_CPU_RESERVATION:-0.25} + memory: ${SIGNOZ_QUERY_SERVICE_MEMORY_RESERVATION:-256M} + healthcheck: + test: + [ + "CMD", + "wget", + "--no-verbose", + "--tries=1", + "--spider", + "http://localhost:8080/api/v1/health", + ] + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s + + # Frontend for the UI + frontend: + <<: *defaults + image: ${GLOBAL_REGISTRY:-}signoz/frontend:${SIGNOZ_FRONTEND_VERSION:-0.55.0} + ports: + - "${SIGNOZ_PORT_OVERRIDE:-3301}:3301" + volumes: + - ./frontend/nginx-config.conf:/etc/nginx/conf.d/default.conf:ro + environment: + - TZ=${TZ:-UTC} + depends_on: + query-service: + condition: service_healthy + deploy: + resources: + limits: + cpus: ${SIGNOZ_FRONTEND_CPU_LIMIT:-0.5} + memory: ${SIGNOZ_FRONTEND_MEMORY_LIMIT:-512M} + reservations: + cpus: ${SIGNOZ_FRONTEND_CPU_RESERVATION:-0.1} + memory: ${SIGNOZ_FRONTEND_MEMORY_RESERVATION:-128M} + healthcheck: + test: + [ + "CMD", + "wget", + "--no-verbose", + "--tries=1", + "--spider", + "http://localhost:3301/api/v1/health", + ] + interval: 30s + timeout: 10s + retries: 3 + start_period: 10s + + # Alert Manager for managing alerts + alertmanager: + <<: *defaults + image: ${GLOBAL_REGISTRY:-}signoz/alertmanager:${SIGNOZ_ALERTMANAGER_VERSION:-0.23.5} + command: + - --queryService.url=http://query-service:8080 + - --storage.path=/data + volumes: + - alertmanager_data:/data + environment: + - TZ=${TZ:-UTC} + depends_on: + query-service: + condition: service_healthy + deploy: + resources: + limits: + cpus: ${SIGNOZ_ALERTMANAGER_CPU_LIMIT:-0.5} + memory: ${SIGNOZ_ALERTMANAGER_MEMORY_LIMIT:-512M} + reservations: + cpus: ${SIGNOZ_ALERTMANAGER_CPU_RESERVATION:-0.1} + memory: ${SIGNOZ_ALERTMANAGER_MEMORY_RESERVATION:-128M} + healthcheck: + test: + [ + "CMD", + "wget", + "--no-verbose", + "--tries=1", + "--spider", + "http://localhost:9093/-/healthy", + ] + interval: 30s + timeout: 10s + retries: 3 + start_period: 10s + +volumes: + clickhouse_data: + signoz_data: + alertmanager_data: diff --git a/src/tempo/.env.example b/src/tempo/.env.example new file mode 100644 index 0000000..44c4b40 --- /dev/null +++ b/src/tempo/.env.example @@ -0,0 +1,25 @@ +# Global Registry (optional) +# GLOBAL_REGISTRY=registry.example.com/ + +# Tempo Version +TEMPO_VERSION=2.7.2 + +# Port Overrides +TEMPO_HTTP_PORT_OVERRIDE=3200 +TEMPO_GRPC_PORT_OVERRIDE=9095 +TEMPO_OTLP_HTTP_PORT_OVERRIDE=4318 +TEMPO_OTLP_GRPC_PORT_OVERRIDE=4317 +TEMPO_ZIPKIN_PORT_OVERRIDE=9411 +TEMPO_JAEGER_THRIFT_HTTP_PORT_OVERRIDE=14268 +TEMPO_JAEGER_GRPC_PORT_OVERRIDE=14250 + +# Timezone +TZ=UTC + +# Resource Limits +TEMPO_CPU_LIMIT=1.0 +TEMPO_MEMORY_LIMIT=1G + +# Resource Reservations +TEMPO_CPU_RESERVATION=0.25 +TEMPO_MEMORY_RESERVATION=256M diff --git a/src/tempo/README.md b/src/tempo/README.md new file mode 100644 index 0000000..dbd3e15 --- /dev/null +++ b/src/tempo/README.md @@ -0,0 +1,211 @@ +# Grafana Tempo + +[中文文档](README.zh.md) + +Grafana Tempo is an open-source, easy-to-use, and high-scale distributed tracing backend. Tempo is cost-efficient, requiring only object storage to operate, and is deeply integrated with Grafana, Prometheus, and Loki. + +## Features + +- **Cost-effective**: Uses object storage (supports S3, GCS, Azure, filesystem) +- **Easy to operate**: No dependencies other than object storage +- **Multi-tenant**: Built-in multi-tenancy support +- **Multiple protocols**: Supports OTLP, Jaeger, and Zipkin +- **TraceQL**: Powerful query language for trace data +- **Metrics generation**: Can generate RED metrics from traces + +## Quick Start + +1. Copy the example environment file: + + ```bash + cp .env.example .env + ``` + +2. Start the service: + + ```bash + docker compose up -d + ``` + +3. Verify the service is running: + + ```bash + docker compose ps + curl http://localhost:3200/ready + ``` + +## Configuration + +### Environment Variables + +| Variable | Default | Description | +| ---------------------------------------- | ------- | ----------------------- | +| `TEMPO_VERSION` | `2.7.2` | Tempo version | +| `TEMPO_HTTP_PORT_OVERRIDE` | `3200` | HTTP API port | +| `TEMPO_GRPC_PORT_OVERRIDE` | `9095` | gRPC port | +| `TEMPO_OTLP_HTTP_PORT_OVERRIDE` | `4318` | OTLP HTTP receiver port | +| `TEMPO_OTLP_GRPC_PORT_OVERRIDE` | `4317` | OTLP gRPC receiver port | +| `TEMPO_ZIPKIN_PORT_OVERRIDE` | `9411` | Zipkin receiver port | +| `TEMPO_JAEGER_THRIFT_HTTP_PORT_OVERRIDE` | `14268` | Jaeger Thrift HTTP port | +| `TEMPO_JAEGER_GRPC_PORT_OVERRIDE` | `14250` | Jaeger gRPC port | +| `TZ` | `UTC` | Timezone | +| `TEMPO_CPU_LIMIT` | `1.0` | CPU limit | +| `TEMPO_MEMORY_LIMIT` | `1G` | Memory limit | +| `TEMPO_CPU_RESERVATION` | `0.25` | CPU reservation | +| `TEMPO_MEMORY_RESERVATION` | `256M` | Memory reservation | + +### Supported Trace Protocols + +- **OTLP** (OpenTelemetry Protocol): Port 4317 (gRPC), 4318 (HTTP) +- **Zipkin**: Port 9411 +- **Jaeger**: Port 14250 (gRPC), 14268 (Thrift HTTP) + +### Default Configuration + +The service includes a basic configuration file (`tempo-config.yaml`) that: + +- Enables all major trace receivers (OTLP, Jaeger, Zipkin) +- Uses local filesystem storage +- Configures trace retention and compaction +- Enables metrics generation from traces (requires Prometheus) + +For production deployments, you should customize the configuration based on your requirements. + +## Integration with Grafana + +1. Add Tempo as a data source in Grafana: + - URL: `http://tempo:3200` (if running in the same Docker network) + - Or: `http://localhost:3200` (from host machine) + +2. Query traces using TraceQL or trace IDs + +3. Enable trace-to-logs and trace-to-metrics correlation + +## Sending Traces to Tempo + +### OpenTelemetry SDK + +Configure your application to send traces to Tempo: + +```python +from opentelemetry import trace +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor + +# Configure the OTLP exporter +otlp_exporter = OTLPSpanExporter( + endpoint="http://localhost:4317", + insecure=True +) + +# Set up the tracer provider +trace.set_tracer_provider(TracerProvider()) +trace.get_tracer_provider().add_span_processor( + BatchSpanProcessor(otlp_exporter) +) +``` + +### Using cURL (Testing) + +Send a test trace via HTTP: + +```bash +curl -X POST http://localhost:4318/v1/traces \ + -H "Content-Type: application/json" \ + -d '{ + "resourceSpans": [{ + "resource": { + "attributes": [{ + "key": "service.name", + "value": {"stringValue": "test-service"} + }] + }, + "scopeSpans": [{ + "spans": [{ + "traceId": "5B8EFFF798038103D269B633813FC60C", + "spanId": "EEE19B7EC3C1B174", + "name": "test-span", + "startTimeUnixNano": "1544712660000000000", + "endTimeUnixNano": "1544712661000000000", + "kind": 1 + }] + }] + }] + }' +``` + +### Jaeger Client Libraries + +Configure Jaeger clients to send to Tempo's Jaeger-compatible endpoints: + +```yaml +JAEGER_AGENT_HOST: localhost +JAEGER_AGENT_PORT: 14250 +``` + +## Storage + +Traces are stored in a Docker volume named `tempo_data`. + +## Metrics Generation + +Tempo can generate RED (Rate, Errors, Duration) metrics from traces. The default configuration attempts to send these to Prometheus at `http://prometheus:9090`. If you don't have Prometheus running, you can: + +1. Remove the `remote_write` section from `tempo-config.yaml` +2. Set up Prometheus to receive metrics from Tempo + +## Health Check + +The service includes a health check that monitors the `/ready` endpoint every 30 seconds. + +## Resource Requirements + +- **Minimum**: 256MB RAM, 0.25 CPU +- **Recommended**: 1GB RAM, 1 CPU (for moderate trace volumes) +- **Production**: Scale based on trace ingestion rate and retention period + +## Security Considerations + +The default configuration: + +- Runs as non-root user (UID:GID 10001:10001) +- Exposes multiple ports for different protocols +- Uses filesystem storage (not suitable for distributed deployments) + +For production: + +- Use object storage (S3, GCS, Azure Blob) +- Enable authentication and encryption +- Implement proper network security and access controls +- Configure appropriate retention policies +- Consider running in distributed mode for high availability + +## TraceQL Examples + +Query traces using TraceQL in Grafana: + +```traceql +# Find slow traces +{ duration > 1s } + +# Find traces with errors +{ status = error } + +# Find traces for a specific service +{ resource.service.name = "frontend" } + +# Complex query +{ resource.service.name = "frontend" && duration > 100ms && status = error } +``` + +## Documentation + +- [Official Documentation](https://grafana.com/docs/tempo/latest/) +- [TraceQL Query Language](https://grafana.com/docs/tempo/latest/traceql/) +- [Configuration Reference](https://grafana.com/docs/tempo/latest/configuration/) +- [GitHub Repository](https://github.com/grafana/tempo) + +## License + +Tempo is licensed under the [AGPLv3 License](https://github.com/grafana/tempo/blob/main/LICENSE). diff --git a/src/tempo/README.zh.md b/src/tempo/README.zh.md new file mode 100644 index 0000000..f4ec710 --- /dev/null +++ b/src/tempo/README.zh.md @@ -0,0 +1,211 @@ +# Grafana Tempo + +[English Documentation](README.md) + +Grafana Tempo 是一个开源、易于使用且高度可扩展的分布式追踪后端。Tempo 非常经济高效,仅需对象存储即可运行,并与 Grafana、Prometheus 和 Loki 深度集成。 + +## 特性 + +- **成本效益**:使用对象存储(支持 S3、GCS、Azure、文件系统) +- **易于操作**:除对象存储外无其他依赖 +- **多租户**:内置多租户支持 +- **多种协议**:支持 OTLP、Jaeger 和 Zipkin +- **TraceQL**:用于追踪数据的强大查询语言 +- **指标生成**:可从追踪生成 RED 指标 + +## 快速开始 + +1. 复制示例环境文件: + + ```bash + cp .env.example .env + ``` + +2. 启动服务: + + ```bash + docker compose up -d + ``` + +3. 验证服务正在运行: + + ```bash + docker compose ps + curl http://localhost:3200/ready + ``` + +## 配置 + +### 环境变量 + +| 变量 | 默认值 | 描述 | +| ---------------------------------------- | ------- | ----------------------- | +| `TEMPO_VERSION` | `2.7.2` | Tempo 版本 | +| `TEMPO_HTTP_PORT_OVERRIDE` | `3200` | HTTP API 端口 | +| `TEMPO_GRPC_PORT_OVERRIDE` | `9095` | gRPC 端口 | +| `TEMPO_OTLP_HTTP_PORT_OVERRIDE` | `4318` | OTLP HTTP 接收器端口 | +| `TEMPO_OTLP_GRPC_PORT_OVERRIDE` | `4317` | OTLP gRPC 接收器端口 | +| `TEMPO_ZIPKIN_PORT_OVERRIDE` | `9411` | Zipkin 接收器端口 | +| `TEMPO_JAEGER_THRIFT_HTTP_PORT_OVERRIDE` | `14268` | Jaeger Thrift HTTP 端口 | +| `TEMPO_JAEGER_GRPC_PORT_OVERRIDE` | `14250` | Jaeger gRPC 端口 | +| `TZ` | `UTC` | 时区 | +| `TEMPO_CPU_LIMIT` | `1.0` | CPU 限制 | +| `TEMPO_MEMORY_LIMIT` | `1G` | 内存限制 | +| `TEMPO_CPU_RESERVATION` | `0.25` | CPU 预留 | +| `TEMPO_MEMORY_RESERVATION` | `256M` | 内存预留 | + +### 支持的追踪协议 + +- **OTLP**(OpenTelemetry 协议):端口 4317(gRPC),4318(HTTP) +- **Zipkin**:端口 9411 +- **Jaeger**:端口 14250(gRPC),14268(Thrift HTTP) + +### 默认配置 + +该服务包含一个基本配置文件(`tempo-config.yaml`),该文件: + +- 启用所有主要追踪接收器(OTLP、Jaeger、Zipkin) +- 使用本地文件系统存储 +- 配置追踪保留和压缩 +- 启用从追踪生成指标(需要 Prometheus) + +对于生产部署,您应该根据需求自定义配置。 + +## 与 Grafana 集成 + +1. 在 Grafana 中添加 Tempo 作为数据源: + - URL:`http://tempo:3200`(如果在同一 Docker 网络中运行) + - 或者:`http://localhost:3200`(从主机访问) + +2. 使用 TraceQL 或追踪 ID 查询追踪 + +3. 启用追踪到日志和追踪到指标的关联 + +## 向 Tempo 发送追踪 + +### OpenTelemetry SDK + +配置您的应用程序向 Tempo 发送追踪: + +```python +from opentelemetry import trace +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor + +# 配置 OTLP 导出器 +otlp_exporter = OTLPSpanExporter( + endpoint="http://localhost:4317", + insecure=True +) + +# 设置追踪器提供程序 +trace.set_tracer_provider(TracerProvider()) +trace.get_tracer_provider().add_span_processor( + BatchSpanProcessor(otlp_exporter) +) +``` + +### 使用 cURL(测试) + +通过 HTTP 发送测试追踪: + +```bash +curl -X POST http://localhost:4318/v1/traces \ + -H "Content-Type: application/json" \ + -d '{ + "resourceSpans": [{ + "resource": { + "attributes": [{ + "key": "service.name", + "value": {"stringValue": "test-service"} + }] + }, + "scopeSpans": [{ + "spans": [{ + "traceId": "5B8EFFF798038103D269B633813FC60C", + "spanId": "EEE19B7EC3C1B174", + "name": "test-span", + "startTimeUnixNano": "1544712660000000000", + "endTimeUnixNano": "1544712661000000000", + "kind": 1 + }] + }] + }] + }' +``` + +### Jaeger 客户端库 + +配置 Jaeger 客户端向 Tempo 的 Jaeger 兼容端点发送: + +```yaml +JAEGER_AGENT_HOST: localhost +JAEGER_AGENT_PORT: 14250 +``` + +## 存储 + +追踪存储在名为 `tempo_data` 的 Docker 卷中。 + +## 指标生成 + +Tempo 可以从追踪生成 RED(速率、错误、持续时间)指标。默认配置尝试将这些指标发送到 `http://prometheus:9090` 的 Prometheus。如果您没有运行 Prometheus,您可以: + +1. 从 `tempo-config.yaml` 中删除 `remote_write` 部分 +2. 设置 Prometheus 以接收来自 Tempo 的指标 + +## 健康检查 + +该服务包含健康检查,每 30 秒监控一次 `/ready` 端点。 + +## 资源需求 + +- **最低要求**:256MB RAM,0.25 CPU +- **推荐配置**:1GB RAM,1 CPU(用于中等追踪量) +- **生产环境**:根据追踪摄入速率和保留期限进行扩展 + +## 安全注意事项 + +默认配置: + +- 以非 root 用户运行(UID:GID 10001:10001) +- 为不同协议暴露多个端口 +- 使用文件系统存储(不适用于分布式部署) + +对于生产环境: + +- 使用对象存储(S3、GCS、Azure Blob) +- 启用身份验证和加密 +- 实施适当的网络安全和访问控制 +- 配置适当的保留策略 +- 考虑以分布式模式运行以实现高可用性 + +## TraceQL 示例 + +在 Grafana 中使用 TraceQL 查询追踪: + +```traceql +# 查找慢追踪 +{ duration > 1s } + +# 查找有错误的追踪 +{ status = error } + +# 查找特定服务的追踪 +{ resource.service.name = "frontend" } + +# 复杂查询 +{ resource.service.name = "frontend" && duration > 100ms && status = error } +``` + +## 文档 + +- [官方文档](https://grafana.com/docs/tempo/latest/) +- [TraceQL 查询语言](https://grafana.com/docs/tempo/latest/traceql/) +- [配置参考](https://grafana.com/docs/tempo/latest/configuration/) +- [GitHub 仓库](https://github.com/grafana/tempo) + +## 许可证 + +Tempo 使用 [AGPLv3 许可证](https://github.com/grafana/tempo/blob/main/LICENSE)。 diff --git a/src/tempo/docker-compose.yaml b/src/tempo/docker-compose.yaml new file mode 100644 index 0000000..ece854a --- /dev/null +++ b/src/tempo/docker-compose.yaml @@ -0,0 +1,44 @@ +x-defaults: &defaults + restart: unless-stopped + logging: + driver: json-file + options: + max-size: 100m + max-file: "3" + +services: + tempo: + <<: *defaults + image: ${GLOBAL_REGISTRY:-}grafana/tempo:${TEMPO_VERSION:-2.7.2} + ports: + - "${TEMPO_HTTP_PORT_OVERRIDE:-3200}:3200" # HTTP + - "${TEMPO_GRPC_PORT_OVERRIDE:-9095}:9095" # gRPC + - "${TEMPO_OTLP_HTTP_PORT_OVERRIDE:-4318}:4318" # OTLP HTTP + - "${TEMPO_OTLP_GRPC_PORT_OVERRIDE:-4317}:4317" # OTLP gRPC + - "${TEMPO_ZIPKIN_PORT_OVERRIDE:-9411}:9411" # Zipkin + - "${TEMPO_JAEGER_THRIFT_HTTP_PORT_OVERRIDE:-14268}:14268" # Jaeger Thrift HTTP + - "${TEMPO_JAEGER_GRPC_PORT_OVERRIDE:-14250}:14250" # Jaeger gRPC + volumes: + - tempo_data:/tmp/tempo + - ./tempo-config.yaml:/etc/tempo/tempo.yaml:ro + environment: + - TZ=${TZ:-UTC} + command: -config.file=/etc/tempo/tempo.yaml + user: "10001:10001" # Tempo user + deploy: + resources: + limits: + cpus: ${TEMPO_CPU_LIMIT:-1.0} + memory: ${TEMPO_MEMORY_LIMIT:-1G} + reservations: + cpus: ${TEMPO_CPU_RESERVATION:-0.25} + memory: ${TEMPO_MEMORY_RESERVATION:-256M} + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3200/ready"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s + +volumes: + tempo_data: diff --git a/src/tempo/tempo-config.yaml b/src/tempo/tempo-config.yaml new file mode 100644 index 0000000..c71c0ae --- /dev/null +++ b/src/tempo/tempo-config.yaml @@ -0,0 +1,50 @@ +server: + http_listen_port: 3200 + +distributor: + receivers: + otlp: + protocols: + http: + endpoint: 0.0.0.0:4318 + grpc: + endpoint: 0.0.0.0:4317 + zipkin: + endpoint: 0.0.0.0:9411 + jaeger: + protocols: + thrift_http: + endpoint: 0.0.0.0:14268 + grpc: + endpoint: 0.0.0.0:14250 + +ingester: + max_block_duration: 5m + +compactor: + compaction: + block_retention: 1h + +metrics_generator: + registry: + external_labels: + source: tempo + cluster: docker-compose + storage: + path: /tmp/tempo/generator/wal + remote_write: + - url: http://prometheus:9090/api/v1/write + send_exemplars: true + +storage: + trace: + backend: local + wal: + path: /tmp/tempo/wal + local: + path: /tmp/tempo/blocks + +overrides: + defaults: + metrics_generator: + processors: [service-graphs, span-metrics]