introduce uptime monitor for easytier public nodes (#1250)

2026-05-07 10:14:35 +00:00 · 2025-08-20 22:59:44 +08:00
parent 8f37d4ef7c
commit e6ec7f405c
61 changed files with 12122 additions and 17 deletions
@@ -0,0 +1,160 @@
+use std::{collections::HashSet, sync::Arc, time::Duration};
+
+use anyhow::Context as _;
+use tokio::time::{interval, Interval};
+use tracing::{error, info};
+
+use crate::{
+    db::{entity::shared_nodes, operations::NodeOperations, Db},
+    health_checker::HealthChecker,
+};
+
+/// HealthChecker的封装器，用于监控数据库中节点的添加和删除
+pub struct HealthCheckerManager {
+    health_checker: Arc<HealthChecker>,
+    db: Db,
+    current_nodes: Arc<tokio::sync::RwLock<HashSet<i32>>>,
+    monitor_interval: Duration,
+}
+
+impl HealthCheckerManager {
+    /// 创建新的HealthCheckerManager实例
+    pub fn new(health_checker: Arc<HealthChecker>, db: Db) -> Self {
+        Self {
+            health_checker,
+            db,
+            current_nodes: Arc::new(tokio::sync::RwLock::new(HashSet::new())),
+            monitor_interval: Duration::from_secs(1), // 默认每1秒检查一次
+        }
+    }
+
+    /// 设置监控间隔
+    pub fn with_monitor_interval(mut self, interval: Duration) -> Self {
+        self.monitor_interval = interval;
+        self
+    }
+
+    /// 启动监控任务
+    pub async fn start_monitoring(&self) -> anyhow::Result<()> {
+        // 启动定期检查任务
+        let health_checker = Arc::clone(&self.health_checker);
+        let db = self.db.clone();
+        let current_nodes = Arc::clone(&self.current_nodes);
+        let monitor_interval = self.monitor_interval;
+
+        tokio::spawn(async move {
+            let mut ticker = interval(monitor_interval);
+            loop {
+                if let Err(e) = Self::check_node_changes(&health_checker, &db, &current_nodes).await
+                {
+                    tracing::error!("Error checking node changes: {}", e);
+                }
+                ticker.tick().await;
+            }
+        });
+
+        Ok(())
+    }
+
+    /// 检查节点变化并更新监控
+    async fn check_node_changes(
+        health_checker: &Arc<HealthChecker>,
+        db: &Db,
+        current_nodes: &Arc<tokio::sync::RwLock<HashSet<i32>>>,
+    ) -> anyhow::Result<()> {
+        // 获取数据库中当前的所有节点
+        let db_nodes = NodeOperations::get_all_nodes(db)
+            .await
+            .with_context(|| "Failed to get all nodes from database")?;
+
+        let db_node_ids: HashSet<i32> = db_nodes.iter().map(|node| node.id).collect();
+
+        let mut current_nodes_guard = current_nodes.write().await;
+
+        // 检查新增的节点
+        for &node_id in &db_node_ids {
+            if !current_nodes_guard.contains(&node_id) {
+                // 新节点，添加到监控
+                if let Err(e) = health_checker.add_node(node_id).await {
+                    error!("Failed to add node {} to health checker: {}", node_id, e);
+                    continue;
+                }
+                current_nodes_guard.insert(node_id);
+                info!("Added new node {} to health monitoring", node_id);
+            } else if let Err(e) = health_checker.try_update_node(node_id).await {
+                error!("Failed to add node {} to health checker: {}", node_id, e);
+            }
+        }
+
+        // 检查删除的节点
+        let nodes_to_remove: Vec<i32> = current_nodes_guard
+            .iter()
+            .filter(|&&node_id| !db_node_ids.contains(&node_id))
+            .copied()
+            .collect();
+
+        for node_id in nodes_to_remove {
+            // 节点已删除，从监控中移除
+            if let Err(e) = health_checker.remove_node(node_id).await {
+                error!(
+                    "Failed to remove node {} from health checker: {}",
+                    node_id, e
+                );
+                continue;
+            }
+            current_nodes_guard.remove(&node_id);
+            info!("Removed node {} from health monitoring", node_id);
+        }
+
+        Ok(())
+    }
+
+    /// 手动触发节点变化检查
+    pub async fn refresh_nodes(&self) -> anyhow::Result<()> {
+        Self::check_node_changes(&self.health_checker, &self.db, &self.current_nodes).await
+    }
+
+    /// 获取当前监控的节点数量
+    pub async fn get_monitored_node_count(&self) -> usize {
+        self.current_nodes.read().await.len()
+    }
+
+    /// 获取当前监控的节点ID列表
+    pub async fn get_monitored_nodes(&self) -> Vec<i32> {
+        self.current_nodes.read().await.iter().copied().collect()
+    }
+
+    /// 获取节点的内存健康记录
+    pub fn get_node_memory_record(
+        &self,
+        node_id: i32,
+    ) -> Option<crate::health_checker::HealthyMemRecord> {
+        self.health_checker.get_node_memory_record(node_id)
+    }
+
+    /// 获取节点的健康统计信息
+    pub fn get_node_health_stats(
+        &self,
+        node_id: i32,
+        hours: u64,
+    ) -> Option<crate::db::HealthStats> {
+        self.health_checker.get_node_health_stats(node_id, hours)
+    }
+
+    /// 获取所有节点的当前健康状态
+    pub fn get_all_nodes_health_status(
+        &self,
+    ) -> Vec<(i32, crate::db::HealthStatus, Option<String>)> {
+        self.health_checker.get_all_nodes_health_status()
+    }
+
+    pub async fn test_connection(
+        &self,
+        node_info: &shared_nodes::Model,
+        max_time: Duration,
+    ) -> anyhow::Result<()> {
+        self.health_checker
+            .test_connection(node_info, max_time)
+            .await
+    }
+}