mirror of
https://github.com/EasyTier/EasyTier.git
synced 2026-05-07 18:24:36 +00:00
introduce uptime monitor for easytier public nodes (#1250)
This commit is contained in:
@@ -0,0 +1,660 @@
|
||||
use std::{
|
||||
ops::{DerefMut, Div},
|
||||
sync::Arc,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use anyhow::Context as _;
|
||||
use dashmap::DashMap;
|
||||
use easytier::{
|
||||
common::{
|
||||
config::{ConfigLoader, NetworkIdentity, PeerConfig, TomlConfigLoader},
|
||||
scoped_task::ScopedTask,
|
||||
},
|
||||
defer,
|
||||
instance_manager::NetworkInstanceManager,
|
||||
launcher::ConfigSource,
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sqlx::any;
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
|
||||
use crate::db::{
|
||||
entity::shared_nodes,
|
||||
operations::{HealthOperations, NodeOperations},
|
||||
Db, HealthStatus,
|
||||
};
|
||||
|
||||
pub struct HealthCheckOneNode {
|
||||
node_id: String,
|
||||
}
|
||||
|
||||
const HEALTH_CHECK_RING_GRANULARITY_SEC: usize = 60 * 15; // 15分钟
|
||||
const HEALTH_CHECK_RING_MAX_DURATION_SEC: usize = 60 * 60 * 24; // 最多一天
|
||||
|
||||
// const HEALTH_CHECK_RING_GRANULARITY_SEC: usize = 10;
|
||||
// const HEALTH_CHECK_RING_MAX_DURATION_SEC: usize = 60;
|
||||
|
||||
const HEALTH_CHECK_RING_SIZE: usize =
|
||||
HEALTH_CHECK_RING_MAX_DURATION_SEC / HEALTH_CHECK_RING_GRANULARITY_SEC;
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
struct RingItem {
|
||||
counter: u64,
|
||||
round: u64,
|
||||
}
|
||||
|
||||
impl RingItem {
|
||||
fn try_update_round(&mut self, timestamp: u64) {
|
||||
let cur_round =
|
||||
timestamp.div((HEALTH_CHECK_RING_GRANULARITY_SEC * HEALTH_CHECK_RING_SIZE) as u64);
|
||||
if self.round != cur_round {
|
||||
self.round = cur_round;
|
||||
self.counter = 0;
|
||||
}
|
||||
}
|
||||
|
||||
fn inc(&mut self, timestamp: u64) {
|
||||
self.try_update_round(timestamp);
|
||||
self.counter += 1;
|
||||
}
|
||||
|
||||
fn get(&mut self, timestamp: u64) -> u64 {
|
||||
self.try_update_round(timestamp);
|
||||
self.counter
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HealthyMemRecord {
|
||||
node_id: i32,
|
||||
current_health_status: HealthStatus,
|
||||
last_error_info: Option<String>,
|
||||
last_check_time: chrono::DateTime<chrono::Utc>,
|
||||
last_response_time: Option<i32>,
|
||||
|
||||
// the current time is corresponding to the index by modulo with UNIX-timestamp.
|
||||
total_check_counter_ring: Vec<RingItem>,
|
||||
healthy_counter_ring: Vec<RingItem>,
|
||||
}
|
||||
|
||||
impl HealthyMemRecord {
|
||||
pub fn new(node_id: i32) -> Self {
|
||||
Self {
|
||||
node_id,
|
||||
current_health_status: HealthStatus::Unknown,
|
||||
last_error_info: None,
|
||||
last_check_time: chrono::Utc::now(),
|
||||
last_response_time: None,
|
||||
total_check_counter_ring: vec![Default::default(); HEALTH_CHECK_RING_SIZE],
|
||||
healthy_counter_ring: vec![Default::default(); HEALTH_CHECK_RING_SIZE],
|
||||
}
|
||||
}
|
||||
|
||||
/// 从数据库记录初始化内存记录
|
||||
pub fn from_db_records(
|
||||
node_id: i32,
|
||||
records: &[crate::db::entity::health_records::Model],
|
||||
) -> Self {
|
||||
let mut mem_record = Self::new(node_id);
|
||||
|
||||
if let Some(latest) = records.first() {
|
||||
mem_record.current_health_status = latest.get_status();
|
||||
mem_record.last_check_time = latest.checked_at.to_utc();
|
||||
mem_record.last_response_time = if latest.response_time == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(latest.response_time)
|
||||
};
|
||||
mem_record.last_error_info = if latest.error_message.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(latest.error_message.clone())
|
||||
};
|
||||
}
|
||||
|
||||
// 填充环形缓冲区
|
||||
mem_record.populate_ring_from_records(records);
|
||||
mem_record
|
||||
}
|
||||
|
||||
/// 从历史记录填充环形缓冲区
|
||||
fn populate_ring_from_records(&mut self, records: &[crate::db::entity::health_records::Model]) {
|
||||
let now = chrono::Utc::now().timestamp() as usize;
|
||||
|
||||
for record in records {
|
||||
let record_time = record.checked_at.to_utc().timestamp() as usize;
|
||||
let time_diff = now.saturating_sub(record_time);
|
||||
|
||||
// 只处理在环形缓冲区时间范围内的记录
|
||||
if time_diff < HEALTH_CHECK_RING_MAX_DURATION_SEC {
|
||||
let ring_index =
|
||||
(record_time / HEALTH_CHECK_RING_GRANULARITY_SEC) % HEALTH_CHECK_RING_SIZE;
|
||||
self.total_check_counter_ring[ring_index].inc(record_time as u64);
|
||||
|
||||
if record.get_status() == HealthStatus::Healthy {
|
||||
self.healthy_counter_ring[ring_index].inc(record_time as u64);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// 更新健康状态并记录到环形缓冲区
|
||||
pub fn update_health_status(
|
||||
&mut self,
|
||||
status: HealthStatus,
|
||||
response_time: Option<i32>,
|
||||
error_message: Option<String>,
|
||||
) {
|
||||
self.current_health_status = status.clone();
|
||||
self.last_check_time = chrono::Utc::now();
|
||||
self.last_response_time = response_time;
|
||||
self.last_error_info = error_message;
|
||||
|
||||
// 更新环形缓冲区
|
||||
let now = chrono::Utc::now().timestamp() as usize;
|
||||
let ring_index = (now / HEALTH_CHECK_RING_GRANULARITY_SEC) % HEALTH_CHECK_RING_SIZE;
|
||||
|
||||
self.total_check_counter_ring[ring_index].inc(now as u64);
|
||||
self.healthy_counter_ring[ring_index].try_update_round(now as u64);
|
||||
if status == HealthStatus::Healthy {
|
||||
self.healthy_counter_ring[ring_index].inc(now as u64);
|
||||
}
|
||||
}
|
||||
|
||||
/// 获取健康统计信息
|
||||
pub fn get_health_stats(&self, hours: u64) -> crate::db::HealthStats {
|
||||
let now = chrono::Utc::now().timestamp() as usize;
|
||||
|
||||
let mut total_checks = 0;
|
||||
let mut healthy_count = 0;
|
||||
|
||||
for ring_index in 0..HEALTH_CHECK_RING_SIZE {
|
||||
total_checks += self.total_check_counter_ring[ring_index].counter;
|
||||
healthy_count += self.healthy_counter_ring[ring_index].counter;
|
||||
}
|
||||
|
||||
let health_percentage = if total_checks > 0 {
|
||||
(healthy_count as f64 / total_checks as f64) * 100.0
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
crate::db::HealthStats {
|
||||
total_checks,
|
||||
healthy_count,
|
||||
unhealthy_count: total_checks - healthy_count,
|
||||
health_percentage,
|
||||
average_response_time: self.last_response_time.map(|rt| rt as f64),
|
||||
uptime_percentage: health_percentage,
|
||||
last_check_time: Some(self.last_check_time),
|
||||
last_status: Some(self.current_health_status.clone()),
|
||||
}
|
||||
}
|
||||
|
||||
/// 获取当前健康状态
|
||||
pub fn get_current_health_status(&self) -> &HealthStatus {
|
||||
&self.current_health_status
|
||||
}
|
||||
|
||||
/// 获取最后检查时间
|
||||
pub fn get_last_check_time(&self) -> chrono::DateTime<chrono::Utc> {
|
||||
self.last_check_time
|
||||
}
|
||||
|
||||
/// 获取最后响应时间
|
||||
pub fn get_last_response_time(&self) -> Option<i32> {
|
||||
self.last_response_time
|
||||
}
|
||||
|
||||
/// 获取最后错误信息
|
||||
pub fn get_last_error_info(&self) -> &Option<String> {
|
||||
&self.last_error_info
|
||||
}
|
||||
|
||||
pub fn get_counter_ring(&mut self) -> (Vec<u64>, Vec<u64>) {
|
||||
let now = self.last_check_time.timestamp() as usize;
|
||||
|
||||
let mut total_ring = vec![0; HEALTH_CHECK_RING_SIZE];
|
||||
let mut healthy_ring = vec![0; HEALTH_CHECK_RING_SIZE];
|
||||
|
||||
let mut total_checks = 0;
|
||||
let mut healthy_count = 0;
|
||||
|
||||
for i in 0..HEALTH_CHECK_RING_SIZE {
|
||||
let ring_time = now - (i * HEALTH_CHECK_RING_GRANULARITY_SEC);
|
||||
let ring_index =
|
||||
ring_time.div_euclid(HEALTH_CHECK_RING_GRANULARITY_SEC) % HEALTH_CHECK_RING_SIZE;
|
||||
total_ring[i] = self.total_check_counter_ring[ring_index].get(ring_time as u64);
|
||||
healthy_ring[i] = self.healthy_counter_ring[ring_index].counter;
|
||||
}
|
||||
|
||||
(total_ring, healthy_ring)
|
||||
}
|
||||
|
||||
pub fn get_ring_granularity(&self) -> u32 {
|
||||
HEALTH_CHECK_RING_GRANULARITY_SEC as u32
|
||||
}
|
||||
}
|
||||
|
||||
pub struct HealthChecker {
|
||||
db: Db,
|
||||
instance_mgr: Arc<NetworkInstanceManager>,
|
||||
inst_id_map: DashMap<i32, uuid::Uuid>,
|
||||
node_tasks: DashMap<i32, ScopedTask<()>>,
|
||||
node_records: Arc<DashMap<i32, HealthyMemRecord>>,
|
||||
node_cfg: Arc<DashMap<i32, TomlConfigLoader>>,
|
||||
}
|
||||
|
||||
impl HealthChecker {
|
||||
pub fn new(db: Db) -> Self {
|
||||
let instance_mgr = Arc::new(NetworkInstanceManager::new());
|
||||
Self {
|
||||
db,
|
||||
instance_mgr,
|
||||
inst_id_map: DashMap::new(),
|
||||
node_tasks: DashMap::new(),
|
||||
node_records: Arc::new(DashMap::new()),
|
||||
node_cfg: Arc::new(DashMap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// 启动时从数据库加载所有节点的健康记录到内存
|
||||
pub async fn load_health_records_from_db(&self) -> anyhow::Result<()> {
|
||||
info!("Loading health records from database...");
|
||||
|
||||
// 获取所有活跃节点
|
||||
let nodes = NodeOperations::get_all_nodes(&self.db)
|
||||
.await
|
||||
.with_context(|| "Failed to get all nodes from database")?;
|
||||
|
||||
let from_date = chrono::Utc::now().naive_utc()
|
||||
- chrono::Duration::seconds(HEALTH_CHECK_RING_MAX_DURATION_SEC as i64);
|
||||
|
||||
for node in nodes {
|
||||
// 获取每个节点最近的健康记录(用于初始化环形缓冲区)
|
||||
let records =
|
||||
HealthOperations::get_node_health_records(&self.db, node.id, Some(from_date), None)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to get health records for node {}", node.id)
|
||||
})?;
|
||||
|
||||
// 创建内存记录
|
||||
let mem_record = HealthyMemRecord::from_db_records(node.id, &records);
|
||||
self.node_records.insert(node.id, mem_record);
|
||||
|
||||
debug!(
|
||||
"Loaded {} health records for node {} ({})",
|
||||
records.len(),
|
||||
node.id,
|
||||
node.name
|
||||
);
|
||||
}
|
||||
|
||||
info!(
|
||||
"Loaded health records for {} nodes",
|
||||
self.node_records.len()
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// 获取节点的内存健康记录
|
||||
pub fn get_node_memory_record(&self, node_id: i32) -> Option<HealthyMemRecord> {
|
||||
self.node_records.get(&node_id).map(|entry| entry.clone())
|
||||
}
|
||||
|
||||
/// 获取节点的健康统计信息(从内存)
|
||||
pub fn get_node_health_stats(
|
||||
&self,
|
||||
node_id: i32,
|
||||
hours: u64,
|
||||
) -> Option<crate::db::HealthStats> {
|
||||
self.node_records
|
||||
.get(&node_id)
|
||||
.map(|record| record.get_health_stats(hours))
|
||||
}
|
||||
|
||||
/// 获取所有节点的当前健康状态(从内存)
|
||||
pub fn get_all_nodes_health_status(&self) -> Vec<(i32, HealthStatus, Option<String>)> {
|
||||
self.node_records
|
||||
.iter()
|
||||
.map(|entry| {
|
||||
let record = entry.value();
|
||||
(
|
||||
record.node_id,
|
||||
record.current_health_status.clone(),
|
||||
record.last_error_info.clone(),
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub async fn try_update_node(&self, node_id: i32) -> anyhow::Result<()> {
|
||||
let old_cfg = self
|
||||
.node_cfg
|
||||
.get(&node_id)
|
||||
.ok_or_else(|| anyhow::anyhow!("old node cfg not found, node_id: {}", node_id))?
|
||||
.clone();
|
||||
let new_cfg = self.get_node_cfg(node_id, Some(old_cfg.get_id())).await?;
|
||||
|
||||
if new_cfg.dump() != old_cfg.dump() {
|
||||
self.remove_node(node_id).await?;
|
||||
self.add_node(node_id).await?;
|
||||
info!("node {} cfg updated", node_id);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn get_node_cfg_with_model(
|
||||
&self,
|
||||
node_info: &shared_nodes::Model,
|
||||
inst_id: Option<uuid::Uuid>,
|
||||
) -> anyhow::Result<TomlConfigLoader> {
|
||||
let cfg = TomlConfigLoader::default();
|
||||
cfg.set_peers(vec![PeerConfig {
|
||||
uri: format!(
|
||||
"{}://{}:{}",
|
||||
node_info.protocol, node_info.host, node_info.port
|
||||
)
|
||||
.parse()
|
||||
.with_context(|| "failed to parse peer uri")?,
|
||||
}]);
|
||||
|
||||
let inst_id = inst_id.unwrap_or(uuid::Uuid::new_v4());
|
||||
cfg.set_id(inst_id);
|
||||
cfg.set_network_identity(NetworkIdentity::new(
|
||||
node_info.network_name.clone(),
|
||||
node_info.network_secret.clone(),
|
||||
));
|
||||
|
||||
cfg.set_hostname(Some("HealthCheckNode".to_string()));
|
||||
|
||||
let mut flags = cfg.get_flags();
|
||||
flags.no_tun = true;
|
||||
flags.disable_p2p = true;
|
||||
flags.disable_udp_hole_punching = true;
|
||||
cfg.set_flags(flags);
|
||||
|
||||
Ok(cfg)
|
||||
}
|
||||
|
||||
pub async fn test_connection(
|
||||
&self,
|
||||
node_info: &shared_nodes::Model,
|
||||
max_time: Duration,
|
||||
) -> anyhow::Result<()> {
|
||||
let cfg = self.get_node_cfg_with_model(node_info, None).await?;
|
||||
defer!({
|
||||
let _ = self
|
||||
.instance_mgr
|
||||
.delete_network_instance(vec![cfg.get_id()]);
|
||||
});
|
||||
self.instance_mgr
|
||||
.run_network_instance(cfg.clone(), ConfigSource::FFI)
|
||||
.with_context(|| "failed to run network instance")?;
|
||||
|
||||
let now = Instant::now();
|
||||
let mut err = None;
|
||||
while now.elapsed() < max_time {
|
||||
match Self::test_node_healthy(cfg.get_id(), self.instance_mgr.clone()).await {
|
||||
Ok(_) => {
|
||||
return Ok(());
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"test node healthy failed, node_info: {:?}, err: {}",
|
||||
node_info, e
|
||||
);
|
||||
err = Some(e);
|
||||
}
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
Err(anyhow::anyhow!("test node healthy failed, err: {:?}", err))
|
||||
}
|
||||
|
||||
async fn get_node_cfg(
|
||||
&self,
|
||||
node_id: i32,
|
||||
inst_id: Option<uuid::Uuid>,
|
||||
) -> anyhow::Result<TomlConfigLoader> {
|
||||
let node_info = NodeOperations::get_node_by_id(&self.db, node_id)
|
||||
.await
|
||||
.with_context(|| format!("failed to get node by id: {}", node_id))?
|
||||
.ok_or_else(|| anyhow::anyhow!("node not found"))?;
|
||||
self.get_node_cfg_with_model(&node_info, inst_id).await
|
||||
}
|
||||
|
||||
pub async fn add_node(&self, node_id: i32) -> anyhow::Result<()> {
|
||||
let cfg = self.get_node_cfg(node_id, None).await?;
|
||||
info!(
|
||||
"Add node {} to health checker, cfg: {}",
|
||||
node_id,
|
||||
cfg.dump()
|
||||
);
|
||||
|
||||
self.instance_mgr
|
||||
.run_network_instance(cfg.clone(), ConfigSource::FFI)
|
||||
.with_context(|| "failed to run network instance")?;
|
||||
self.inst_id_map.insert(node_id, cfg.get_id());
|
||||
|
||||
// 初始化内存记录(如果不存在)
|
||||
if !self.node_records.contains_key(&node_id) {
|
||||
// 从数据库加载历史记录
|
||||
let from_date = chrono::Utc::now().naive_utc()
|
||||
- chrono::Duration::seconds(HEALTH_CHECK_RING_MAX_DURATION_SEC as i64);
|
||||
if let Ok(records) =
|
||||
HealthOperations::get_node_health_records(&self.db, node_id, Some(from_date), None)
|
||||
.await
|
||||
{
|
||||
let mem_record = HealthyMemRecord::from_db_records(node_id, &records);
|
||||
self.node_records.insert(node_id, mem_record);
|
||||
info!(
|
||||
"Initialized memory record for node {} with {} historical records",
|
||||
node_id,
|
||||
records.len()
|
||||
);
|
||||
} else {
|
||||
self.node_records
|
||||
.insert(node_id, HealthyMemRecord::new(node_id));
|
||||
info!("Initialized new memory record for node {}", node_id);
|
||||
}
|
||||
}
|
||||
|
||||
// 启动健康检查任务
|
||||
let task = ScopedTask::from(tokio::spawn(Self::node_health_check_task(
|
||||
node_id,
|
||||
cfg.get_id(),
|
||||
Arc::clone(&self.instance_mgr),
|
||||
self.db.clone(),
|
||||
Arc::clone(&self.node_records),
|
||||
)));
|
||||
self.node_tasks.insert(node_id, task);
|
||||
self.node_cfg.insert(node_id, cfg.clone());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn remove_node(&self, node_id: i32) -> anyhow::Result<()> {
|
||||
self.node_tasks.remove(&node_id);
|
||||
if let Some(inst_id) = self.inst_id_map.remove(&node_id) {
|
||||
let _ = self.instance_mgr.delete_network_instance(vec![inst_id.1]);
|
||||
}
|
||||
self.node_cfg.remove(&node_id);
|
||||
// 保留内存记录,不删除,以便后续查询历史数据
|
||||
info!(
|
||||
"Removed health check task for node {}, memory record retained",
|
||||
node_id
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(err, ret, skip(instance_mgr))]
|
||||
async fn test_node_healthy(
|
||||
inst_id: uuid::Uuid,
|
||||
instance_mgr: Arc<NetworkInstanceManager>,
|
||||
// return version, response time on healthy, conn_count
|
||||
) -> anyhow::Result<(String, u64, u32)> {
|
||||
let Some(instance) = instance_mgr.get_network_info(&inst_id) else {
|
||||
anyhow::bail!("healthy check node is not started");
|
||||
};
|
||||
|
||||
let running = instance.running;
|
||||
// health check node is not running, update db
|
||||
if !running {
|
||||
anyhow::bail!("healthy check node is not running");
|
||||
}
|
||||
|
||||
if let Some(err) = instance.error_msg {
|
||||
anyhow::bail!("healthy check node has error: {}", err);
|
||||
}
|
||||
|
||||
let p = instance.peer_route_pairs;
|
||||
// dst node is not online
|
||||
let Some(dst_node) = p.iter().find(|x| {
|
||||
// we disable p2p, so we only check direct connected peer
|
||||
x.route.as_ref().is_some_and(|route| {
|
||||
!route.feature_flag.unwrap().is_public_server && route.hostname != "HealthCheckNode"
|
||||
}) && x.peer.as_ref().is_some_and(|p| !p.conns.is_empty())
|
||||
}) else {
|
||||
anyhow::bail!("dst node is not online");
|
||||
};
|
||||
|
||||
let Some(route_info) = &dst_node.route else {
|
||||
anyhow::bail!("dst node route is not found");
|
||||
};
|
||||
|
||||
let Some(peer_info) = &dst_node.peer else {
|
||||
anyhow::bail!("dst node peer is not found");
|
||||
};
|
||||
|
||||
let version = route_info
|
||||
.version
|
||||
.clone()
|
||||
.split("-")
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
.to_string();
|
||||
|
||||
// 计算响应时间(这里可以根据实际需要实现)
|
||||
let response_time = peer_info
|
||||
.conns
|
||||
.iter()
|
||||
.filter_map(|x| x.stats)
|
||||
.map(|x| x.latency_us)
|
||||
.min()
|
||||
.unwrap_or(0);
|
||||
|
||||
let peer_id = peer_info.peer_id;
|
||||
|
||||
let conn_count = if let Some(summary) = instance.foreign_network_summary {
|
||||
summary
|
||||
.info_map
|
||||
.get(&peer_id)
|
||||
.map(|x| x.network_count)
|
||||
.unwrap_or(0)
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
Ok((version, response_time, conn_count))
|
||||
}
|
||||
|
||||
async fn node_health_check_task(
|
||||
node_id: i32,
|
||||
inst_id: uuid::Uuid,
|
||||
instance_mgr: Arc<NetworkInstanceManager>,
|
||||
db: Db,
|
||||
node_records: Arc<DashMap<i32, HealthyMemRecord>>,
|
||||
) {
|
||||
/// 记录健康状态到数据库和内存
|
||||
async fn record_health_status(
|
||||
db: &Db,
|
||||
node_records: &Arc<DashMap<i32, HealthyMemRecord>>,
|
||||
node_id: i32,
|
||||
status: HealthStatus,
|
||||
response_time: Option<i32>,
|
||||
error_message: Option<String>,
|
||||
) {
|
||||
// 写入数据库
|
||||
if let Err(e) = HealthOperations::create_health_record(
|
||||
db,
|
||||
node_id,
|
||||
status.clone(),
|
||||
response_time,
|
||||
error_message.clone(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
error!("Failed to create health record for node {}: {}", node_id, e);
|
||||
}
|
||||
|
||||
// 更新内存记录
|
||||
if let Some(mut record) = node_records.get_mut(&node_id) {
|
||||
record.update_health_status(status, response_time, error_message);
|
||||
} else {
|
||||
let mut new_record = HealthyMemRecord::new(node_id);
|
||||
new_record.update_health_status(status, response_time, error_message);
|
||||
node_records.insert(node_id, new_record);
|
||||
}
|
||||
}
|
||||
let mut tick = tokio::time::interval(Duration::from_secs(5));
|
||||
let mut counter: u64 = 0;
|
||||
loop {
|
||||
if counter != 0 {
|
||||
tick.tick().await;
|
||||
}
|
||||
counter += 1;
|
||||
|
||||
match Self::test_node_healthy(inst_id, instance_mgr.clone()).await {
|
||||
Ok((version, response_time, conn_count)) => {
|
||||
if let Err(e) = NodeOperations::update_node_status(
|
||||
&db,
|
||||
node_id,
|
||||
true,
|
||||
Some(conn_count as i32),
|
||||
)
|
||||
.await
|
||||
{
|
||||
error!("Failed to update node status for node {}: {}", node_id, e);
|
||||
}
|
||||
|
||||
record_health_status(
|
||||
&db,
|
||||
&node_records,
|
||||
node_id,
|
||||
HealthStatus::Healthy,
|
||||
Some(response_time as i32),
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
|
||||
// update node version
|
||||
if let Err(e) = NodeOperations::update_node_version(&db, node_id, version).await
|
||||
{
|
||||
error!("Failed to update node version for node {}: {}", node_id, e);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
if let Err(e) =
|
||||
NodeOperations::update_node_status(&db, node_id, false, None).await
|
||||
{
|
||||
error!("Failed to update node status for node {}: {}", node_id, e);
|
||||
}
|
||||
|
||||
record_health_status(
|
||||
&db,
|
||||
&node_records,
|
||||
node_id,
|
||||
HealthStatus::Unhealthy,
|
||||
None,
|
||||
Some(e.to_string()),
|
||||
)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user