mirror of
https://github.com/EasyTier/EasyTier.git
synced 2026-05-07 18:24:36 +00:00
662 lines
22 KiB
Rust
662 lines
22 KiB
Rust
use std::{
|
|
ops::{DerefMut, Div},
|
|
sync::Arc,
|
|
time::{Duration, Instant},
|
|
};
|
|
|
|
use anyhow::Context as _;
|
|
use dashmap::DashMap;
|
|
use easytier::{
|
|
common::{
|
|
config::{ConfigFileControl, ConfigLoader, NetworkIdentity, PeerConfig, TomlConfigLoader},
|
|
scoped_task::ScopedTask,
|
|
},
|
|
defer,
|
|
instance_manager::NetworkInstanceManager,
|
|
};
|
|
use serde::{Deserialize, Serialize};
|
|
use sqlx::any;
|
|
use tracing::{debug, error, info, instrument, warn};
|
|
|
|
use crate::db::{
|
|
Db, HealthStatus,
|
|
entity::shared_nodes,
|
|
operations::{HealthOperations, NodeOperations},
|
|
};
|
|
|
|
pub struct HealthCheckOneNode {
|
|
node_id: String,
|
|
}
|
|
|
|
const HEALTH_CHECK_RING_GRANULARITY_SEC: usize = 60 * 15; // 15分钟
|
|
const HEALTH_CHECK_RING_MAX_DURATION_SEC: usize = 60 * 60 * 24; // 最多一天
|
|
|
|
// const HEALTH_CHECK_RING_GRANULARITY_SEC: usize = 10;
|
|
// const HEALTH_CHECK_RING_MAX_DURATION_SEC: usize = 60;
|
|
|
|
const HEALTH_CHECK_RING_SIZE: usize =
|
|
HEALTH_CHECK_RING_MAX_DURATION_SEC / HEALTH_CHECK_RING_GRANULARITY_SEC;
|
|
|
|
#[derive(Debug, Default, Clone)]
|
|
struct RingItem {
|
|
counter: u64,
|
|
round: u64,
|
|
}
|
|
|
|
impl RingItem {
|
|
fn try_update_round(&mut self, timestamp: u64) {
|
|
let cur_round =
|
|
timestamp.div((HEALTH_CHECK_RING_GRANULARITY_SEC * HEALTH_CHECK_RING_SIZE) as u64);
|
|
if self.round != cur_round {
|
|
self.round = cur_round;
|
|
self.counter = 0;
|
|
}
|
|
}
|
|
|
|
fn inc(&mut self, timestamp: u64) {
|
|
self.try_update_round(timestamp);
|
|
self.counter += 1;
|
|
}
|
|
|
|
fn get(&mut self, timestamp: u64) -> u64 {
|
|
self.try_update_round(timestamp);
|
|
self.counter
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub struct HealthyMemRecord {
|
|
node_id: i32,
|
|
current_health_status: HealthStatus,
|
|
last_error_info: Option<String>,
|
|
last_check_time: chrono::DateTime<chrono::Utc>,
|
|
last_response_time: Option<i32>,
|
|
|
|
// the current time is corresponding to the index by modulo with UNIX-timestamp.
|
|
total_check_counter_ring: Vec<RingItem>,
|
|
healthy_counter_ring: Vec<RingItem>,
|
|
}
|
|
|
|
impl HealthyMemRecord {
|
|
pub fn new(node_id: i32) -> Self {
|
|
Self {
|
|
node_id,
|
|
current_health_status: HealthStatus::Unknown,
|
|
last_error_info: None,
|
|
last_check_time: chrono::Utc::now(),
|
|
last_response_time: None,
|
|
total_check_counter_ring: vec![Default::default(); HEALTH_CHECK_RING_SIZE],
|
|
healthy_counter_ring: vec![Default::default(); HEALTH_CHECK_RING_SIZE],
|
|
}
|
|
}
|
|
|
|
/// 从数据库记录初始化内存记录
|
|
pub fn from_db_records(
|
|
node_id: i32,
|
|
records: &[crate::db::entity::health_records::Model],
|
|
) -> Self {
|
|
let mut mem_record = Self::new(node_id);
|
|
|
|
if let Some(latest) = records.first() {
|
|
mem_record.current_health_status = latest.get_status();
|
|
mem_record.last_check_time = latest.checked_at.to_utc();
|
|
mem_record.last_response_time = if latest.response_time == 0 {
|
|
None
|
|
} else {
|
|
Some(latest.response_time)
|
|
};
|
|
mem_record.last_error_info = if latest.error_message.is_empty() {
|
|
None
|
|
} else {
|
|
Some(latest.error_message.clone())
|
|
};
|
|
}
|
|
|
|
// 填充环形缓冲区
|
|
mem_record.populate_ring_from_records(records);
|
|
mem_record
|
|
}
|
|
|
|
/// 从历史记录填充环形缓冲区
|
|
fn populate_ring_from_records(&mut self, records: &[crate::db::entity::health_records::Model]) {
|
|
let now = chrono::Utc::now().timestamp() as usize;
|
|
|
|
for record in records {
|
|
let record_time = record.checked_at.to_utc().timestamp() as usize;
|
|
let time_diff = now.saturating_sub(record_time);
|
|
|
|
// 只处理在环形缓冲区时间范围内的记录
|
|
if time_diff < HEALTH_CHECK_RING_MAX_DURATION_SEC {
|
|
let ring_index =
|
|
(record_time / HEALTH_CHECK_RING_GRANULARITY_SEC) % HEALTH_CHECK_RING_SIZE;
|
|
self.total_check_counter_ring[ring_index].inc(record_time as u64);
|
|
|
|
if record.get_status() == HealthStatus::Healthy {
|
|
self.healthy_counter_ring[ring_index].inc(record_time as u64);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// 更新健康状态并记录到环形缓冲区
|
|
pub fn update_health_status(
|
|
&mut self,
|
|
status: HealthStatus,
|
|
response_time: Option<i32>,
|
|
error_message: Option<String>,
|
|
) {
|
|
self.current_health_status = status.clone();
|
|
self.last_check_time = chrono::Utc::now();
|
|
self.last_response_time = response_time;
|
|
self.last_error_info = error_message;
|
|
|
|
// 更新环形缓冲区
|
|
let now = chrono::Utc::now().timestamp() as usize;
|
|
let ring_index = (now / HEALTH_CHECK_RING_GRANULARITY_SEC) % HEALTH_CHECK_RING_SIZE;
|
|
|
|
self.total_check_counter_ring[ring_index].inc(now as u64);
|
|
self.healthy_counter_ring[ring_index].try_update_round(now as u64);
|
|
if status == HealthStatus::Healthy {
|
|
self.healthy_counter_ring[ring_index].inc(now as u64);
|
|
}
|
|
}
|
|
|
|
/// 获取健康统计信息
|
|
pub fn get_health_stats(&self, hours: u64) -> crate::db::HealthStats {
|
|
let now = chrono::Utc::now().timestamp() as usize;
|
|
|
|
let mut total_checks = 0;
|
|
let mut healthy_count = 0;
|
|
|
|
for ring_index in 0..HEALTH_CHECK_RING_SIZE {
|
|
total_checks += self.total_check_counter_ring[ring_index].counter;
|
|
healthy_count += self.healthy_counter_ring[ring_index].counter;
|
|
}
|
|
|
|
let health_percentage = if total_checks > 0 {
|
|
(healthy_count as f64 / total_checks as f64) * 100.0
|
|
} else {
|
|
0.0
|
|
};
|
|
|
|
crate::db::HealthStats {
|
|
total_checks,
|
|
healthy_count,
|
|
unhealthy_count: total_checks - healthy_count,
|
|
health_percentage,
|
|
average_response_time: self.last_response_time.map(|rt| rt as f64),
|
|
uptime_percentage: health_percentage,
|
|
last_check_time: Some(self.last_check_time),
|
|
last_status: Some(self.current_health_status.clone()),
|
|
}
|
|
}
|
|
|
|
/// 获取当前健康状态
|
|
pub fn get_current_health_status(&self) -> &HealthStatus {
|
|
&self.current_health_status
|
|
}
|
|
|
|
/// 获取最后检查时间
|
|
pub fn get_last_check_time(&self) -> chrono::DateTime<chrono::Utc> {
|
|
self.last_check_time
|
|
}
|
|
|
|
/// 获取最后响应时间
|
|
pub fn get_last_response_time(&self) -> Option<i32> {
|
|
self.last_response_time
|
|
}
|
|
|
|
/// 获取最后错误信息
|
|
pub fn get_last_error_info(&self) -> &Option<String> {
|
|
&self.last_error_info
|
|
}
|
|
|
|
pub fn get_counter_ring(&mut self) -> (Vec<u64>, Vec<u64>) {
|
|
let now = self.last_check_time.timestamp() as usize;
|
|
|
|
let mut total_ring = vec![0; HEALTH_CHECK_RING_SIZE];
|
|
let mut healthy_ring = vec![0; HEALTH_CHECK_RING_SIZE];
|
|
|
|
let mut total_checks = 0;
|
|
let mut healthy_count = 0;
|
|
|
|
for i in 0..HEALTH_CHECK_RING_SIZE {
|
|
let ring_time = now - (i * HEALTH_CHECK_RING_GRANULARITY_SEC);
|
|
let ring_index =
|
|
ring_time.div_euclid(HEALTH_CHECK_RING_GRANULARITY_SEC) % HEALTH_CHECK_RING_SIZE;
|
|
total_ring[i] = self.total_check_counter_ring[ring_index].get(ring_time as u64);
|
|
healthy_ring[i] = self.healthy_counter_ring[ring_index].counter;
|
|
}
|
|
|
|
(total_ring, healthy_ring)
|
|
}
|
|
|
|
pub fn get_ring_granularity(&self) -> u32 {
|
|
HEALTH_CHECK_RING_GRANULARITY_SEC as u32
|
|
}
|
|
}
|
|
|
|
pub struct HealthChecker {
|
|
db: Db,
|
|
instance_mgr: Arc<NetworkInstanceManager>,
|
|
inst_id_map: DashMap<i32, uuid::Uuid>,
|
|
node_tasks: DashMap<i32, ScopedTask<()>>,
|
|
node_records: Arc<DashMap<i32, HealthyMemRecord>>,
|
|
node_cfg: Arc<DashMap<i32, TomlConfigLoader>>,
|
|
}
|
|
|
|
impl HealthChecker {
|
|
pub fn new(db: Db) -> Self {
|
|
let instance_mgr = Arc::new(NetworkInstanceManager::new());
|
|
Self {
|
|
db,
|
|
instance_mgr,
|
|
inst_id_map: DashMap::new(),
|
|
node_tasks: DashMap::new(),
|
|
node_records: Arc::new(DashMap::new()),
|
|
node_cfg: Arc::new(DashMap::new()),
|
|
}
|
|
}
|
|
|
|
/// 启动时从数据库加载所有节点的健康记录到内存
|
|
pub async fn load_health_records_from_db(&self) -> anyhow::Result<()> {
|
|
info!("Loading health records from database...");
|
|
|
|
// 获取所有活跃节点
|
|
let nodes = NodeOperations::get_all_nodes(&self.db)
|
|
.await
|
|
.with_context(|| "Failed to get all nodes from database")?;
|
|
|
|
let from_date = chrono::Utc::now().naive_utc()
|
|
- chrono::Duration::seconds(HEALTH_CHECK_RING_MAX_DURATION_SEC as i64);
|
|
|
|
for node in nodes {
|
|
// 获取每个节点最近的健康记录(用于初始化环形缓冲区)
|
|
let records =
|
|
HealthOperations::get_node_health_records(&self.db, node.id, Some(from_date), None)
|
|
.await
|
|
.with_context(|| {
|
|
format!("Failed to get health records for node {}", node.id)
|
|
})?;
|
|
|
|
// 创建内存记录
|
|
let mem_record = HealthyMemRecord::from_db_records(node.id, &records);
|
|
self.node_records.insert(node.id, mem_record);
|
|
|
|
debug!(
|
|
"Loaded {} health records for node {} ({})",
|
|
records.len(),
|
|
node.id,
|
|
node.name
|
|
);
|
|
}
|
|
|
|
info!(
|
|
"Loaded health records for {} nodes",
|
|
self.node_records.len()
|
|
);
|
|
Ok(())
|
|
}
|
|
|
|
/// 获取节点的内存健康记录
|
|
pub fn get_node_memory_record(&self, node_id: i32) -> Option<HealthyMemRecord> {
|
|
self.node_records.get(&node_id).map(|entry| entry.clone())
|
|
}
|
|
|
|
/// 获取节点的健康统计信息(从内存)
|
|
pub fn get_node_health_stats(
|
|
&self,
|
|
node_id: i32,
|
|
hours: u64,
|
|
) -> Option<crate::db::HealthStats> {
|
|
self.node_records
|
|
.get(&node_id)
|
|
.map(|record| record.get_health_stats(hours))
|
|
}
|
|
|
|
/// 获取所有节点的当前健康状态(从内存)
|
|
pub fn get_all_nodes_health_status(&self) -> Vec<(i32, HealthStatus, Option<String>)> {
|
|
self.node_records
|
|
.iter()
|
|
.map(|entry| {
|
|
let record = entry.value();
|
|
(
|
|
record.node_id,
|
|
record.current_health_status.clone(),
|
|
record.last_error_info.clone(),
|
|
)
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
pub async fn try_update_node(&self, node_id: i32) -> anyhow::Result<()> {
|
|
let old_cfg = self
|
|
.node_cfg
|
|
.get(&node_id)
|
|
.ok_or_else(|| anyhow::anyhow!("old node cfg not found, node_id: {}", node_id))?
|
|
.clone();
|
|
let new_cfg = self.get_node_cfg(node_id, Some(old_cfg.get_id())).await?;
|
|
|
|
if new_cfg.dump() != old_cfg.dump() {
|
|
self.remove_node(node_id).await?;
|
|
self.add_node(node_id).await?;
|
|
info!("node {} cfg updated", node_id);
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
async fn get_node_cfg_with_model(
|
|
&self,
|
|
node_info: &shared_nodes::Model,
|
|
inst_id: Option<uuid::Uuid>,
|
|
) -> anyhow::Result<TomlConfigLoader> {
|
|
let cfg = TomlConfigLoader::default();
|
|
cfg.set_peers(vec![PeerConfig {
|
|
uri: format!(
|
|
"{}://{}:{}",
|
|
node_info.protocol, node_info.host, node_info.port
|
|
)
|
|
.parse()
|
|
.with_context(|| "failed to parse peer uri")?,
|
|
peer_public_key: None,
|
|
}]);
|
|
|
|
let inst_id = inst_id.unwrap_or(uuid::Uuid::new_v4());
|
|
cfg.set_id(inst_id);
|
|
cfg.set_network_identity(NetworkIdentity::new(
|
|
node_info.network_name.clone(),
|
|
node_info.network_secret.clone(),
|
|
));
|
|
|
|
cfg.set_hostname(Some("HealthCheckNode".to_string()));
|
|
|
|
let mut flags = cfg.get_flags();
|
|
flags.no_tun = true;
|
|
flags.disable_p2p = true;
|
|
flags.disable_udp_hole_punching = true;
|
|
flags.disable_tcp_hole_punching = true;
|
|
cfg.set_flags(flags);
|
|
|
|
Ok(cfg)
|
|
}
|
|
|
|
pub async fn test_connection(
|
|
&self,
|
|
node_info: &shared_nodes::Model,
|
|
max_time: Duration,
|
|
) -> anyhow::Result<()> {
|
|
let cfg = self.get_node_cfg_with_model(node_info, None).await?;
|
|
defer!({
|
|
let _ = self
|
|
.instance_mgr
|
|
.delete_network_instance(vec![cfg.get_id()]);
|
|
});
|
|
self.instance_mgr
|
|
.run_network_instance(cfg.clone(), false, ConfigFileControl::STATIC_CONFIG)
|
|
.with_context(|| "failed to run network instance")?;
|
|
|
|
let now = Instant::now();
|
|
let mut err = None;
|
|
while now.elapsed() < max_time {
|
|
match Self::test_node_healthy(cfg.get_id(), self.instance_mgr.clone()).await {
|
|
Ok(_) => {
|
|
return Ok(());
|
|
}
|
|
Err(e) => {
|
|
warn!(
|
|
"test node healthy failed, node_info: {:?}, err: {}",
|
|
node_info, e
|
|
);
|
|
err = Some(e);
|
|
}
|
|
}
|
|
tokio::time::sleep(Duration::from_millis(100)).await;
|
|
}
|
|
Err(anyhow::anyhow!("test node healthy failed, err: {:?}", err))
|
|
}
|
|
|
|
async fn get_node_cfg(
|
|
&self,
|
|
node_id: i32,
|
|
inst_id: Option<uuid::Uuid>,
|
|
) -> anyhow::Result<TomlConfigLoader> {
|
|
let node_info = NodeOperations::get_node_by_id(&self.db, node_id)
|
|
.await
|
|
.with_context(|| format!("failed to get node by id: {}", node_id))?
|
|
.ok_or_else(|| anyhow::anyhow!("node not found"))?;
|
|
self.get_node_cfg_with_model(&node_info, inst_id).await
|
|
}
|
|
|
|
pub async fn add_node(&self, node_id: i32) -> anyhow::Result<()> {
|
|
let cfg = self.get_node_cfg(node_id, None).await?;
|
|
info!(
|
|
"Add node {} to health checker, cfg: {}",
|
|
node_id,
|
|
cfg.dump()
|
|
);
|
|
|
|
self.instance_mgr
|
|
.run_network_instance(cfg.clone(), true, ConfigFileControl::STATIC_CONFIG)
|
|
.with_context(|| "failed to run network instance")?;
|
|
self.inst_id_map.insert(node_id, cfg.get_id());
|
|
|
|
// 初始化内存记录(如果不存在)
|
|
if !self.node_records.contains_key(&node_id) {
|
|
// 从数据库加载历史记录
|
|
let from_date = chrono::Utc::now().naive_utc()
|
|
- chrono::Duration::seconds(HEALTH_CHECK_RING_MAX_DURATION_SEC as i64);
|
|
if let Ok(records) =
|
|
HealthOperations::get_node_health_records(&self.db, node_id, Some(from_date), None)
|
|
.await
|
|
{
|
|
let mem_record = HealthyMemRecord::from_db_records(node_id, &records);
|
|
self.node_records.insert(node_id, mem_record);
|
|
info!(
|
|
"Initialized memory record for node {} with {} historical records",
|
|
node_id,
|
|
records.len()
|
|
);
|
|
} else {
|
|
self.node_records
|
|
.insert(node_id, HealthyMemRecord::new(node_id));
|
|
info!("Initialized new memory record for node {}", node_id);
|
|
}
|
|
}
|
|
|
|
// 启动健康检查任务
|
|
let task = ScopedTask::from(tokio::spawn(Self::node_health_check_task(
|
|
node_id,
|
|
cfg.get_id(),
|
|
Arc::clone(&self.instance_mgr),
|
|
self.db.clone(),
|
|
Arc::clone(&self.node_records),
|
|
)));
|
|
self.node_tasks.insert(node_id, task);
|
|
self.node_cfg.insert(node_id, cfg.clone());
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub async fn remove_node(&self, node_id: i32) -> anyhow::Result<()> {
|
|
self.node_tasks.remove(&node_id);
|
|
if let Some(inst_id) = self.inst_id_map.remove(&node_id) {
|
|
let _ = self.instance_mgr.delete_network_instance(vec![inst_id.1]);
|
|
}
|
|
self.node_cfg.remove(&node_id);
|
|
// 保留内存记录,不删除,以便后续查询历史数据
|
|
info!(
|
|
"Removed health check task for node {}, memory record retained",
|
|
node_id
|
|
);
|
|
Ok(())
|
|
}
|
|
|
|
#[instrument(err, ret, skip(instance_mgr))]
|
|
async fn test_node_healthy(
|
|
inst_id: uuid::Uuid,
|
|
instance_mgr: Arc<NetworkInstanceManager>,
|
|
// return version, response time on healthy, conn_count
|
|
) -> anyhow::Result<(String, u64, u32)> {
|
|
let Some(instance) = instance_mgr.get_network_info(&inst_id).await else {
|
|
anyhow::bail!("healthy check node is not started");
|
|
};
|
|
|
|
let running = instance.running;
|
|
// health check node is not running, update db
|
|
if !running {
|
|
anyhow::bail!("healthy check node is not running");
|
|
}
|
|
|
|
if let Some(err) = instance.error_msg {
|
|
anyhow::bail!("healthy check node has error: {}", err);
|
|
}
|
|
|
|
let p = instance.peer_route_pairs;
|
|
// dst node is not online
|
|
let Some(dst_node) = p.iter().find(|x| {
|
|
// we disable p2p, so we only check direct connected peer
|
|
x.route.as_ref().is_some_and(|route| {
|
|
!route.feature_flag.unwrap().is_public_server && route.hostname != "HealthCheckNode"
|
|
}) && x.peer.as_ref().is_some_and(|p| !p.conns.is_empty())
|
|
}) else {
|
|
anyhow::bail!("dst node is not online");
|
|
};
|
|
|
|
let Some(route_info) = &dst_node.route else {
|
|
anyhow::bail!("dst node route is not found");
|
|
};
|
|
|
|
let Some(peer_info) = &dst_node.peer else {
|
|
anyhow::bail!("dst node peer is not found");
|
|
};
|
|
|
|
let version = route_info
|
|
.version
|
|
.clone()
|
|
.split("-")
|
|
.next()
|
|
.unwrap_or("")
|
|
.to_string();
|
|
|
|
// 计算响应时间(这里可以根据实际需要实现)
|
|
let response_time = peer_info
|
|
.conns
|
|
.iter()
|
|
.filter_map(|x| x.stats)
|
|
.map(|x| x.latency_us)
|
|
.min()
|
|
.unwrap_or(0);
|
|
|
|
let peer_id = peer_info.peer_id;
|
|
|
|
let conn_count = if let Some(summary) = instance.foreign_network_summary {
|
|
summary
|
|
.info_map
|
|
.get(&peer_id)
|
|
.map(|x| x.network_count)
|
|
.unwrap_or(0)
|
|
} else {
|
|
0
|
|
};
|
|
|
|
Ok((version, response_time, conn_count))
|
|
}
|
|
|
|
async fn node_health_check_task(
|
|
node_id: i32,
|
|
inst_id: uuid::Uuid,
|
|
instance_mgr: Arc<NetworkInstanceManager>,
|
|
db: Db,
|
|
node_records: Arc<DashMap<i32, HealthyMemRecord>>,
|
|
) {
|
|
/// 记录健康状态到数据库和内存
|
|
async fn record_health_status(
|
|
db: &Db,
|
|
node_records: &Arc<DashMap<i32, HealthyMemRecord>>,
|
|
node_id: i32,
|
|
status: HealthStatus,
|
|
response_time: Option<i32>,
|
|
error_message: Option<String>,
|
|
) {
|
|
// 写入数据库
|
|
if let Err(e) = HealthOperations::create_health_record(
|
|
db,
|
|
node_id,
|
|
status.clone(),
|
|
response_time,
|
|
error_message.clone(),
|
|
)
|
|
.await
|
|
{
|
|
error!("Failed to create health record for node {}: {}", node_id, e);
|
|
}
|
|
|
|
// 更新内存记录
|
|
if let Some(mut record) = node_records.get_mut(&node_id) {
|
|
record.update_health_status(status, response_time, error_message);
|
|
} else {
|
|
let mut new_record = HealthyMemRecord::new(node_id);
|
|
new_record.update_health_status(status, response_time, error_message);
|
|
node_records.insert(node_id, new_record);
|
|
}
|
|
}
|
|
let mut tick = tokio::time::interval(Duration::from_secs(5));
|
|
let mut counter: u64 = 0;
|
|
loop {
|
|
if counter != 0 {
|
|
tick.tick().await;
|
|
}
|
|
counter += 1;
|
|
|
|
match Self::test_node_healthy(inst_id, instance_mgr.clone()).await {
|
|
Ok((version, response_time, conn_count)) => {
|
|
if let Err(e) = NodeOperations::update_node_status(
|
|
&db,
|
|
node_id,
|
|
true,
|
|
Some(conn_count as i32),
|
|
)
|
|
.await
|
|
{
|
|
error!("Failed to update node status for node {}: {}", node_id, e);
|
|
}
|
|
|
|
record_health_status(
|
|
&db,
|
|
&node_records,
|
|
node_id,
|
|
HealthStatus::Healthy,
|
|
Some(response_time as i32),
|
|
None,
|
|
)
|
|
.await;
|
|
|
|
// update node version
|
|
if let Err(e) = NodeOperations::update_node_version(&db, node_id, version).await
|
|
{
|
|
error!("Failed to update node version for node {}: {}", node_id, e);
|
|
}
|
|
}
|
|
Err(e) => {
|
|
if let Err(e) =
|
|
NodeOperations::update_node_status(&db, node_id, false, None).await
|
|
{
|
|
error!("Failed to update node status for node {}: {}", node_id, e);
|
|
}
|
|
|
|
record_health_status(
|
|
&db,
|
|
&node_records,
|
|
node_id,
|
|
HealthStatus::Unhealthy,
|
|
None,
|
|
Some(format!("inst id: {}, err: {}", inst_id, e)),
|
|
)
|
|
.await;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|