fix: clean stale foreign network state (#2197)

- clear foreign-network traffic metric peer caches on peer removal and network cleanup
- release reserved foreign-network peer IDs on handshake/add-peer error paths
- avoid creating no-op foreign-network token buckets when limits are unlimited
- shrink relay/session maps after cleanup and remove unused peer-center global data entries
This commit is contained in:
KKRainbow
2026-05-01 23:30:51 +08:00
committed by GitHub
parent 3542e944cb
commit 1b48029bdc
7 changed files with 176 additions and 29 deletions
+47 -9
View File
@@ -35,7 +35,7 @@ pub(crate) struct PeerCenterInfoEntry {
update_time: std::time::Instant, update_time: std::time::Instant,
} }
#[derive(Default)] #[derive(Debug, Default)]
pub(crate) struct PeerCenterServerGlobalData { pub(crate) struct PeerCenterServerGlobalData {
pub(crate) global_peer_map: DashMap<SrcDstPeerPair, PeerCenterInfoEntry>, pub(crate) global_peer_map: DashMap<SrcDstPeerPair, PeerCenterInfoEntry>,
pub(crate) peer_report_time: DashMap<PeerId, std::time::Instant>, pub(crate) peer_report_time: DashMap<PeerId, std::time::Instant>,
@@ -58,27 +58,33 @@ pub(crate) fn get_global_data(node_id: PeerId) -> Arc<PeerCenterServerGlobalData
pub struct PeerCenterServer { pub struct PeerCenterServer {
// every peer has its own server, so use per-struct dash map is ok. // every peer has its own server, so use per-struct dash map is ok.
my_node_id: PeerId, my_node_id: PeerId,
data: Arc<PeerCenterServerGlobalData>,
tasks: Arc<JoinSet<()>>, tasks: Arc<JoinSet<()>>,
} }
impl PeerCenterServer { impl PeerCenterServer {
pub fn new(my_node_id: PeerId) -> Self { pub fn new(my_node_id: PeerId) -> Self {
let data = get_global_data(my_node_id);
let weak_data = Arc::downgrade(&data);
let mut tasks = JoinSet::new(); let mut tasks = JoinSet::new();
tasks.spawn(async move { tasks.spawn(async move {
loop { loop {
tokio::time::sleep(std::time::Duration::from_secs(10)).await; tokio::time::sleep(std::time::Duration::from_secs(10)).await;
PeerCenterServer::clean_outdated_peer(my_node_id).await; let Some(data) = weak_data.upgrade() else {
break;
};
PeerCenterServer::clean_outdated_peer_data(&data).await;
} }
}); });
PeerCenterServer { PeerCenterServer {
my_node_id, my_node_id,
data,
tasks: Arc::new(tasks), tasks: Arc::new(tasks),
} }
} }
async fn clean_outdated_peer(my_node_id: PeerId) { async fn clean_outdated_peer_data(data: &PeerCenterServerGlobalData) {
let data = get_global_data(my_node_id);
data.peer_report_time.retain(|_, v| { data.peer_report_time.retain(|_, v| {
std::time::Instant::now().duration_since(*v) < std::time::Duration::from_secs(180) std::time::Instant::now().duration_since(*v) < std::time::Duration::from_secs(180)
}); });
@@ -88,8 +94,7 @@ impl PeerCenterServer {
}); });
} }
fn calc_global_digest(my_node_id: PeerId) -> Digest { fn calc_global_digest_data(data: &PeerCenterServerGlobalData) -> Digest {
let data = get_global_data(my_node_id);
let mut hasher = std::collections::hash_map::DefaultHasher::new(); let mut hasher = std::collections::hash_map::DefaultHasher::new();
data.global_peer_map data.global_peer_map
.iter() .iter()
@@ -102,6 +107,18 @@ impl PeerCenterServer {
} }
} }
impl Drop for PeerCenterServer {
fn drop(&mut self) {
if Arc::strong_count(&self.tasks) != 1 {
return;
}
GLOBAL_DATA.remove_if(&self.my_node_id, |_, data| {
Arc::ptr_eq(data, &self.data) && Arc::strong_count(data) <= 2
});
}
}
#[async_trait::async_trait] #[async_trait::async_trait]
impl PeerCenterRpc for PeerCenterServer { impl PeerCenterRpc for PeerCenterServer {
type Controller = BaseController; type Controller = BaseController;
@@ -117,7 +134,7 @@ impl PeerCenterRpc for PeerCenterServer {
tracing::debug!("receive report_peers"); tracing::debug!("receive report_peers");
let data = get_global_data(self.my_node_id); let data = &self.data;
data.peer_report_time data.peer_report_time
.insert(my_peer_id, std::time::Instant::now()); .insert(my_peer_id, std::time::Instant::now());
@@ -134,7 +151,7 @@ impl PeerCenterRpc for PeerCenterServer {
} }
data.digest data.digest
.store(PeerCenterServer::calc_global_digest(self.my_node_id)); .store(PeerCenterServer::calc_global_digest_data(data));
Ok(ReportPeersResponse::default()) Ok(ReportPeersResponse::default())
} }
@@ -147,7 +164,7 @@ impl PeerCenterRpc for PeerCenterServer {
) -> Result<GetGlobalPeerMapResponse, rpc_types::error::Error> { ) -> Result<GetGlobalPeerMapResponse, rpc_types::error::Error> {
let digest = req.digest; let digest = req.digest;
let data = get_global_data(self.my_node_id); let data = &self.data;
if digest == data.digest.load() && digest != 0 { if digest == data.digest.load() && digest != 0 {
return Ok(GetGlobalPeerMapResponse::default()); return Ok(GetGlobalPeerMapResponse::default());
} }
@@ -171,3 +188,24 @@ impl PeerCenterRpc for PeerCenterServer {
}) })
} }
} }
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn global_data_removed_when_last_server_drops() {
let peer_id = u32::MAX - 17;
GLOBAL_DATA.remove(&peer_id);
let server = PeerCenterServer::new(peer_id);
assert!(GLOBAL_DATA.contains_key(&peer_id));
let server_clone = server.clone();
drop(server);
assert!(GLOBAL_DATA.contains_key(&peer_id));
drop(server_clone);
assert!(!GLOBAL_DATA.contains_key(&peer_id));
}
}
+67 -4
View File
@@ -87,7 +87,7 @@ struct ForeignNetworkEntry {
packet_recv: Mutex<Option<PacketRecvChanReceiver>>, packet_recv: Mutex<Option<PacketRecvChanReceiver>>,
bps_limiter: Arc<TokenBucket>, bps_limiter: Option<Arc<TokenBucket>>,
peer_center: Arc<PeerCenterInstance>, peer_center: Arc<PeerCenterInstance>,
@@ -191,14 +191,16 @@ impl ForeignNetworkEntry {
); );
let relay_bps_limit = global_ctx.config.get_flags().foreign_relay_bps_limit; let relay_bps_limit = global_ctx.config.get_flags().foreign_relay_bps_limit;
let bps_limiter = (relay_bps_limit != u64::MAX).then(|| {
let limiter_config = LimiterConfig { let limiter_config = LimiterConfig {
burst_rate: None, burst_rate: None,
bps: Some(relay_bps_limit), bps: Some(relay_bps_limit),
fill_duration_ms: None, fill_duration_ms: None,
}; };
let bps_limiter = global_ctx global_ctx
.token_bucket_manager() .token_bucket_manager()
.get_or_create(&network.network_name, limiter_config.into()); .get_or_create(&network.network_name, limiter_config.into())
});
let peer_center = Arc::new(PeerCenterInstance::new(Arc::new( let peer_center = Arc::new(PeerCenterInstance::new(Arc::new(
PeerMapWithPeerRpcManager { PeerMapWithPeerRpcManager {
@@ -536,7 +538,9 @@ impl ForeignNetworkEntry {
); );
continue; continue;
} }
if !bps_limiter.try_consume(len.into()) { if let Some(bps_limiter) = bps_limiter.as_ref()
&& !bps_limiter.try_consume(len.into())
{
continue; continue;
} }
} }
@@ -713,6 +717,7 @@ impl ForeignNetworkManagerData {
fn remove_network(&self, network_name: &String) { fn remove_network(&self, network_name: &String) {
let _l = self.lock.lock().unwrap(); let _l = self.lock.lock().unwrap();
if let Some(old) = self.network_peer_maps.remove(network_name) { if let Some(old) = self.network_peer_maps.remove(network_name) {
old.1.traffic_metrics.clear_peer_cache();
let to_remove_peers = old.1.peer_map.list_peers(); let to_remove_peers = old.1.peer_map.list_peers();
for p in to_remove_peers { for p in to_remove_peers {
self.peer_network_map.remove_if(&p, |_, v| { self.peer_network_map.remove_if(&p, |_, v| {
@@ -722,6 +727,9 @@ impl ForeignNetworkManagerData {
} }
} }
self.network_peer_last_update.remove(network_name); self.network_peer_last_update.remove(network_name);
shrink_dashmap(&self.peer_network_map, None);
shrink_dashmap(&self.network_peer_maps, None);
shrink_dashmap(&self.network_peer_last_update, None);
} }
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
@@ -994,12 +1002,14 @@ impl ForeignNetworkManager {
async fn start_event_handler(&self, entry: &ForeignNetworkEntry) { async fn start_event_handler(&self, entry: &ForeignNetworkEntry) {
let data = self.data.clone(); let data = self.data.clone();
let network_name = entry.network.network_name.clone(); let network_name = entry.network.network_name.clone();
let traffic_metrics = entry.traffic_metrics.clone();
let mut s = entry.global_ctx.subscribe(); let mut s = entry.global_ctx.subscribe();
self.tasks.lock().unwrap().spawn(async move { self.tasks.lock().unwrap().spawn(async move {
while let Ok(e) = s.recv().await { while let Ok(e) = s.recv().await {
match &e { match &e {
GlobalCtxEvent::PeerRemoved(peer_id) => { GlobalCtxEvent::PeerRemoved(peer_id) => {
tracing::info!(?e, "remove peer from foreign network manager"); tracing::info!(?e, "remove peer from foreign network manager");
traffic_metrics.remove_peer(*peer_id);
data.remove_peer(*peer_id, &network_name); data.remove_peer(*peer_id, &network_name);
data.network_peer_last_update data.network_peer_last_update
.insert(network_name.clone(), SystemTime::now()); .insert(network_name.clone(), SystemTime::now());
@@ -1018,6 +1028,7 @@ impl ForeignNetworkManager {
} }
// if lagged or recv done just remove the network // if lagged or recv done just remove the network
tracing::error!("global event handler at foreign network manager exit"); tracing::error!("global event handler at foreign network manager exit");
traffic_metrics.clear_peer_cache();
data.remove_network(&network_name); data.remove_network(&network_name);
}); });
} }
@@ -1604,6 +1615,58 @@ pub mod tests {
.await; .await;
} }
#[tokio::test]
async fn foreign_network_peer_removed_clears_traffic_metric_peer_cache() {
let pm_center = create_mock_peer_manager_with_mock_stun(NatType::Unknown).await;
let pma_net1 = create_mock_peer_manager_for_foreign_network("net1").await;
connect_peer_manager(pma_net1.clone(), pm_center.clone()).await;
wait_for_condition(
|| {
let pm_center = pm_center.clone();
async move {
pm_center
.get_foreign_network_manager()
.get_network_peer_id("net1")
.is_some()
}
},
Duration::from_secs(5),
)
.await;
let entry = pm_center
.get_foreign_network_manager()
.data
.get_network_entry("net1")
.unwrap();
entry
.traffic_metrics
.record_rx(pma_net1.my_peer_id(), PacketType::Data as u8, 128)
.await;
assert!(
entry
.traffic_metrics
.contains_peer_cache(pma_net1.my_peer_id())
);
entry
.global_ctx
.issue_event(GlobalCtxEvent::PeerRemoved(pma_net1.my_peer_id()));
wait_for_condition(
|| {
let entry = entry.clone();
let peer_id = pma_net1.my_peer_id();
async move { !entry.traffic_metrics.contains_peer_cache(peer_id) }
},
Duration::from_secs(5),
)
.await;
}
#[tokio::test] #[tokio::test]
async fn foreign_network_encapsulated_forwarding_records_tx_metrics() { async fn foreign_network_encapsulated_forwarding_records_tx_metrics() {
set_global_var!(OSPF_UPDATE_MY_GLOBAL_FOREIGN_NETWORK_INTERVAL_SEC, 1); set_global_var!(OSPF_UPDATE_MY_GLOBAL_FOREIGN_NETWORK_INTERVAL_SEC, 1);
+3 -1
View File
@@ -1352,7 +1352,9 @@ impl PeerConn {
let is_foreign_network = conn_info_for_instrument.network_name let is_foreign_network = conn_info_for_instrument.network_name
!= self.global_ctx.get_network_identity().network_name; != self.global_ctx.get_network_identity().network_name;
let recv_limiter = if is_foreign_network { let recv_limiter = if is_foreign_network
&& self.global_ctx.get_flags().foreign_relay_bps_limit != u64::MAX
{
let relay_network_bps_limit = self.global_ctx.get_flags().foreign_relay_bps_limit; let relay_network_bps_limit = self.global_ctx.get_flags().foreign_relay_bps_limit;
let limiter_config = LimiterConfig { let limiter_config = LimiterConfig {
burst_rate: None, burst_rate: None,
+26 -7
View File
@@ -687,6 +687,11 @@ impl PeerManager {
Ok(()) Ok(())
} }
fn release_reserved_peer_id(&self, network_name: &str) {
self.reserved_my_peer_id_map.remove(network_name);
shrink_dashmap(&self.reserved_my_peer_id_map, None);
}
#[tracing::instrument(ret)] #[tracing::instrument(ret)]
pub async fn add_tunnel_as_server( pub async fn add_tunnel_as_server(
&self, &self,
@@ -702,7 +707,8 @@ impl PeerManager {
tunnel, tunnel,
self.peer_session_store.clone(), self.peer_session_store.clone(),
); );
conn.do_handshake_as_server_ext(|peer, network_name:&str| { let mut reserved_peer_id_network_name = None;
let handshake_ret = conn.do_handshake_as_server_ext(|peer, network_name:&str| {
if network_name if network_name
== self.global_ctx.get_network_identity().network_name == self.global_ctx.get_network_identity().network_name
{ {
@@ -713,6 +719,7 @@ impl PeerManager {
.foreign_network_manager .foreign_network_manager
.get_network_peer_id(network_name); .get_network_peer_id(network_name);
if peer_id.is_none() { if peer_id.is_none() {
reserved_peer_id_network_name = Some(network_name.to_string());
peer_id = Some(*self.reserved_my_peer_id_map.entry(network_name.to_string()).or_insert_with(|| { peer_id = Some(*self.reserved_my_peer_id_map.entry(network_name.to_string()).or_insert_with(|| {
rand::random::<PeerId>() rand::random::<PeerId>()
}).value()); }).value());
@@ -728,7 +735,14 @@ impl PeerManager {
Ok(()) Ok(())
}) })
.await?; .await;
if let Err(err) = handshake_ret {
if let Some(network_name) = reserved_peer_id_network_name {
self.release_reserved_peer_id(&network_name);
}
return Err(err);
}
let peer_identity = conn.get_network_identity(); let peer_identity = conn.get_network_identity();
let peer_network_name = peer_identity.network_name.clone(); let peer_network_name = peer_identity.network_name.clone();
@@ -747,6 +761,7 @@ impl PeerManager {
if !is_local_network && self.global_ctx.get_flags().private_mode && !foreign_network_allowed if !is_local_network && self.global_ctx.get_flags().private_mode && !foreign_network_allowed
{ {
self.release_reserved_peer_id(&peer_network_name);
return Err(Error::SecretKeyError( return Err(Error::SecretKeyError(
"private mode is turned on, foreign network secret mismatch".to_string(), "private mode is turned on, foreign network secret mismatch".to_string(),
)); ));
@@ -754,14 +769,18 @@ impl PeerManager {
conn.set_is_hole_punched(!is_directly_connected); conn.set_is_hole_punched(!is_directly_connected);
if is_local_network { let add_peer_ret = if is_local_network {
self.add_new_peer_conn(conn).await?; self.add_new_peer_conn(conn).await
} else { } else {
self.foreign_network_manager.add_peer_conn(conn).await?; self.foreign_network_manager.add_peer_conn(conn).await
};
if let Err(err) = add_peer_ret {
self.release_reserved_peer_id(&peer_network_name);
return Err(err);
} }
self.reserved_my_peer_id_map.remove(&peer_network_name); self.release_reserved_peer_id(&peer_network_name);
shrink_dashmap(&self.reserved_my_peer_id_map, None);
tracing::info!("add tunnel as server done"); tracing::info!("add tunnel as server done");
Ok(()) Ok(())
+5 -1
View File
@@ -7,7 +7,10 @@ use anyhow::anyhow;
use dashmap::DashMap; use dashmap::DashMap;
use super::secure_datagram::{SecureDatagramDirection, SecureDatagramSession}; use super::secure_datagram::{SecureDatagramDirection, SecureDatagramSession};
use crate::{common::PeerId, tunnel::packet_def::ZCPacket}; use crate::{
common::{PeerId, shrink_dashmap},
tunnel::packet_def::ZCPacket,
};
pub struct UpsertResponderSessionReturn { pub struct UpsertResponderSessionReturn {
pub session: Arc<PeerSession>, pub session: Arc<PeerSession>,
@@ -78,6 +81,7 @@ impl PeerSessionStore {
pub fn evict_unused_sessions(&self) { pub fn evict_unused_sessions(&self) {
self.sessions self.sessions
.retain(|_key, session| Arc::strong_count(session) > 1); .retain(|_key, session| Arc::strong_count(session) > 1);
shrink_dashmap(&self.sessions, None);
} }
#[tracing::instrument(skip(self))] #[tracing::instrument(skip(self))]
+9 -1
View File
@@ -9,7 +9,7 @@ use tokio::time::{Duration, timeout};
use crate::peers::foreign_network_client::ForeignNetworkClient; use crate::peers::foreign_network_client::ForeignNetworkClient;
use crate::{ use crate::{
common::error::Error, common::error::Error,
common::{PeerId, global_ctx::ArcGlobalCtx}, common::{PeerId, global_ctx::ArcGlobalCtx, shrink_dashmap},
peers::peer_map::PeerMap, peers::peer_map::PeerMap,
peers::peer_session::{PeerSession, PeerSessionAction, PeerSessionStore, SessionKey}, peers::peer_session::{PeerSession, PeerSessionAction, PeerSessionStore, SessionKey},
peers::route_trait::NextHopPolicy, peers::route_trait::NextHopPolicy,
@@ -652,6 +652,10 @@ impl RelayPeerMap {
self.handshake_locks.remove(&peer_id); self.handshake_locks.remove(&peer_id);
self.pending_packets.remove(&peer_id); self.pending_packets.remove(&peer_id);
} }
shrink_dashmap(&self.states, None);
shrink_dashmap(&self.pending_handshakes, None);
shrink_dashmap(&self.handshake_locks, None);
shrink_dashmap(&self.pending_packets, None);
} }
pub fn has_state(&self, peer_id: PeerId) -> bool { pub fn has_state(&self, peer_id: PeerId) -> bool {
@@ -679,6 +683,10 @@ impl RelayPeerMap {
self.pending_handshakes.remove(&peer_id); self.pending_handshakes.remove(&peer_id);
self.handshake_locks.remove(&peer_id); self.handshake_locks.remove(&peer_id);
self.pending_packets.remove(&peer_id); self.pending_packets.remove(&peer_id);
shrink_dashmap(&self.states, None);
shrink_dashmap(&self.pending_handshakes, None);
shrink_dashmap(&self.handshake_locks, None);
shrink_dashmap(&self.pending_packets, None);
tracing::debug!(?peer_id, "RelayPeerMap removed peer relay state"); tracing::debug!(?peer_id, "RelayPeerMap removed peer relay state");
} }
+13
View File
@@ -201,6 +201,11 @@ impl LogicalTrafficMetrics {
self.per_peer.len() self.per_peer.len()
} }
#[cfg(test)]
fn contains_peer_cache(&self, peer_id: PeerId) -> bool {
self.per_peer.contains_key(&peer_id)
}
fn build_peer_counters(&self, instance_id: &str) -> TrafficCounters { fn build_peer_counters(&self, instance_id: &str) -> TrafficCounters {
let instance_label = match self.label_kind { let instance_label = match self.label_kind {
InstanceLabelKind::To => LabelType::ToInstanceId(instance_id.to_string()), InstanceLabelKind::To => LabelType::ToInstanceId(instance_id.to_string()),
@@ -333,6 +338,14 @@ impl TrafficMetricRecorder {
self.rx_metrics.control.clear_peer_cache(); self.rx_metrics.control.clear_peer_cache();
} }
#[cfg(test)]
pub(crate) fn contains_peer_cache(&self, peer_id: PeerId) -> bool {
self.tx_metrics.data.contains_peer_cache(peer_id)
|| self.tx_metrics.control.contains_peer_cache(peer_id)
|| self.rx_metrics.data.contains_peer_cache(peer_id)
|| self.rx_metrics.control.contains_peer_cache(peer_id)
}
fn resolve_instance_id(&self, peer_id: PeerId) -> BoxFuture<'static, Option<String>> { fn resolve_instance_id(&self, peer_id: PeerId) -> BoxFuture<'static, Option<String>> {
(self.resolve_instance_id)(peer_id) (self.resolve_instance_id)(peer_id)
} }