fix latency first route of public server (#1129)

This commit is contained in:
Sijie.Sun
2025-07-19 18:16:53 +08:00
committed by GitHub
parent e25cd9be37
commit 85f0091056
9 changed files with 326 additions and 186 deletions
+114 -33
View File
@@ -12,17 +12,19 @@ use tokio::task::JoinSet;
use tracing::Instrument;
use crate::{
common::PeerId,
common::{global_ctx::GlobalCtx, PeerId},
peers::{
peer_manager::PeerManager,
peer_map::PeerMap,
peer_rpc::PeerRpcManager,
route_trait::{RouteCostCalculator, RouteCostCalculatorInterface},
rpc_service::PeerManagerRpcService,
},
proto::{
peer_rpc::{
GetGlobalPeerMapRequest, GetGlobalPeerMapResponse, GlobalPeerMap, PeerCenterRpc,
PeerCenterRpcClientFactory, PeerCenterRpcServer, PeerInfoForGlobalMap,
ReportPeersRequest, ReportPeersResponse,
DirectConnectedPeerInfo, GetGlobalPeerMapRequest, GetGlobalPeerMapResponse,
GlobalPeerMap, PeerCenterRpc, PeerCenterRpcClientFactory, PeerCenterRpcServer,
PeerInfoForGlobalMap, ReportPeersRequest, ReportPeersResponse,
},
rpc_types::{self, controller::BaseController},
},
@@ -30,8 +32,18 @@ use crate::{
use super::{server::PeerCenterServer, Digest, Error};
#[async_trait::async_trait]
#[auto_impl::auto_impl(&, Arc, Box)]
pub trait PeerCenterPeerManagerTrait: Send + Sync + 'static {
async fn list_peers(&self) -> PeerInfoForGlobalMap;
fn my_peer_id(&self) -> PeerId;
fn get_global_ctx(&self) -> Arc<GlobalCtx>;
fn get_rpc_mgr(&self) -> Weak<PeerRpcManager>;
async fn list_routes(&self) -> Vec<crate::proto::cli::Route>;
}
struct PeerCenterBase {
peer_mgr: Weak<PeerManager>,
peer_mgr: Arc<dyn PeerCenterPeerManagerTrait>,
my_peer_id: PeerId,
tasks: Mutex<JoinSet<()>>,
lock: Arc<Mutex<()>>,
@@ -41,7 +53,7 @@ struct PeerCenterBase {
static SERVICE_ID: u32 = 50;
struct PeridicJobCtx<T> {
peer_mgr: Weak<PeerManager>,
peer_mgr: Arc<dyn PeerCenterPeerManagerTrait>,
my_peer_id: PeerId,
center_peer: AtomicCell<PeerId>,
job_ctx: T,
@@ -49,22 +61,17 @@ struct PeridicJobCtx<T> {
impl PeerCenterBase {
pub async fn init(&self) -> Result<(), Error> {
let Some(peer_mgr) = self.peer_mgr.upgrade() else {
let Some(rpc_mgr) = self.peer_mgr.get_rpc_mgr().upgrade() else {
return Err(Error::Shutdown);
};
peer_mgr
.get_peer_rpc_mgr()
.rpc_server()
.registry()
.register(
PeerCenterRpcServer::new(PeerCenterServer::new(peer_mgr.my_peer_id())),
&peer_mgr.get_global_ctx().get_network_name(),
);
rpc_mgr.rpc_server().registry().register(
PeerCenterRpcServer::new(PeerCenterServer::new(self.peer_mgr.my_peer_id())),
&self.peer_mgr.get_global_ctx().get_network_name(),
);
Ok(())
}
async fn select_center_peer(peer_mgr: &Arc<PeerManager>) -> Option<PeerId> {
async fn select_center_peer(peer_mgr: &dyn PeerCenterPeerManagerTrait) -> Option<PeerId> {
let peers = peer_mgr.list_routes().await;
if peers.is_empty() {
return None;
@@ -109,19 +116,18 @@ impl PeerCenterBase {
job_ctx,
});
loop {
let Some(peer_mgr) = peer_mgr.upgrade() else {
tracing::error!("peer manager is shutdown, exit periodic job");
return;
};
let Some(center_peer) = Self::select_center_peer(&peer_mgr).await else {
tracing::trace!("no center peer found, sleep 1 second");
tokio::time::sleep(Duration::from_secs(1)).await;
continue;
};
let Some(rpc_mgr) = peer_mgr.get_rpc_mgr().upgrade() else {
tracing::error!("rpc manager is shutdown, exit periodic job");
return;
};
ctx.center_peer.store(center_peer.clone());
tracing::trace!(?center_peer, "run periodic job");
let rpc_mgr = peer_mgr.get_peer_rpc_mgr();
let _g = lock.lock().await;
let stub = rpc_mgr
.rpc_client()
@@ -148,10 +154,11 @@ impl PeerCenterBase {
);
}
pub fn new(peer_mgr: Arc<PeerManager>) -> Self {
pub fn new(peer_mgr: Arc<dyn PeerCenterPeerManagerTrait>) -> Self {
let my_peer_id = peer_mgr.my_peer_id();
PeerCenterBase {
peer_mgr: Arc::downgrade(&peer_mgr),
my_peer_id: peer_mgr.my_peer_id(),
peer_mgr,
my_peer_id,
tasks: Mutex::new(JoinSet::new()),
lock: Arc::new(Mutex::new(())),
}
@@ -190,7 +197,7 @@ impl PeerCenterRpc for PeerCenterInstanceService {
}
pub struct PeerCenterInstance {
peer_mgr: Arc<PeerManager>,
peer_mgr: Arc<dyn PeerCenterPeerManagerTrait>,
client: Arc<PeerCenterBase>,
global_peer_map: Arc<RwLock<GlobalPeerMap>>,
@@ -199,7 +206,7 @@ pub struct PeerCenterInstance {
}
impl PeerCenterInstance {
pub fn new(peer_mgr: Arc<PeerManager>) -> Self {
pub fn new(peer_mgr: Arc<dyn PeerCenterPeerManagerTrait>) -> Self {
PeerCenterInstance {
peer_mgr: peer_mgr.clone(),
client: Arc::new(PeerCenterBase::new(peer_mgr.clone())),
@@ -286,15 +293,14 @@ impl PeerCenterInstance {
async fn init_report_peers_job(&self) {
struct Ctx {
service: PeerManagerRpcService,
peer_mgr: Arc<dyn PeerCenterPeerManagerTrait>,
last_report_peers: Mutex<BTreeSet<PeerId>>,
last_center_peer: AtomicCell<PeerId>,
last_report_time: AtomicCell<Instant>,
}
let ctx = Arc::new(Ctx {
service: PeerManagerRpcService::new(self.peer_mgr.clone()),
peer_mgr: self.peer_mgr.clone(),
last_report_peers: Mutex::new(BTreeSet::new()),
last_center_peer: AtomicCell::new(PeerId::default()),
last_report_time: AtomicCell::new(Instant::now()),
@@ -303,7 +309,7 @@ impl PeerCenterInstance {
self.client
.init_periodic_job(ctx, |client, ctx| async move {
let my_node_id = ctx.my_peer_id;
let peers: PeerInfoForGlobalMap = ctx.job_ctx.service.list_peers().await.into();
let peers = ctx.job_ctx.peer_mgr.list_peers().await;
let peer_list = peers.direct_peers.keys().map(|k| *k).collect();
let job_ctx = &ctx.job_ctx;
@@ -373,7 +379,7 @@ impl PeerCenterInstance {
if let Some(cost) = self.directed_cost(src, dst) {
return cost;
}
self.directed_cost(dst, src).unwrap_or(100)
self.directed_cost(dst, src).unwrap_or(500)
}
fn begin_update(&mut self) {
@@ -402,6 +408,81 @@ impl PeerCenterInstance {
}
}
#[async_trait::async_trait]
impl PeerCenterPeerManagerTrait for PeerManager {
async fn list_peers(&self) -> PeerInfoForGlobalMap {
PeerManagerRpcService::list_peers(self).await.into()
}
fn my_peer_id(&self) -> PeerId {
self.get_peer_map().my_peer_id()
}
fn get_global_ctx(&self) -> Arc<GlobalCtx> {
self.get_peer_map().get_global_ctx()
}
fn get_rpc_mgr(&self) -> Weak<PeerRpcManager> {
Arc::downgrade(&self.get_peer_rpc_mgr())
}
async fn list_routes(&self) -> Vec<crate::proto::cli::Route> {
self.list_routes().await
}
}
pub struct PeerMapWithPeerRpcManager {
pub peer_map: Arc<PeerMap>,
pub rpc_mgr: Arc<PeerRpcManager>,
}
#[async_trait::async_trait]
impl PeerCenterPeerManagerTrait for PeerMapWithPeerRpcManager {
async fn list_peers(&self) -> PeerInfoForGlobalMap {
// TODO: currently latency between public server cannot be calculated because one public-server pair
// has no connection between them. (hard to get latency from peer manager because it's hard to transfrom the peer id)
// but it's fine because we don't want to too much traffic between public servers.
let peers = self.peer_map.list_peers().await;
let mut ret = PeerInfoForGlobalMap::default();
for peer in peers {
if let Some(conns) = self.peer_map.list_peer_conns(peer).await {
let Some(min_lat) = conns
.iter()
.map(|conn| conn.stats.as_ref().unwrap().latency_us)
.min()
else {
continue;
};
ret.direct_peers.insert(
peer,
DirectConnectedPeerInfo {
latency_ms: std::cmp::max(1, (min_lat as u32 / 1000) as i32),
},
);
}
}
ret
}
fn my_peer_id(&self) -> PeerId {
self.peer_map.my_peer_id()
}
fn get_global_ctx(&self) -> Arc<GlobalCtx> {
self.peer_map.get_global_ctx()
}
fn get_rpc_mgr(&self) -> Weak<PeerRpcManager> {
Arc::downgrade(&self.rpc_mgr)
}
async fn list_routes(&self) -> Vec<crate::proto::cli::Route> {
self.peer_map.list_route_infos().await
}
}
#[cfg(test)]
mod tests {
use crate::{