add stats metrics (#1207)

support new cli command `easytier-cli stats`

It's useful to find out which components are consuming bandwidth.
This commit is contained in:
Sijie.Sun
2025-08-09 00:06:35 +08:00
committed by GitHub
parent efa17a7c10
commit 8cdb27d43d
15 changed files with 1442 additions and 19 deletions
+23
View File
@@ -302,3 +302,26 @@ service PortForwardManageRpc {
rpc RemovePortForward(RemovePortForwardRequest) returns (RemovePortForwardResponse);
rpc ListPortForward(ListPortForwardRequest) returns (ListPortForwardResponse);
}
message MetricSnapshot {
string name = 1;
uint64 value = 2;
map<string, string> labels = 3;
}
message GetStatsRequest {}
message GetStatsResponse {
repeated MetricSnapshot metrics = 1;
}
message GetPrometheusStatsRequest {}
message GetPrometheusStatsResponse {
string prometheus_text = 1;
}
service StatsRpc {
rpc GetStats(GetStatsRequest) returns (GetStatsResponse);
rpc GetPrometheusStats(GetPrometheusStatsRequest) returns (GetPrometheusStatsResponse);
}
+16 -1
View File
@@ -9,7 +9,8 @@ use crate::{
tunnel::{packet_def::PacketType, ring::create_ring_tunnel_pair, Tunnel},
};
use super::{client::Client, server::Server};
use super::{client::Client, server::Server, service_registry::ServiceRegistry};
use crate::common::stats_manager::StatsManager;
pub struct BidirectRpcManager {
rpc_client: Client,
@@ -38,6 +39,20 @@ impl BidirectRpcManager {
}
}
pub fn new_with_stats_manager(stats_manager: Arc<StatsManager>) -> Self {
Self {
rpc_client: Client::new_with_stats_manager(stats_manager.clone()),
rpc_server: Server::new_with_registry_and_stats_manager(Arc::new(ServiceRegistry::new()), stats_manager),
rx_timeout: None,
error: Arc::new(Mutex::new(None)),
tunnel: Mutex::new(None),
running: Arc::new(AtomicBool::new(false)),
tasks: Mutex::new(None),
}
}
pub fn set_rx_timeout(mut self, timeout: Option<std::time::Duration>) -> Self {
self.rx_timeout = timeout;
self
+67 -3
View File
@@ -10,7 +10,10 @@ use tokio::task::JoinSet;
use tokio::time::timeout;
use tokio_stream::StreamExt;
use crate::common::PeerId;
use crate::common::{
stats_manager::{LabelSet, LabelType, MetricName, StatsManager},
PeerId,
};
use crate::defer;
use crate::proto::common::{
CompressionAlgoPb, RpcCompressionInfo, RpcDescriptor, RpcPacket, RpcRequest, RpcResponse,
@@ -66,6 +69,7 @@ pub struct Client {
inflight_requests: InflightRequestTable,
peer_info: PeerInfoTable,
tasks: Mutex<JoinSet<()>>,
stats_manager: Option<Arc<StatsManager>>,
}
impl Client {
@@ -77,6 +81,19 @@ impl Client {
inflight_requests: Arc::new(DashMap::new()),
peer_info: Arc::new(DashMap::new()),
tasks: Mutex::new(JoinSet::new()),
stats_manager: None,
}
}
pub fn new_with_stats_manager(stats_manager: Arc<StatsManager>) -> Self {
let (ring_a, ring_b) = create_ring_tunnel_pair();
Self {
mpsc: Mutex::new(MpscTunnel::new(ring_a, None)),
transport: Mutex::new(MpscTunnel::new(ring_b, None)),
inflight_requests: Arc::new(DashMap::new()),
peer_info: Arc::new(DashMap::new()),
tasks: Mutex::new(JoinSet::new()),
stats_manager: Some(stats_manager),
}
}
@@ -168,6 +185,7 @@ impl Client {
zc_packet_sender: MpscTunnelSender,
inflight_requests: InflightRequestTable,
peer_info: PeerInfoTable,
stats_manager: Option<Arc<StatsManager>>,
_phan: PhantomData<F>,
}
@@ -196,6 +214,7 @@ impl Client {
method: <Self::Descriptor as ServiceDescriptor>::Method,
input: bytes::Bytes,
) -> Result<bytes::Bytes> {
let start_time = std::time::Instant::now();
let transaction_id = CUR_TID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
let (tx, mut rx) = mpsc::unbounded_channel();
let key = InflightRequestKey {
@@ -203,6 +222,13 @@ impl Client {
to_peer_id: self.to_peer_id,
transaction_id,
};
let desc = self.service_descriptor();
let labels = LabelSet::new()
.with_label_type(LabelType::NetworkName(self.domain_name.to_string()))
.with_label_type(LabelType::SrcPeerId(self.from_peer_id))
.with_label_type(LabelType::DstPeerId(self.to_peer_id))
.with_label_type(LabelType::ServiceName(desc.name().to_string()))
.with_label_type(LabelType::MethodName(method.name().to_string()));
defer!(self.inflight_requests.remove(&key););
self.inflight_requests.insert(
@@ -210,11 +236,16 @@ impl Client {
InflightRequest {
sender: tx,
merger: PacketMerger::new(),
start_time: std::time::Instant::now(),
start_time,
},
);
let desc = self.service_descriptor();
// Record RPC client TX stats
if let Some(ref stats_manager) = self.stats_manager {
stats_manager
.get_counter(MetricName::PeerRpcClientTx, labels.clone())
.inc();
}
let rpc_desc = RpcDescriptor {
domain_name: self.domain_name.clone(),
@@ -281,12 +312,44 @@ impl Client {
let rpc_resp = RpcResponse::decode(Bytes::from(rpc_packet.body))?;
if let Some(err) = &rpc_resp.error {
// Record RPC error stats
if let Some(ref stats_manager) = self.stats_manager {
let labels = labels
.clone()
.with_label_type(LabelType::ErrorType(format!("{:?}", err.error_kind)))
.with_label_type(LabelType::Status("error".to_string()));
stats_manager
.get_counter(MetricName::PeerRpcErrors, labels.clone())
.inc();
let duration_ms = start_time.elapsed().as_millis() as u64;
stats_manager
.get_counter(MetricName::PeerRpcDuration, labels)
.add(duration_ms);
}
return Err(err.into());
}
let raw_output = Bytes::from(rpc_resp.response.clone());
ctrl.set_raw_output(raw_output.clone());
// Record RPC client RX and duration stats
if let Some(ref stats_manager) = self.stats_manager {
let labels = labels
.clone()
.with_label_type(LabelType::Status("success".to_string()));
stats_manager
.get_counter(MetricName::PeerRpcClientRx, labels.clone())
.inc();
let duration_ms = start_time.elapsed().as_millis() as u64;
stats_manager
.get_counter(MetricName::PeerRpcDuration, labels)
.add(duration_ms);
}
Ok(raw_output)
}
}
@@ -298,6 +361,7 @@ impl Client {
zc_packet_sender: self.mpsc.lock().unwrap().get_sink(),
inflight_requests: self.inflight_requests.clone(),
peer_info: self.peer_info.clone(),
stats_manager: self.stats_manager.clone(),
_phan: PhantomData,
})
}
+72 -1
View File
@@ -10,7 +10,11 @@ use tokio::{task::JoinSet, time::timeout};
use tokio_stream::StreamExt;
use crate::{
common::{join_joinset_background, PeerId},
common::{
join_joinset_background,
stats_manager::{LabelSet, LabelType, MetricName, StatsManager},
PeerId,
},
proto::{
common::{
self, CompressionAlgoPb, RpcCompressionInfo, RpcPacket, RpcRequest, RpcResponse,
@@ -46,6 +50,7 @@ pub struct Server {
tasks: Arc<Mutex<JoinSet<()>>>,
packet_mergers: Arc<DashMap<PacketMergerKey, PacketMerger>>,
stats_manager: Option<Arc<StatsManager>>,
}
impl Server {
@@ -62,6 +67,23 @@ impl Server {
transport: Mutex::new(MpscTunnel::new(ring_b, None)),
tasks: Arc::new(Mutex::new(JoinSet::new())),
packet_mergers: Arc::new(DashMap::new()),
stats_manager: None,
}
}
pub fn new_with_registry_and_stats_manager(
registry: Arc<ServiceRegistry>,
stats_manager: Arc<StatsManager>,
) -> Self {
let (ring_a, ring_b) = create_ring_tunnel_pair();
Self {
registry,
mpsc: Mutex::new(Some(MpscTunnel::new(ring_a, None))),
transport: Mutex::new(MpscTunnel::new(ring_b, None)),
tasks: Arc::new(Mutex::new(JoinSet::new())),
packet_mergers: Arc::new(DashMap::new()),
stats_manager: Some(stats_manager),
}
}
@@ -85,6 +107,7 @@ impl Server {
let packet_merges = self.packet_mergers.clone();
let reg = self.registry.clone();
let stats_manager = self.stats_manager.clone();
let t = Arc::downgrade(&tasks);
let tunnel_info = mpsc.tunnel_info();
tasks.lock().unwrap().spawn(async move {
@@ -133,6 +156,7 @@ impl Server {
packet,
reg.clone(),
tunnel_info.clone(),
stats_manager.clone(),
));
}
Ok(None) => {}
@@ -189,12 +213,27 @@ impl Server {
packet: RpcPacket,
reg: Arc<ServiceRegistry>,
tunnel_info: Option<TunnelInfo>,
stats_manager: Option<Arc<StatsManager>>,
) {
let from_peer = packet.from_peer;
let to_peer = packet.to_peer;
let transaction_id = packet.transaction_id;
let trace_id = packet.trace_id;
let desc = packet.descriptor.clone().unwrap();
let method_name = reg.get_method_name(&desc).unwrap_or("<Nil>".to_owned());
let labels = LabelSet::new()
.with_label_type(LabelType::NetworkName(desc.domain_name.to_string()))
.with_label_type(LabelType::SrcPeerId(from_peer))
.with_label_type(LabelType::DstPeerId(to_peer))
.with_label_type(LabelType::ServiceName(desc.service_name.to_string()))
.with_label_type(LabelType::MethodName(method_name));
// Record RPC server RX stats
if let Some(ref stats_manager) = stats_manager {
stats_manager
.get_counter(MetricName::PeerRpcServerRx, labels.clone())
.inc();
}
let mut resp_msg = RpcResponse::default();
let now = std::time::Instant::now();
@@ -205,9 +244,41 @@ impl Server {
match &resp_bytes {
Ok(r) => {
resp_msg.response = r.clone().into();
// Record successful RPC server TX and duration stats
if let Some(ref stats_manager) = stats_manager {
let labels = labels
.clone()
.with_label_type(LabelType::Status("success".to_string()));
stats_manager
.get_counter(MetricName::PeerRpcServerTx, labels.clone())
.inc();
let duration_ms = now.elapsed().as_millis() as u64;
stats_manager
.get_counter(MetricName::PeerRpcDuration, labels)
.add(duration_ms);
}
}
Err(err) => {
resp_msg.error = Some(err.into());
// Record RPC server error stats
if let Some(ref stats_manager) = stats_manager {
let labels = labels
.clone()
.with_label_type(LabelType::Status("error".to_string()));
stats_manager
.get_counter(MetricName::PeerRpcErrors, labels.clone())
.inc();
let duration_ms = now.elapsed().as_millis() as u64;
stats_manager
.get_counter(MetricName::PeerRpcDuration, labels)
.add(duration_ms);
}
}
};
resp_msg.runtime_us = now.elapsed().as_micros() as u64;
@@ -78,6 +78,14 @@ impl ServiceRegistry {
self.table.insert(key, entry);
}
pub fn get_method_name(&self, rpc_desc: &RpcDescriptor) -> Option<String> {
let service_key = ServiceKey::from(rpc_desc);
let entry = self.table.get(&service_key)?;
let method_index = rpc_desc.method_index as u8;
let method_name = entry.service.get_method_name(method_index).ok()?;
Some(method_name)
}
pub fn unregister<H: Handler<Controller = RpcController>>(
&self,
h: H,
+10
View File
@@ -1,4 +1,6 @@
//! Traits for defining generic RPC handlers.
use crate::proto::rpc_types::descriptor::MethodDescriptor;
use super::{
controller::Controller,
descriptor::{self, ServiceDescriptor},
@@ -49,6 +51,8 @@ pub trait HandlerExt: Send + Sync + 'static {
method_index: u8,
input: bytes::Bytes,
) -> super::error::Result<bytes::Bytes>;
fn get_method_name(&self, method_index: u8) -> super::error::Result<String>;
}
#[async_trait::async_trait]
@@ -64,4 +68,10 @@ impl<C: Controller, T: Handler<Controller = C>> HandlerExt for T {
let method = self.get_method_from_index(method_index)?;
self.call(ctrl, method, input).await
}
fn get_method_name(&self, method_index: u8) -> super::error::Result<String> {
let method = self.get_method_from_index(method_index)?;
let name = method.name().to_string();
Ok(name)
}
}