From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [IPv6:2a01:7e0:0:424::9]) by lore.proxmox.com (Postfix) with ESMTPS id 5FF961FF13B for ; Mon, 08 Jun 2026 15:26:18 +0200 (CEST) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 168D01DAAB; Mon, 8 Jun 2026 15:26:16 +0200 (CEST) From: Dominik Csapak To: pdm-devel@lists.proxmox.com Subject: [PATCH datacenter-manager v2 1/4] server: remote cache: prepare for back-off mechanism Date: Mon, 8 Jun 2026 15:25:29 +0200 Message-ID: <20260608132539.2949407-2-d.csapak@proxmox.com> X-Mailer: git-send-email 2.47.3 In-Reply-To: <20260608132539.2949407-1-d.csapak@proxmox.com> References: <20260608132539.2949407-1-d.csapak@proxmox.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-SPAM-LEVEL: Spam detection results: 0 AWL 0.049 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record URIBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to URIBL was blocked. See http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block for more information. [connection.rs,mod.rs] Message-ID-Hash: BQJYKQ37SRXVU4YIXF7EY6RSS5WVYAKJ X-Message-ID-Hash: BQJYKQ37SRXVU4YIXF7EY6RSS5WVYAKJ X-MailFrom: d.csapak@proxmox.com X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; loop; banned-address; emergency; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header X-Mailman-Version: 3.3.10 Precedence: list List-Id: Proxmox Datacenter Manager development discussion List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: Replace the 'reachable' field in `HostInfo` of the `RemoteMappingCache` with an `Option` instead. `None` means it's reachable and some state means it's not. The back-off state itself contains information about when it was last reachable and when it's time to try again. The current logic of the back-off state is: * Retry for 10 minutes before backing off * Start with backing off for 10 seconds. * Retrying after the current back-off time ran out adds a doubled back-off time (20s, 40s, etc.) in case the host is still offline. * do that until a maximum back-off time of 3 minutes is reached. Signed-off-by: Dominik Csapak --- .../tasks/remote_node_mapping.rs | 29 ++-- server/src/connection.rs | 21 ++- server/src/remote_cache/back_off.rs | 128 ++++++++++++++++++ server/src/remote_cache/mod.rs | 114 ++++++++++++++-- 4 files changed, 264 insertions(+), 28 deletions(-) create mode 100644 server/src/remote_cache/back_off.rs diff --git a/server/src/bin/proxmox-datacenter-api/tasks/remote_node_mapping.rs b/server/src/bin/proxmox-datacenter-api/tasks/remote_node_mapping.rs index 9e2ebf15..867e01cf 100644 --- a/server/src/bin/proxmox-datacenter-api/tasks/remote_node_mapping.rs +++ b/server/src/bin/proxmox-datacenter-api/tasks/remote_node_mapping.rs @@ -25,7 +25,7 @@ use proxmox_section_config::typed::SectionConfigData; use pdm_api_types::remotes::{Remote, RemoteType}; -use server::remote_cache::{self, RemoteMappingCache}; +use server::remote_cache::{self, ConnectionState, RemoteMappingCache}; use server::task_utils; const CONFIG_POLL_INTERVAL: u64 = 60; @@ -188,22 +188,21 @@ impl CachingTask { log::debug!("querying remote {:?} node {:?}", remote.id, node.hostname); // if the host is new, we need to query its name - let query_result = match query_node_name(remote, &node.hostname).await { - Ok(node_name) => Some(node_name), - Err(err) => { - log::error!( - "failed to query info for remote '{}' node '{}' - {err:?}", - remote.id, - node.hostname - ); - None - } - }; + let (query_result, connection_state) = + match query_node_name(remote, &node.hostname).await { + Ok(node_name) => (Some(node_name), ConnectionState::Reachable), + Err(err) => { + log::error!( + "failed to query info for remote '{}' node '{}' - {err:?}", + remote.id, + node.hostname + ); + (None, ConnectionState::Unreachable(err.to_string())) + } + }; let mut cache = RemoteMappingCache::write()?; - if let Some(info) = cache.info_by_hostname_mut(&remote.id, &node.hostname) { - info.reachable = query_result.is_some(); - } + cache.mark_host_reachable(&remote.id, &node.hostname, connection_state); if let Some(node_name) = query_result { cache.set_node_name(&remote.id, &node.hostname, Some(node_name)); } diff --git a/server/src/connection.rs b/server/src/connection.rs index 7ad1a5b9..91a51cbf 100644 --- a/server/src/connection.rs +++ b/server/src/connection.rs @@ -25,6 +25,7 @@ use pdm_api_types::remotes::{NodeUrl, Remote, RemoteType, TlsProbeOutcome}; use pve_api_types::client::PveClientImpl; use crate::pbs_client::PbsClient; +use crate::remote_cache::ConnectionState; static INSTANCE: OnceLock> = OnceLock::new(); @@ -381,7 +382,7 @@ impl ClientFactory for DefaultClientFactory { ) -> Result, Error> { let cache = crate::remote_cache::RemoteMappingCache::get(); match cache.info_by_node_name(&remote.id, node) { - Some(info) if info.reachable => { + Some(info) if info.is_reachable() => { self.make_pve_client_with_endpoint(remote, Some(&info.hostname)) } _ => self.make_pve_client(remote), @@ -550,7 +551,11 @@ impl MultiClientState { let entry = self.get_entry(); log::error!("marking client {} as unreachable", entry.hostname); if let Ok(mut cache) = crate::remote_cache::RemoteMappingCache::write() { - cache.mark_host_reachable(&self.remote, &entry.hostname, false); + cache.mark_host_reachable( + &self.remote, + &entry.hostname, + ConnectionState::Unreachable("unknown error".to_string()), + ); let _ = cache.save(); } self.next(); @@ -775,7 +780,11 @@ macro_rules! try_request { log::error!("marking {hostname:?} as reachable again!"); if let Ok(mut cache) = crate::remote_cache::RemoteMappingCache::write() { - cache.mark_host_reachable(&$self.remote, &hostname, true); + cache.mark_host_reachable( + &$self.remote, + &hostname, + ConnectionState::Reachable, + ); let _ = cache.save(); } } @@ -797,7 +806,11 @@ macro_rules! try_request { if let Ok(result) = tokio::time::timeout($self.timeout, request).await { if result.is_ok() { if let Ok(mut cache) = crate::remote_cache::RemoteMappingCache::write() { - cache.mark_host_reachable(&$self.remote, &hostname, true); + cache.mark_host_reachable( + &$self.remote, + &hostname, + ConnectionState::Reachable, + ); let _ = cache.save(); } } diff --git a/server/src/remote_cache/back_off.rs b/server/src/remote_cache/back_off.rs new file mode 100644 index 00000000..b2eb7fcd --- /dev/null +++ b/server/src/remote_cache/back_off.rs @@ -0,0 +1,128 @@ +use serde::{Deserialize, Serialize}; + +/// The initial back-off time when the minimum offline time was reached. +const BACK_OFF_BASE_TIME_S: i64 = 10; +/// The maximum back-off time between retries. +const BACK_OFF_MAX_TIME_S: i64 = 3 * 60; +/// Minimum time a connection must fail before back-off kicks in. +const BACK_OFF_MIN_TIME_S: i64 = 10 * 60; + +/// Holds the current state for backing off an offline remote +#[derive(Clone, Deserialize, Serialize)] +pub struct BackOffState { + /// How often the back-off time is doubled since the start, up to a maximum of + /// [BACK_OFF_MAX_TIME_S] + back_off_doubling_count: u32, + /// The last error message we got. + last_error: String, + /// The first time it failed. + first_fail_time: i64, + /// The next time a connection can be retried. + next_try: i64, +} + +impl BackOffState { + /// Create a new back-off config when a host/remote is not reachable for the first time + pub fn new(time: i64, error: String) -> Self { + Self { + first_fail_time: time, + next_try: time, + back_off_doubling_count: 0, + last_error: error, + } + } + + /// Increases the back-off state when the remote is still unreachable. + /// Returns the next timestamp when it's allowed to retry if set. + pub fn retried(&mut self, time: i64, error: String) -> Option { + self.last_error = error; + + if self.time_to_next_try(time) > 0 { + // still in the back off period + return None; + } + + if self.back_off_doubling_count == 0 && (time - self.first_fail_time) < BACK_OFF_MIN_TIME_S + { + // failing, but did not reach the minimum fail timer yet + return None; + } + + // we're outside the back-off period, and later than the minimum + let back_off_time = (BACK_OFF_BASE_TIME_S * 2i64.pow(self.back_off_doubling_count)) + .min(BACK_OFF_MAX_TIME_S); + + if back_off_time < BACK_OFF_MAX_TIME_S { + self.back_off_doubling_count += 1; + } + + self.next_try = time + back_off_time; + Some(self.next_try) + } + + /// Returns the remaining time to try again in seconds. + /// If it is 0, we're free to try again. + pub fn time_to_next_try(&self, current_time: i64) -> u64 { + self.next_try.saturating_sub(current_time).max(0) as u64 + } + + /// Returns the last error that was set. + pub fn last_error(&self) -> String { + self.last_error.clone() + } +} + +#[cfg(test)] +mod test { + use super::{BACK_OFF_MAX_TIME_S, BACK_OFF_MIN_TIME_S, BackOffState}; + + #[test] + fn test_back_off_calculation() { + let mut time = 0; + let mut back_off = BackOffState::new(time, String::new()); + // timeout should be @ 0 seconds + + assert_eq!(back_off.time_to_next_try(time + 1), 0); + + time += BACK_OFF_MIN_TIME_S; + back_off.retried(time, String::new()); + let mut back_off_time = 10; + + assert_eq!(back_off.time_to_next_try(time + 5), 5); + assert_eq!(back_off.time_to_next_try(time + 10), 0); + assert_eq!(back_off.time_to_next_try(time + 20), 0); + + back_off.retried(time + back_off_time, String::new()); + time += back_off_time; + back_off_time *= 2; + back_off.retried(time, String::new()); + + assert_eq!(back_off.time_to_next_try(time), back_off_time as u64); + + time += back_off_time; + back_off_time *= 2; + back_off.retried(time, String::new()); + time += back_off_time; + back_off_time *= 2; + back_off.retried(time, String::new()); + time += back_off_time; + back_off_time *= 2; + back_off.retried(time, String::new()); + time += back_off_time; + back_off.retried(time, String::new()); + + // retried 7 times, back-off should have reached the max time by now + assert_eq!(back_off.time_to_next_try(time), BACK_OFF_MAX_TIME_S as u64); + + // simulate 12 hour outage + + let iterations = 12 * 60 * 60 / BACK_OFF_MAX_TIME_S; + for _ in 0..iterations { + time += BACK_OFF_MAX_TIME_S; + back_off.retried(time, String::new()); + } + + // timeout should still be at the maximum + assert_eq!(back_off.time_to_next_try(time), BACK_OFF_MAX_TIME_S as u64); + } +} diff --git a/server/src/remote_cache/mod.rs b/server/src/remote_cache/mod.rs index 39483e9f..b6c2f48a 100644 --- a/server/src/remote_cache/mod.rs +++ b/server/src/remote_cache/mod.rs @@ -24,10 +24,14 @@ use serde::{Deserialize, Serialize}; use proxmox_product_config::replace_config; use proxmox_product_config::{ApiLockGuard, open_api_lockfile}; +use proxmox_time::epoch_i64; use pdm_api_types::remotes::RemoteType; use pdm_config::ConfigVersionCache; +mod back_off; +use back_off::BackOffState; + const CACHE_FILENAME: &str = concat!( pdm_buildcfg::PDM_CACHE_DIR_M!(), "/remote-mapping-cache.json" @@ -198,16 +202,26 @@ impl RemoteMappingCache { } /// Mark a host as reachable. - pub fn mark_host_reachable(&mut self, remote_name: &str, hostname: &str, reachable: bool) { + pub fn mark_host_reachable( + &mut self, + remote_name: &str, + hostname: &str, + connection_state: ConnectionState, + ) { if let Some(info) = self.info_by_hostname_mut(remote_name, hostname) { - info.reachable = reachable; + info.set_reachable(connection_state); } } /// Mark a host as reachable. - pub fn mark_node_reachable(&mut self, remote_name: &str, node_name: &str, reachable: bool) { + pub fn mark_node_reachable( + &mut self, + remote_name: &str, + node_name: &str, + connection_state: ConnectionState, + ) { if let Some(info) = self.info_by_node_name_mut(remote_name, node_name) { - info.reachable = reachable; + info.set_reachable(connection_state); } } @@ -222,10 +236,70 @@ impl RemoteMappingCache { /// Check if a host is reachable. pub fn host_is_reachable(&self, remote: &str, hostname: &str) -> bool { self.info_by_hostname(remote, hostname) - .is_none_or(|info| info.reachable) + .is_none_or(|info| info.is_reachable()) + } + + /// Get the next time to try the host and the last error if it was not reachable. + pub fn host_time_to_next_try( + &self, + remote: &str, + hostname: &str, + current_time: i64, + ) -> Option<(u64, String)> { + self.info_by_hostname(remote, hostname) + .and_then(|info| info.back_off.as_ref()) + .map(|back_off| { + ( + back_off.time_to_next_try(current_time), + back_off.last_error(), + ) + }) + } + + /// Returns the remaining back-off time and optionally the last error we got. + pub fn remote_time_to_next_try( + &self, + remote: &str, + current_time: i64, + ) -> Option<(u64, String)> { + match self.remotes.get(remote) { + Some(remote) => { + let mut time = u64::MAX; + let mut err = String::new(); + for info in remote.hosts.values() { + if let Some(back_off) = &info.back_off { + let node_time = back_off.time_to_next_try(current_time); + // use the least time from the hosts + if node_time < time { + time = node_time; + err = back_off.last_error(); + } + } else { + // we found a node that is reachable, return immediately + return None; + } + } + + if time == u64::MAX { + // we had no node information so we're allowed to try + return None; + } + + Some((time, err)) + } + None => None, // no info about remote, are we allowed to try + } } } +/// If a remote is reachable or not +pub enum ConnectionState { + /// The host/remote/etc. is reachable + Reachable, + /// The remote/host/etc. is not reachable. Contains the error. + Unreachable(String), +} + /// An entry for a remote in a [`RemoteMappingCache`]. #[derive(Clone, Deserialize, Serialize)] pub struct RemoteMapping { @@ -271,9 +345,9 @@ pub struct HostInfo { /// This is the cluster side node name, if we know it. node_name: Option, - /// This means we were able to reach the node. - /// When a client fails to connect it may update this to mark it as unreachable. - pub reachable: bool, + /// Per host back off config + #[serde(default, skip_serializing_if = "Option::is_none")] + back_off: Option, } impl HostInfo { @@ -281,11 +355,33 @@ impl HostInfo { Self { hostname, node_name: None, - reachable: true, + back_off: None, } } pub fn node_name(&self) -> Option<&str> { self.node_name.as_deref() } + + /// Sets the host's reachable status. + /// Returns the next timestamp when it's allowed to retry if set. + pub fn set_reachable(&mut self, connection_state: ConnectionState) -> Option { + match connection_state { + ConnectionState::Reachable => { + self.back_off = None; + } + ConnectionState::Unreachable(err) => { + let time = epoch_i64(); + match &mut self.back_off { + Some(back_off) => return back_off.retried(time, err), + None => self.back_off = Some(BackOffState::new(time, err)), + } + } + } + None + } + + pub fn is_reachable(&self) -> bool { + self.back_off.is_none() + } } -- 2.47.3