From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [IPv6:2a01:7e0:0:424::9]) by lore.proxmox.com (Postfix) with ESMTPS id 482191FF14C for ; Fri, 29 May 2026 15:31:01 +0200 (CEST) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 26231BE32; Fri, 29 May 2026 15:31:01 +0200 (CEST) From: Dominik Csapak To: pdm-devel@lists.proxmox.com Subject: [PATCH datacenter-manager 2/4] server: remote cache: prepare for back-off mechanism Date: Fri, 29 May 2026 15:30:18 +0200 Message-ID: <20260529133026.3149896-3-d.csapak@proxmox.com> X-Mailer: git-send-email 2.47.3 In-Reply-To: <20260529133026.3149896-1-d.csapak@proxmox.com> References: <20260529133026.3149896-1-d.csapak@proxmox.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-SPAM-LEVEL: Spam detection results: 0 AWL 0.049 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record URIBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to URIBL was blocked. See http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block for more information. [mod.rs] Message-ID-Hash: 7JY2SWSGAVFKNAYKBZ4F6MTBU4N4GSQR X-Message-ID-Hash: 7JY2SWSGAVFKNAYKBZ4F6MTBU4N4GSQR X-MailFrom: d.csapak@proxmox.com X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; loop; banned-address; emergency; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header X-Mailman-Version: 3.3.10 Precedence: list List-Id: Proxmox Datacenter Manager development discussion List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: this introduces a new field for the RemoteMappingCache that contains the current status of a 'BackOffState'. This is intended to mark remotes as unreachable when the connection to them fails and only to retry if enough time elapsed. This is to prevent sending numerous connections out to a remote that is known to not be reachable. The back-off timeout is increased exponentially from 10 seconds up to 600 seconds, so at most it takes 10 minutes for a remote to be reachable again if it was offline for a prolonged period of time. Signed-off-by: Dominik Csapak --- Note that this now takes up to 10 minutes for pdm to mark a remote as reachable again, since it won't retry sooner. We could combat that by e.g. retrying every 10th connection, even if the back-off timeout has not run out yet. (probably has to be scaled by the nodes and tasks we are running?). Another possibility would be to have either a special API call to force refresh it, but my guess is that most users would just abuse that button? I'm very open for other ideas on how to improve this, maybe it's just a matter of finetuning the back-off scale and maximum to get a well working system. server/src/remote_cache/mod.rs | 139 +++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) diff --git a/server/src/remote_cache/mod.rs b/server/src/remote_cache/mod.rs index 39483e9f..d50a324c 100644 --- a/server/src/remote_cache/mod.rs +++ b/server/src/remote_cache/mod.rs @@ -24,6 +24,7 @@ use serde::{Deserialize, Serialize}; use proxmox_product_config::replace_config; use proxmox_product_config::{ApiLockGuard, open_api_lockfile}; +use proxmox_time::epoch_i64; use pdm_api_types::remotes::RemoteType; use pdm_config::ConfigVersionCache; @@ -224,6 +225,84 @@ impl RemoteMappingCache { self.info_by_hostname(remote, hostname) .is_none_or(|info| info.reachable) } + + /// Mark a remote as reachable. + pub fn mark_remote_reachable(&mut self, remote: &str, reachable: RemoteState) { + if let Some(remote) = self.remotes.get_mut(remote) { + remote.set_reachable(reachable); + } + } + + /// Returns the remaining backoff time and optionally the last error we got. + pub fn remote_remaining_backoff_time( + &self, + remote: &str, + current_time: i64, + ) -> (i64, Option) { + match self.remotes.get(remote) { + Some(remote) => match &remote.backoff { + Some(backoff) => ( + backoff.remaining_backoff_time(current_time), + Some(backoff.last_error.clone()), + ), + None => (0, None), + }, + None => (0, None), // no info about remote, are we allowed to try? + } + } +} + +/// The initial backoff time for a remote in case it's offline. +const BACK_OFF_BASE_TIME_S: i64 = 10; +/// The maximum back-off time between retries. +const BACK_OFF_MAX_TIME_S: i64 = 600; + +/// Holds the current state for backing off an offline remote +#[derive(Clone, Deserialize, Serialize)] +struct BackOffState { + /// The last time a connection was tried + last_try: i64, + /// How often the back-off time is doubled since the start, up to a maximum of + /// [BACK_OFF_MAX_TIME_S] + back_off_doubling_count: u32, + /// The last error message we got: + last_error: String, +} + +impl BackOffState { + /// Creates a new backoff state + fn new(time: i64, error: String) -> Self { + Self { + last_try: time, + back_off_doubling_count: 0, + last_error: error, + } + } + + /// Increases the backoff state when the remote is still unreachable. + fn increase(&mut self, time: i64, error: String) { + if self.remaining_backoff_time(time) < BACK_OFF_MAX_TIME_S { + self.back_off_doubling_count += 1; + } + self.last_try = time; + self.last_error = error; + } + + /// Returns the remaining time to try again in seconds. + /// If negative, we're free to try again. + fn remaining_backoff_time(&self, current_time: i64) -> i64 { + let back_off_time = (BACK_OFF_BASE_TIME_S * 2i64.pow(self.back_off_doubling_count)) + .min(BACK_OFF_MAX_TIME_S); + self.last_try + back_off_time - current_time + } +} + +/// If a remote is reachable or not +pub enum RemoteState { + /// The remote is reachable + Reachable, + /// The remote is not reachable. Contains the error. + Unreachable(String), } /// An entry for a remote in a [`RemoteMappingCache`]. @@ -237,6 +316,9 @@ pub struct RemoteMapping { /// Maps a node name to a hostname, for where we have that info. pub node_to_host: HashMap, + + /// internal backoff state, only controlled via [Self::set_reachable] + backoff: Option, } impl RemoteMapping { @@ -245,6 +327,7 @@ impl RemoteMapping { ty, hosts: HashMap::new(), node_to_host: HashMap::new(), + backoff: None, } } @@ -260,6 +343,22 @@ impl RemoteMapping { } } } + + /// Sets the remote to reachable or unreachable + pub fn set_reachable(&mut self, reachable: RemoteState) { + match reachable { + RemoteState::Reachable => { + self.backoff = None; + } + RemoteState::Unreachable(err) => { + let time = epoch_i64(); + match &mut self.backoff { + Some(backoff) => backoff.increase(time, err), + None => self.backoff = Some(BackOffState::new(time, err)), + } + } + } + } } /// All the data we keep cached for nodes found in [`RemoteMapping`]. @@ -289,3 +388,43 @@ impl HostInfo { self.node_name.as_deref() } } + +#[cfg(test)] +mod test { + use crate::remote_cache::BackOffState; + + #[test] + fn test_back_off_calculation() { + let mut backoff = BackOffState::new(0, String::new()); + // timeout should be @ 10 seconds + + assert_eq!(backoff.remaining_backoff_time(1), 9); + assert_eq!(backoff.remaining_backoff_time(5), 5); + assert_eq!(backoff.remaining_backoff_time(9), 1); + assert_eq!(backoff.remaining_backoff_time(10), 0); + assert_eq!(backoff.remaining_backoff_time(15), -5); + + backoff.increase(0, String::new()); + // timeout should be now @ 2*10 seconds + + assert_eq!(backoff.remaining_backoff_time(10), 10); + assert_eq!(backoff.remaining_backoff_time(15), 5); + assert_eq!(backoff.remaining_backoff_time(20), 0); + assert_eq!(backoff.remaining_backoff_time(30), -10); + + backoff.increase(0, String::new()); + backoff.increase(0, String::new()); + backoff.increase(0, String::new()); + + // timeout should be now at 2^4 * 10 seconds + assert_eq!(backoff.remaining_backoff_time(150), 10); + + backoff.increase(0, String::new()); + backoff.increase(0, String::new()); + backoff.increase(0, String::new()); + backoff.increase(0, String::new()); + + // timeout should be now at 2^8 * 10 seconds -> maximum of 600 + assert_eq!(backoff.remaining_backoff_time(590), 10); + } +} -- 2.47.3