From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <pdm-devel-bounces@lists.proxmox.com>
Received: from firstgate.proxmox.com (firstgate.proxmox.com [IPv6:2a01:7e0:0:424::9])
	by lore.proxmox.com (Postfix) with ESMTPS id 5FF961FF13B
	for <inbox@lore.proxmox.com>; Mon, 08 Jun 2026 15:26:18 +0200 (CEST)
Received: from firstgate.proxmox.com (localhost [127.0.0.1])
	by firstgate.proxmox.com (Proxmox) with ESMTP id 168D01DAAB;
	Mon,  8 Jun 2026 15:26:16 +0200 (CEST)
From: Dominik Csapak <d.csapak@proxmox.com>
To: pdm-devel@lists.proxmox.com
Subject: [PATCH datacenter-manager v2 1/4] server: remote cache: prepare for
 back-off mechanism
Date: Mon,  8 Jun 2026 15:25:29 +0200
Message-ID: <20260608132539.2949407-2-d.csapak@proxmox.com>
X-Mailer: git-send-email 2.47.3
In-Reply-To: <20260608132539.2949407-1-d.csapak@proxmox.com>
References: <20260608132539.2949407-1-d.csapak@proxmox.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-SPAM-LEVEL: Spam detection results:  0
	AWL                     0.049 Adjusted score from AWL reputation of From:
 address
	BAYES_00                 -1.9 Bayes spam probability is 0 to 1%
	DMARC_MISSING             0.1 Missing DMARC policy
	KAM_DMARC_STATUS         0.01 Test Rule for DKIM or SPF Failure with Strict
 Alignment
	SPF_HELO_NONE           0.001 SPF: HELO does not publish an SPF Record
	SPF_PASS               -0.001 SPF: sender matches SPF record
	URIBL_BLOCKED           0.001 ADMINISTRATOR NOTICE: The query to URIBL was
 blocked.  See http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block
 for more information. [connection.rs,mod.rs]
Message-ID-Hash: BQJYKQ37SRXVU4YIXF7EY6RSS5WVYAKJ
X-Message-ID-Hash: BQJYKQ37SRXVU4YIXF7EY6RSS5WVYAKJ
X-MailFrom: d.csapak@proxmox.com
X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; loop;
 banned-address; emergency; member-moderation; nonmember-moderation;
 administrivia; implicit-dest; max-recipients; max-size; news-moderation;
 no-subject; digests; suspicious-header
X-Mailman-Version: 3.3.10
Precedence: list
List-Id: Proxmox Datacenter Manager development discussion
 <pdm-devel.lists.proxmox.com>
List-Help: <mailto:pdm-devel-request@lists.proxmox.com?subject=help>
List-Owner: <mailto:pdm-devel-owner@lists.proxmox.com>
List-Post: <mailto:pdm-devel@lists.proxmox.com>
List-Subscribe: <mailto:pdm-devel-join@lists.proxmox.com>
List-Unsubscribe: <mailto:pdm-devel-leave@lists.proxmox.com>

Replace the 'reachable' field in `HostInfo` of the `RemoteMappingCache`
with an `Option<BackOffState>` instead. `None` means it's reachable and
some state means it's not.

The back-off state itself contains information about when it was last
reachable and when it's time to try again.

The current logic of the back-off state is:

* Retry for 10 minutes before backing off
* Start with backing off for 10 seconds.
* Retrying after the current back-off time ran out adds a doubled
  back-off time (20s, 40s, etc.) in case the host is still offline.
* do that until a maximum back-off time of 3 minutes is reached.

Signed-off-by: Dominik Csapak <d.csapak@proxmox.com>
---
 .../tasks/remote_node_mapping.rs              |  29 ++--
 server/src/connection.rs                      |  21 ++-
 server/src/remote_cache/back_off.rs           | 128 ++++++++++++++++++
 server/src/remote_cache/mod.rs                | 114 ++++++++++++++--
 4 files changed, 264 insertions(+), 28 deletions(-)
 create mode 100644 server/src/remote_cache/back_off.rs
diff --git a/server/src/bin/proxmox-datacenter-api/tasks/remote_node_mapping.rs b/server/src/bin/proxmox-datacenter-api/tasks/remote_node_mapping.rs
index 9e2ebf15..867e01cf 100644
--- a/server/src/bin/proxmox-datacenter-api/tasks/remote_node_mapping.rs
+++ b/server/src/bin/proxmox-datacenter-api/tasks/remote_node_mapping.rs
@@ -25,7 +25,7 @@ use proxmox_section_config::typed::SectionConfigData;
 
 use pdm_api_types::remotes::{Remote, RemoteType};
 
-use server::remote_cache::{self, RemoteMappingCache};
+use server::remote_cache::{self, ConnectionState, RemoteMappingCache};
 use server::task_utils;
 
 const CONFIG_POLL_INTERVAL: u64 = 60;
@@ -188,22 +188,21 @@ impl CachingTask {
             log::debug!("querying remote {:?} node {:?}", remote.id, node.hostname);
 
             // if the host is new, we need to query its name
-            let query_result = match query_node_name(remote, &node.hostname).await {
-                Ok(node_name) => Some(node_name),
-                Err(err) => {
-                    log::error!(
-                        "failed to query info for remote '{}' node '{}' - {err:?}",
-                        remote.id,
-                        node.hostname
-                    );
-                    None
-                }
-            };
+            let (query_result, connection_state) =
+                match query_node_name(remote, &node.hostname).await {
+                    Ok(node_name) => (Some(node_name), ConnectionState::Reachable),
+                    Err(err) => {
+                        log::error!(
+                            "failed to query info for remote '{}' node '{}' - {err:?}",
+                            remote.id,
+                            node.hostname
+                        );
+                        (None, ConnectionState::Unreachable(err.to_string()))
+                    }
+                };
 
             let mut cache = RemoteMappingCache::write()?;
-            if let Some(info) = cache.info_by_hostname_mut(&remote.id, &node.hostname) {
-                info.reachable = query_result.is_some();
-            }
+            cache.mark_host_reachable(&remote.id, &node.hostname, connection_state);
             if let Some(node_name) = query_result {
                 cache.set_node_name(&remote.id, &node.hostname, Some(node_name));
             }
diff --git a/server/src/connection.rs b/server/src/connection.rs
index 7ad1a5b9..91a51cbf 100644
--- a/server/src/connection.rs
+++ b/server/src/connection.rs
@@ -25,6 +25,7 @@ use pdm_api_types::remotes::{NodeUrl, Remote, RemoteType, TlsProbeOutcome};
 use pve_api_types::client::PveClientImpl;
 
 use crate::pbs_client::PbsClient;
+use crate::remote_cache::ConnectionState;
 
 static INSTANCE: OnceLock<Box<dyn ClientFactory + Send + Sync>> = OnceLock::new();
 
@@ -381,7 +382,7 @@ impl ClientFactory for DefaultClientFactory {
     ) -> Result<Arc<PveClient>, Error> {
         let cache = crate::remote_cache::RemoteMappingCache::get();
         match cache.info_by_node_name(&remote.id, node) {
-            Some(info) if info.reachable => {
+            Some(info) if info.is_reachable() => {
                 self.make_pve_client_with_endpoint(remote, Some(&info.hostname))
             }
             _ => self.make_pve_client(remote),
@@ -550,7 +551,11 @@ impl MultiClientState {
             let entry = self.get_entry();
             log::error!("marking client {} as unreachable", entry.hostname);
             if let Ok(mut cache) = crate::remote_cache::RemoteMappingCache::write() {
-                cache.mark_host_reachable(&self.remote, &entry.hostname, false);
+                cache.mark_host_reachable(
+                    &self.remote,
+                    &entry.hostname,
+                    ConnectionState::Unreachable("unknown error".to_string()),
+                );
                 let _ = cache.save();
             }
             self.next();
@@ -775,7 +780,11 @@ macro_rules! try_request {
                             log::error!("marking {hostname:?} as reachable again!");
                             if let Ok(mut cache) = crate::remote_cache::RemoteMappingCache::write()
                             {
-                                cache.mark_host_reachable(&$self.remote, &hostname, true);
+                                cache.mark_host_reachable(
+                                    &$self.remote,
+                                    &hostname,
+                                    ConnectionState::Reachable,
+                                );
                                 let _ = cache.save();
                             }
                         }
@@ -797,7 +806,11 @@ macro_rules! try_request {
                 if let Ok(result) = tokio::time::timeout($self.timeout, request).await {
                     if result.is_ok() {
                         if let Ok(mut cache) = crate::remote_cache::RemoteMappingCache::write() {
-                            cache.mark_host_reachable(&$self.remote, &hostname, true);
+                            cache.mark_host_reachable(
+                                &$self.remote,
+                                &hostname,
+                                ConnectionState::Reachable,
+                            );
                             let _ = cache.save();
                         }
                     }
diff --git a/server/src/remote_cache/back_off.rs b/server/src/remote_cache/back_off.rs
new file mode 100644
index 00000000..b2eb7fcd
--- /dev/null
+++ b/server/src/remote_cache/back_off.rs
@@ -0,0 +1,128 @@
+use serde::{Deserialize, Serialize};
+
+/// The initial back-off time when the minimum offline time was reached.
+const BACK_OFF_BASE_TIME_S: i64 = 10;
+/// The maximum back-off time between retries.
+const BACK_OFF_MAX_TIME_S: i64 = 3 * 60;
+/// Minimum time a connection must fail before back-off kicks in.
+const BACK_OFF_MIN_TIME_S: i64 = 10 * 60;
+
+/// Holds the current state for backing off an offline remote
+#[derive(Clone, Deserialize, Serialize)]
+pub struct BackOffState {
+    /// How often the back-off time is doubled since the start, up to a maximum of
+    /// [BACK_OFF_MAX_TIME_S]
+    back_off_doubling_count: u32,
+    /// The last error message we got.
+    last_error: String,
+    /// The first time it failed.
+    first_fail_time: i64,
+    /// The next time a connection can be retried.
+    next_try: i64,
+}
+
+impl BackOffState {
+    /// Create a new back-off config when a host/remote is not reachable for the first time
+    pub fn new(time: i64, error: String) -> Self {
+        Self {
+            first_fail_time: time,
+            next_try: time,
+            back_off_doubling_count: 0,
+            last_error: error,
+        }
+    }
+
+    /// Increases the back-off state when the remote is still unreachable.
+    /// Returns the next timestamp when it's allowed to retry if set.
+    pub fn retried(&mut self, time: i64, error: String) -> Option<i64> {
+        self.last_error = error;
+
+        if self.time_to_next_try(time) > 0 {
+            // still in the back off period
+            return None;
+        }
+
+        if self.back_off_doubling_count == 0 && (time - self.first_fail_time) < BACK_OFF_MIN_TIME_S
+        {
+            // failing, but did not reach the minimum fail timer yet
+            return None;
+        }
+
+        // we're outside the back-off period, and later than the minimum
+        let back_off_time = (BACK_OFF_BASE_TIME_S * 2i64.pow(self.back_off_doubling_count))
+            .min(BACK_OFF_MAX_TIME_S);
+
+        if back_off_time < BACK_OFF_MAX_TIME_S {
+            self.back_off_doubling_count += 1;
+        }
+
+        self.next_try = time + back_off_time;
+        Some(self.next_try)
+    }
+
+    /// Returns the remaining time to try again in seconds.
+    /// If it is 0, we're free to try again.
+    pub fn time_to_next_try(&self, current_time: i64) -> u64 {
+        self.next_try.saturating_sub(current_time).max(0) as u64
+    }
+
+    /// Returns the last error that was set.
+    pub fn last_error(&self) -> String {
+        self.last_error.clone()
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::{BACK_OFF_MAX_TIME_S, BACK_OFF_MIN_TIME_S, BackOffState};
+
+    #[test]
+    fn test_back_off_calculation() {
+        let mut time = 0;
+        let mut back_off = BackOffState::new(time, String::new());
+        // timeout should be @ 0 seconds
+
+        assert_eq!(back_off.time_to_next_try(time + 1), 0);
+
+        time += BACK_OFF_MIN_TIME_S;
+        back_off.retried(time, String::new());
+        let mut back_off_time = 10;
+
+        assert_eq!(back_off.time_to_next_try(time + 5), 5);
+        assert_eq!(back_off.time_to_next_try(time + 10), 0);
+        assert_eq!(back_off.time_to_next_try(time + 20), 0);
+
+        back_off.retried(time + back_off_time, String::new());
+        time += back_off_time;
+        back_off_time *= 2;
+        back_off.retried(time, String::new());
+
+        assert_eq!(back_off.time_to_next_try(time), back_off_time as u64);
+
+        time += back_off_time;
+        back_off_time *= 2;
+        back_off.retried(time, String::new());
+        time += back_off_time;
+        back_off_time *= 2;
+        back_off.retried(time, String::new());
+        time += back_off_time;
+        back_off_time *= 2;
+        back_off.retried(time, String::new());
+        time += back_off_time;
+        back_off.retried(time, String::new());
+
+        // retried 7 times, back-off should have reached the max time by now
+        assert_eq!(back_off.time_to_next_try(time), BACK_OFF_MAX_TIME_S as u64);
+
+        // simulate 12 hour outage
+
+        let iterations = 12 * 60 * 60 / BACK_OFF_MAX_TIME_S;
+        for _ in 0..iterations {
+            time += BACK_OFF_MAX_TIME_S;
+            back_off.retried(time, String::new());
+        }
+
+        // timeout should still be at the maximum
+        assert_eq!(back_off.time_to_next_try(time), BACK_OFF_MAX_TIME_S as u64);
+    }
+}
diff --git a/server/src/remote_cache/mod.rs b/server/src/remote_cache/mod.rs
index 39483e9f..b6c2f48a 100644
--- a/server/src/remote_cache/mod.rs
+++ b/server/src/remote_cache/mod.rs
@@ -24,10 +24,14 @@ use serde::{Deserialize, Serialize};
 
 use proxmox_product_config::replace_config;
 use proxmox_product_config::{ApiLockGuard, open_api_lockfile};
+use proxmox_time::epoch_i64;
 
 use pdm_api_types::remotes::RemoteType;
 use pdm_config::ConfigVersionCache;
 
+mod back_off;
+use back_off::BackOffState;
+
 const CACHE_FILENAME: &str = concat!(
     pdm_buildcfg::PDM_CACHE_DIR_M!(),
     "/remote-mapping-cache.json"
@@ -198,16 +202,26 @@ impl RemoteMappingCache {
     }
 
     /// Mark a host as reachable.
-    pub fn mark_host_reachable(&mut self, remote_name: &str, hostname: &str, reachable: bool) {
+    pub fn mark_host_reachable(
+        &mut self,
+        remote_name: &str,
+        hostname: &str,
+        connection_state: ConnectionState,
+    ) {
         if let Some(info) = self.info_by_hostname_mut(remote_name, hostname) {
-            info.reachable = reachable;
+            info.set_reachable(connection_state);
         }
     }
 
     /// Mark a host as reachable.
-    pub fn mark_node_reachable(&mut self, remote_name: &str, node_name: &str, reachable: bool) {
+    pub fn mark_node_reachable(
+        &mut self,
+        remote_name: &str,
+        node_name: &str,
+        connection_state: ConnectionState,
+    ) {
         if let Some(info) = self.info_by_node_name_mut(remote_name, node_name) {
-            info.reachable = reachable;
+            info.set_reachable(connection_state);
         }
     }
 
@@ -222,10 +236,70 @@ impl RemoteMappingCache {
     /// Check if a host is reachable.
     pub fn host_is_reachable(&self, remote: &str, hostname: &str) -> bool {
         self.info_by_hostname(remote, hostname)
-            .is_none_or(|info| info.reachable)
+            .is_none_or(|info| info.is_reachable())
+    }
+
+    /// Get the next time to try the host and the last error if it was not reachable.
+    pub fn host_time_to_next_try(
+        &self,
+        remote: &str,
+        hostname: &str,
+        current_time: i64,
+    ) -> Option<(u64, String)> {
+        self.info_by_hostname(remote, hostname)
+            .and_then(|info| info.back_off.as_ref())
+            .map(|back_off| {
+                (
+                    back_off.time_to_next_try(current_time),
+                    back_off.last_error(),
+                )
+            })
+    }
+
+    /// Returns the remaining back-off time and optionally the last error we got.
+    pub fn remote_time_to_next_try(
+        &self,
+        remote: &str,
+        current_time: i64,
+    ) -> Option<(u64, String)> {
+        match self.remotes.get(remote) {
+            Some(remote) => {
+                let mut time = u64::MAX;
+                let mut err = String::new();
+                for info in remote.hosts.values() {
+                    if let Some(back_off) = &info.back_off {
+                        let node_time = back_off.time_to_next_try(current_time);
+                        // use the least time from the hosts
+                        if node_time < time {
+                            time = node_time;
+                            err = back_off.last_error();
+                        }
+                    } else {
+                        // we found a node that is reachable, return immediately
+                        return None;
+                    }
+                }
+
+                if time == u64::MAX {
+                    // we had no node information so we're allowed to try
+                    return None;
+                }
+
+                Some((time, err))
+            }
+            None => None, // no info about remote, are we allowed to try
+        }
     }
 }
 
+/// If a remote is reachable or not
+pub enum ConnectionState {
+    /// The host/remote/etc. is reachable
+    Reachable,
+    /// The remote/host/etc. is not reachable. Contains the error.
+    Unreachable(String),
+}
+
 /// An entry for a remote in a [`RemoteMappingCache`].
 #[derive(Clone, Deserialize, Serialize)]
 pub struct RemoteMapping {
@@ -271,9 +345,9 @@ pub struct HostInfo {
     /// This is the cluster side node name, if we know it.
     node_name: Option<String>,
 
-    /// This means we were able to reach the node.
-    /// When a client fails to connect it may update this to mark it as unreachable.
-    pub reachable: bool,
+    /// Per host back off config
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    back_off: Option<BackOffState>,
 }
 
 impl HostInfo {
@@ -281,11 +355,33 @@ impl HostInfo {
         Self {
             hostname,
             node_name: None,
-            reachable: true,
+            back_off: None,
         }
     }
 
     pub fn node_name(&self) -> Option<&str> {
         self.node_name.as_deref()
     }
+
+    /// Sets the host's reachable status.
+    /// Returns the next timestamp when it's allowed to retry if set.
+    pub fn set_reachable(&mut self, connection_state: ConnectionState) -> Option<i64> {
+        match connection_state {
+            ConnectionState::Reachable => {
+                self.back_off = None;
+            }
+            ConnectionState::Unreachable(err) => {
+                let time = epoch_i64();
+                match &mut self.back_off {
+                    Some(back_off) => return back_off.retried(time, err),
+                    None => self.back_off = Some(BackOffState::new(time, err)),
+                }
+            }
+        }
+        None
+    }
+
+    pub fn is_reachable(&self) -> bool {
+        self.back_off.is_none()
+    }
 }
-- 
2.47.3