From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <pdm-devel-bounces@lists.proxmox.com>
Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68])
	by lore.proxmox.com (Postfix) with ESMTPS id 3CD1A1FF13B
	for <inbox@lore.proxmox.com>; Mon, 08 Jun 2026 15:26:17 +0200 (CEST)
Received: from firstgate.proxmox.com (localhost [127.0.0.1])
	by firstgate.proxmox.com (Proxmox) with ESMTP id 872881DA4A;
	Mon,  8 Jun 2026 15:26:15 +0200 (CEST)
From: Dominik Csapak <d.csapak@proxmox.com>
To: pdm-devel@lists.proxmox.com
Subject: [PATCH datacenter-manager v2 3/4] server: connection: multi-client:
 use back-off state from remote cache
Date: Mon,  8 Jun 2026 15:25:31 +0200
Message-ID: <20260608132539.2949407-4-d.csapak@proxmox.com>
X-Mailer: git-send-email 2.47.3
In-Reply-To: <20260608132539.2949407-1-d.csapak@proxmox.com>
References: <20260608132539.2949407-1-d.csapak@proxmox.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-SPAM-LEVEL: Spam detection results:  0
	AWL                     0.049 Adjusted score from AWL reputation of From:
 address
	BAYES_00                 -1.9 Bayes spam probability is 0 to 1%
	DMARC_MISSING             0.1 Missing DMARC policy
	KAM_DMARC_STATUS         0.01 Test Rule for DKIM or SPF Failure with Strict
 Alignment
	SPF_HELO_NONE           0.001 SPF: HELO does not publish an SPF Record
	SPF_PASS               -0.001 SPF: sender matches SPF record
	URIBL_BLOCKED           0.001 ADMINISTRATOR NOTICE: The query to URIBL was
 blocked.  See http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block
 for more information. [mod.rs,connection.rs]
Message-ID-Hash: YMY2SXGIV237L57WLRFT5Y645G3GNPUU
X-Message-ID-Hash: YMY2SXGIV237L57WLRFT5Y645G3GNPUU
X-MailFrom: d.csapak@proxmox.com
X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; loop;
 banned-address; emergency; member-moderation; nonmember-moderation;
 administrivia; implicit-dest; max-recipients; max-size; news-moderation;
 no-subject; digests; suspicious-header
X-Mailman-Version: 3.3.10
Precedence: list
List-Id: Proxmox Datacenter Manager development discussion
 <pdm-devel.lists.proxmox.com>
List-Help: <mailto:pdm-devel-request@lists.proxmox.com?subject=help>
List-Owner: <mailto:pdm-devel-owner@lists.proxmox.com>
List-Post: <mailto:pdm-devel@lists.proxmox.com>
List-Subscribe: <mailto:pdm-devel-join@lists.proxmox.com>
List-Unsubscribe: <mailto:pdm-devel-leave@lists.proxmox.com>

This greatly reduces the wait time for api calls if a specific remote is
offline for a prolonged period of time. Instead of waiting for a timeout
on every api call to a remote which is known to be offline, return the
last error immediately.

Most notable change here is that the retry loop itself marks the hosts
as unreachable now instead of the iterator. With that, the 'failed()'
method of the `MultiClientState` was unused and can be removed. Adapt
all comments that noted this or mentioned the `failed()` method.

The unreachable clients are still skipped in the iterator.

If a host returns an error on an idempotent API call, we mark the host
as unreachable in case this was a transient error. Any other API call to
that host in the next 10 minutes that succeeds will reset the back-off
state again, so this should be fine.

When backing off, log it with a helpful error message.

Signed-off-by: Dominik Csapak <d.csapak@proxmox.com>
---
 server/src/connection.rs       | 107 ++++++++++++++++++++++-----------
 server/src/remote_cache/mod.rs |  20 +++++-
 2 files changed, 89 insertions(+), 38 deletions(-)

diff --git a/server/src/connection.rs b/server/src/connection.rs
index 91a51cbf..cf0cfb8a 100644
--- a/server/src/connection.rs
+++ b/server/src/connection.rs
@@ -20,6 +20,7 @@ use serde::Serialize;
 
 use proxmox_acme_api::CertificateInfo;
 use proxmox_client::{Client, HttpApiClient, HttpApiResponse, HttpApiResponseStream, TlsOptions};
+use proxmox_time::epoch_i64;
 
 use pdm_api_types::remotes::{NodeUrl, Remote, RemoteType, TlsProbeOutcome};
 use pve_api_types::client::PveClientImpl;
@@ -540,29 +541,6 @@ impl MultiClientState {
         self.current = self.current.wrapping_add(1);
     }
 
-    /// Whenever a request fails with the *current* client we move the current entry forward.
-    ///
-    /// # Note:
-    ///
-    /// With our current strategy `failed_index` is always less than `current`, but if we change
-    /// the strategy, we may want to change this to something like `1 + max(current, failed)`.
-    fn failed(&mut self, failed_index: usize) {
-        if self.current == failed_index {
-            let entry = self.get_entry();
-            log::error!("marking client {} as unreachable", entry.hostname);
-            if let Ok(mut cache) = crate::remote_cache::RemoteMappingCache::write() {
-                cache.mark_host_reachable(
-                    &self.remote,
-                    &entry.hostname,
-                    ConnectionState::Unreachable("unknown error".to_string()),
-                );
-                let _ = cache.save();
-            }
-            self.next();
-            self.skip_unreachable();
-        }
-    }
-
     /// Skip ahead as long as we're pointing to an unreachable.
     fn skip_unreachable(&mut self) {
         let cache = crate::remote_cache::RemoteMappingCache::get();
@@ -588,8 +566,7 @@ impl MultiClientState {
         &self.entries[self.index()]
     }
 
-    /// Get the current entry and its index which can be passed to `failed()` if the client fails
-    /// to connect.
+    /// Get the current entry and its index.
     fn get(&self) -> (&MultiClientEntry, usize) {
         let index = self.index();
         (&self.entries[index], self.current)
@@ -656,8 +633,8 @@ impl MultiClient {
     ///
     /// This is basically a "generator" for clients to try.
     ///
-    /// We share the "state" with other tasks. When a client fails, it is "marked" as failed and
-    /// the state "rotates" through the clients.
+    /// We share the "state" with other tasks. When a client fails, it should be "marked" as failed
+    /// so that the state "rotates" through the clients.
     /// We might be skipping clients if other tasks already tried "more" clients, but that's fine,
     /// since there's no point in trying the same remote twice simultaneously if it is currently
     /// offline...
@@ -688,10 +665,14 @@ impl MultiClient {
                     Some(TryClient::reachable(client))
                 }
                 Some((start, current)) => {
-                    // If our last request failed, the retry-loop asks for another client, mark the
-                    // one we just used as failed and check if all clients have gone through a
-                    // retry loop...
-                    state.failed(current);
+                    // If our last request failed, the retry-loop asks for another client, it marked
+                    // the one we just used as failed and we check if all clients have gone through
+                    // a retry loop...
+                    if state.current == current {
+                        state.next();
+                        state.skip_unreachable();
+                    }
+
                     if state.tried_all_since(start) {
                         // This iterator (and therefore this retry-loop) has tried all clients.
                         // Give up.
@@ -701,8 +682,7 @@ impl MultiClient {
                             state.get_at(try_unreachable.as_mut()?.next()?),
                         ));
                     }
-                    // finally just get the new current client and update `current` for the later
-                    // call to `failed()`
+                    // finally just get the new current client and update `current`
                     let (client, new_current) = state.get();
                     start_current = Some((start, new_current));
 
@@ -729,6 +709,23 @@ macro_rules! try_request {
     ($self:expr, $method:expr, $path_and_query:expr, $params:expr, $how:ident) => {
         let params = $params.map(serde_json::to_value);
         Box::pin(async move {
+            // check if we're still in the back-off time frame of the remote
+            {
+                let cache = crate::remote_cache::RemoteMappingCache::get();
+                if let Some((back_off_time, last_error)) =
+                    cache.remote_time_to_next_try(&$self.remote, epoch_i64())
+                {
+                    if back_off_time > 0 {
+                        log::debug!(
+                            "wanted to retry '{}' but still {}s until retry time",
+                            $self.remote,
+                            back_off_time
+                        );
+                        return Err(proxmox_client::Error::Connect(last_error.into()));
+                    }
+                }
+            }
+
             let params = params
                 .transpose()
                 .map_err(|err| proxmox_client::Error::Anyhow(err.into()))?;
@@ -738,14 +735,31 @@ macro_rules! try_request {
             // remember the first endpoint to retry once at the end if it only failed to connect
             let mut connect_retry = None;
             let mut first = true;
-            // The iterator in use here will automatically mark a client as faulty if we move on to
-            // the `next()` one.
+            // If a call fails here, we have to mark the host as faulty.
             for TryClient {
                 client,
                 hostname,
                 reachable,
             } in $self.try_clients()
             {
+                // check if we're still in the back-off time frame of the host
+                {
+                    let cache = crate::remote_cache::RemoteMappingCache::get();
+                    if let Some((back_off_time, _)) =
+                        cache.host_time_to_next_try(&$self.remote, &hostname, epoch_i64())
+                    {
+                        if back_off_time > 0 {
+                            log::debug!(
+                                "wanted to retry '{}' - '{}', but still {}s until retry time",
+                                $self.remote,
+                                hostname,
+                                back_off_time
+                            );
+                            continue;
+                        }
+                    }
+                }
+
                 if let Some(err) = last_err.take() {
                     let path = $path_and_query;
                     log::error!("client error on request {path}, trying another remote - {err:?}");
@@ -795,6 +809,29 @@ macro_rules! try_request {
                     }
                 }
                 first = false;
+
+                // if this client was not able to reach the remote, mark the current hostname as
+                // unreachable
+                let err = match (timed_out, last_err.as_ref()) {
+                    (true, _) => Some("request timed out".to_string()),
+                    (false, Some(err)) => Some(err.to_string()),
+                    (false, None) => None,
+                };
+                if let Some(err) = err {
+                    if let Ok(mut cache) = crate::remote_cache::RemoteMappingCache::write() {
+                        log::debug!(
+                            "could not reach host '{}' on remote '{}', marking as unreachable",
+                            hostname,
+                            $self.remote
+                        );
+                        cache.mark_host_reachable(
+                            &$self.remote,
+                            &hostname,
+                            ConnectionState::Unreachable(err),
+                        );
+                        let _ = cache.save();
+                    }
+                }
             }
 
             // best-effort: give the first endpoint one more try after all others, but only on a
diff --git a/server/src/remote_cache/mod.rs b/server/src/remote_cache/mod.rs
index 3263705f..270118a9 100644
--- a/server/src/remote_cache/mod.rs
+++ b/server/src/remote_cache/mod.rs
@@ -24,7 +24,7 @@ use serde::{Deserialize, Serialize};
 
 use proxmox_product_config::replace_config;
 use proxmox_product_config::{ApiLockGuard, open_api_lockfile};
-use proxmox_time::epoch_i64;
+use proxmox_time::{epoch_i64, epoch_to_rfc2822};
 
 use pdm_api_types::remotes::RemoteType;
 use pdm_config::ConfigVersionCache;
@@ -238,7 +238,14 @@ impl RemoteMappingCache {
         let unreachable = matches!(&connection_state, ConnectionState::Unreachable(_));
 
         if let Some(info) = self.info_by_hostname_mut(remote_name, hostname) {
-            info.set_reachable(connection_state);
+            if let Some(next_try) = info.set_reachable(connection_state) {
+                log::warn!(
+                    "could not reach host {} of {} for some time, backing off until {}",
+                    hostname,
+                    remote_name,
+                    epoch_to_rfc2822(next_try).unwrap_or_else(|_| next_try.to_string())
+                );
+            }
         }
 
         self.set_or_reset_canary(remote_name, unreachable);
@@ -254,7 +261,14 @@ impl RemoteMappingCache {
         let unreachable = matches!(&connection_state, ConnectionState::Unreachable(_));
 
         if let Some(info) = self.info_by_node_name_mut(remote_name, node_name) {
-            info.set_reachable(connection_state);
+            if let Some(next_try) = info.set_reachable(connection_state) {
+                log::warn!(
+                    "could not reach node {} of {} for some time, backing off until {}",
+                    node_name,
+                    remote_name,
+                    epoch_to_rfc2822(next_try).unwrap_or_else(|_| next_try.to_string())
+                );
+            }
         }
 
         self.set_or_reset_canary(remote_name, unreachable);
-- 
2.47.3