From: Dominik Csapak <d.csapak@proxmox.com>
To: pdm-devel@lists.proxmox.com
Subject: [PATCH datacenter-manager v2 1/4] server: remote cache: prepare for back-off mechanism
Date: Mon, 8 Jun 2026 15:25:29 +0200 [thread overview]
Message-ID: <20260608132539.2949407-2-d.csapak@proxmox.com> (raw)
In-Reply-To: <20260608132539.2949407-1-d.csapak@proxmox.com>
Replace the 'reachable' field in `HostInfo` of the `RemoteMappingCache`
with an `Option<BackOffState>` instead. `None` means it's reachable and
some state means it's not.
The back-off state itself contains information about when it was last
reachable and when it's time to try again.
The current logic of the back-off state is:
* Retry for 10 minutes before backing off
* Start with backing off for 10 seconds.
* Retrying after the current back-off time ran out adds a doubled
back-off time (20s, 40s, etc.) in case the host is still offline.
* do that until a maximum back-off time of 3 minutes is reached.
Signed-off-by: Dominik Csapak <d.csapak@proxmox.com>
---
.../tasks/remote_node_mapping.rs | 29 ++--
server/src/connection.rs | 21 ++-
server/src/remote_cache/back_off.rs | 128 ++++++++++++++++++
server/src/remote_cache/mod.rs | 114 ++++++++++++++--
4 files changed, 264 insertions(+), 28 deletions(-)
create mode 100644 server/src/remote_cache/back_off.rs
diff --git a/server/src/bin/proxmox-datacenter-api/tasks/remote_node_mapping.rs b/server/src/bin/proxmox-datacenter-api/tasks/remote_node_mapping.rs
index 9e2ebf15..867e01cf 100644
--- a/server/src/bin/proxmox-datacenter-api/tasks/remote_node_mapping.rs
+++ b/server/src/bin/proxmox-datacenter-api/tasks/remote_node_mapping.rs
@@ -25,7 +25,7 @@ use proxmox_section_config::typed::SectionConfigData;
use pdm_api_types::remotes::{Remote, RemoteType};
-use server::remote_cache::{self, RemoteMappingCache};
+use server::remote_cache::{self, ConnectionState, RemoteMappingCache};
use server::task_utils;
const CONFIG_POLL_INTERVAL: u64 = 60;
@@ -188,22 +188,21 @@ impl CachingTask {
log::debug!("querying remote {:?} node {:?}", remote.id, node.hostname);
// if the host is new, we need to query its name
- let query_result = match query_node_name(remote, &node.hostname).await {
- Ok(node_name) => Some(node_name),
- Err(err) => {
- log::error!(
- "failed to query info for remote '{}' node '{}' - {err:?}",
- remote.id,
- node.hostname
- );
- None
- }
- };
+ let (query_result, connection_state) =
+ match query_node_name(remote, &node.hostname).await {
+ Ok(node_name) => (Some(node_name), ConnectionState::Reachable),
+ Err(err) => {
+ log::error!(
+ "failed to query info for remote '{}' node '{}' - {err:?}",
+ remote.id,
+ node.hostname
+ );
+ (None, ConnectionState::Unreachable(err.to_string()))
+ }
+ };
let mut cache = RemoteMappingCache::write()?;
- if let Some(info) = cache.info_by_hostname_mut(&remote.id, &node.hostname) {
- info.reachable = query_result.is_some();
- }
+ cache.mark_host_reachable(&remote.id, &node.hostname, connection_state);
if let Some(node_name) = query_result {
cache.set_node_name(&remote.id, &node.hostname, Some(node_name));
}
diff --git a/server/src/connection.rs b/server/src/connection.rs
index 7ad1a5b9..91a51cbf 100644
--- a/server/src/connection.rs
+++ b/server/src/connection.rs
@@ -25,6 +25,7 @@ use pdm_api_types::remotes::{NodeUrl, Remote, RemoteType, TlsProbeOutcome};
use pve_api_types::client::PveClientImpl;
use crate::pbs_client::PbsClient;
+use crate::remote_cache::ConnectionState;
static INSTANCE: OnceLock<Box<dyn ClientFactory + Send + Sync>> = OnceLock::new();
@@ -381,7 +382,7 @@ impl ClientFactory for DefaultClientFactory {
) -> Result<Arc<PveClient>, Error> {
let cache = crate::remote_cache::RemoteMappingCache::get();
match cache.info_by_node_name(&remote.id, node) {
- Some(info) if info.reachable => {
+ Some(info) if info.is_reachable() => {
self.make_pve_client_with_endpoint(remote, Some(&info.hostname))
}
_ => self.make_pve_client(remote),
@@ -550,7 +551,11 @@ impl MultiClientState {
let entry = self.get_entry();
log::error!("marking client {} as unreachable", entry.hostname);
if let Ok(mut cache) = crate::remote_cache::RemoteMappingCache::write() {
- cache.mark_host_reachable(&self.remote, &entry.hostname, false);
+ cache.mark_host_reachable(
+ &self.remote,
+ &entry.hostname,
+ ConnectionState::Unreachable("unknown error".to_string()),
+ );
let _ = cache.save();
}
self.next();
@@ -775,7 +780,11 @@ macro_rules! try_request {
log::error!("marking {hostname:?} as reachable again!");
if let Ok(mut cache) = crate::remote_cache::RemoteMappingCache::write()
{
- cache.mark_host_reachable(&$self.remote, &hostname, true);
+ cache.mark_host_reachable(
+ &$self.remote,
+ &hostname,
+ ConnectionState::Reachable,
+ );
let _ = cache.save();
}
}
@@ -797,7 +806,11 @@ macro_rules! try_request {
if let Ok(result) = tokio::time::timeout($self.timeout, request).await {
if result.is_ok() {
if let Ok(mut cache) = crate::remote_cache::RemoteMappingCache::write() {
- cache.mark_host_reachable(&$self.remote, &hostname, true);
+ cache.mark_host_reachable(
+ &$self.remote,
+ &hostname,
+ ConnectionState::Reachable,
+ );
let _ = cache.save();
}
}
diff --git a/server/src/remote_cache/back_off.rs b/server/src/remote_cache/back_off.rs
new file mode 100644
index 00000000..b2eb7fcd
--- /dev/null
+++ b/server/src/remote_cache/back_off.rs
@@ -0,0 +1,128 @@
+use serde::{Deserialize, Serialize};
+
+/// The initial back-off time when the minimum offline time was reached.
+const BACK_OFF_BASE_TIME_S: i64 = 10;
+/// The maximum back-off time between retries.
+const BACK_OFF_MAX_TIME_S: i64 = 3 * 60;
+/// Minimum time a connection must fail before back-off kicks in.
+const BACK_OFF_MIN_TIME_S: i64 = 10 * 60;
+
+/// Holds the current state for backing off an offline remote
+#[derive(Clone, Deserialize, Serialize)]
+pub struct BackOffState {
+ /// How often the back-off time is doubled since the start, up to a maximum of
+ /// [BACK_OFF_MAX_TIME_S]
+ back_off_doubling_count: u32,
+ /// The last error message we got.
+ last_error: String,
+ /// The first time it failed.
+ first_fail_time: i64,
+ /// The next time a connection can be retried.
+ next_try: i64,
+}
+
+impl BackOffState {
+ /// Create a new back-off config when a host/remote is not reachable for the first time
+ pub fn new(time: i64, error: String) -> Self {
+ Self {
+ first_fail_time: time,
+ next_try: time,
+ back_off_doubling_count: 0,
+ last_error: error,
+ }
+ }
+
+ /// Increases the back-off state when the remote is still unreachable.
+ /// Returns the next timestamp when it's allowed to retry if set.
+ pub fn retried(&mut self, time: i64, error: String) -> Option<i64> {
+ self.last_error = error;
+
+ if self.time_to_next_try(time) > 0 {
+ // still in the back off period
+ return None;
+ }
+
+ if self.back_off_doubling_count == 0 && (time - self.first_fail_time) < BACK_OFF_MIN_TIME_S
+ {
+ // failing, but did not reach the minimum fail timer yet
+ return None;
+ }
+
+ // we're outside the back-off period, and later than the minimum
+ let back_off_time = (BACK_OFF_BASE_TIME_S * 2i64.pow(self.back_off_doubling_count))
+ .min(BACK_OFF_MAX_TIME_S);
+
+ if back_off_time < BACK_OFF_MAX_TIME_S {
+ self.back_off_doubling_count += 1;
+ }
+
+ self.next_try = time + back_off_time;
+ Some(self.next_try)
+ }
+
+ /// Returns the remaining time to try again in seconds.
+ /// If it is 0, we're free to try again.
+ pub fn time_to_next_try(&self, current_time: i64) -> u64 {
+ self.next_try.saturating_sub(current_time).max(0) as u64
+ }
+
+ /// Returns the last error that was set.
+ pub fn last_error(&self) -> String {
+ self.last_error.clone()
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use super::{BACK_OFF_MAX_TIME_S, BACK_OFF_MIN_TIME_S, BackOffState};
+
+ #[test]
+ fn test_back_off_calculation() {
+ let mut time = 0;
+ let mut back_off = BackOffState::new(time, String::new());
+ // timeout should be @ 0 seconds
+
+ assert_eq!(back_off.time_to_next_try(time + 1), 0);
+
+ time += BACK_OFF_MIN_TIME_S;
+ back_off.retried(time, String::new());
+ let mut back_off_time = 10;
+
+ assert_eq!(back_off.time_to_next_try(time + 5), 5);
+ assert_eq!(back_off.time_to_next_try(time + 10), 0);
+ assert_eq!(back_off.time_to_next_try(time + 20), 0);
+
+ back_off.retried(time + back_off_time, String::new());
+ time += back_off_time;
+ back_off_time *= 2;
+ back_off.retried(time, String::new());
+
+ assert_eq!(back_off.time_to_next_try(time), back_off_time as u64);
+
+ time += back_off_time;
+ back_off_time *= 2;
+ back_off.retried(time, String::new());
+ time += back_off_time;
+ back_off_time *= 2;
+ back_off.retried(time, String::new());
+ time += back_off_time;
+ back_off_time *= 2;
+ back_off.retried(time, String::new());
+ time += back_off_time;
+ back_off.retried(time, String::new());
+
+ // retried 7 times, back-off should have reached the max time by now
+ assert_eq!(back_off.time_to_next_try(time), BACK_OFF_MAX_TIME_S as u64);
+
+ // simulate 12 hour outage
+
+ let iterations = 12 * 60 * 60 / BACK_OFF_MAX_TIME_S;
+ for _ in 0..iterations {
+ time += BACK_OFF_MAX_TIME_S;
+ back_off.retried(time, String::new());
+ }
+
+ // timeout should still be at the maximum
+ assert_eq!(back_off.time_to_next_try(time), BACK_OFF_MAX_TIME_S as u64);
+ }
+}
diff --git a/server/src/remote_cache/mod.rs b/server/src/remote_cache/mod.rs
index 39483e9f..b6c2f48a 100644
--- a/server/src/remote_cache/mod.rs
+++ b/server/src/remote_cache/mod.rs
@@ -24,10 +24,14 @@ use serde::{Deserialize, Serialize};
use proxmox_product_config::replace_config;
use proxmox_product_config::{ApiLockGuard, open_api_lockfile};
+use proxmox_time::epoch_i64;
use pdm_api_types::remotes::RemoteType;
use pdm_config::ConfigVersionCache;
+mod back_off;
+use back_off::BackOffState;
+
const CACHE_FILENAME: &str = concat!(
pdm_buildcfg::PDM_CACHE_DIR_M!(),
"/remote-mapping-cache.json"
@@ -198,16 +202,26 @@ impl RemoteMappingCache {
}
/// Mark a host as reachable.
- pub fn mark_host_reachable(&mut self, remote_name: &str, hostname: &str, reachable: bool) {
+ pub fn mark_host_reachable(
+ &mut self,
+ remote_name: &str,
+ hostname: &str,
+ connection_state: ConnectionState,
+ ) {
if let Some(info) = self.info_by_hostname_mut(remote_name, hostname) {
- info.reachable = reachable;
+ info.set_reachable(connection_state);
}
}
/// Mark a host as reachable.
- pub fn mark_node_reachable(&mut self, remote_name: &str, node_name: &str, reachable: bool) {
+ pub fn mark_node_reachable(
+ &mut self,
+ remote_name: &str,
+ node_name: &str,
+ connection_state: ConnectionState,
+ ) {
if let Some(info) = self.info_by_node_name_mut(remote_name, node_name) {
- info.reachable = reachable;
+ info.set_reachable(connection_state);
}
}
@@ -222,10 +236,70 @@ impl RemoteMappingCache {
/// Check if a host is reachable.
pub fn host_is_reachable(&self, remote: &str, hostname: &str) -> bool {
self.info_by_hostname(remote, hostname)
- .is_none_or(|info| info.reachable)
+ .is_none_or(|info| info.is_reachable())
+ }
+
+ /// Get the next time to try the host and the last error if it was not reachable.
+ pub fn host_time_to_next_try(
+ &self,
+ remote: &str,
+ hostname: &str,
+ current_time: i64,
+ ) -> Option<(u64, String)> {
+ self.info_by_hostname(remote, hostname)
+ .and_then(|info| info.back_off.as_ref())
+ .map(|back_off| {
+ (
+ back_off.time_to_next_try(current_time),
+ back_off.last_error(),
+ )
+ })
+ }
+
+ /// Returns the remaining back-off time and optionally the last error we got.
+ pub fn remote_time_to_next_try(
+ &self,
+ remote: &str,
+ current_time: i64,
+ ) -> Option<(u64, String)> {
+ match self.remotes.get(remote) {
+ Some(remote) => {
+ let mut time = u64::MAX;
+ let mut err = String::new();
+ for info in remote.hosts.values() {
+ if let Some(back_off) = &info.back_off {
+ let node_time = back_off.time_to_next_try(current_time);
+ // use the least time from the hosts
+ if node_time < time {
+ time = node_time;
+ err = back_off.last_error();
+ }
+ } else {
+ // we found a node that is reachable, return immediately
+ return None;
+ }
+ }
+
+ if time == u64::MAX {
+ // we had no node information so we're allowed to try
+ return None;
+ }
+
+ Some((time, err))
+ }
+ None => None, // no info about remote, are we allowed to try
+ }
}
}
+/// If a remote is reachable or not
+pub enum ConnectionState {
+ /// The host/remote/etc. is reachable
+ Reachable,
+ /// The remote/host/etc. is not reachable. Contains the error.
+ Unreachable(String),
+}
+
/// An entry for a remote in a [`RemoteMappingCache`].
#[derive(Clone, Deserialize, Serialize)]
pub struct RemoteMapping {
@@ -271,9 +345,9 @@ pub struct HostInfo {
/// This is the cluster side node name, if we know it.
node_name: Option<String>,
- /// This means we were able to reach the node.
- /// When a client fails to connect it may update this to mark it as unreachable.
- pub reachable: bool,
+ /// Per host back off config
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ back_off: Option<BackOffState>,
}
impl HostInfo {
@@ -281,11 +355,33 @@ impl HostInfo {
Self {
hostname,
node_name: None,
- reachable: true,
+ back_off: None,
}
}
pub fn node_name(&self) -> Option<&str> {
self.node_name.as_deref()
}
+
+ /// Sets the host's reachable status.
+ /// Returns the next timestamp when it's allowed to retry if set.
+ pub fn set_reachable(&mut self, connection_state: ConnectionState) -> Option<i64> {
+ match connection_state {
+ ConnectionState::Reachable => {
+ self.back_off = None;
+ }
+ ConnectionState::Unreachable(err) => {
+ let time = epoch_i64();
+ match &mut self.back_off {
+ Some(back_off) => return back_off.retried(time, err),
+ None => self.back_off = Some(BackOffState::new(time, err)),
+ }
+ }
+ }
+ None
+ }
+
+ pub fn is_reachable(&self) -> bool {
+ self.back_off.is_none()
+ }
}
--
2.47.3
next prev parent reply other threads:[~2026-06-08 13:26 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-08 13:25 [PATCH datacenter-manager v2 0/4] implement back-off mechanism for connection errors for remotes Dominik Csapak
2026-06-08 13:25 ` Dominik Csapak [this message]
2026-06-08 13:25 ` [PATCH datacenter-manager v2 2/4] server: remote cache: introduce canary remote when none is reachable Dominik Csapak
2026-06-08 13:25 ` [PATCH datacenter-manager v2 3/4] server: connection: multi-client: use back-off state from remote cache Dominik Csapak
2026-06-08 13:25 ` [PATCH datacenter-manager v2 4/4] tasks: remote node mapping: use host cache for PBS too Dominik Csapak
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260608132539.2949407-2-d.csapak@proxmox.com \
--to=d.csapak@proxmox.com \
--cc=pdm-devel@lists.proxmox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.