From: Dominik Csapak <d.csapak@proxmox.com>
To: pdm-devel@lists.proxmox.com
Subject: [PATCH datacenter-manager v2 1/4] server: remote cache: prepare for back-off mechanism
Date: Mon, 8 Jun 2026 15:25:29 +0200 [thread overview]
Message-ID: <20260608132539.2949407-2-d.csapak@proxmox.com> (raw)
In-Reply-To: <20260608132539.2949407-1-d.csapak@proxmox.com>
Replace the 'reachable' field in `HostInfo` of the `RemoteMappingCache`
with an `Option<BackOffState>` instead. `None` means it's reachable and
some state means it's not.
The back-off state itself contains information about when it was last
reachable and when it's time to try again.
The current logic of the back-off state is:
* Retry for 10 minutes before backing off
* Start with backing off for 10 seconds.
* Retrying after the current back-off time ran out adds a doubled
back-off time (20s, 40s, etc.) in case the host is still offline.
* do that until a maximum back-off time of 3 minutes is reached.
Signed-off-by: Dominik Csapak <d.csapak@proxmox.com>
---
.../tasks/remote_node_mapping.rs | 29 ++--
server/src/connection.rs | 21 ++-
server/src/remote_cache/back_off.rs | 128 ++++++++++++++++++
server/src/remote_cache/mod.rs | 114 ++++++++++++++--
4 files changed, 264 insertions(+), 28 deletions(-)
create mode 100644 server/src/remote_cache/back_off.rs
diff --git a/server/src/bin/proxmox-datacenter-api/tasks/remote_node_mapping.rs b/server/src/bin/proxmox-datacenter-api/tasks/remote_node_mapping.rs
index 9e2ebf15..867e01cf 100644
--- a/server/src/bin/proxmox-datacenter-api/tasks/remote_node_mapping.rs
+++ b/server/src/bin/proxmox-datacenter-api/tasks/remote_node_mapping.rs
@@ -25,7 +25,7 @@ use proxmox_section_config::typed::SectionConfigData;
use pdm_api_types::remotes::{Remote, RemoteType};
-use server::remote_cache::{self, RemoteMappingCache};
+use server::remote_cache::{self, ConnectionState, RemoteMappingCache};
use server::task_utils;
const CONFIG_POLL_INTERVAL: u64 = 60;
@@ -188,22 +188,21 @@ impl CachingTask {
log::debug!("querying remote {:?} node {:?}", remote.id, node.hostname);
// if the host is new, we need to query its name
- let query_result = match query_node_name(remote, &node.hostname).await {
- Ok(node_name) => Some(node_name),
- Err(err) => {
- log::error!(
- "failed to query info for remote '{}' node '{}' - {err:?}",
- remote.id,
- node.hostname
- );
- None
- }
- };
+ let (query_result, connection_state) =
+ match query_node_name(remote, &node.hostname).await {
+ Ok(node_name) => (Some(node_name), ConnectionState::Reachable),
+ Err(err) => {
+ log::error!(
+ "failed to query info for remote '{}' node '{}' - {err:?}",
+ remote.id,
+ node.hostname
+ );
+ (None, ConnectionState::Unreachable(err.to_string()))
+ }
+ };
let mut cache = RemoteMappingCache::write()?;
- if let Some(info) = cache.info_by_hostname_mut(&remote.id, &node.hostname) {
- info.reachable = query_result.is_some();
- }
+ cache.mark_host_reachable(&remote.id, &node.hostname, connection_state);
if let Some(node_name) = query_result {
cache.set_node_name(&remote.id, &node.hostname, Some(node_name));
}
diff --git a/server/src/connection.rs b/server/src/connection.rs
index 7ad1a5b9..91a51cbf 100644
--- a/server/src/connection.rs
+++ b/server/src/connection.rs
@@ -25,6 +25,7 @@ use pdm_api_types::remotes::{NodeUrl, Remote, RemoteType, TlsProbeOutcome};
use pve_api_types::client::PveClientImpl;
use crate::pbs_client::PbsClient;
+use crate::remote_cache::ConnectionState;
static INSTANCE: OnceLock<Box<dyn ClientFactory + Send + Sync>> = OnceLock::new();
@@ -381,7 +382,7 @@ impl ClientFactory for DefaultClientFactory {
) -> Result<Arc<PveClient>, Error> {
let cache = crate::remote_cache::RemoteMappingCache::get();
match cache.info_by_node_name(&remote.id, node) {
- Some(info) if info.reachable => {
+ Some(info) if info.is_reachable() => {
self.make_pve_client_with_endpoint(remote, Some(&info.hostname))
}
_ => self.make_pve_client(remote),
@@ -550,7 +551,11 @@ impl MultiClientState {
let entry = self.get_entry();
log::error!("marking client {} as unreachable", entry.hostname);
if let Ok(mut cache) = crate::remote_cache::RemoteMappingCache::write() {
- cache.mark_host_reachable(&self.remote, &entry.hostname, false);
+ cache.mark_host_reachable(
+ &self.remote,
+ &entry.hostname,
+ ConnectionState::Unreachable("unknown error".to_string()),
+ );
let _ = cache.save();
}
self.next();
@@ -775,7 +780,11 @@ macro_rules! try_request {
log::error!("marking {hostname:?} as reachable again!");
if let Ok(mut cache) = crate::remote_cache::RemoteMappingCache::write()
{
- cache.mark_host_reachable(&$self.remote, &hostname, true);
+ cache.mark_host_reachable(
+ &$self.remote,
+ &hostname,
+ ConnectionState::Reachable,
+ );
let _ = cache.save();
}
}
@@ -797,7 +806,11 @@ macro_rules! try_request {
if let Ok(result) = tokio::time::timeout($self.timeout, request).await {
if result.is_ok() {
if let Ok(mut cache) = crate::remote_cache::RemoteMappingCache::write() {
- cache.mark_host_reachable(&$self.remote, &hostname, true);
+ cache.mark_host_reachable(
+ &$self.remote,
+ &hostname,
+ ConnectionState::Reachable,
+ );
let _ = cache.save();
}
}
diff --git a/server/src/remote_cache/back_off.rs b/server/src/remote_cache/back_off.rs
new file mode 100644
index 00000000..b2eb7fcd
--- /dev/null
+++ b/server/src/remote_cache/back_off.rs
@@ -0,0 +1,128 @@
+use serde::{Deserialize, Serialize};
+
+/// The initial back-off time when the minimum offline time was reached.
+const BACK_OFF_BASE_TIME_S: i64 = 10;
+/// The maximum back-off time between retries.
+const BACK_OFF_MAX_TIME_S: i64 = 3 * 60;
+/// Minimum time a connection must fail before back-off kicks in.
+const BACK_OFF_MIN_TIME_S: i64 = 10 * 60;
+
+/// Holds the current state for backing off an offline remote
+#[derive(Clone, Deserialize, Serialize)]
+pub struct BackOffState {
+ /// How often the back-off time is doubled since the start, up to a maximum of
+ /// [BACK_OFF_MAX_TIME_S]
+ back_off_doubling_count: u32,
+ /// The last error message we got.
+ last_error: String,
+ /// The first time it failed.
+ first_fail_time: i64,
+ /// The next time a connection can be retried.
+ next_try: i64,
+}
+
+impl BackOffState {
+ /// Create a new back-off config when a host/remote is not reachable for the first time
+ pub fn new(time: i64, error: String) -> Self {
+ Self {
+ first_fail_time: time,
+ next_try: time,
+ back_off_doubling_count: 0,
+ last_error: error,
+ }
+ }
+
+ /// Increases the back-off state when the remote is still unreachable.
+ /// Returns the next timestamp when it's allowed to retry if set.
+ pub fn retried(&mut self, time: i64, error: String) -> Option<i64> {
+ self.last_error = error;
+
+ if self.time_to_next_try(time) > 0 {
+ // still in the back off period
+ return None;
+ }
+
+ if self.back_off_doubling_count == 0 && (time - self.first_fail_time) < BACK_OFF_MIN_TIME_S
+ {
+ // failing, but did not reach the minimum fail timer yet
+ return None;
+ }
+
+ // we're outside the back-off period, and later than the minimum
+ let back_off_time = (BACK_OFF_BASE_TIME_S * 2i64.pow(self.back_off_doubling_count))
+ .min(BACK_OFF_MAX_TIME_S);
+
+ if back_off_time < BACK_OFF_MAX_TIME_S {
+ self.back_off_doubling_count += 1;
+ }
+
+ self.next_try = time + back_off_time;
+ Some(self.next_try)
+ }
+
+ /// Returns the remaining time to try again in seconds.
+ /// If it is 0, we're free to try again.
+ pub fn time_to_next_try(&self, current_time: i64) -> u64 {
+ self.next_try.saturating_sub(current_time).max(0) as u64
+ }
+
+ /// Returns the last error that was set.
+ pub fn last_error(&self) -> String {
+ self.last_error.clone()
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use super::{BACK_OFF_MAX_TIME_S, BACK_OFF_MIN_TIME_S, BackOffState};
+
+ #[test]
+ fn test_back_off_calculation() {
+ let mut time = 0;
+ let mut back_off = BackOffState::new(time, String::new());
+ // timeout should be @ 0 seconds
+
+ assert_eq!(back_off.time_to_next_try(time + 1), 0);
+
+ time += BACK_OFF_MIN_TIME_S;
+ back_off.retried(time, String::new());
+ let mut back_off_time = 10;
+
+ assert_eq!(back_off.time_to_next_try(time + 5), 5);
+ assert_eq!(back_off.time_to_next_try(time + 10), 0);
+ assert_eq!(back_off.time_to_next_try(time + 20), 0);
+
+ back_off.retried(time + back_off_time, String::new());
+ time += back_off_time;
+ back_off_time *= 2;
+ back_off.retried(time, String::new());
+
+ assert_eq!(back_off.time_to_next_try(time), back_off_time as u64);
+
+ time += back_off_time;
+ back_off_time *= 2;
+ back_off.retried(time, String::new());
+ time += back_off_time;
+ back_off_time *= 2;
+ back_off.retried(time, String::new());
+ time += back_off_time;
+ back_off_time *= 2;
+ back_off.retried(time, String::new());
+ time += back_off_time;
+ back_off.retried(time, String::new());
+
+ // retried 7 times, back-off should have reached the max time by now
+ assert_eq!(back_off.time_to_next_try(time), BACK_OFF_MAX_TIME_S as u64);
+
+ // simulate 12 hour outage
+
+ let iterations = 12 * 60 * 60 / BACK_OFF_MAX_TIME_S;
+ for _ in 0..iterations {
+ time += BACK_OFF_MAX_TIME_S;
+ back_off.retried(time, String::new());
+ }
+
+ // timeout should still be at the maximum
+ assert_eq!(back_off.time_to_next_try(time), BACK_OFF_MAX_TIME_S as u64);
+ }
+}
diff --git a/server/src/remote_cache/mod.rs b/server/src/remote_cache/mod.rs
index 39483e9f..b6c2f48a 100644
--- a/server/src/remote_cache/mod.rs
+++ b/server/src/remote_cache/mod.rs
@@ -24,10 +24,14 @@ use serde::{Deserialize, Serialize};
use proxmox_product_config::replace_config;
use proxmox_product_config::{ApiLockGuard, open_api_lockfile};
+use proxmox_time::epoch_i64;
use pdm_api_types::remotes::RemoteType;
use pdm_config::ConfigVersionCache;
+mod back_off;
+use back_off::BackOffState;
+
const CACHE_FILENAME: &str = concat!(
pdm_buildcfg::PDM_CACHE_DIR_M!(),
"/remote-mapping-cache.json"
@@ -198,16 +202,26 @@ impl RemoteMappingCache {
}
/// Mark a host as reachable.
- pub fn mark_host_reachable(&mut self, remote_name: &str, hostname: &str, reachable: bool) {
+ pub fn mark_host_reachable(
+ &mut self,
+ remote_name: &str,
+ hostname: &str,
+ connection_state: ConnectionState,
+ ) {
if let Some(info) = self.info_by_hostname_mut(remote_name, hostname) {
- info.reachable = reachable;
+ info.set_reachable(connection_state);
}
}
/// Mark a host as reachable.
- pub fn mark_node_reachable(&mut self, remote_name: &str, node_name: &str, reachable: bool) {
+ pub fn mark_node_reachable(
+ &mut self,
+ remote_name: &str,
+ node_name: &str,
+ connection_state: ConnectionState,
+ ) {
if let Some(info) = self.info_by_node_name_mut(remote_name, node_name) {
- info.reachable = reachable;
+ info.set_reachable(connection_state);
}
}
@@ -222,10 +236,70 @@ impl RemoteMappingCache {
/// Check if a host is reachable.
pub fn host_is_reachable(&self, remote: &str, hostname: &str) -> bool {
self.info_by_hostname(remote, hostname)
- .is_none_or(|info| info.reachable)
+ .is_none_or(|info| info.is_reachable())
+ }
+
+ /// Get the next time to try the host and the last error if it was not reachable.
+ pub fn host_time_to_next_try(
+ &self,
+ remote: &str,
+ hostname: &str,
+ current_time: i64,
+ ) -> Option<(u64, String)> {
+ self.info_by_hostname(remote, hostname)
+ .and_then(|info| info.back_off.as_ref())
+ .map(|back_off| {
+ (
+ back_off.time_to_next_try(current_time),
+ back_off.last_error(),
+ )
+ })
+ }
+
+ /// Returns the remaining back-off time and optionally the last error we got.
+ pub fn remote_time_to_next_try(
+ &self,
+ remote: &str,
+ current_time: i64,
+ ) -> Option<(u64, String)> {
+ match self.remotes.get(remote) {
+ Some(remote) => {
+ let mut time = u64::MAX;
+ let mut err = String::new();
+ for info in remote.hosts.values() {
+ if let Some(back_off) = &info.back_off {
+ let node_time = back_off.time_to_next_try(current_time);
+ // use the least time from the hosts
+ if node_time < time {
+ time = node_time;
+ err = back_off.last_error();
+ }
+ } else {
+ // we found a node that is reachable, return immediately
+ return None;
+ }
+ }
+
+ if time == u64::MAX {
+ // we had no node information so we're allowed to try
+ return None;
+ }
+
+ Some((time, err))
+ }
+ None => None, // no info about remote, are we allowed to try
+ }
}
}
+/// If a remote is reachable or not
+pub enum ConnectionState {
+ /// The host/remote/etc. is reachable
+ Reachable,
+ /// The remote/host/etc. is not reachable. Contains the error.
+ Unreachable(String),
+}
+
/// An entry for a remote in a [`RemoteMappingCache`].
#[derive(Clone, Deserialize, Serialize)]
pub struct RemoteMapping {
@@ -271,9 +345,9 @@ pub struct HostInfo {
/// This is the cluster side node name, if we know it.
node_name: Option<String>,
- /// This means we were able to reach the node.
- /// When a client fails to connect it may update this to mark it as unreachable.
- pub reachable: bool,
+ /// Per host back off config
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ back_off: Option<BackOffState>,
}
impl HostInfo {
@@ -281,11 +355,33 @@ impl HostInfo {
Self {
hostname,
node_name: None,
- reachable: true,
+ back_off: None,
}
}
pub fn node_name(&self) -> Option<&str> {
self.node_name.as_deref()
}
+
+ /// Sets the host's reachable status.
+ /// Returns the next timestamp when it's allowed to retry if set.
+ pub fn set_reachable(&mut self, connection_state: ConnectionState) -> Option<i64> {
+ match connection_state {
+ ConnectionState::Reachable => {
+ self.back_off = None;
+ }
+ ConnectionState::Unreachable(err) => {
+ let time = epoch_i64();
+ match &mut self.back_off {
+ Some(back_off) => return back_off.retried(time, err),
+ None => self.back_off = Some(BackOffState::new(time, err)),
+ }
+ }
+ }
+ None
+ }
+
+ pub fn is_reachable(&self) -> bool {
+ self.back_off.is_none()
+ }
}
--
2.47.3
next prev parent reply other threads:[~2026-06-08 13:26 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-08 13:25 [PATCH datacenter-manager v2 0/4] implement back-off mechanism for connection errors for remotes Dominik Csapak
2026-06-08 13:25 ` Dominik Csapak [this message]
2026-06-08 13:25 ` [PATCH datacenter-manager v2 2/4] server: remote cache: introduce canary remote when none is reachable Dominik Csapak
2026-06-08 13:25 ` [PATCH datacenter-manager v2 3/4] server: connection: multi-client: use back-off state from remote cache Dominik Csapak
2026-06-08 13:25 ` [PATCH datacenter-manager v2 4/4] tasks: remote node mapping: use host cache for PBS too Dominik Csapak
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260608132539.2949407-2-d.csapak@proxmox.com \
--to=d.csapak@proxmox.com \
--cc=pdm-devel@lists.proxmox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox