From: Wolfgang Bumiller <w.bumiller@proxmox.com>
To: pdm-devel@lists.proxmox.com
Subject: [pdm-devel] [PATCH v2 datacenter-manager 5/7] server: don't try to connect to known-unreachable servers
Date: Wed, 5 Mar 2025 16:01:06 +0100 [thread overview]
Message-ID: <20250305150108.245584-6-w.bumiller@proxmox.com> (raw)
In-Reply-To: <20250305150108.245584-1-w.bumiller@proxmox.com>
Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
Reviewed-by: Lukas Wagner <l.wagner@proxmox.com>
---
No changes since v1.
server/src/connection.rs | 93 +++++++++++++++++++++++++---------
server/src/remote_cache/mod.rs | 6 +++
2 files changed, 74 insertions(+), 25 deletions(-)
diff --git a/server/src/connection.rs b/server/src/connection.rs
index 3cfefba..b9c0e16 100644
--- a/server/src/connection.rs
+++ b/server/src/connection.rs
@@ -130,14 +130,17 @@ fn prepare_connect_multi_client(remote: &Remote) -> Result<(MultiClient, Connect
let mut clients = Vec::new();
for node in &remote.nodes {
- clients.push(Arc::new(prepare_connect_client_to_node(
- node,
- info.default_port,
- info.pve_compat,
- )?));
+ clients.push(MultiClientEntry {
+ client: Arc::new(prepare_connect_client_to_node(
+ node,
+ info.default_port,
+ info.pve_compat,
+ )?),
+ hostname: node.hostname.clone(),
+ });
}
- Ok((MultiClient::new(clients), info))
+ Ok((MultiClient::new(remote.id.clone(), clients), info))
}
/// Like [`connect()`], but with failover support for remotes which can have multiple nodes.
@@ -449,6 +452,14 @@ pub fn init(instance: Box<dyn ClientFactory + Send + Sync>) {
}
}
+/// In order to allow the [`MultiClient`] to check the cached reachability state of a client, we
+/// need to know which remote it belongs to, so store the metadata alongside the actual `Client`
+/// struct.
+struct MultiClientEntry {
+ client: Arc<Client>,
+ hostname: String,
+}
+
/// This is another wrapper around the actual HTTP client responsible for dealing with connection
/// problems: if we cannot reach a node of a cluster, this will attempt to retry a request on
/// another node.
@@ -456,19 +467,15 @@ pub fn init(instance: Box<dyn ClientFactory + Send + Sync>) {
/// # Possible improvements
///
/// - For `GET` requests we could also start a 2nd request after a shorter time out (eg. 10s).
-/// - We could use RRD data for a "best guess" where to start eg. if we know a node was offline on
-/// the last rrd polling we'd start with a different one.
-/// For this, we still need to include the node names in the clients here somehow so that we can
-/// actually manage/re-shuffle them from the outside after this struct is already created.
struct MultiClient {
state: StdMutex<MultiClientState>,
timeout: Duration,
}
impl MultiClient {
- fn new(clients: Vec<Arc<Client>>) -> Self {
+ fn new(remote: String, entries: Vec<MultiClientEntry>) -> Self {
Self {
- state: StdMutex::new(MultiClientState::new(clients)),
+ state: StdMutex::new(MultiClientState::new(remote, entries)),
timeout: Duration::from_secs(60),
}
}
@@ -477,8 +484,8 @@ impl MultiClient {
where
F: Fn(&Arc<Client>),
{
- for client in &self.state.lock().unwrap().clients {
- func(client);
+ for entry in &self.state.lock().unwrap().entries {
+ func(&entry.client);
}
}
}
@@ -488,15 +495,24 @@ impl MultiClient {
struct MultiClientState {
/// The current index *not* modulo the client count.
current: usize,
- clients: Vec<Arc<Client>>,
+ remote: String,
+ entries: Vec<MultiClientEntry>,
}
impl MultiClientState {
- fn new(clients: Vec<Arc<Client>>) -> Self {
- Self {
+ fn new(remote: String, entries: Vec<MultiClientEntry>) -> Self {
+ let mut this = Self {
current: 0,
- clients,
- }
+ remote,
+ entries,
+ };
+ this.skip_unreachable();
+ this
+ }
+
+ /// Moving to the next entry must wrap.
+ fn next(&mut self) {
+ self.current = self.current.wrapping_add(1);
}
/// Whenever a request fails with the *current* client we move the current entry forward.
@@ -507,20 +523,47 @@ impl MultiClientState {
/// the strategy, we may want to change this to something like `1 + max(current, failed)`.
fn failed(&mut self, failed_index: usize) {
if self.current == failed_index {
- self.current = self.current.wrapping_add(1);
+ let entry = self.get_entry();
+ log::error!("marking client {} as unreachable", entry.hostname);
+ if let Ok(mut cache) = crate::remote_cache::RemoteMappingCache::write() {
+ cache.mark_host_reachable(&self.remote, &entry.hostname, false);
+ let _ = cache.save();
+ }
+ self.next();
+ self.skip_unreachable();
+ }
+ }
+
+ /// Skip ahead as long as we're pointing to an unreachable.
+ fn skip_unreachable(&mut self) {
+ let cache = crate::remote_cache::RemoteMappingCache::get();
+ // loop at most as many times as we have entries...
+ for _ in 0..self.entries.len() {
+ let entry = self.get_entry();
+ if !cache.host_is_reachable(&self.remote, &entry.hostname) {
+ log::error!("skipping host {} - marked unreachable", entry.hostname);
+ self.next();
+ } else {
+ return;
+ }
}
}
- /// Get `current` as an *index* (i.e. modulo `clients.len()`).
+ /// Get `current` as an *index* (i.e. modulo `entries.len()`).
fn index(&self) -> usize {
- self.current % self.clients.len()
+ self.current % self.entries.len()
+ }
+
+ /// Get the current entry.
+ fn get_entry(&self) -> &MultiClientEntry {
+ &self.entries[self.index()]
}
/// Get the current client and its index which can be passed to `failed()` if the client fails
/// to connect.
fn get(&self) -> (Arc<Client>, usize) {
let index = self.index();
- (Arc::clone(&self.clients[index]), self.current)
+ (Arc::clone(&self.entries[index].client), self.current)
}
/// Check if we already tried all clients since a specific starting index.
@@ -533,11 +576,11 @@ impl MultiClientState {
/// request-retry-loop will fail as soon as the same *number* of clients since its starting
/// point were marked as faulty without retrying them all.
fn tried_all_since(&self, start: usize) -> bool {
- self.tried_clients(start) >= self.clients.len()
+ self.tried_clients(start) >= self.entries.len()
}
/// We store the current index continuously without wrapping it modulo the client count (and
- /// only do that when indexing the `clients` array), so that we can easily check if "all
+ /// only do that when indexing the `entries` array), so that we can easily check if "all
/// currently running tasks taken together" have already tried all clients by comparing our
/// starting point to the current index.
fn tried_clients(&self, start: usize) -> usize {
diff --git a/server/src/remote_cache/mod.rs b/server/src/remote_cache/mod.rs
index 69e79f1..57e4bf3 100644
--- a/server/src/remote_cache/mod.rs
+++ b/server/src/remote_cache/mod.rs
@@ -218,6 +218,12 @@ impl RemoteMappingCache {
remote.set_node_name(hostname, node_name);
}
}
+
+ /// Check if a host is reachable.
+ pub fn host_is_reachable(&self, remote: &str, hostname: &str) -> bool {
+ self.info_by_hostname(remote, hostname)
+ .is_none_or(|info| info.reachable)
+ }
}
/// An entry for a remote in a [`RemoteMappingCache`].
--
2.39.5
_______________________________________________
pdm-devel mailing list
pdm-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pdm-devel
next prev parent reply other threads:[~2025-03-05 15:01 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-03-05 15:01 [pdm-devel] [PATCH pdm 0/7] multi-remote client and node reachability cache Wolfgang Bumiller
2025-03-05 15:01 ` [pdm-devel] [PATCH v2 datacenter-manager 1/7] server: generic multi-client wrapper Wolfgang Bumiller
2025-03-05 15:01 ` [pdm-devel] [PATCH v2 datacenter-manager 2/7] server: store pve MultiClient for re-use Wolfgang Bumiller
2025-03-05 15:01 ` [pdm-devel] [PATCH v2 datacenter-manager 3/7] server: separate ConnectInfo from client creation Wolfgang Bumiller
2025-03-05 15:01 ` [pdm-devel] [PATCH v2 datacenter-manager 4/7] server: cache pve node reachability and names Wolfgang Bumiller
2025-03-05 15:01 ` Wolfgang Bumiller [this message]
2025-03-05 15:01 ` [pdm-devel] [PATCH v2 datacenter-manager 6/7] server: try previously unreachable clients as last resort Wolfgang Bumiller
2025-03-05 15:01 ` [pdm-devel] [PATCH v2 datacenter-manager 7/7] server: add some tracing instrumentation Wolfgang Bumiller
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250305150108.245584-6-w.bumiller@proxmox.com \
--to=w.bumiller@proxmox.com \
--cc=pdm-devel@lists.proxmox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal