From: Wolfgang Bumiller <w.bumiller@proxmox.com>
To: pdm-devel@lists.proxmox.com
Subject: [pdm-devel] [PATCH datacenter-manager 5/7] server: don't try to connect to known-unreachable servers
Date: Tue, 4 Feb 2025 10:55:52 +0100 [thread overview]
Message-ID: <20250204095554.39501-6-w.bumiller@proxmox.com> (raw)
In-Reply-To: <20250204095554.39501-1-w.bumiller@proxmox.com>
Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
---
server/src/connection.rs | 93 +++++++++++++++++++++++++---------
server/src/remote_cache/mod.rs | 6 +++
2 files changed, 74 insertions(+), 25 deletions(-)
diff --git a/server/src/connection.rs b/server/src/connection.rs
index 24e2e44..7c1c237 100644
--- a/server/src/connection.rs
+++ b/server/src/connection.rs
@@ -130,14 +130,17 @@ fn prepare_connect_multi_client(remote: &Remote) -> Result<(MultiClient, Connect
let mut clients = Vec::new();
for node in &remote.nodes {
- clients.push(Arc::new(prepare_connect_client_to_node(
- node,
- info.default_port,
- info.pve_compat,
- )?));
+ clients.push(MultiClientEntry {
+ client: Arc::new(prepare_connect_client_to_node(
+ node,
+ info.default_port,
+ info.pve_compat,
+ )?),
+ hostname: node.hostname.clone(),
+ });
}
- Ok((MultiClient::new(clients), info))
+ Ok((MultiClient::new(remote.id.clone(), clients), info))
}
/// Like [`connect()`], but with failover support for remotes which can have multiple nodes.
@@ -449,6 +452,14 @@ pub fn init(instance: Box<dyn ClientFactory + Send + Sync>) {
}
}
+/// In order to allow the [`MultiClient`] to check the cached reachability state of a client, we
+/// need to know which remote it belongs to, so store the metadata alongside the actual `Client`
+/// struct.
+struct MultiClientEntry {
+ client: Arc<Client>,
+ hostname: String,
+}
+
/// This is another wrapper around the actual HTTP client responsible for dealing with connection
/// problems: if we cannot reach a node of a cluster, this will attempt to retry a request on
/// another node.
@@ -456,19 +467,15 @@ pub fn init(instance: Box<dyn ClientFactory + Send + Sync>) {
/// # Possible improvements
///
/// - For `GET` requests we could also start a 2nd request after a shorter time out (eg. 10s).
-/// - We could use RRD data for a "best guess" where to start eg. if we know a node was offline on
-/// the last rrd polling we'd start with a different one.
-/// For this, we still need to include the node names in the clients here somehow so that we can
-/// actually manage/re-shuffle them from the outside after this struct is already created.
struct MultiClient {
state: StdMutex<MultiClientState>,
timeout: Duration,
}
impl MultiClient {
- fn new(clients: Vec<Arc<Client>>) -> Self {
+ fn new(remote: String, entries: Vec<MultiClientEntry>) -> Self {
Self {
- state: StdMutex::new(MultiClientState::new(clients)),
+ state: StdMutex::new(MultiClientState::new(remote, entries)),
timeout: Duration::from_secs(60),
}
}
@@ -477,8 +484,8 @@ impl MultiClient {
where
F: Fn(&Arc<Client>),
{
- for client in &self.state.lock().unwrap().clients {
- func(client);
+ for entry in &self.state.lock().unwrap().entries {
+ func(&entry.client);
}
}
}
@@ -488,15 +495,24 @@ impl MultiClient {
struct MultiClientState {
/// The current index *not* modulo the client count.
current: usize,
- clients: Vec<Arc<Client>>,
+ remote: String,
+ entries: Vec<MultiClientEntry>,
}
impl MultiClientState {
- fn new(clients: Vec<Arc<Client>>) -> Self {
- Self {
+ fn new(remote: String, entries: Vec<MultiClientEntry>) -> Self {
+ let mut this = Self {
current: 0,
- clients,
- }
+ remote,
+ entries,
+ };
+ this.skip_unreachable();
+ this
+ }
+
+ /// Moving to the next entry must wrap.
+ fn next(&mut self) {
+ self.current = self.current.wrapping_add(1);
}
/// Whenever a request fails with the *current* client we move the current entry forward.
@@ -507,20 +523,47 @@ impl MultiClientState {
/// the strategy, we may want to change this to something like `1 + max(current, failed)`.
fn failed(&mut self, failed_index: usize) {
if self.current == failed_index {
- self.current = self.current.wrapping_add(1);
+ let entry = self.get_entry();
+ log::error!("marking client {} as unreachable", entry.hostname);
+ if let Ok(mut cache) = crate::remote_cache::RemoteMappingCache::write() {
+ cache.mark_host_reachable(&self.remote, &entry.hostname, false);
+ let _ = cache.save();
+ }
+ self.next();
+ self.skip_unreachable();
+ }
+ }
+
+ /// Skip ahead as long as we're pointing to an unreachable.
+ fn skip_unreachable(&mut self) {
+ let cache = crate::remote_cache::RemoteMappingCache::get();
+ // loop at most as many times as we have entries...
+ for _ in 0..self.entries.len() {
+ let entry = self.get_entry();
+ if !cache.host_is_reachable(&self.remote, &entry.hostname) {
+ log::error!("skipping host {} - marked unreachable", entry.hostname);
+ self.next();
+ } else {
+ return;
+ }
}
}
- /// Get `current` as an *index* (i.e. modulo `clients.len()`).
+ /// Get `current` as an *index* (i.e. modulo `entries.len()`).
fn index(&self) -> usize {
- self.current % self.clients.len()
+ self.current % self.entries.len()
+ }
+
+ /// Get the current entry.
+ fn get_entry(&self) -> &MultiClientEntry {
+ &self.entries[self.index()]
}
/// Get the current client and its index which can be passed to `failed()` if the client fails
/// to connect.
fn get(&self) -> (Arc<Client>, usize) {
let index = self.index();
- (Arc::clone(&self.clients[index]), self.current)
+ (Arc::clone(&self.entries[index].client), self.current)
}
/// Check if we already tried all clients since a specific starting index.
@@ -533,11 +576,11 @@ impl MultiClientState {
/// request-retry-loop will fail as soon as the same *number* of clients since its starting
/// point were marked as faulty without retrying them all.
fn tried_all_since(&self, start: usize) -> bool {
- self.tried_clients(start) >= self.clients.len()
+ self.tried_clients(start) >= self.entries.len()
}
/// We store the current index continuously without wrapping it modulo the client count (and
- /// only do that when indexing the `clients` array), so that we can easily check if "all
+ /// only do that when indexing the `entries` array), so that we can easily check if "all
/// currently running tasks taken together" have already tried all clients by comparing our
/// starting point to the current index.
fn tried_clients(&self, start: usize) -> usize {
diff --git a/server/src/remote_cache/mod.rs b/server/src/remote_cache/mod.rs
index 69e79f1..57e4bf3 100644
--- a/server/src/remote_cache/mod.rs
+++ b/server/src/remote_cache/mod.rs
@@ -218,6 +218,12 @@ impl RemoteMappingCache {
remote.set_node_name(hostname, node_name);
}
}
+
+ /// Check if a host is reachable.
+ pub fn host_is_reachable(&self, remote: &str, hostname: &str) -> bool {
+ self.info_by_hostname(remote, hostname)
+ .is_none_or(|info| info.reachable)
+ }
}
/// An entry for a remote in a [`RemoteMappingCache`].
--
2.39.5
_______________________________________________
pdm-devel mailing list
pdm-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pdm-devel
next prev parent reply other threads:[~2025-02-04 9:56 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-02-04 9:55 [pdm-devel] [PATCH pdm 0/7] multi-remote client and node reachability cache Wolfgang Bumiller
2025-02-04 9:55 ` [pdm-devel] [PATCH datacenter-manager 1/7] server: generic multi-client wrapper Wolfgang Bumiller
2025-02-11 14:50 ` Lukas Wagner
2025-02-12 9:07 ` Wolfgang Bumiller
2025-02-04 9:55 ` [pdm-devel] [PATCH datacenter-manager 2/7] server: store pve MultiClient for re-use Wolfgang Bumiller
2025-02-04 9:55 ` [pdm-devel] [PATCH datacenter-manager 3/7] server: separate ConnectInfo from client creation Wolfgang Bumiller
2025-02-04 9:55 ` [pdm-devel] [PATCH datacenter-manager 4/7] server: cache pve node reachability and names Wolfgang Bumiller
2025-02-04 9:55 ` Wolfgang Bumiller [this message]
2025-02-04 9:55 ` [pdm-devel] [PATCH datacenter-manager 6/7] server: try previously unreachable clients as last resort Wolfgang Bumiller
2025-02-04 9:55 ` [pdm-devel] [PATCH datacenter-manager 7/7] server: add some tracing instrumentation Wolfgang Bumiller
2025-02-11 14:50 ` [pdm-devel] [PATCH pdm 0/7] multi-remote client and node reachability cache Lukas Wagner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250204095554.39501-6-w.bumiller@proxmox.com \
--to=w.bumiller@proxmox.com \
--cc=pdm-devel@lists.proxmox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.