From: Wolfgang Bumiller <w.bumiller@proxmox.com>
To: pdm-devel@lists.proxmox.com
Subject: [pdm-devel] [PATCH v2 datacenter-manager 6/7] server: try previously unreachable clients as last resort
Date: Wed, 5 Mar 2025 16:01:07 +0100 [thread overview]
Message-ID: <20250305150108.245584-7-w.bumiller@proxmox.com> (raw)
In-Reply-To: <20250305150108.245584-1-w.bumiller@proxmox.com>
and mark them as reachable again if they succeed
Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
Reviewed-by: Lukas Wagner <l.wagner@proxmox.com>
---
No changes since v1.
server/src/connection.rs | 92 ++++++++++++++++++++++++++++++++++------
1 file changed, 80 insertions(+), 12 deletions(-)
diff --git a/server/src/connection.rs b/server/src/connection.rs
index b9c0e16..3092ab3 100644
--- a/server/src/connection.rs
+++ b/server/src/connection.rs
@@ -469,13 +469,15 @@ struct MultiClientEntry {
/// - For `GET` requests we could also start a 2nd request after a shorter time out (eg. 10s).
struct MultiClient {
state: StdMutex<MultiClientState>,
+ remote: String,
timeout: Duration,
}
impl MultiClient {
fn new(remote: String, entries: Vec<MultiClientEntry>) -> Self {
Self {
- state: StdMutex::new(MultiClientState::new(remote, entries)),
+ state: StdMutex::new(MultiClientState::new(remote.clone(), entries)),
+ remote,
timeout: Duration::from_secs(60),
}
}
@@ -559,11 +561,16 @@ impl MultiClientState {
&self.entries[self.index()]
}
- /// Get the current client and its index which can be passed to `failed()` if the client fails
+ /// Get the current entry and its index which can be passed to `failed()` if the client fails
/// to connect.
- fn get(&self) -> (Arc<Client>, usize) {
+ fn get(&self) -> (&MultiClientEntry, usize) {
let index = self.index();
- (Arc::clone(&self.entries[index].client), self.current)
+ (&self.entries[index], self.current)
+ }
+
+ /// Get a client at a specific point (which still needs to be converted to an index).
+ fn get_at(&self, at: usize) -> &MultiClientEntry {
+ &self.entries[at % self.entries.len()]
}
/// Check if we already tried all clients since a specific starting index.
@@ -588,6 +595,30 @@ impl MultiClientState {
}
}
+struct TryClient {
+ client: Arc<Client>,
+ reachable: bool,
+ hostname: String,
+}
+
+impl TryClient {
+ fn reachable(entry: &MultiClientEntry) -> Self {
+ Self {
+ client: Arc::clone(&entry.client),
+ hostname: entry.hostname.clone(),
+ reachable: true,
+ }
+ }
+
+ fn unreachable(entry: &MultiClientEntry) -> Self {
+ Self {
+ client: Arc::clone(&entry.client),
+ hostname: entry.hostname.clone(),
+ reachable: false,
+ }
+ }
+}
+
impl MultiClient {
/// This is the client usage strategy.
///
@@ -598,17 +629,28 @@ impl MultiClient {
/// We might be skipping clients if other tasks already tried "more" clients, but that's fine,
/// since there's no point in trying the same remote twice simultaneously if it is currently
/// offline...
- fn try_clients(&self) -> impl Iterator<Item = Arc<Client>> + '_ {
+ fn try_clients(&self) -> impl Iterator<Item = TryClient> + '_ {
let mut start_current = None;
let state = &self.state;
+
+ let mut unreachable_clients = Vec::new();
+ let mut try_unreachable = None::<std::vec::IntoIter<_>>;
+
std::iter::from_fn(move || {
let mut state = state.lock().unwrap();
+
+ if let Some(ref mut try_unreachable) = try_unreachable {
+ return Some(TryClient::unreachable(
+ state.get_at(try_unreachable.next()?),
+ ));
+ }
+
match start_current {
None => {
// first attempt, just use the current client and remember the starting index
let (client, index) = state.get();
start_current = Some((index, index));
- Some(client)
+ Some(TryClient::reachable(client))
}
Some((start, current)) => {
// If our last request failed, the retry-loop asks for another client, mark the
@@ -618,13 +660,24 @@ impl MultiClient {
if state.tried_all_since(start) {
// This iterator (and therefore this retry-loop) has tried all clients.
// Give up.
- return None;
+ try_unreachable =
+ Some(std::mem::take(&mut unreachable_clients).into_iter());
+ return Some(TryClient::unreachable(
+ state.get_at(try_unreachable.as_mut()?.next()?),
+ ));
}
// finally just get the new current client and update `current` for the later
// call to `failed()`
- let (client, current) = state.get();
- start_current = Some((start, current));
- Some(client)
+ let (client, new_current) = state.get();
+ start_current = Some((start, new_current));
+
+ // remember all the clients we skipped:
+ let mut at = current + 1;
+ while at != new_current {
+ unreachable_clients.push(at);
+ at = at.wrapping_add(1);
+ }
+ Some(TryClient::reachable(client))
}
}
})
@@ -647,7 +700,12 @@ macro_rules! try_request {
let mut timed_out = false;
// The iterator in use here will automatically mark a client as faulty if we move on to
// the `next()` one.
- for client in $self.try_clients() {
+ for TryClient {
+ client,
+ hostname,
+ reachable,
+ } in $self.try_clients()
+ {
if let Some(err) = last_err.take() {
log::error!("API client error, trying another remote - {err:?}");
}
@@ -661,7 +719,17 @@ macro_rules! try_request {
Ok(Err(proxmox_client::Error::Client(err))) => {
last_err = Some(err);
}
- Ok(result) => return result,
+ Ok(result) => {
+ if !reachable {
+ log::error!("marking {hostname:?} as reachable again!");
+ if let Ok(mut cache) = crate::remote_cache::RemoteMappingCache::write()
+ {
+ cache.mark_host_reachable(&$self.remote, &hostname, true);
+ let _ = cache.save();
+ }
+ }
+ return result;
+ }
Err(_) => {
timed_out = true;
}
--
2.39.5
_______________________________________________
pdm-devel mailing list
pdm-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pdm-devel
next prev parent reply other threads:[~2025-03-05 15:01 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-03-05 15:01 [pdm-devel] [PATCH pdm 0/7] multi-remote client and node reachability cache Wolfgang Bumiller
2025-03-05 15:01 ` [pdm-devel] [PATCH v2 datacenter-manager 1/7] server: generic multi-client wrapper Wolfgang Bumiller
2025-03-05 15:01 ` [pdm-devel] [PATCH v2 datacenter-manager 2/7] server: store pve MultiClient for re-use Wolfgang Bumiller
2025-03-05 15:01 ` [pdm-devel] [PATCH v2 datacenter-manager 3/7] server: separate ConnectInfo from client creation Wolfgang Bumiller
2025-03-05 15:01 ` [pdm-devel] [PATCH v2 datacenter-manager 4/7] server: cache pve node reachability and names Wolfgang Bumiller
2025-03-05 15:01 ` [pdm-devel] [PATCH v2 datacenter-manager 5/7] server: don't try to connect to known-unreachable servers Wolfgang Bumiller
2025-03-05 15:01 ` Wolfgang Bumiller [this message]
2025-03-05 15:01 ` [pdm-devel] [PATCH v2 datacenter-manager 7/7] server: add some tracing instrumentation Wolfgang Bumiller
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250305150108.245584-7-w.bumiller@proxmox.com \
--to=w.bumiller@proxmox.com \
--cc=pdm-devel@lists.proxmox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal