From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <pdm-devel-bounces@lists.proxmox.com>
Received: from firstgate.proxmox.com (firstgate.proxmox.com [IPv6:2a01:7e0:0:424::9])
	by lore.proxmox.com (Postfix) with ESMTPS id 258251FF15C
	for <inbox@lore.proxmox.com>; Wed,  5 Mar 2025 16:01:21 +0100 (CET)
Received: from firstgate.proxmox.com (localhost [127.0.0.1])
	by firstgate.proxmox.com (Proxmox) with ESMTP id 9BE8B18D9A;
	Wed,  5 Mar 2025 16:01:14 +0100 (CET)
From: Wolfgang Bumiller <w.bumiller@proxmox.com>
To: pdm-devel@lists.proxmox.com
Date: Wed,  5 Mar 2025 16:01:07 +0100
Message-Id: <20250305150108.245584-7-w.bumiller@proxmox.com>
X-Mailer: git-send-email 2.39.5
In-Reply-To: <20250305150108.245584-1-w.bumiller@proxmox.com>
References: <20250305150108.245584-1-w.bumiller@proxmox.com>
MIME-Version: 1.0
X-SPAM-LEVEL: Spam detection results:  0
 AWL 0.083 Adjusted score from AWL reputation of From: address
 BAYES_00                 -1.9 Bayes spam probability is 0 to 1%
 DMARC_MISSING             0.1 Missing DMARC policy
 KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment
 SPF_HELO_NONE           0.001 SPF: HELO does not publish an SPF Record
 SPF_PASS               -0.001 SPF: sender matches SPF record
Subject: [pdm-devel] [PATCH v2 datacenter-manager 6/7] server: try
 previously unreachable clients as last resort
X-BeenThere: pdm-devel@lists.proxmox.com
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Proxmox Datacenter Manager development discussion
 <pdm-devel.lists.proxmox.com>
List-Unsubscribe: <https://lists.proxmox.com/cgi-bin/mailman/options/pdm-devel>, 
 <mailto:pdm-devel-request@lists.proxmox.com?subject=unsubscribe>
List-Archive: <http://lists.proxmox.com/pipermail/pdm-devel/>
List-Post: <mailto:pdm-devel@lists.proxmox.com>
List-Help: <mailto:pdm-devel-request@lists.proxmox.com?subject=help>
List-Subscribe: <https://lists.proxmox.com/cgi-bin/mailman/listinfo/pdm-devel>, 
 <mailto:pdm-devel-request@lists.proxmox.com?subject=subscribe>
Reply-To: Proxmox Datacenter Manager development discussion
 <pdm-devel@lists.proxmox.com>
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Errors-To: pdm-devel-bounces@lists.proxmox.com
Sender: "pdm-devel" <pdm-devel-bounces@lists.proxmox.com>

and mark them as reachable again if they succeed

Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
Reviewed-by: Lukas Wagner <l.wagner@proxmox.com>
---
No changes since v1.

 server/src/connection.rs | 92 ++++++++++++++++++++++++++++++++++------
 1 file changed, 80 insertions(+), 12 deletions(-)

diff --git a/server/src/connection.rs b/server/src/connection.rs
index b9c0e16..3092ab3 100644
--- a/server/src/connection.rs
+++ b/server/src/connection.rs
@@ -469,13 +469,15 @@ struct MultiClientEntry {
 /// - For `GET` requests we could also start a 2nd request after a shorter time out (eg. 10s).
 struct MultiClient {
     state: StdMutex<MultiClientState>,
+    remote: String,
     timeout: Duration,
 }
 
 impl MultiClient {
     fn new(remote: String, entries: Vec<MultiClientEntry>) -> Self {
         Self {
-            state: StdMutex::new(MultiClientState::new(remote, entries)),
+            state: StdMutex::new(MultiClientState::new(remote.clone(), entries)),
+            remote,
             timeout: Duration::from_secs(60),
         }
     }
@@ -559,11 +561,16 @@ impl MultiClientState {
         &self.entries[self.index()]
     }
 
-    /// Get the current client and its index which can be passed to `failed()` if the client fails
+    /// Get the current entry and its index which can be passed to `failed()` if the client fails
     /// to connect.
-    fn get(&self) -> (Arc<Client>, usize) {
+    fn get(&self) -> (&MultiClientEntry, usize) {
         let index = self.index();
-        (Arc::clone(&self.entries[index].client), self.current)
+        (&self.entries[index], self.current)
+    }
+
+    /// Get a client at a specific point (which still needs to be converted to an index).
+    fn get_at(&self, at: usize) -> &MultiClientEntry {
+        &self.entries[at % self.entries.len()]
     }
 
     /// Check if we already tried all clients since a specific starting index.
@@ -588,6 +595,30 @@ impl MultiClientState {
     }
 }
 
+struct TryClient {
+    client: Arc<Client>,
+    reachable: bool,
+    hostname: String,
+}
+
+impl TryClient {
+    fn reachable(entry: &MultiClientEntry) -> Self {
+        Self {
+            client: Arc::clone(&entry.client),
+            hostname: entry.hostname.clone(),
+            reachable: true,
+        }
+    }
+
+    fn unreachable(entry: &MultiClientEntry) -> Self {
+        Self {
+            client: Arc::clone(&entry.client),
+            hostname: entry.hostname.clone(),
+            reachable: false,
+        }
+    }
+}
+
 impl MultiClient {
     /// This is the client usage strategy.
     ///
@@ -598,17 +629,28 @@ impl MultiClient {
     /// We might be skipping clients if other tasks already tried "more" clients, but that's fine,
     /// since there's no point in trying the same remote twice simultaneously if it is currently
     /// offline...
-    fn try_clients(&self) -> impl Iterator<Item = Arc<Client>> + '_ {
+    fn try_clients(&self) -> impl Iterator<Item = TryClient> + '_ {
         let mut start_current = None;
         let state = &self.state;
+
+        let mut unreachable_clients = Vec::new();
+        let mut try_unreachable = None::<std::vec::IntoIter<_>>;
+
         std::iter::from_fn(move || {
             let mut state = state.lock().unwrap();
+
+            if let Some(ref mut try_unreachable) = try_unreachable {
+                return Some(TryClient::unreachable(
+                    state.get_at(try_unreachable.next()?),
+                ));
+            }
+
             match start_current {
                 None => {
                     // first attempt, just use the current client and remember the starting index
                     let (client, index) = state.get();
                     start_current = Some((index, index));
-                    Some(client)
+                    Some(TryClient::reachable(client))
                 }
                 Some((start, current)) => {
                     // If our last request failed, the retry-loop asks for another client, mark the
@@ -618,13 +660,24 @@ impl MultiClient {
                     if state.tried_all_since(start) {
                         // This iterator (and therefore this retry-loop) has tried all clients.
                         // Give up.
-                        return None;
+                        try_unreachable =
+                            Some(std::mem::take(&mut unreachable_clients).into_iter());
+                        return Some(TryClient::unreachable(
+                            state.get_at(try_unreachable.as_mut()?.next()?),
+                        ));
                     }
                     // finally just get the new current client and update `current` for the later
                     // call to `failed()`
-                    let (client, current) = state.get();
-                    start_current = Some((start, current));
-                    Some(client)
+                    let (client, new_current) = state.get();
+                    start_current = Some((start, new_current));
+
+                    // remember all the clients we skipped:
+                    let mut at = current + 1;
+                    while at != new_current {
+                        unreachable_clients.push(at);
+                        at = at.wrapping_add(1);
+                    }
+                    Some(TryClient::reachable(client))
                 }
             }
         })
@@ -647,7 +700,12 @@ macro_rules! try_request {
             let mut timed_out = false;
             // The iterator in use here will automatically mark a client as faulty if we move on to
             // the `next()` one.
-            for client in $self.try_clients() {
+            for TryClient {
+                client,
+                hostname,
+                reachable,
+            } in $self.try_clients()
+            {
                 if let Some(err) = last_err.take() {
                     log::error!("API client error, trying another remote - {err:?}");
                 }
@@ -661,7 +719,17 @@ macro_rules! try_request {
                     Ok(Err(proxmox_client::Error::Client(err))) => {
                         last_err = Some(err);
                     }
-                    Ok(result) => return result,
+                    Ok(result) => {
+                        if !reachable {
+                            log::error!("marking {hostname:?} as reachable again!");
+                            if let Ok(mut cache) = crate::remote_cache::RemoteMappingCache::write()
+                            {
+                                cache.mark_host_reachable(&$self.remote, &hostname, true);
+                                let _ = cache.save();
+                            }
+                        }
+                        return result;
+                    }
                     Err(_) => {
                         timed_out = true;
                     }
-- 
2.39.5



_______________________________________________
pdm-devel mailing list
pdm-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pdm-devel