From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <pdm-devel-bounces@lists.proxmox.com>
Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68])
	by lore.proxmox.com (Postfix) with ESMTPS id A08CB1FF15E
	for <inbox@lore.proxmox.com>; Tue, 11 Feb 2025 13:06:03 +0100 (CET)
Received: from firstgate.proxmox.com (localhost [127.0.0.1])
	by firstgate.proxmox.com (Proxmox) with ESMTP id F2E642A770;
	Tue, 11 Feb 2025 13:05:56 +0100 (CET)
From: Lukas Wagner <l.wagner@proxmox.com>
To: pdm-devel@lists.proxmox.com
Date: Tue, 11 Feb 2025 13:05:26 +0100
Message-Id: <20250211120541.163621-11-l.wagner@proxmox.com>
X-Mailer: git-send-email 2.39.5
In-Reply-To: <20250211120541.163621-1-l.wagner@proxmox.com>
References: <20250211120541.163621-1-l.wagner@proxmox.com>
MIME-Version: 1.0
X-SPAM-LEVEL: Spam detection results:  0
 AWL 0.010 Adjusted score from AWL reputation of From: address
 BAYES_00                 -1.9 Bayes spam probability is 0 to 1%
 DMARC_MISSING             0.1 Missing DMARC policy
 KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment
 SPF_HELO_NONE           0.001 SPF: HELO does not publish an SPF Record
 SPF_PASS               -0.001 SPF: sender matches SPF record
Subject: [pdm-devel] [PATCH proxmox-datacenter-manager 10/25] metric
 collection: collect overdue metrics on startup/timer change
X-BeenThere: pdm-devel@lists.proxmox.com
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Proxmox Datacenter Manager development discussion
 <pdm-devel.lists.proxmox.com>
List-Unsubscribe: <https://lists.proxmox.com/cgi-bin/mailman/options/pdm-devel>, 
 <mailto:pdm-devel-request@lists.proxmox.com?subject=unsubscribe>
List-Archive: <http://lists.proxmox.com/pipermail/pdm-devel/>
List-Post: <mailto:pdm-devel@lists.proxmox.com>
List-Help: <mailto:pdm-devel-request@lists.proxmox.com?subject=help>
List-Subscribe: <https://lists.proxmox.com/cgi-bin/mailman/listinfo/pdm-devel>, 
 <mailto:pdm-devel-request@lists.proxmox.com?subject=subscribe>
Reply-To: Proxmox Datacenter Manager development discussion
 <pdm-devel@lists.proxmox.com>
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Errors-To: pdm-devel-bounces@lists.proxmox.com
Sender: "pdm-devel" <pdm-devel-bounces@lists.proxmox.com>

Due to the fact that the timer fires at aligned points in time and might
now fire right away after being set up, it could happen that we get gaps
in the data if we change the timer interval or at daemon startup.

To mitigate this, on daemon startup and also if the collection interval
changes, we
  - check if the time until the next scheduled regular collection
    plus the time *since* the last successful collection exceeds
    the configured collection interval
  - if yes, we collect immediately
  - if no, we do nothing and let the remote be collected at the
    next timer tick

Signed-off-by: Lukas Wagner <l.wagner@proxmox.com>
---
 .../src/metric_collection/collection_task.rs  | 72 +++++++++++++++++--
 1 file changed, 65 insertions(+), 7 deletions(-)

diff --git a/server/src/metric_collection/collection_task.rs b/server/src/metric_collection/collection_task.rs
index b4e3207..f0742ea 100644
--- a/server/src/metric_collection/collection_task.rs
+++ b/server/src/metric_collection/collection_task.rs
@@ -1,4 +1,7 @@
-use std::{sync::Arc, time::Duration};
+use std::{
+    sync::Arc,
+    time::{Duration, Instant},
+};
 
 use anyhow::Error;
 use rand::Rng;
@@ -69,13 +72,18 @@ impl MetricCollectionTask {
     /// This function does never return.
     #[tracing::instrument(skip_all, name = "metric_collection_task")]
     pub(super) async fn run(&mut self) {
-        let mut timer = Self::setup_timer(self.settings.collection_interval_or_default());
+        let (mut timer, mut next_run) =
+            Self::setup_timer(self.settings.collection_interval_or_default());
 
         log::debug!(
             "metric collection starting up. collection interval set to {} seconds",
             self.settings.collection_interval_or_default()
         );
 
+        // Check and fetch any remote which would be overdue by the time the
+        // timer first fires.
+        self.fetch_overdue_and_save_state(next_run).await;
+
         loop {
             let old_settings = self.settings.clone();
             tokio::select! {
@@ -124,7 +132,12 @@ impl MetricCollectionTask {
                     "metric collection interval changed to {} seconds, reloading timer",
                     interval
                 );
-                timer = Self::setup_timer(interval);
+                (timer, next_run) = Self::setup_timer(interval);
+                // If change (and therefore reset) our timer right before it fires,
+                // we could potentially miss one collection event.
+                // Therefore fetch all remotes which would be due for metric collection before
+                // the new timer fires.
+                self.fetch_overdue_and_save_state(next_run).await;
             }
         }
     }
@@ -208,12 +221,12 @@ impl MetricCollectionTask {
     /// Set up a [`tokio::time::Interval`] instance with the provided interval.
     /// The timer will be aligned, e.g. an interval of `60` will let the timer
     /// fire at minute boundaries.
-    fn setup_timer(interval: u64) -> Interval {
+    fn setup_timer(interval: u64) -> (Interval, Instant) {
         let mut timer = tokio::time::interval(Duration::from_secs(interval));
-        let first_run = task_utils::next_aligned_instant(interval).into();
-        timer.reset_at(first_run);
+        let first_run = task_utils::next_aligned_instant(interval);
+        timer.reset_at(first_run.into());
 
-        timer
+        (timer, first_run)
     }
 
     /// Convenience helper to load `remote.cfg`, logging the error
@@ -292,6 +305,51 @@ impl MetricCollectionTask {
         }
     }
 
+    /// Fetch metric data from remotes which are overdue for collection and save
+    /// collection state.
+    async fn fetch_overdue_and_save_state(&mut self, next_run: Instant) {
+        if let Some(remotes) = Self::load_remote_config() {
+            self.fetch_overdue(&remotes, next_run).await;
+            if let Err(e) = self.state.save() {
+                log::error!("could not update metric collection state: {e}");
+            }
+        }
+    }
+
+    /// Fetch metric data from remotes which are overdue for collection.
+    ///
+    /// Use this on startup of the metric collection loop as well as
+    /// when the collection interval changes.
+    ///
+    /// Does nothing if the remote config could not be read, in this case an
+    /// error is logged.
+    async fn fetch_overdue(&mut self, remotes: &SectionConfigData<Remote>, next_run: Instant) {
+        let left_until_scheduled = next_run - Instant::now();
+        let now = proxmox_time::epoch_i64();
+
+        let mut overdue = Vec::new();
+
+        for remote in &remotes.order {
+            let last_collection = self
+                .state
+                .get_status(remote)
+                .and_then(|s| s.last_collection)
+                .unwrap_or(0);
+
+            let diff = now - last_collection;
+
+            if diff + left_until_scheduled.as_secs() as i64
+                > self.settings.collection_interval_or_default() as i64
+            {
+                log::debug!(
+                    "starting metric collection for remote '{remote}' - triggered because collection is overdue"
+                );
+                overdue.push(remote.clone());
+            }
+        }
+        self.fetch_remotes(remotes, &overdue).await;
+    }
+
     /// Fetch a single remote.
     #[tracing::instrument(skip_all, fields(remote = remote.id), name = "metric_collection_task")]
     async fn fetch_single_remote(
-- 
2.39.5



_______________________________________________
pdm-devel mailing list
pdm-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pdm-devel