From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: <pdm-devel-bounces@lists.proxmox.com> Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) by lore.proxmox.com (Postfix) with ESMTPS id D80141FF164 for <inbox@lore.proxmox.com>; Fri, 14 Feb 2025 14:07:19 +0100 (CET) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 6C00E19C98; Fri, 14 Feb 2025 14:07:18 +0100 (CET) From: Lukas Wagner <l.wagner@proxmox.com> To: pdm-devel@lists.proxmox.com Date: Fri, 14 Feb 2025 14:06:35 +0100 Message-Id: <20250214130653.283012-11-l.wagner@proxmox.com> X-Mailer: git-send-email 2.39.5 In-Reply-To: <20250214130653.283012-1-l.wagner@proxmox.com> References: <20250214130653.283012-1-l.wagner@proxmox.com> MIME-Version: 1.0 X-SPAM-LEVEL: Spam detection results: 0 AWL 0.010 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Subject: [pdm-devel] [PATCH proxmox-datacenter-manager v2 10/28] metric collection: collect overdue metrics on startup/timer change X-BeenThere: pdm-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Datacenter Manager development discussion <pdm-devel.lists.proxmox.com> List-Unsubscribe: <https://lists.proxmox.com/cgi-bin/mailman/options/pdm-devel>, <mailto:pdm-devel-request@lists.proxmox.com?subject=unsubscribe> List-Archive: <http://lists.proxmox.com/pipermail/pdm-devel/> List-Post: <mailto:pdm-devel@lists.proxmox.com> List-Help: <mailto:pdm-devel-request@lists.proxmox.com?subject=help> List-Subscribe: <https://lists.proxmox.com/cgi-bin/mailman/listinfo/pdm-devel>, <mailto:pdm-devel-request@lists.proxmox.com?subject=subscribe> Reply-To: Proxmox Datacenter Manager development discussion <pdm-devel@lists.proxmox.com> Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: pdm-devel-bounces@lists.proxmox.com Sender: "pdm-devel" <pdm-devel-bounces@lists.proxmox.com> Due to the fact that the timer fires at aligned points in time and might now fire right away after being set up, it could happen that we get gaps in the data if we change the timer interval or at daemon startup. To mitigate this, on daemon startup and also if the collection interval changes, we - check if the time until the next scheduled regular collection plus the time *since* the last successful collection exceeds the configured collection interval - if yes, we collect immediately - if no, we do nothing and let the remote be collected at the next timer tick Signed-off-by: Lukas Wagner <l.wagner@proxmox.com> --- Notes: Changes since v1: - Document return values of `setup_timer` .../src/metric_collection/collection_task.rs | 84 +++++++++++++++---- 1 file changed, 66 insertions(+), 18 deletions(-) diff --git a/server/src/metric_collection/collection_task.rs b/server/src/metric_collection/collection_task.rs index 744a7ccc..9467175b 100644 --- a/server/src/metric_collection/collection_task.rs +++ b/server/src/metric_collection/collection_task.rs @@ -1,4 +1,7 @@ -use std::{sync::Arc, time::Duration}; +use std::{ + sync::Arc, + time::{Duration, Instant}, +}; use anyhow::Error; use rand::Rng; @@ -69,15 +72,18 @@ impl MetricCollectionTask { /// This function never returns. #[tracing::instrument(skip_all, name = "metric_collection_task")] pub(super) async fn run(&mut self) { - let mut timer = Self::setup_timer(self.settings.collection_interval_or_default()); + let (mut timer, mut first_tick) = + Self::setup_timer(self.settings.collection_interval_or_default()); - log::debug!( - "metric collection starting up. Collection interval set to {} seconds.", - self.settings.collection_interval_or_default() - ); + // Check and fetch any remote which would be overdue by the time the + // timer first fires. + if let Some(remote_config) = Self::load_remote_config() { + self.fetch_overdue(&remote_config, first_tick).await; + } loop { - let old_settings = self.settings.clone(); + // Remember current collection interval. + let current_interval = self.settings.collection_interval_or_default(); tokio::select! { _ = timer.tick() => { // Reload settings in case they have changed in the meanwhile @@ -117,14 +123,17 @@ impl MetricCollectionTask { } } - let interval = self.settings.collection_interval_or_default(); + let new_interval = self.settings.collection_interval_or_default(); - if old_settings.collection_interval_or_default() != interval { - log::info!( - "metric collection interval changed to {} seconds, reloading timer", - interval - ); - timer = Self::setup_timer(interval); + if current_interval != new_interval { + (timer, first_tick) = Self::setup_timer(new_interval); + // If we change (and therefore reset) our timer right before it fires, + // we could potentially miss one collection event. + // Therefore fetch all remotes which would be due for metric collection before + // the new timer fires. + if let Some(remote_config) = Self::load_remote_config() { + self.fetch_overdue(&remote_config, first_tick).await; + } } if let Err(err) = self.state.save() { @@ -186,12 +195,16 @@ impl MetricCollectionTask { /// Set up a [`tokio::time::Interval`] instance with the provided interval. /// The timer will be aligned, e.g. an interval of `60` will let the timer /// fire at minute boundaries. - fn setup_timer(interval: u64) -> Interval { + /// + /// The return values are a tuple of the [`tokio::time::Interval`] timer instance + /// and the [`std::time::Instant`] at which the timer first fires. + fn setup_timer(interval: u64) -> (Interval, Instant) { + log::debug!("setting metric collection interval timer to {interval} seconds.",); let mut timer = tokio::time::interval(Duration::from_secs(interval)); - let first_run = task_utils::next_aligned_instant(interval).into(); - timer.reset_at(first_run); + let first_run = task_utils::next_aligned_instant(interval); + timer.reset_at(first_run.into()); - timer + (timer, first_run) } /// Convenience helper to load `remote.cfg`, logging the error @@ -270,6 +283,41 @@ impl MetricCollectionTask { } } + /// Fetch metric data from remotes which are overdue for collection. + /// + /// Use this on startup of the metric collection loop as well as + /// when the collection interval changes. + async fn fetch_overdue( + &mut self, + remote_config: &SectionConfigData<Remote>, + next_run: Instant, + ) { + let left_until_scheduled = next_run - Instant::now(); + let now = proxmox_time::epoch_i64(); + + let mut overdue = Vec::new(); + + for remote in &remote_config.order { + let last_collection = self + .state + .get_status(remote) + .and_then(|s| s.last_collection) + .unwrap_or(0); + + let diff = now - last_collection; + + if diff + left_until_scheduled.as_secs() as i64 + > self.settings.collection_interval_or_default() as i64 + { + log::debug!( + "starting metric collection for remote '{remote}' - triggered because collection is overdue" + ); + overdue.push(remote.clone()); + } + } + self.fetch_remotes(remote_config, &overdue).await; + } + /// Fetch a single remote. #[tracing::instrument(skip_all, fields(remote = remote.id), name = "metric_collection_task")] async fn fetch_single_remote( -- 2.39.5 _______________________________________________ pdm-devel mailing list pdm-devel@lists.proxmox.com https://lists.proxmox.com/cgi-bin/mailman/listinfo/pdm-devel