From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: <pdm-devel-bounces@lists.proxmox.com> Received: from firstgate.proxmox.com (firstgate.proxmox.com [IPv6:2a01:7e0:0:424::9]) by lore.proxmox.com (Postfix) with ESMTPS id 5D3CD1FF16F for <inbox@lore.proxmox.com>; Thu, 13 Feb 2025 09:56:05 +0100 (CET) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id EED8C2E24A; Thu, 13 Feb 2025 09:56:01 +0100 (CET) Date: Thu, 13 Feb 2025 09:55:28 +0100 From: Wolfgang Bumiller <w.bumiller@proxmox.com> To: Lukas Wagner <l.wagner@proxmox.com> Message-ID: <kpzd2j2hmms6nkugixdagifikc3mii5hyqj66ofic7wsxztv7h@losm6xjj4cqi> References: <20250211120541.163621-1-l.wagner@proxmox.com> <20250211120541.163621-11-l.wagner@proxmox.com> MIME-Version: 1.0 Content-Disposition: inline In-Reply-To: <20250211120541.163621-11-l.wagner@proxmox.com> X-SPAM-LEVEL: Spam detection results: 0 AWL 0.081 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment RCVD_IN_VALIDITY_CERTIFIED_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to Validity was blocked. See https://knowledge.validity.com/hc/en-us/articles/20961730681243 for more information. RCVD_IN_VALIDITY_RPBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to Validity was blocked. See https://knowledge.validity.com/hc/en-us/articles/20961730681243 for more information. RCVD_IN_VALIDITY_SAFE_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to Validity was blocked. See https://knowledge.validity.com/hc/en-us/articles/20961730681243 for more information. SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Subject: Re: [pdm-devel] [PATCH proxmox-datacenter-manager 10/25] metric collection: collect overdue metrics on startup/timer change X-BeenThere: pdm-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Datacenter Manager development discussion <pdm-devel.lists.proxmox.com> List-Unsubscribe: <https://lists.proxmox.com/cgi-bin/mailman/options/pdm-devel>, <mailto:pdm-devel-request@lists.proxmox.com?subject=unsubscribe> List-Archive: <http://lists.proxmox.com/pipermail/pdm-devel/> List-Post: <mailto:pdm-devel@lists.proxmox.com> List-Help: <mailto:pdm-devel-request@lists.proxmox.com?subject=help> List-Subscribe: <https://lists.proxmox.com/cgi-bin/mailman/listinfo/pdm-devel>, <mailto:pdm-devel-request@lists.proxmox.com?subject=subscribe> Reply-To: Proxmox Datacenter Manager development discussion <pdm-devel@lists.proxmox.com> Cc: pdm-devel@lists.proxmox.com Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: pdm-devel-bounces@lists.proxmox.com Sender: "pdm-devel" <pdm-devel-bounces@lists.proxmox.com> On Tue, Feb 11, 2025 at 01:05:26PM +0100, Lukas Wagner wrote: > Due to the fact that the timer fires at aligned points in time and might > now fire right away after being set up, it could happen that we get gaps > in the data if we change the timer interval or at daemon startup. > > To mitigate this, on daemon startup and also if the collection interval > changes, we > - check if the time until the next scheduled regular collection > plus the time *since* the last successful collection exceeds > the configured collection interval > - if yes, we collect immediately > - if no, we do nothing and let the remote be collected at the > next timer tick > > Signed-off-by: Lukas Wagner <l.wagner@proxmox.com> > --- > .../src/metric_collection/collection_task.rs | 72 +++++++++++++++++-- > 1 file changed, 65 insertions(+), 7 deletions(-) > > diff --git a/server/src/metric_collection/collection_task.rs b/server/src/metric_collection/collection_task.rs > index b4e3207..f0742ea 100644 > --- a/server/src/metric_collection/collection_task.rs > +++ b/server/src/metric_collection/collection_task.rs > @@ -1,4 +1,7 @@ > -use std::{sync::Arc, time::Duration}; > +use std::{ > + sync::Arc, > + time::{Duration, Instant}, > +}; > > use anyhow::Error; > use rand::Rng; > @@ -69,13 +72,18 @@ impl MetricCollectionTask { > /// This function does never return. > #[tracing::instrument(skip_all, name = "metric_collection_task")] > pub(super) async fn run(&mut self) { > - let mut timer = Self::setup_timer(self.settings.collection_interval_or_default()); > + let (mut timer, mut next_run) = > + Self::setup_timer(self.settings.collection_interval_or_default()); > > log::debug!( > "metric collection starting up. collection interval set to {} seconds", > self.settings.collection_interval_or_default() > ); > > + // Check and fetch any remote which would be overdue by the time the > + // timer first fires. > + self.fetch_overdue_and_save_state(next_run).await; > + > loop { > let old_settings = self.settings.clone(); > tokio::select! { > @@ -124,7 +132,12 @@ impl MetricCollectionTask { > "metric collection interval changed to {} seconds, reloading timer", > interval > ); > - timer = Self::setup_timer(interval); > + (timer, next_run) = Self::setup_timer(interval); > + // If change (and therefore reset) our timer right before it fires, > + // we could potentially miss one collection event. Couldn't we instead just pass `next_run` through to `setup_timer` and call `reset_at(next_run)` on it? (`first_run` would only be used in the initial setup, so `next_run` could either be an `Option`, or the setup code does the `next_aligned_instant` call... This should be much less code by making the new `fetch_overdue{,_and_save_sate}()` functions unnecessary, or am I missing something? > + // Therefore fetch all remotes which would be due for metric collection before > + // the new timer fires. > + self.fetch_overdue_and_save_state(next_run).await; > } > } > } > @@ -208,12 +221,12 @@ impl MetricCollectionTask { > /// Set up a [`tokio::time::Interval`] instance with the provided interval. > /// The timer will be aligned, e.g. an interval of `60` will let the timer > /// fire at minute boundaries. > - fn setup_timer(interval: u64) -> Interval { > + fn setup_timer(interval: u64) -> (Interval, Instant) { > let mut timer = tokio::time::interval(Duration::from_secs(interval)); > - let first_run = task_utils::next_aligned_instant(interval).into(); > - timer.reset_at(first_run); > + let first_run = task_utils::next_aligned_instant(interval); > + timer.reset_at(first_run.into()); > > - timer > + (timer, first_run) > } > > /// Convenience helper to load `remote.cfg`, logging the error > @@ -292,6 +305,51 @@ impl MetricCollectionTask { > } > } > > + /// Fetch metric data from remotes which are overdue for collection and save > + /// collection state. > + async fn fetch_overdue_and_save_state(&mut self, next_run: Instant) { > + if let Some(remotes) = Self::load_remote_config() { > + self.fetch_overdue(&remotes, next_run).await; > + if let Err(e) = self.state.save() { > + log::error!("could not update metric collection state: {e}"); > + } > + } > + } > + > + /// Fetch metric data from remotes which are overdue for collection. > + /// > + /// Use this on startup of the metric collection loop as well as > + /// when the collection interval changes. > + /// > + /// Does nothing if the remote config could not be read, in this case an > + /// error is logged. > + async fn fetch_overdue(&mut self, remotes: &SectionConfigData<Remote>, next_run: Instant) { > + let left_until_scheduled = next_run - Instant::now(); > + let now = proxmox_time::epoch_i64(); > + > + let mut overdue = Vec::new(); > + > + for remote in &remotes.order { > + let last_collection = self > + .state > + .get_status(remote) > + .and_then(|s| s.last_collection) > + .unwrap_or(0); > + > + let diff = now - last_collection; > + > + if diff + left_until_scheduled.as_secs() as i64 > + > self.settings.collection_interval_or_default() as i64 > + { > + log::debug!( > + "starting metric collection for remote '{remote}' - triggered because collection is overdue" > + ); > + overdue.push(remote.clone()); > + } > + } > + self.fetch_remotes(remotes, &overdue).await; > + } > + > /// Fetch a single remote. > #[tracing::instrument(skip_all, fields(remote = remote.id), name = "metric_collection_task")] > async fn fetch_single_remote( > -- > 2.39.5 > > > > _______________________________________________ > pdm-devel mailing list > pdm-devel@lists.proxmox.com > https://lists.proxmox.com/cgi-bin/mailman/listinfo/pdm-devel > > _______________________________________________ pdm-devel mailing list pdm-devel@lists.proxmox.com https://lists.proxmox.com/cgi-bin/mailman/listinfo/pdm-devel