From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by lists.proxmox.com (Postfix) with ESMTPS id 4D4B16AE3D for ; Wed, 17 Feb 2021 11:53:26 +0100 (CET) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 39A8720CCF for ; Wed, 17 Feb 2021 11:52:56 +0100 (CET) Received: from proxmox-new.maurer-it.com (proxmox-new.maurer-it.com [212.186.127.180]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by firstgate.proxmox.com (Proxmox) with ESMTPS id 599A220CC5 for ; Wed, 17 Feb 2021 11:52:55 +0100 (CET) Received: from proxmox-new.maurer-it.com (localhost.localdomain [127.0.0.1]) by proxmox-new.maurer-it.com (Proxmox) with ESMTP id 27D99419C4 for ; Wed, 17 Feb 2021 11:52:55 +0100 (CET) Date: Wed, 17 Feb 2021 11:52:53 +0100 From: Wolfgang Bumiller To: Stefan Reiter Cc: pbs-devel@lists.proxmox.com Message-ID: <20210217105253.wfyaxp5wnj57c5yq@olga.proxmox.com> References: <20210216170710.31767-1-s.reiter@proxmox.com> <20210216170710.31767-17-s.reiter@proxmox.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20210216170710.31767-17-s.reiter@proxmox.com> User-Agent: NeoMutt/20180716 X-SPAM-LEVEL: Spam detection results: 0 AWL 0.044 Adjusted score from AWL reputation of From: address KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment RCVD_IN_DNSWL_MED -2.3 Sender listed at https://www.dnswl.org/, medium trust SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record URIBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to URIBL was blocked. See http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block for more information. [proxmox-restore-daemon.rs, mod.rs, api.rs, watchdog.rs] Subject: Re: [pbs-devel] [PATCH proxmox-backup 16/22] file-restore-daemon: add watchdog module X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 17 Feb 2021 10:53:26 -0000 On Tue, Feb 16, 2021 at 06:07:04PM +0100, Stefan Reiter wrote: > Add a watchdog that will automatically shut down the VM after 10 > minutes, if no API call is received. > > This is handled using the unix 'alarm' syscall. > > Signed-off-by: Stefan Reiter > --- > src/api2/types/file_restore.rs | 3 ++ > src/bin/proxmox-restore-daemon.rs | 5 ++ > src/bin/proxmox_restore_daemon/api.rs | 22 ++++++-- > src/bin/proxmox_restore_daemon/mod.rs | 3 ++ > src/bin/proxmox_restore_daemon/watchdog.rs | 63 ++++++++++++++++++++++ > 5 files changed, 91 insertions(+), 5 deletions(-) > create mode 100644 src/bin/proxmox_restore_daemon/watchdog.rs > > diff --git a/src/api2/types/file_restore.rs b/src/api2/types/file_restore.rs > index cd8df16a..710c6d83 100644 > --- a/src/api2/types/file_restore.rs > +++ b/src/api2/types/file_restore.rs > @@ -8,5 +8,8 @@ use proxmox::api::api; > pub struct RestoreDaemonStatus { > /// VM uptime in seconds > pub uptime: i64, > + /// time left until auto-shutdown, keep in mind that this is inaccurate when 'keep-timeout' is > + /// not set, as then after the status call the timer will have reset > + pub timeout: i64, > } > > diff --git a/src/bin/proxmox-restore-daemon.rs b/src/bin/proxmox-restore-daemon.rs > index 1ec90794..d30da563 100644 > --- a/src/bin/proxmox-restore-daemon.rs > +++ b/src/bin/proxmox-restore-daemon.rs > @@ -40,6 +40,9 @@ fn main() -> Result<(), Error> { > .write_style(env_logger::WriteStyle::Never) > .init(); > > + // start watchdog, failure is a critical error as it leads to a scenario where we never exit > + watchdog_init()?; > + > proxmox_backup::tools::runtime::main(run()) > } > > @@ -77,6 +80,8 @@ fn accept_vsock_connections( > Ok(stream) => { > if sender.send(Ok(stream)).await.is_err() { > error!("connection accept channel was closed"); > + } else { > + watchdog_ping(); Should the ping not also happen at every api call in case connections get reused? > } > } > Err(err) => { > diff --git a/src/bin/proxmox_restore_daemon/api.rs b/src/bin/proxmox_restore_daemon/api.rs > index 3c642aaf..8eb727df 100644 > --- a/src/bin/proxmox_restore_daemon/api.rs > +++ b/src/bin/proxmox_restore_daemon/api.rs > @@ -8,6 +8,8 @@ use proxmox::list_subdirs_api_method; > > use proxmox_backup::api2::types::*; > > +use super::{watchdog_remaining, watchdog_undo_ping}; > + > // NOTE: All API endpoints must have Permission::World, as the configs for authentication do not > // exist within the restore VM. Safety is guaranteed since we use a low port, so only root on the > // host can contact us - and there the proxmox-backup-client validates permissions already. > @@ -25,6 +27,16 @@ fn read_uptime() -> Result { > } > > #[api( > + input: { > + properties: { > + "keep-timeout": { > + type: bool, > + description: "If true, do not reset the watchdog timer on this API call.", > + default: false, > + optional: true, > + }, > + }, > + }, > access: { > description: "Permissions are handled outside restore VM.", > permission: &Permission::World, > @@ -34,12 +46,12 @@ fn read_uptime() -> Result { > } > )] > /// General status information > -fn status( > - _param: Value, > - _info: &ApiMethod, > - _rpcenv: &mut dyn RpcEnvironment, > -) -> Result { > +fn status(keep_timeout: bool) -> Result { > + if keep_timeout { This seems just weird. Do we really need this? > + watchdog_undo_ping(); > + } > Ok(RestoreDaemonStatus { > uptime: read_uptime()? as i64, > + timeout: watchdog_remaining(false), > }) > } > diff --git a/src/bin/proxmox_restore_daemon/mod.rs b/src/bin/proxmox_restore_daemon/mod.rs > index d938a5bb..6802d31c 100644 > --- a/src/bin/proxmox_restore_daemon/mod.rs > +++ b/src/bin/proxmox_restore_daemon/mod.rs > @@ -1,3 +1,6 @@ > ///! File restore VM related functionality > mod api; > pub use api::*; > + > +mod watchdog; > +pub use watchdog::*; > diff --git a/src/bin/proxmox_restore_daemon/watchdog.rs b/src/bin/proxmox_restore_daemon/watchdog.rs > new file mode 100644 > index 00000000..f722be0b > --- /dev/null > +++ b/src/bin/proxmox_restore_daemon/watchdog.rs > @@ -0,0 +1,63 @@ > +//! SIGALRM/alarm(1) based watchdog that shuts down the VM if not pinged for TIMEOUT > +use anyhow::Error; > +use std::sync::atomic::{AtomicI64, Ordering}; > + > +use nix::sys::{reboot, signal::*}; > +use nix::unistd::alarm; > + > +const TIMEOUT: u32 = 600; // seconds > +static TRIGGERED: AtomicI64 = AtomicI64::new(0); > +static LAST_TRIGGERED: AtomicI64 = AtomicI64::new(0); > + > +/// Handler is called when alarm-watchdog expires, immediately shuts down VM when triggered > +extern "C" fn alarm_handler(_signal: nix::libc::c_int) { > + // use println! instead of log, since log might buffer and not print before shut down > + println!("Watchdog expired, shutting down VM..."); > + let err = reboot::reboot(reboot::RebootMode::RB_POWER_OFF).unwrap_err(); > + println!("'reboot' syscall failed: {}", err); > + std::process::exit(1); > +} > + > +/// Initialize alarm() based watchdog > +pub fn watchdog_init() -> Result<(), Error> { > + unsafe { > + sigaction( > + Signal::SIGALRM, Please don't use this with async code. Threads, signal handlers and async are really annoying to keep track of. This is a perlism we really shouldn't continue to use. We only have a single semi-acceptable excuse for timing signals currently and that's file locks with timeouts, as those have no alternative (yet), which use the timer_create(2) api btw. and will hopefully at some point be replaced by io-uring... Please just spawn a future using tokio::time::sleep(watchdog_remaining()).await in a loop (and don't forget to initialize `TRIGGERD` to the current time before spawning it of course ;-) ). > + &SigAction::new( > + SigHandler::Handler(alarm_handler), > + SaFlags::empty(), > + SigSet::empty(), > + ), > + )?; > + } > + > + watchdog_ping(); > + > + Ok(()) > +} > + > +/// Trigger watchdog keepalive > +pub fn watchdog_ping() { > + alarm::set(TIMEOUT); ^ then this can just go > + let cur_time = proxmox::tools::time::epoch_i64(); > + let last = TRIGGERED.swap(cur_time, Ordering::SeqCst); > + LAST_TRIGGERED.store(last, Ordering::SeqCst); > +} > + > +/// Returns the remaining time before watchdog expiry in seconds if 'current' is true, otherwise it > +/// returns the remaining time before the last ping (which is probably what you want in the API, as > +/// from an API call 'current'=true will *always* return TIMEOUT) > +pub fn watchdog_remaining(current: bool) -> i64 { > + let cur_time = proxmox::tools::time::epoch_i64(); > + let last_time = (if current { &TRIGGERED } else { &LAST_TRIGGERED }).load(Ordering::SeqCst); > + TIMEOUT as i64 - (cur_time - last_time) > +} > + > +/// Undo the last watchdog ping and set timer back to previous state, call this in the API to fake > +/// a non-resetting call > +pub fn watchdog_undo_ping() { This still makes me cringe :-P > + let set = watchdog_remaining(false); > + TRIGGERED.store(LAST_TRIGGERED.load(Ordering::SeqCst), Ordering::SeqCst); > + // make sure argument cannot be 0, as that would cancel any alarm > + alarm::set(1.max(set) as u32); > +} > -- > 2.20.1