From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by lists.proxmox.com (Postfix) with ESMTPS id 25D5F6AE9E for ; Wed, 17 Feb 2021 12:15:12 +0100 (CET) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 1B78321273 for ; Wed, 17 Feb 2021 12:14:42 +0100 (CET) Received: from proxmox-new.maurer-it.com (proxmox-new.maurer-it.com [212.186.127.180]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by firstgate.proxmox.com (Proxmox) with ESMTPS id 2C42821266 for ; Wed, 17 Feb 2021 12:14:41 +0100 (CET) Received: from proxmox-new.maurer-it.com (localhost.localdomain [127.0.0.1]) by proxmox-new.maurer-it.com (Proxmox) with ESMTP id E9053461C0 for ; Wed, 17 Feb 2021 12:14:40 +0100 (CET) To: Wolfgang Bumiller Cc: pbs-devel@lists.proxmox.com References: <20210216170710.31767-1-s.reiter@proxmox.com> <20210216170710.31767-17-s.reiter@proxmox.com> <20210217105253.wfyaxp5wnj57c5yq@olga.proxmox.com> From: Stefan Reiter Message-ID: <64949c97-b8b9-01e8-8ed8-9f6c02fca94b@proxmox.com> Date: Wed, 17 Feb 2021 12:14:39 +0100 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Thunderbird/78.7.1 MIME-Version: 1.0 In-Reply-To: <20210217105253.wfyaxp5wnj57c5yq@olga.proxmox.com> Content-Type: text/plain; charset=utf-8; format=flowed Content-Language: en-US Content-Transfer-Encoding: 7bit X-SPAM-LEVEL: Spam detection results: 0 AWL -0.025 Adjusted score from AWL reputation of From: address KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment NICE_REPLY_A -0.001 Looks like a legit reply (A) RCVD_IN_DNSWL_MED -2.3 Sender listed at https://www.dnswl.org/, medium trust SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record URIBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to URIBL was blocked. See http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block for more information. [watchdog.rs, api.rs, proxmox-restore-daemon.rs, mod.rs] Subject: Re: [pbs-devel] [PATCH proxmox-backup 16/22] file-restore-daemon: add watchdog module X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 17 Feb 2021 11:15:12 -0000 On 17/02/2021 11:52, Wolfgang Bumiller wrote: > On Tue, Feb 16, 2021 at 06:07:04PM +0100, Stefan Reiter wrote: >> Add a watchdog that will automatically shut down the VM after 10 >> minutes, if no API call is received. >> >> This is handled using the unix 'alarm' syscall. >> >> Signed-off-by: Stefan Reiter >> --- >> src/api2/types/file_restore.rs | 3 ++ >> src/bin/proxmox-restore-daemon.rs | 5 ++ >> src/bin/proxmox_restore_daemon/api.rs | 22 ++++++-- >> src/bin/proxmox_restore_daemon/mod.rs | 3 ++ >> src/bin/proxmox_restore_daemon/watchdog.rs | 63 ++++++++++++++++++++++ >> 5 files changed, 91 insertions(+), 5 deletions(-) >> create mode 100644 src/bin/proxmox_restore_daemon/watchdog.rs >> >> diff --git a/src/api2/types/file_restore.rs b/src/api2/types/file_restore.rs >> index cd8df16a..710c6d83 100644 >> --- a/src/api2/types/file_restore.rs >> +++ b/src/api2/types/file_restore.rs >> @@ -8,5 +8,8 @@ use proxmox::api::api; >> pub struct RestoreDaemonStatus { >> /// VM uptime in seconds >> pub uptime: i64, >> + /// time left until auto-shutdown, keep in mind that this is inaccurate when 'keep-timeout' is >> + /// not set, as then after the status call the timer will have reset >> + pub timeout: i64, >> } >> >> diff --git a/src/bin/proxmox-restore-daemon.rs b/src/bin/proxmox-restore-daemon.rs >> index 1ec90794..d30da563 100644 >> --- a/src/bin/proxmox-restore-daemon.rs >> +++ b/src/bin/proxmox-restore-daemon.rs >> @@ -40,6 +40,9 @@ fn main() -> Result<(), Error> { >> .write_style(env_logger::WriteStyle::Never) >> .init(); >> >> + // start watchdog, failure is a critical error as it leads to a scenario where we never exit >> + watchdog_init()?; >> + >> proxmox_backup::tools::runtime::main(run()) >> } >> >> @@ -77,6 +80,8 @@ fn accept_vsock_connections( >> Ok(stream) => { >> if sender.send(Ok(stream)).await.is_err() { >> error!("connection accept channel was closed"); >> + } else { >> + watchdog_ping(); > > Should the ping not also happen at every api call in case connections > get reused? > I wanted to keep as much watchdog code out of API calls, lest some new code forgets to call a ping(), but yes, I didn't think of connection reuse (it doesn't currently happen anywhere, but still good to be safe). >> } >> } >> Err(err) => { >> diff --git a/src/bin/proxmox_restore_daemon/api.rs b/src/bin/proxmox_restore_daemon/api.rs >> index 3c642aaf..8eb727df 100644 >> --- a/src/bin/proxmox_restore_daemon/api.rs >> +++ b/src/bin/proxmox_restore_daemon/api.rs >> @@ -8,6 +8,8 @@ use proxmox::list_subdirs_api_method; >> >> use proxmox_backup::api2::types::*; >> >> +use super::{watchdog_remaining, watchdog_undo_ping}; >> + >> // NOTE: All API endpoints must have Permission::World, as the configs for authentication do not >> // exist within the restore VM. Safety is guaranteed since we use a low port, so only root on the >> // host can contact us - and there the proxmox-backup-client validates permissions already. >> @@ -25,6 +27,16 @@ fn read_uptime() -> Result { >> } >> >> #[api( >> + input: { >> + properties: { >> + "keep-timeout": { >> + type: bool, >> + description: "If true, do not reset the watchdog timer on this API call.", >> + default: false, >> + optional: true, >> + }, >> + }, >> + }, >> access: { >> description: "Permissions are handled outside restore VM.", >> permission: &Permission::World, >> @@ -34,12 +46,12 @@ fn read_uptime() -> Result { >> } >> )] >> /// General status information >> -fn status( >> - _param: Value, >> - _info: &ApiMethod, >> - _rpcenv: &mut dyn RpcEnvironment, >> -) -> Result { >> +fn status(keep_timeout: bool) -> Result { >> + if keep_timeout { > > This seems just weird. Do we really need this? > Not necessarily, but the idea I had in mind was someone running a script of sorts that calls 'proxmox-file-restore status' (for monitoring etc...) that would otherwise prevent the VMs from ever stopping... >> + watchdog_undo_ping(); >> + } >> Ok(RestoreDaemonStatus { >> uptime: read_uptime()? as i64, >> + timeout: watchdog_remaining(false), >> }) >> } >> diff --git a/src/bin/proxmox_restore_daemon/mod.rs b/src/bin/proxmox_restore_daemon/mod.rs >> index d938a5bb..6802d31c 100644 >> --- a/src/bin/proxmox_restore_daemon/mod.rs >> +++ b/src/bin/proxmox_restore_daemon/mod.rs >> @@ -1,3 +1,6 @@ >> ///! File restore VM related functionality >> mod api; >> pub use api::*; >> + >> +mod watchdog; >> +pub use watchdog::*; >> diff --git a/src/bin/proxmox_restore_daemon/watchdog.rs b/src/bin/proxmox_restore_daemon/watchdog.rs >> new file mode 100644 >> index 00000000..f722be0b >> --- /dev/null >> +++ b/src/bin/proxmox_restore_daemon/watchdog.rs >> @@ -0,0 +1,63 @@ >> +//! SIGALRM/alarm(1) based watchdog that shuts down the VM if not pinged for TIMEOUT >> +use anyhow::Error; >> +use std::sync::atomic::{AtomicI64, Ordering}; >> + >> +use nix::sys::{reboot, signal::*}; >> +use nix::unistd::alarm; >> + >> +const TIMEOUT: u32 = 600; // seconds >> +static TRIGGERED: AtomicI64 = AtomicI64::new(0); >> +static LAST_TRIGGERED: AtomicI64 = AtomicI64::new(0); >> + >> +/// Handler is called when alarm-watchdog expires, immediately shuts down VM when triggered >> +extern "C" fn alarm_handler(_signal: nix::libc::c_int) { >> + // use println! instead of log, since log might buffer and not print before shut down >> + println!("Watchdog expired, shutting down VM..."); >> + let err = reboot::reboot(reboot::RebootMode::RB_POWER_OFF).unwrap_err(); >> + println!("'reboot' syscall failed: {}", err); >> + std::process::exit(1); >> +} >> + >> +/// Initialize alarm() based watchdog >> +pub fn watchdog_init() -> Result<(), Error> { >> + unsafe { >> + sigaction( >> + Signal::SIGALRM, > > Please don't use this with async code. Threads, signal handlers and > async are really annoying to keep track of. This is a perlism we really > shouldn't continue to use. We only have a single semi-acceptable excuse > for timing signals currently and that's file locks with timeouts, as > those have no alternative (yet), which use the timer_create(2) > api btw. and will hopefully at some point be replaced by io-uring... > I went with alarm() on the assumption that it might be a bit more reliable (tokio scheduler can get stuck?), and even had the idea to use an actual QEMU watchdog (that had some other issues though, that I don't quite remember atm). > Please just spawn a future using > tokio::time::sleep(watchdog_remaining()).await > in a loop (and don't forget to initialize `TRIGGERD` to the current time > before spawning it of course ;-) ). > ...though the probability of a tokio hang is probably low enough that this will do just fine too - I'll change it in v2. >> + &SigAction::new( >> + SigHandler::Handler(alarm_handler), >> + SaFlags::empty(), >> + SigSet::empty(), >> + ), >> + )?; >> + } >> + >> + watchdog_ping(); >> + >> + Ok(()) >> +} >> + >> +/// Trigger watchdog keepalive >> +pub fn watchdog_ping() { >> + alarm::set(TIMEOUT); > > ^ then this can just go > >> + let cur_time = proxmox::tools::time::epoch_i64(); >> + let last = TRIGGERED.swap(cur_time, Ordering::SeqCst); >> + LAST_TRIGGERED.store(last, Ordering::SeqCst); >> +} >> + >> +/// Returns the remaining time before watchdog expiry in seconds if 'current' is true, otherwise it >> +/// returns the remaining time before the last ping (which is probably what you want in the API, as >> +/// from an API call 'current'=true will *always* return TIMEOUT) >> +pub fn watchdog_remaining(current: bool) -> i64 { >> + let cur_time = proxmox::tools::time::epoch_i64(); >> + let last_time = (if current { &TRIGGERED } else { &LAST_TRIGGERED }).load(Ordering::SeqCst); >> + TIMEOUT as i64 - (cur_time - last_time) >> +} >> + >> +/// Undo the last watchdog ping and set timer back to previous state, call this in the API to fake >> +/// a non-resetting call >> +pub fn watchdog_undo_ping() { > > This still makes me cringe :-P > >> + let set = watchdog_remaining(false); >> + TRIGGERED.store(LAST_TRIGGERED.load(Ordering::SeqCst), Ordering::SeqCst); >> + // make sure argument cannot be 0, as that would cancel any alarm >> + alarm::set(1.max(set) as u32); >> +} >> -- >> 2.20.1