From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by lists.proxmox.com (Postfix) with ESMTPS id 7C9E36A47E for ; Tue, 16 Feb 2021 18:07:42 +0100 (CET) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 6420B190A5 for ; Tue, 16 Feb 2021 18:07:41 +0100 (CET) Received: from proxmox-new.maurer-it.com (proxmox-new.maurer-it.com [212.186.127.180]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by firstgate.proxmox.com (Proxmox) with ESMTPS id 8AD4B18E1A for ; Tue, 16 Feb 2021 18:07:32 +0100 (CET) Received: from proxmox-new.maurer-it.com (localhost.localdomain [127.0.0.1]) by proxmox-new.maurer-it.com (Proxmox) with ESMTP id 568A5461DF for ; Tue, 16 Feb 2021 18:07:32 +0100 (CET) From: Stefan Reiter To: pbs-devel@lists.proxmox.com Date: Tue, 16 Feb 2021 18:07:04 +0100 Message-Id: <20210216170710.31767-17-s.reiter@proxmox.com> X-Mailer: git-send-email 2.20.1 In-Reply-To: <20210216170710.31767-1-s.reiter@proxmox.com> References: <20210216170710.31767-1-s.reiter@proxmox.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-SPAM-LEVEL: Spam detection results: 0 AWL -0.029 Adjusted score from AWL reputation of From: address KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment RCVD_IN_DNSWL_MED -2.3 Sender listed at https://www.dnswl.org/, medium trust SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record URIBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to URIBL was blocked. See http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block for more information. [api.rs, proxmox-restore-daemon.rs, mod.rs, watchdog.rs] Subject: [pbs-devel] [PATCH proxmox-backup 16/22] file-restore-daemon: add watchdog module X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 16 Feb 2021 17:07:42 -0000 Add a watchdog that will automatically shut down the VM after 10 minutes, if no API call is received. This is handled using the unix 'alarm' syscall. Signed-off-by: Stefan Reiter --- src/api2/types/file_restore.rs | 3 ++ src/bin/proxmox-restore-daemon.rs | 5 ++ src/bin/proxmox_restore_daemon/api.rs | 22 ++++++-- src/bin/proxmox_restore_daemon/mod.rs | 3 ++ src/bin/proxmox_restore_daemon/watchdog.rs | 63 ++++++++++++++++++++++ 5 files changed, 91 insertions(+), 5 deletions(-) create mode 100644 src/bin/proxmox_restore_daemon/watchdog.rs diff --git a/src/api2/types/file_restore.rs b/src/api2/types/file_restore.rs index cd8df16a..710c6d83 100644 --- a/src/api2/types/file_restore.rs +++ b/src/api2/types/file_restore.rs @@ -8,5 +8,8 @@ use proxmox::api::api; pub struct RestoreDaemonStatus { /// VM uptime in seconds pub uptime: i64, + /// time left until auto-shutdown, keep in mind that this is inaccurate when 'keep-timeout' is + /// not set, as then after the status call the timer will have reset + pub timeout: i64, } diff --git a/src/bin/proxmox-restore-daemon.rs b/src/bin/proxmox-restore-daemon.rs index 1ec90794..d30da563 100644 --- a/src/bin/proxmox-restore-daemon.rs +++ b/src/bin/proxmox-restore-daemon.rs @@ -40,6 +40,9 @@ fn main() -> Result<(), Error> { .write_style(env_logger::WriteStyle::Never) .init(); + // start watchdog, failure is a critical error as it leads to a scenario where we never exit + watchdog_init()?; + proxmox_backup::tools::runtime::main(run()) } @@ -77,6 +80,8 @@ fn accept_vsock_connections( Ok(stream) => { if sender.send(Ok(stream)).await.is_err() { error!("connection accept channel was closed"); + } else { + watchdog_ping(); } } Err(err) => { diff --git a/src/bin/proxmox_restore_daemon/api.rs b/src/bin/proxmox_restore_daemon/api.rs index 3c642aaf..8eb727df 100644 --- a/src/bin/proxmox_restore_daemon/api.rs +++ b/src/bin/proxmox_restore_daemon/api.rs @@ -8,6 +8,8 @@ use proxmox::list_subdirs_api_method; use proxmox_backup::api2::types::*; +use super::{watchdog_remaining, watchdog_undo_ping}; + // NOTE: All API endpoints must have Permission::World, as the configs for authentication do not // exist within the restore VM. Safety is guaranteed since we use a low port, so only root on the // host can contact us - and there the proxmox-backup-client validates permissions already. @@ -25,6 +27,16 @@ fn read_uptime() -> Result { } #[api( + input: { + properties: { + "keep-timeout": { + type: bool, + description: "If true, do not reset the watchdog timer on this API call.", + default: false, + optional: true, + }, + }, + }, access: { description: "Permissions are handled outside restore VM.", permission: &Permission::World, @@ -34,12 +46,12 @@ fn read_uptime() -> Result { } )] /// General status information -fn status( - _param: Value, - _info: &ApiMethod, - _rpcenv: &mut dyn RpcEnvironment, -) -> Result { +fn status(keep_timeout: bool) -> Result { + if keep_timeout { + watchdog_undo_ping(); + } Ok(RestoreDaemonStatus { uptime: read_uptime()? as i64, + timeout: watchdog_remaining(false), }) } diff --git a/src/bin/proxmox_restore_daemon/mod.rs b/src/bin/proxmox_restore_daemon/mod.rs index d938a5bb..6802d31c 100644 --- a/src/bin/proxmox_restore_daemon/mod.rs +++ b/src/bin/proxmox_restore_daemon/mod.rs @@ -1,3 +1,6 @@ ///! File restore VM related functionality mod api; pub use api::*; + +mod watchdog; +pub use watchdog::*; diff --git a/src/bin/proxmox_restore_daemon/watchdog.rs b/src/bin/proxmox_restore_daemon/watchdog.rs new file mode 100644 index 00000000..f722be0b --- /dev/null +++ b/src/bin/proxmox_restore_daemon/watchdog.rs @@ -0,0 +1,63 @@ +//! SIGALRM/alarm(1) based watchdog that shuts down the VM if not pinged for TIMEOUT +use anyhow::Error; +use std::sync::atomic::{AtomicI64, Ordering}; + +use nix::sys::{reboot, signal::*}; +use nix::unistd::alarm; + +const TIMEOUT: u32 = 600; // seconds +static TRIGGERED: AtomicI64 = AtomicI64::new(0); +static LAST_TRIGGERED: AtomicI64 = AtomicI64::new(0); + +/// Handler is called when alarm-watchdog expires, immediately shuts down VM when triggered +extern "C" fn alarm_handler(_signal: nix::libc::c_int) { + // use println! instead of log, since log might buffer and not print before shut down + println!("Watchdog expired, shutting down VM..."); + let err = reboot::reboot(reboot::RebootMode::RB_POWER_OFF).unwrap_err(); + println!("'reboot' syscall failed: {}", err); + std::process::exit(1); +} + +/// Initialize alarm() based watchdog +pub fn watchdog_init() -> Result<(), Error> { + unsafe { + sigaction( + Signal::SIGALRM, + &SigAction::new( + SigHandler::Handler(alarm_handler), + SaFlags::empty(), + SigSet::empty(), + ), + )?; + } + + watchdog_ping(); + + Ok(()) +} + +/// Trigger watchdog keepalive +pub fn watchdog_ping() { + alarm::set(TIMEOUT); + let cur_time = proxmox::tools::time::epoch_i64(); + let last = TRIGGERED.swap(cur_time, Ordering::SeqCst); + LAST_TRIGGERED.store(last, Ordering::SeqCst); +} + +/// Returns the remaining time before watchdog expiry in seconds if 'current' is true, otherwise it +/// returns the remaining time before the last ping (which is probably what you want in the API, as +/// from an API call 'current'=true will *always* return TIMEOUT) +pub fn watchdog_remaining(current: bool) -> i64 { + let cur_time = proxmox::tools::time::epoch_i64(); + let last_time = (if current { &TRIGGERED } else { &LAST_TRIGGERED }).load(Ordering::SeqCst); + TIMEOUT as i64 - (cur_time - last_time) +} + +/// Undo the last watchdog ping and set timer back to previous state, call this in the API to fake +/// a non-resetting call +pub fn watchdog_undo_ping() { + let set = watchdog_remaining(false); + TRIGGERED.store(LAST_TRIGGERED.load(Ordering::SeqCst), Ordering::SeqCst); + // make sure argument cannot be 0, as that would cancel any alarm + alarm::set(1.max(set) as u32); +} -- 2.20.1