all lists on lists.proxmox.com
 help / color / mirror / Atom feed
From: Stefan Reiter <s.reiter@proxmox.com>
To: Wolfgang Bumiller <w.bumiller@proxmox.com>
Cc: pbs-devel@lists.proxmox.com
Subject: Re: [pbs-devel] [PATCH proxmox-backup 16/22] file-restore-daemon: add watchdog module
Date: Wed, 17 Feb 2021 12:14:39 +0100	[thread overview]
Message-ID: <64949c97-b8b9-01e8-8ed8-9f6c02fca94b@proxmox.com> (raw)
In-Reply-To: <20210217105253.wfyaxp5wnj57c5yq@olga.proxmox.com>

On 17/02/2021 11:52, Wolfgang Bumiller wrote:
> On Tue, Feb 16, 2021 at 06:07:04PM +0100, Stefan Reiter wrote:
>> Add a watchdog that will automatically shut down the VM after 10
>> minutes, if no API call is received.
>>
>> This is handled using the unix 'alarm' syscall.
>>
>> Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
>> ---
>>   src/api2/types/file_restore.rs             |  3 ++
>>   src/bin/proxmox-restore-daemon.rs          |  5 ++
>>   src/bin/proxmox_restore_daemon/api.rs      | 22 ++++++--
>>   src/bin/proxmox_restore_daemon/mod.rs      |  3 ++
>>   src/bin/proxmox_restore_daemon/watchdog.rs | 63 ++++++++++++++++++++++
>>   5 files changed, 91 insertions(+), 5 deletions(-)
>>   create mode 100644 src/bin/proxmox_restore_daemon/watchdog.rs
>>
>> diff --git a/src/api2/types/file_restore.rs b/src/api2/types/file_restore.rs
>> index cd8df16a..710c6d83 100644
>> --- a/src/api2/types/file_restore.rs
>> +++ b/src/api2/types/file_restore.rs
>> @@ -8,5 +8,8 @@ use proxmox::api::api;
>>   pub struct RestoreDaemonStatus {
>>       /// VM uptime in seconds
>>       pub uptime: i64,
>> +    /// time left until auto-shutdown, keep in mind that this is inaccurate when 'keep-timeout' is
>> +    /// not set, as then after the status call the timer will have reset
>> +    pub timeout: i64,
>>   }
>>   
>> diff --git a/src/bin/proxmox-restore-daemon.rs b/src/bin/proxmox-restore-daemon.rs
>> index 1ec90794..d30da563 100644
>> --- a/src/bin/proxmox-restore-daemon.rs
>> +++ b/src/bin/proxmox-restore-daemon.rs
>> @@ -40,6 +40,9 @@ fn main() -> Result<(), Error> {
>>           .write_style(env_logger::WriteStyle::Never)
>>           .init();
>>   
>> +    // start watchdog, failure is a critical error as it leads to a scenario where we never exit
>> +    watchdog_init()?;
>> +
>>       proxmox_backup::tools::runtime::main(run())
>>   }
>>   
>> @@ -77,6 +80,8 @@ fn accept_vsock_connections(
>>                   Ok(stream) => {
>>                       if sender.send(Ok(stream)).await.is_err() {
>>                           error!("connection accept channel was closed");
>> +                    } else {
>> +                        watchdog_ping();
> 
> Should the ping not also happen at every api call in case connections
> get reused?
> 

I wanted to keep as much watchdog code out of API calls, lest some new 
code forgets to call a ping(), but yes, I didn't think of connection 
reuse (it doesn't currently happen anywhere, but still good to be safe).

>>                       }
>>                   }
>>                   Err(err) => {
>> diff --git a/src/bin/proxmox_restore_daemon/api.rs b/src/bin/proxmox_restore_daemon/api.rs
>> index 3c642aaf..8eb727df 100644
>> --- a/src/bin/proxmox_restore_daemon/api.rs
>> +++ b/src/bin/proxmox_restore_daemon/api.rs
>> @@ -8,6 +8,8 @@ use proxmox::list_subdirs_api_method;
>>   
>>   use proxmox_backup::api2::types::*;
>>   
>> +use super::{watchdog_remaining, watchdog_undo_ping};
>> +
>>   // NOTE: All API endpoints must have Permission::World, as the configs for authentication do not
>>   // exist within the restore VM. Safety is guaranteed since we use a low port, so only root on the
>>   // host can contact us - and there the proxmox-backup-client validates permissions already.
>> @@ -25,6 +27,16 @@ fn read_uptime() -> Result<f32, Error> {
>>   }
>>   
>>   #[api(
>> +    input: {
>> +        properties: {
>> +            "keep-timeout": {
>> +                type: bool,
>> +                description: "If true, do not reset the watchdog timer on this API call.",
>> +                default: false,
>> +                optional: true,
>> +            },
>> +        },
>> +    },
>>       access: {
>>           description: "Permissions are handled outside restore VM.",
>>           permission: &Permission::World,
>> @@ -34,12 +46,12 @@ fn read_uptime() -> Result<f32, Error> {
>>       }
>>   )]
>>   /// General status information
>> -fn status(
>> -    _param: Value,
>> -    _info: &ApiMethod,
>> -    _rpcenv: &mut dyn RpcEnvironment,
>> -) -> Result<RestoreDaemonStatus, Error> {
>> +fn status(keep_timeout: bool) -> Result<RestoreDaemonStatus, Error> {
>> +    if keep_timeout {
> 
> This seems just weird. Do we really need this?
> 

Not necessarily, but the idea I had in mind was someone running a script 
of sorts that calls 'proxmox-file-restore status' (for monitoring 
etc...) that would otherwise prevent the VMs from ever stopping...

>> +        watchdog_undo_ping();
>> +    }
>>       Ok(RestoreDaemonStatus {
>>           uptime: read_uptime()? as i64,
>> +        timeout: watchdog_remaining(false),
>>       })
>>   }
>> diff --git a/src/bin/proxmox_restore_daemon/mod.rs b/src/bin/proxmox_restore_daemon/mod.rs
>> index d938a5bb..6802d31c 100644
>> --- a/src/bin/proxmox_restore_daemon/mod.rs
>> +++ b/src/bin/proxmox_restore_daemon/mod.rs
>> @@ -1,3 +1,6 @@
>>   ///! File restore VM related functionality
>>   mod api;
>>   pub use api::*;
>> +
>> +mod watchdog;
>> +pub use watchdog::*;
>> diff --git a/src/bin/proxmox_restore_daemon/watchdog.rs b/src/bin/proxmox_restore_daemon/watchdog.rs
>> new file mode 100644
>> index 00000000..f722be0b
>> --- /dev/null
>> +++ b/src/bin/proxmox_restore_daemon/watchdog.rs
>> @@ -0,0 +1,63 @@
>> +//! SIGALRM/alarm(1) based watchdog that shuts down the VM if not pinged for TIMEOUT
>> +use anyhow::Error;
>> +use std::sync::atomic::{AtomicI64, Ordering};
>> +
>> +use nix::sys::{reboot, signal::*};
>> +use nix::unistd::alarm;
>> +
>> +const TIMEOUT: u32 = 600; // seconds
>> +static TRIGGERED: AtomicI64 = AtomicI64::new(0);
>> +static LAST_TRIGGERED: AtomicI64 = AtomicI64::new(0);
>> +
>> +/// Handler is called when alarm-watchdog expires, immediately shuts down VM when triggered
>> +extern "C" fn alarm_handler(_signal: nix::libc::c_int) {
>> +    // use println! instead of log, since log might buffer and not print before shut down
>> +    println!("Watchdog expired, shutting down VM...");
>> +    let err = reboot::reboot(reboot::RebootMode::RB_POWER_OFF).unwrap_err();
>> +    println!("'reboot' syscall failed: {}", err);
>> +    std::process::exit(1);
>> +}
>> +
>> +/// Initialize alarm() based watchdog
>> +pub fn watchdog_init() -> Result<(), Error> {
>> +    unsafe {
>> +        sigaction(
>> +            Signal::SIGALRM,
> 
> Please don't use this with async code. Threads, signal handlers and
> async are really annoying to keep track of.  This is a perlism we really
> shouldn't continue to use. We only have a single semi-acceptable excuse
> for timing signals currently and that's file locks with timeouts, as
> those have no alternative (yet), which use the timer_create(2)
> api btw. and will hopefully at some point be replaced by io-uring...
> 

I went with alarm() on the assumption that it might be a bit more 
reliable (tokio scheduler can get stuck?), and even had the idea to use 
an actual QEMU watchdog (that had some other issues though, that I don't 
quite remember atm).

> Please just spawn a future using
>      tokio::time::sleep(watchdog_remaining()).await
> in a loop (and don't forget to initialize `TRIGGERD` to the current time
> before spawning it of course ;-) ).
> 

...though the probability of a tokio hang is probably low enough that 
this will do just fine too - I'll change it in v2.

>> +            &SigAction::new(
>> +                SigHandler::Handler(alarm_handler),
>> +                SaFlags::empty(),
>> +                SigSet::empty(),
>> +            ),
>> +        )?;
>> +    }
>> +
>> +    watchdog_ping();
>> +
>> +    Ok(())
>> +}
>> +
>> +/// Trigger watchdog keepalive
>> +pub fn watchdog_ping() {
>> +    alarm::set(TIMEOUT);
> 
> ^ then this can just go
> 
>> +    let cur_time = proxmox::tools::time::epoch_i64();
>> +    let last = TRIGGERED.swap(cur_time, Ordering::SeqCst);
>> +    LAST_TRIGGERED.store(last, Ordering::SeqCst);
>> +}
>> +
>> +/// Returns the remaining time before watchdog expiry in seconds if 'current' is true, otherwise it
>> +/// returns the remaining time before the last ping (which is probably what you want in the API, as
>> +/// from an API call 'current'=true will *always* return TIMEOUT)
>> +pub fn watchdog_remaining(current: bool) -> i64 {
>> +    let cur_time = proxmox::tools::time::epoch_i64();
>> +    let last_time = (if current { &TRIGGERED } else { &LAST_TRIGGERED }).load(Ordering::SeqCst);
>> +    TIMEOUT as i64 - (cur_time - last_time)
>> +}
>> +
>> +/// Undo the last watchdog ping and set timer back to previous state, call this in the API to fake
>> +/// a non-resetting call
>> +pub fn watchdog_undo_ping() {
> 
> This still makes me cringe :-P
> 
>> +    let set = watchdog_remaining(false);
>> +    TRIGGERED.store(LAST_TRIGGERED.load(Ordering::SeqCst), Ordering::SeqCst);
>> +    // make sure argument cannot be 0, as that would cancel any alarm
>> +    alarm::set(1.max(set) as u32);
>> +}
>> -- 
>> 2.20.1




  reply	other threads:[~2021-02-17 11:15 UTC|newest]

Thread overview: 50+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-02-16 17:06 [pbs-devel] [PATCH 00/22] Single file restore for VM images Stefan Reiter
2021-02-16 17:06 ` [pbs-devel] [PATCH pxar 01/22] decoder/aio: add contents() and content_size() calls Stefan Reiter
2021-02-17  7:56   ` Wolfgang Bumiller
2021-02-16 17:06 ` [pbs-devel] [PATCH pxar 02/22] decoder: add peek() Stefan Reiter
2021-02-17  8:20   ` Wolfgang Bumiller
2021-02-17  8:38     ` Stefan Reiter
2021-02-16 17:06 ` [pbs-devel] [PATCH proxmox-restore-vm-data 03/22] initial commit Stefan Reiter
2021-03-15 18:35   ` [pbs-devel] applied: " Thomas Lamprecht
2021-03-16 15:33     ` Stefan Reiter
2021-02-16 17:06 ` [pbs-devel] [PATCH proxmox-backup 04/22] api2/admin/datastore: refactor list_dir_content in catalog_reader Stefan Reiter
2021-02-17  7:50   ` [pbs-devel] applied: " Thomas Lamprecht
2021-02-16 17:06 ` [pbs-devel] [PATCH proxmox-backup 05/22] api2/admin/datastore: accept "/" as path for root Stefan Reiter
2021-02-17  7:50   ` [pbs-devel] applied: " Thomas Lamprecht
2021-02-16 17:06 ` [pbs-devel] [PATCH proxmox-backup 06/22] api2/admin/datastore: refactor create_zip into pxar/extract Stefan Reiter
2021-02-17  7:50   ` [pbs-devel] applied: " Thomas Lamprecht
2021-02-16 17:06 ` [pbs-devel] [PATCH proxmox-backup 07/22] pxar/extract: add extract_sub_dir Stefan Reiter
2021-02-17  7:51   ` [pbs-devel] applied: " Thomas Lamprecht
2021-02-16 17:06 ` [pbs-devel] [PATCH proxmox-backup 08/22] pxar/extract: add sequential variants to create_zip, extract_sub_dir Stefan Reiter
2021-02-16 17:06 ` [pbs-devel] [PATCH proxmox-backup 09/22] client: extract common functions to proxmox_client_tools module Stefan Reiter
2021-02-17  6:49   ` Dietmar Maurer
2021-02-17  7:58     ` Stefan Reiter
2021-02-17  8:50       ` Dietmar Maurer
2021-02-17  9:47         ` Stefan Reiter
2021-02-17 10:12           ` Dietmar Maurer
2021-02-17  9:13   ` [pbs-devel] applied: " Dietmar Maurer
2021-02-16 17:06 ` [pbs-devel] [PATCH proxmox-backup 10/22] proxmox_client_tools: extract 'key' from client module Stefan Reiter
2021-02-17  9:11   ` Dietmar Maurer
2021-02-16 17:06 ` [pbs-devel] [PATCH proxmox-backup 11/22] file-restore: add binary and basic commands Stefan Reiter
2021-02-16 17:07 ` [pbs-devel] [PATCH proxmox-backup 12/22] file-restore: allow specifying output-format Stefan Reiter
2021-02-16 17:07 ` [pbs-devel] [PATCH proxmox-backup 13/22] rest: implement tower service for UnixStream Stefan Reiter
2021-02-17  6:52   ` [pbs-devel] applied: " Dietmar Maurer
2021-02-16 17:07 ` [pbs-devel] [PATCH proxmox-backup 14/22] client: add VsockClient to connect to virtio-vsock VMs Stefan Reiter
2021-02-17  7:24   ` [pbs-devel] applied: " Dietmar Maurer
2021-02-16 17:07 ` [pbs-devel] [PATCH proxmox-backup 15/22] file-restore-daemon: add binary with virtio-vsock API server Stefan Reiter
2021-02-17 10:17   ` Dietmar Maurer
2021-02-17 10:25   ` Dietmar Maurer
2021-02-17 10:30     ` Stefan Reiter
2021-02-17 11:13       ` Dietmar Maurer
2021-02-17 11:26   ` Dietmar Maurer
2021-02-16 17:07 ` [pbs-devel] [PATCH proxmox-backup 16/22] file-restore-daemon: add watchdog module Stefan Reiter
2021-02-17 10:52   ` Wolfgang Bumiller
2021-02-17 11:14     ` Stefan Reiter [this message]
2021-02-17 11:29       ` Wolfgang Bumiller
2021-02-16 17:07 ` [pbs-devel] [PATCH proxmox-backup 17/22] file-restore-daemon: add disk module Stefan Reiter
2021-02-16 17:07 ` [pbs-devel] [PATCH proxmox-backup 18/22] file-restore: add basic VM/block device support Stefan Reiter
2021-02-16 17:07 ` [pbs-devel] [PATCH proxmox-backup 19/22] file-restore: improve logging of VM with logrotate Stefan Reiter
2021-02-16 17:07 ` [pbs-devel] [PATCH proxmox-backup 20/22] debian/client: add postinst hook to rebuild file-restore initramfs Stefan Reiter
2021-02-16 17:07 ` [pbs-devel] [PATCH proxmox-backup 21/22] file-restore(-daemon): implement list API Stefan Reiter
2021-02-16 17:07 ` [pbs-devel] [PATCH proxmox-backup 22/22] file-restore: add 'extract' command for VM file restore Stefan Reiter
2021-02-16 17:11 ` [pbs-devel] [PATCH 00/22] Single file restore for VM images Stefan Reiter

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=64949c97-b8b9-01e8-8ed8-9f6c02fca94b@proxmox.com \
    --to=s.reiter@proxmox.com \
    --cc=pbs-devel@lists.proxmox.com \
    --cc=w.bumiller@proxmox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal