From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <s.reiter@proxmox.com>
Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature RSA-PSS (2048 bits))
 (No client certificate requested)
 by lists.proxmox.com (Postfix) with ESMTPS id 100F2670AE
 for <pbs-devel@lists.proxmox.com>; Wed, 29 Jul 2020 14:33:24 +0200 (CEST)
Received: from firstgate.proxmox.com (localhost [127.0.0.1])
 by firstgate.proxmox.com (Proxmox) with ESMTP id 9FC24E9E4
 for <pbs-devel@lists.proxmox.com>; Wed, 29 Jul 2020 14:33:23 +0200 (CEST)
Received: from proxmox-new.maurer-it.com (proxmox-new.maurer-it.com
 [212.186.127.180])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature RSA-PSS (2048 bits))
 (No client certificate requested)
 by firstgate.proxmox.com (Proxmox) with ESMTPS id 95AEEE996
 for <pbs-devel@lists.proxmox.com>; Wed, 29 Jul 2020 14:33:21 +0200 (CEST)
Received: from proxmox-new.maurer-it.com (localhost.localdomain [127.0.0.1])
 by proxmox-new.maurer-it.com (Proxmox) with ESMTP id 563C5433D5
 for <pbs-devel@lists.proxmox.com>; Wed, 29 Jul 2020 14:33:21 +0200 (CEST)
From: Stefan Reiter <s.reiter@proxmox.com>
To: pbs-devel@lists.proxmox.com
Date: Wed, 29 Jul 2020 14:33:13 +0200
Message-Id: <20200729123314.10049-5-s.reiter@proxmox.com>
X-Mailer: git-send-email 2.20.1
In-Reply-To: <20200729123314.10049-1-s.reiter@proxmox.com>
References: <20200729123314.10049-1-s.reiter@proxmox.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-SPAM-LEVEL: Spam detection results:  0
 AWL -0.043 Adjusted score from AWL reputation of From: address
 KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment
 RCVD_IN_DNSWL_MED        -2.3 Sender listed at https://www.dnswl.org/,
 medium trust
 SPF_HELO_NONE           0.001 SPF: HELO does not publish an SPF Record
 SPF_PASS               -0.001 SPF: sender matches SPF record
 URIBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to URIBL was blocked. See
 http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block for more
 information. [backup.rs]
Subject: [pbs-devel] [PATCH proxmox-backup 4/5] backup: use flock on backup
 group to forbid multiple backups at once
X-BeenThere: pbs-devel@lists.proxmox.com
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Proxmox Backup Server development discussion
 <pbs-devel.lists.proxmox.com>
List-Unsubscribe: <https://lists.proxmox.com/cgi-bin/mailman/options/pbs-devel>, 
 <mailto:pbs-devel-request@lists.proxmox.com?subject=unsubscribe>
List-Archive: <http://lists.proxmox.com/pipermail/pbs-devel/>
List-Post: <mailto:pbs-devel@lists.proxmox.com>
List-Help: <mailto:pbs-devel-request@lists.proxmox.com?subject=help>
List-Subscribe: <https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel>, 
 <mailto:pbs-devel-request@lists.proxmox.com?subject=subscribe>
X-List-Received-Date: Wed, 29 Jul 2020 12:33:24 -0000

Multiple backups within one backup group don't really make sense, but
break all sorts of guarantees (e.g. a second backup started after a
first would use a "known-chunks" list from the previous unfinished one,
which would be empty - but using the list from the last finished one is
not a fix either, as that one could be deleted or pruned once the first
simultaneous backup is finished).

Fix it by only allowing one backup per backup group at one time. This is
done via a flock on the backup group directory, thus remaining intact
even after a reload.

Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
---
 src/api2/backup.rs        | 11 ++++++----
 src/backup/backup_info.rs | 44 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/src/api2/backup.rs b/src/api2/backup.rs
index 05978bf2..621e8c07 100644
--- a/src/api2/backup.rs
+++ b/src/api2/backup.rs
@@ -95,17 +95,17 @@ async move {
     }
 
     let last_backup = BackupInfo::last_backup(&datastore.base_path(), &backup_group).unwrap_or(None);
-    let backup_dir = BackupDir::new_with_group(backup_group, backup_time);
+    let backup_dir = BackupDir::new_with_group(backup_group.clone(), backup_time);
 
     if let Some(last) = &last_backup {
         if backup_dir.backup_time() <= last.backup_dir.backup_time() {
             bail!("backup timestamp is older than last backup.");
         }
-        // fixme: abort if last backup is still running - howto test?
-        // Idea: write upid into a file inside snapshot dir. then test if
-        // it is still running here.
     }
 
+    // lock backup group to only allow one backup per group at a time
+    let _group_guard = backup_group.lock(&datastore.base_path())?;
+
     let (path, is_new) = datastore.create_backup_dir(&backup_dir)?;
     if !is_new { bail!("backup directory already exists."); }
 
@@ -144,6 +144,9 @@ async move {
             .map(|_| Err(format_err!("task aborted")));
 
         async move {
+            // keep flock until task ends
+            let _group_guard = _group_guard;
+
             let res = select!{
                 req = req_fut => req,
                 abrt = abort_future => abrt,
diff --git a/src/backup/backup_info.rs b/src/backup/backup_info.rs
index b4f671bd..041f5785 100644
--- a/src/backup/backup_info.rs
+++ b/src/backup/backup_info.rs
@@ -3,7 +3,9 @@ use crate::tools;
 use anyhow::{bail, format_err, Error};
 use regex::Regex;
 use std::os::unix::io::RawFd;
+use nix::dir::Dir;
 
+use std::time::Duration;
 use chrono::{DateTime, TimeZone, SecondsFormat, Utc};
 
 use std::path::{PathBuf, Path};
@@ -36,6 +38,9 @@ lazy_static!{
 
 }
 
+/// Opaque type releasing the corresponding flock when dropped
+pub type BackupGroupGuard = Dir;
+
 /// BackupGroup is a directory containing a list of BackupDir
 #[derive(Debug, Eq, PartialEq, Hash, Clone)]
 pub struct BackupGroup {
@@ -130,6 +135,45 @@ impl BackupGroup {
         Ok(last)
     }
 
+    pub fn lock(&self, base_path: &Path) -> Result<BackupGroupGuard, Error> {
+        use nix::fcntl::OFlag;
+        use nix::sys::stat::Mode;
+
+        let mut path = base_path.to_owned();
+        path.push(self.group_path());
+
+        let mut handle = Dir::open(&path, OFlag::O_RDONLY, Mode::empty())
+            .map_err(|err| {
+                format_err!(
+                    "unable to open backup group directory {:?} for locking - {}",
+                    self.group_path(),
+                    err,
+                )
+            })?;
+
+        // acquire in non-blocking mode, no point in waiting here since other
+        // backups could still take a very long time
+        tools::lock_file(&mut handle, true, Some(Duration::from_nanos(0)))
+            .map_err(|err| {
+                match err.downcast_ref::<nix::Error>() {
+                    Some(nix::Error::Sys(nix::errno::Errno::EAGAIN)) => {
+                        return format_err!(
+                            "unable to acquire lock on backup group {:?} - another backup is already running",
+                            self.group_path(),
+                        );
+                    },
+                    _ => ()
+                }
+                format_err!(
+                    "unable to acquire lock on backup group {:?} - {}",
+                    self.group_path(),
+                    err,
+                )
+            })?;
+
+        Ok(handle)
+    }
+
     pub fn list_groups(base_path: &Path) -> Result<Vec<BackupGroup>, Error> {
         let mut list = Vec::new();
 
-- 
2.20.1