From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [IPv6:2a01:7e0:0:424::9]) by lore.proxmox.com (Postfix) with ESMTPS id 0C2EC1FF179 for ; Wed, 1 Oct 2025 13:23:01 +0200 (CEST) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id E49A11E244; Wed, 1 Oct 2025 13:23:08 +0200 (CEST) From: Christian Ebner To: pbs-devel@lists.proxmox.com Date: Wed, 1 Oct 2025 13:22:51 +0200 Message-ID: <20251001112251.3788-1-c.ebner@proxmox.com> X-Mailer: git-send-email 2.47.3 MIME-Version: 1.0 X-Bm-Milter-Handled: 55990f41-d878-4baa-be0a-ee34c49e34d2 X-Bm-Transport-Timestamp: 1759317763009 X-SPAM-LEVEL: Spam detection results: 0 AWL 0.043 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Subject: [pbs-devel] [PATCH proxmox-backup stable-3] fix #6566: backup: api: conditionally drop group and snapshot locks X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: Proxmox Backup Server development discussion Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: pbs-devel-bounces@lists.proxmox.com Sender: "pbs-devel" To guarantee consistency by possible concurrent operations, the backup protocol locks the backup group, the previous backup snapshot (if any) and holds a lock for the newly created backup snapshot. All of these are currently stored in the backup worker task, only released on its destruction. The backup API however signals a successful backup via the return status of the `finish` call, while still holding the locks. Therefore, an immediate subsequent backup of the client to the same group can fail because the locks cannot be acquired until the previous backup task is completely destroyed, which can however outlive the `finish` return for some time. This manifests in e.g. a push sync job failing. To fix this, store the lock guards inside the RPC environments shared state instead, allowing to selectively drop the locks on successful backup finish. On error, hold the locks until the cleanup was successful. Immediate verification of new snapshots already downgraded the lock by dropping the exclusive lock and getting a shared lock. Since the dropping is now already handled by the finish call, only gathering the shared lock is required. While there is now a larger time window for concurrent prunes, the underlying possible race between verification and prune remains in place. Backported from https://git.proxmox.com/?p=proxmox-backup.git;a=commit;h=b1ece6c70c7785191321525576ed6f53e9c4bc18 Fixes: https://bugzilla.proxmox.com/show_bug.cgi?id=6566 Signed-off-by: Christian Ebner --- src/api2/backup/environment.rs | 39 +++++++++++++++++++++++++++++----- src/api2/backup/mod.rs | 18 +++++++++------- 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/src/api2/backup/environment.rs b/src/api2/backup/environment.rs index 3d541b461..c4251c008 100644 --- a/src/api2/backup/environment.rs +++ b/src/api2/backup/environment.rs @@ -85,6 +85,27 @@ struct SharedBackupState { known_chunks: KnownChunksMap, backup_size: u64, // sums up size of all files backup_stat: UploadStatistic, + backup_lock_guards: BackupLockGuards, +} + +pub struct BackupLockGuards { + previous_snapshot: Option, + group: Option, + snapshot: Option, +} + +impl BackupLockGuards { + pub(crate) fn new( + previous_snapshot: Option, + group: BackupLockGuard, + snapshot: BackupLockGuard, + ) -> Self { + Self { + previous_snapshot, + group: Some(group), + snapshot: Some(snapshot), + } + } } impl SharedBackupState { @@ -125,6 +146,7 @@ impl BackupEnvironment { worker: Arc, datastore: Arc, backup_dir: BackupDir, + backup_lock_guards: BackupLockGuards, ) -> Self { let state = SharedBackupState { finished: false, @@ -135,6 +157,7 @@ impl BackupEnvironment { known_chunks: HashMap::new(), backup_size: 0, backup_stat: UploadStatistic::new(), + backup_lock_guards, }; Self { @@ -607,6 +630,9 @@ impl BackupEnvironment { bail!("backup does not contain valid files (file count == 0)"); } + // drop previous snapshot lock + state.backup_lock_guards.previous_snapshot.take(); + // check for valid manifest and store stats let stats = serde_json::to_value(state.backup_stat)?; self.backup_dir @@ -630,13 +656,17 @@ impl BackupEnvironment { // marks the backup as successful state.finished = true; + // drop snapshot and group lock only here so any error above will lead to + // the locks still being held in the env for the backup cleanup. + state.backup_lock_guards.snapshot.take(); + state.backup_lock_guards.group.take(); + Ok(()) } /// If verify-new is set on the datastore, this will run a new verify task - /// for the backup. If not, this will return and also drop the passed lock - /// immediately. - pub fn verify_after_complete(&self, excl_snap_lock: BackupLockGuard) -> Result<(), Error> { + /// for the backup. If not, this will return. + pub fn verify_after_complete(&self) -> Result<(), Error> { self.ensure_finished()?; if !self.datastore.verify_new() { @@ -644,8 +674,7 @@ impl BackupEnvironment { return Ok(()); } - // Downgrade to shared lock, the backup itself is finished - drop(excl_snap_lock); + // Get shared lock, the backup itself is finished let snap_lock = self.backup_dir.lock_shared().with_context(|| { format!( "while trying to verify snapshot '{:?}' after completion", diff --git a/src/api2/backup/mod.rs b/src/api2/backup/mod.rs index 629df933e..2f3fa7527 100644 --- a/src/api2/backup/mod.rs +++ b/src/api2/backup/mod.rs @@ -140,7 +140,7 @@ fn upgrade_to_backup_protocol( }; // lock backup group to only allow one backup per group at a time - let (owner, _group_guard) = datastore.create_locked_backup_group( + let (owner, group_guard) = datastore.create_locked_backup_group( backup_group.backup_ns(), backup_group.as_ref(), &auth_id, @@ -179,7 +179,7 @@ fn upgrade_to_backup_protocol( let backup_dir = backup_group.backup_dir(backup_dir_arg.time)?; - let _last_guard = if let Some(last) = &last_backup { + let last_guard = if let Some(last) = &last_backup { if backup_dir.backup_time() <= last.backup_dir.backup_time() { bail!("backup timestamp is older than last backup."); } @@ -205,12 +205,19 @@ fn upgrade_to_backup_protocol( auth_id.to_string(), true, move |worker| { + // Keep flock for the backup runtime by storing guards in backup env shared state. + // Drop them on successful backup finish or when dropping the env after cleanup in + // case of errors. The former is required for immediate subsequent backups (e.g. + // during a push sync) to be able to lock the group and snapshots. + let backup_lock_guards = BackupLockGuards::new(last_guard, group_guard, snap_guard); + let mut env = BackupEnvironment::new( env_type, auth_id, worker.clone(), datastore, backup_dir, + backup_lock_guards, ); env.debug = debug; @@ -264,11 +271,6 @@ fn upgrade_to_backup_protocol( let mut abort_future = abort_future.map(|_| Err(format_err!("task aborted"))); async move { - // keep flock until task ends - let _group_guard = _group_guard; - let snap_guard = snap_guard; - let _last_guard = _last_guard; - let res = select! { req = req_fut => req, abrt = abort_future => abrt, @@ -280,7 +282,7 @@ fn upgrade_to_backup_protocol( } let verify = |env: BackupEnvironment| { - if let Err(err) = env.verify_after_complete(snap_guard) { + if let Err(err) = env.verify_after_complete() { env.log(format!( "backup finished, but starting the requested verify task failed: {}", err -- 2.39.5 _______________________________________________ pbs-devel mailing list pbs-devel@lists.proxmox.com https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel