From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) by lore.proxmox.com (Postfix) with ESMTPS id AB4AE1FF165 for ; Thu, 25 Sep 2025 14:41:23 +0200 (CEST) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 5256D20B1B; Thu, 25 Sep 2025 14:41:55 +0200 (CEST) Date: Thu, 25 Sep 2025 14:41:19 +0200 From: Fabian =?iso-8859-1?q?Gr=FCnbichler?= To: Proxmox Backup Server development discussion References: <20250924145612.188579-1-c.ebner@proxmox.com> <20250924145612.188579-2-c.ebner@proxmox.com> In-Reply-To: <20250924145612.188579-2-c.ebner@proxmox.com> MIME-Version: 1.0 User-Agent: astroid/0.17.0 (https://github.com/astroidmail/astroid) Message-Id: <1758803678.fo1e90c9lf.astroid@yuna.none> X-Bm-Milter-Handled: 55990f41-d878-4baa-be0a-ee34c49e34d2 X-Bm-Transport-Timestamp: 1758804068007 X-SPAM-LEVEL: Spam detection results: 0 AWL 0.047 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment KAM_SHORT 0.001 Use of a URL Shortener for very short URL RCVD_IN_VALIDITY_CERTIFIED_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to Validity was blocked. See https://knowledge.validity.com/hc/en-us/articles/20961730681243 for more information. RCVD_IN_VALIDITY_RPBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to Validity was blocked. See https://knowledge.validity.com/hc/en-us/articles/20961730681243 for more information. RCVD_IN_VALIDITY_SAFE_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to Validity was blocked. See https://knowledge.validity.com/hc/en-us/articles/20961730681243 for more information. SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Subject: Re: [pbs-devel] [PATCH proxmox-backup 1/2] fix #6750: api: avoid possible deadlock on datastores with s3 backend X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: Proxmox Backup Server development discussion Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: pbs-devel-bounces@lists.proxmox.com Sender: "pbs-devel" On September 24, 2025 4:56 pm, Christian Ebner wrote: > Closing of the fixed or dynamic index files with s3 backend will call > async code, which must be avoided because of possible deadlocks [0]. > Therefore, perform all changes on the shared backup state and drop the > guard before uploading the fixed index file to the s3 backend. > > Account for active backend operations and check consistency, since it > must be assured that all active backend operations are finished before > the finish call can succeed. > > [0] https://docs.rs/tokio/latest/tokio/sync/struct.Mutex.html#which-kind-of-mutex-should-you-use > > Fixes: https://bugzilla.proxmox.com/show_bug.cgi?id=6750 > Signed-off-by: Christian Ebner > --- > src/api2/backup/environment.rs | 77 +++++++++++++++++++++++----------- > 1 file changed, 53 insertions(+), 24 deletions(-) > > diff --git a/src/api2/backup/environment.rs b/src/api2/backup/environment.rs > index d5e6869cd..e535891a4 100644 > --- a/src/api2/backup/environment.rs > +++ b/src/api2/backup/environment.rs > @@ -82,6 +82,7 @@ struct SharedBackupState { > finished: bool, > uid_counter: usize, > file_counter: usize, // successfully uploaded files > + active_backend_operations: usize, > dynamic_writers: HashMap, > fixed_writers: HashMap, > known_chunks: KnownChunksMap, > @@ -135,6 +136,7 @@ impl BackupEnvironment { > finished: false, > uid_counter: 0, > file_counter: 0, > + active_backend_operations: 0, > dynamic_writers: HashMap::new(), > fixed_writers: HashMap::new(), > known_chunks: HashMap::new(), > @@ -483,15 +485,10 @@ impl BackupEnvironment { > ); > } > > - // For S3 backends, upload the index file to the object store after closing > - if let DatastoreBackend::S3(s3_client) = &self.backend { > - self.s3_upload_index(s3_client, &data.name) > - .context("failed to upload dynamic index to s3 backend")?; > - self.log(format!( > - "Uploaded dynamic index file to s3 backend: {}", > - data.name > - )) > - } > + state.file_counter += 1; > + state.backup_size += size; > + state.backup_stat = state.backup_stat + data.upload_stat; > + state.active_backend_operations += 1; > > self.log_upload_stat( > &data.name, > @@ -502,9 +499,21 @@ impl BackupEnvironment { > &data.upload_stat, > ); > > - state.file_counter += 1; > - state.backup_size += size; > - state.backup_stat = state.backup_stat + data.upload_stat; > + // never hold mutex guard during s3 upload due to possible deadlocks > + drop(state); > + > + // For S3 backends, upload the index file to the object store after closing > + if let DatastoreBackend::S3(s3_client) = &self.backend { > + self.s3_upload_index(s3_client, &data.name) > + .context("failed to upload dynamic index to s3 backend")?; > + self.log(format!( > + "Uploaded dynamic index file to s3 backend: {}", > + data.name > + )) > + } > + > + let mut state = self.state.lock().unwrap(); > + state.active_backend_operations -= 1; these two hunks are okay, although we could also reuse the registered writers map to encode whether a writer is active, being processed/closed or doesn't exist? would allow more fine-grained logging.. > Ok(()) > } > @@ -567,15 +576,10 @@ impl BackupEnvironment { > ); > } > > - // For S3 backends, upload the index file to the object store after closing > - if let DatastoreBackend::S3(s3_client) = &self.backend { > - self.s3_upload_index(s3_client, &data.name) > - .context("failed to upload fixed index to s3 backend")?; > - self.log(format!( > - "Uploaded fixed index file to object store: {}", > - data.name > - )) > - } > + state.file_counter += 1; > + state.backup_size += size; > + state.backup_stat = state.backup_stat + data.upload_stat; > + state.active_backend_operations += 1; > > self.log_upload_stat( > &data.name, > @@ -586,9 +590,21 @@ impl BackupEnvironment { > &data.upload_stat, > ); > > - state.file_counter += 1; > - state.backup_size += size; > - state.backup_stat = state.backup_stat + data.upload_stat; > + // never hold mutex guard during s3 upload due to possible deadlocks > + drop(state); > + > + // For S3 backends, upload the index file to the object store after closing > + if let DatastoreBackend::S3(s3_client) = &self.backend { > + self.s3_upload_index(s3_client, &data.name) > + .context("failed to upload fixed index to s3 backend")?; > + self.log(format!( > + "Uploaded fixed index file to object store: {}", > + data.name > + )) > + } > + > + let mut state = self.state.lock().unwrap(); > + state.active_backend_operations -= 1; > > Ok(()) > } > @@ -645,6 +661,13 @@ impl BackupEnvironment { > bail!("found open index writer - unable to finish backup"); > } > > + if state.active_backend_operations != 0 { > + bail!( > + "backup task still has {} active operations.", > + state.active_backend_operations, > + ); > + } > + > if state.file_counter == 0 { > bail!("backup does not contain valid files (file count == 0)"); > } > @@ -753,6 +776,12 @@ impl BackupEnvironment { > if !state.finished { > bail!("backup ended but finished flag is not set."); > } > + if state.active_backend_operations != 0 { > + bail!( > + "backup ended but {} active backend operations.", > + state.active_backend_operations, > + ); > + } there's now an inconsistency between ensure_finished(), which checks both conditions, and finished() (used to determine whether a connection being interrupted is benign or not!), which just checks the finished flag.. > Ok(()) > } > > -- > 2.47.3 > > > > _______________________________________________ > pbs-devel mailing list > pbs-devel@lists.proxmox.com > https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel > > > _______________________________________________ pbs-devel mailing list pbs-devel@lists.proxmox.com https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel