From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [IPv6:2a01:7e0:0:424::9]) by lore.proxmox.com (Postfix) with ESMTPS id 124AB1FF15E for ; Mon, 10 Nov 2025 09:31:17 +0100 (CET) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id BA1A4F2E1; Mon, 10 Nov 2025 09:32:01 +0100 (CET) Date: Mon, 10 Nov 2025 09:31:54 +0100 From: Fabian =?iso-8859-1?q?Gr=FCnbichler?= To: Proxmox Backup Server development discussion References: <20251106171358.865503-1-c.ebner@proxmox.com> <20251106171358.865503-4-c.ebner@proxmox.com> In-Reply-To: <20251106171358.865503-4-c.ebner@proxmox.com> MIME-Version: 1.0 User-Agent: astroid/0.17.0 (https://github.com/astroidmail/astroid) Message-Id: <1762762321.tfqlrhk6b0.astroid@yuna.none> X-Bm-Milter-Handled: 55990f41-d878-4baa-be0a-ee34c49e34d2 X-Bm-Transport-Timestamp: 1762763495093 X-SPAM-LEVEL: Spam detection results: 0 AWL 0.046 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment RCVD_IN_VALIDITY_CERTIFIED_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to Validity was blocked. See https://knowledge.validity.com/hc/en-us/articles/20961730681243 for more information. RCVD_IN_VALIDITY_RPBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to Validity was blocked. See https://knowledge.validity.com/hc/en-us/articles/20961730681243 for more information. RCVD_IN_VALIDITY_SAFE_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to Validity was blocked. See https://knowledge.validity.com/hc/en-us/articles/20961730681243 for more information. SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Subject: Re: [pbs-devel] [PATCH proxmox-backup v2 3/3] datastore: insert chunk marker and touch bad chunks in locked context X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: Proxmox Backup Server development discussion Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: pbs-devel-bounces@lists.proxmox.com Sender: "pbs-devel" On November 6, 2025 6:13 pm, Christian Ebner wrote: > Assures that both, the touching of bad chunks as well as the > insertion of missing chunk marker files are done while the chunk > store mutex is guarded, other operations therefore get a consistent > state. > > To achieve this, introduces a helper method which allows to run a > callback in a locked context if the chunk file is missing. > > Signed-off-by: Christian Ebner > --- > changes since version 1: > - not present in previous version > > pbs-datastore/src/chunk_store.rs | 17 +++++++++++++ > pbs-datastore/src/datastore.rs | 42 +++++++++++++++++++------------- > 2 files changed, 42 insertions(+), 17 deletions(-) > > diff --git a/pbs-datastore/src/chunk_store.rs b/pbs-datastore/src/chunk_store.rs > index b88a0a096..063bc55f6 100644 > --- a/pbs-datastore/src/chunk_store.rs > +++ b/pbs-datastore/src/chunk_store.rs > @@ -212,6 +212,23 @@ impl ChunkStore { > Ok(()) > } > > + /// Update the chunk files atime if it exists, call the provided callback inside a chunk store > + /// locked scope otherwise. > + pub(super) fn cond_touch_chunk_or_locked( > + &self, > + digest: &[u8; 32], > + callback: T, > + ) -> Result<(), Error> > + where > + T: FnOnce() -> Result<(), Error>, > + { > + let _lock = self.mutex.lock(); > + if !self.cond_touch_chunk_no_lock(digest, false)? { > + callback()?; > + } > + Ok(()) > + } > + > /// Update the chunk files atime if it exists. > /// > /// If the chunk file does not exist, return with error if assert_exists is true, with > diff --git a/pbs-datastore/src/datastore.rs b/pbs-datastore/src/datastore.rs > index 4527b40f4..b2f414ce1 100644 > --- a/pbs-datastore/src/datastore.rs > +++ b/pbs-datastore/src/datastore.rs > @@ -1302,15 +1302,16 @@ impl DataStore { > match s3_client { > None => { > // Filesystem backend > - if !self.inner.chunk_store.cond_touch_chunk(digest, false)? { > + self.inner.chunk_store.cond_touch_chunk_or_locked(digest, || { > let hex = hex::encode(digest); > warn!( > "warning: unable to access non-existent chunk {hex}, required by {file_name:?}" > ); > > - // touch any corresponding .bad files to keep them around, meaning if a chunk is > - // rewritten correctly they will be removed automatically, as well as if no index > - // file requires the chunk anymore (won't get to this loop then) > + // touch any corresponding .bad files to keep them around, meaning if a > + // chunk is rewritten correctly they will be removed automatically, as well > + // as if no index file requires the chunk anymore (won't get to this loop > + // then) > for i in 0..=9 { > let bad_ext = format!("{i}.bad"); > let mut bad_path = PathBuf::new(); > @@ -1318,22 +1319,29 @@ impl DataStore { > bad_path.set_extension(bad_ext); > self.inner.chunk_store.cond_touch_path(&bad_path, false)?; > } > - } > + Ok(()) > + })?; do we need to hold the mutex for touching the bad chunks? we don't when creating them during verification (should we?), we do when removing them during GC.. if we do, then we should probably have a pub(super) fn cond_touch_bad_chunk(s?) in the chunk store because each touch here is independent, and there is no overarching need to hold the mutex across all of them, and this would allow us to make cond_touch_path private.. should we also touch bad chunks for the S3 case? > } > Some(ref _s3_client) => { > // Update atime on local cache marker files. > - if !self.inner.chunk_store.cond_touch_chunk(digest, false)? { > - let (chunk_path, _digest) = self.chunk_path(digest); > - // Insert empty file as marker to tell GC phase2 that this is > - // a chunk still in-use, so to keep in the S3 object store. > - std::fs::File::options() > - .write(true) > - .create_new(true) > - .open(&chunk_path) > - .with_context(|| { > - format!("failed to create marker for chunk {}", hex::encode(digest)) > - })?; > - } > + self.inner > + .chunk_store > + .cond_touch_chunk_or_locked(digest, || { > + let (chunk_path, _digest) = self.chunk_path(digest); > + // Insert empty file as marker to tell GC phase2 that this is > + // a chunk still in-use, so to keep in the S3 object store. > + std::fs::File::options() > + .write(true) > + .create_new(true) > + .open(&chunk_path) > + .with_context(|| { > + format!( > + "failed to create marker for chunk {}", > + hex::encode(digest) > + ) > + })?; > + Ok(()) > + })?; AFAICT, we can fix this together with the other S3-races by obtaining the flock on the chunk here? i.e., // without flock first, since a chunk missing is the unlikely path // (corruption detected by verify, or manual damage to the chunk store) if !cond_touch_chunk { // now with flock, to protect the next two calls against concurrent // insert+uploads/renames/.. flock { // somebody else could have inserted it since we checked without // locking above if !cond_touch_chunk { // insert empty marker only if chunk is not there store.clear_chunk } } } if (not-)caching were a property of the datastore, instead of the backup session, we could even just fetch the missing chunk into the cache here if caching is enabled > } > } > } > -- > 2.47.3 > > > > _______________________________________________ > pbs-devel mailing list > pbs-devel@lists.proxmox.com > https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel > > > _______________________________________________ pbs-devel mailing list pbs-devel@lists.proxmox.com https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel