From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [IPv6:2a01:7e0:0:424::9]) by lore.proxmox.com (Postfix) with ESMTPS id 86F2E1FF16E for ; Mon, 3 Mar 2025 14:51:36 +0100 (CET) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id C7835C158; Mon, 3 Mar 2025 14:51:32 +0100 (CET) Date: Mon, 03 Mar 2025 14:51:24 +0100 From: Fabian =?iso-8859-1?q?Gr=FCnbichler?= To: Proxmox Backup Server development discussion References: <20250219164847.757184-1-c.ebner@proxmox.com> <20250219164847.757184-5-c.ebner@proxmox.com> In-Reply-To: <20250219164847.757184-5-c.ebner@proxmox.com> MIME-Version: 1.0 User-Agent: astroid/0.16.0 (https://github.com/astroidmail/astroid) Message-Id: <1741007036.5kgeh44pj2.astroid@yuna.none> X-SPAM-LEVEL: Spam detection results: 0 AWL 0.043 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment RCVD_IN_VALIDITY_CERTIFIED_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to Validity was blocked. See https://knowledge.validity.com/hc/en-us/articles/20961730681243 for more information. RCVD_IN_VALIDITY_RPBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to Validity was blocked. See https://knowledge.validity.com/hc/en-us/articles/20961730681243 for more information. RCVD_IN_VALIDITY_SAFE_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to Validity was blocked. See https://knowledge.validity.com/hc/en-us/articles/20961730681243 for more information. SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Subject: Re: [pbs-devel] [PATCH proxmox-backup 4/9] fix #5982: garbage collection: check atime updates are honored X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: Proxmox Backup Server development discussion Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: pbs-devel-bounces@lists.proxmox.com Sender: "pbs-devel" On February 19, 2025 5:48 pm, Christian Ebner wrote: > Check if the filesystem backing the chunk store actually updates the > atime to avoid potential data loss in phase 2 of garbage collection > in case the atime update is not honored. > > Perform the check before phase 1 of garbage collection, as well as > on datastore creation. The latter to early detect and disallow > datastore creation on filesystem configurations which otherwise most > likely would lead to data losses. > > Enable the atime update check by default, but allow to opt-out by > setting a datastore tuning parameter flag for backwards compatibility. > This is honored by both, garbage collection and datastore creation. > > The check uses a 4 MiB fixed sized, unencypted and compressed chunk > as test marker, inserted if not present. This all zero-chunk is very > likely anyways for unencrypted backup contents with large all-zero > regions using fixed size chunking (e.g. VMs). > > The check sets the chunk's atime twice, once to set it to 1 second > into the past, and once to set it to now. By this one can detect if > the atime is set at all and if it also set if the filesystem is e.g. > mounted via relatime, which is intended to deferr the atime update. that's not how relatime works? see below.. > > Fixes: https://bugzilla.proxmox.com/show_bug.cgi?id=5982 > Signed-off-by: Christian Ebner > --- > changes since version 1: > - Use all-zero fixed size chunk for atime update test > - Test on datastore creation and before garbage collection phase 1 > - fail datastore creation and garbage collection jobs if check fails, > but keep opt-out > > pbs-datastore/src/chunk_store.rs | 80 ++++++++++++++++++++++++++++++-- > pbs-datastore/src/datastore.rs | 10 ++++ > src/api2/config/datastore.rs | 1 + > 3 files changed, 86 insertions(+), 5 deletions(-) > > diff --git a/pbs-datastore/src/chunk_store.rs b/pbs-datastore/src/chunk_store.rs > index 7bdcb0297..be57e3c87 100644 > --- a/pbs-datastore/src/chunk_store.rs > +++ b/pbs-datastore/src/chunk_store.rs > @@ -1,6 +1,7 @@ > use std::os::unix::io::AsRawFd; > use std::path::{Path, PathBuf}; > use std::sync::{Arc, Mutex}; > +use std::time::{Duration, SystemTime, UNIX_EPOCH}; > > use anyhow::{bail, format_err, Error}; > use tracing::info; > @@ -13,6 +14,7 @@ use proxmox_sys::process_locker::{ > }; > use proxmox_worker_task::WorkerTaskContext; > > +use crate::data_blob::DataChunkBuilder; > use crate::file_formats::{ > COMPRESSED_BLOB_MAGIC_1_0, ENCRYPTED_BLOB_MAGIC_1_0, UNCOMPRESSED_BLOB_MAGIC_1_0, > }; > @@ -96,6 +98,7 @@ impl ChunkStore { > uid: nix::unistd::Uid, > gid: nix::unistd::Gid, > sync_level: DatastoreFSyncLevel, > + atime_update_check: bool, > ) -> Result > where > P: Into, > @@ -150,7 +153,16 @@ impl ChunkStore { > } > } > > - Self::open(name, base, sync_level) > + let chunk_store = Self::open(name, base, sync_level)?; > + if atime_update_check { > + chunk_store > + .check_fs_atime_update() > + .map(|()| info!("atime update check successful."))?; > + } else { > + info!("atime update check skipped."); > + } > + > + Ok(chunk_store) > } > > fn lockfile_path>(base: P) -> PathBuf { > @@ -445,6 +457,51 @@ impl ChunkStore { > Ok(()) > } > > + /// Check if atime updates are honored by the filesystem backing the chunk store. > + /// > + /// Sets the atime twice, once into the past by one second, checking that this was honored and > + /// then to now again, as otherwise it is not guaranteed to also cover time ranges by `relatime`. > + /// Uses a 4 MiB fixed size, compressed but unencrypted chunk to test. The chunk is inserted in > + /// the chunk store if not yet present. > + pub fn check_fs_atime_update(&self) -> Result<(), Error> { > + let (zero_chunk, digest) = DataChunkBuilder::build_zero_chunk(None, 4096 * 1024, true)?; > + self.insert_chunk(&zero_chunk, &digest)?; > + > + let past = SystemTime::now().duration_since(UNIX_EPOCH)? - Duration::from_secs(1); > + let times: [libc::timespec; 2] = [ > + libc::timespec { > + tv_sec: past.as_secs().try_into()?, > + tv_nsec: past.subsec_nanos().into(), > + }, > + libc::timespec { > + tv_sec: 0, > + tv_nsec: UTIME_OMIT, > + }, > + ]; this sets *atime* one second in the past, while not touching *mtime*. this is not something we ever need? if this fails, something is weird, but it wouldn't mean that atime handling was broken for our use case.. > + > + use nix::NixPath; > + let (path, _digest) = self.chunk_path(&digest); > + path.with_nix_path(|cstr| unsafe { > + let tmp = libc::utimensat(-1, cstr.as_ptr(), ×[0], libc::AT_SYMLINK_NOFOLLOW); > + nix::errno::Errno::result(tmp) > + })??; > + > + let metadata_past = std::fs::metadata(&path).map_err(Error::from)?; > + let atime_past = metadata_past.accessed()?; > + if past != atime_past.duration_since(UNIX_EPOCH)? { > + bail!("setting atime to the past failed, is atime support enabled on datastore backing filesystem?"); > + } > + > + self.cond_touch_chunk(&digest, true)?; > + > + let metadata_now = std::fs::metadata(&path).map_err(Error::from)?; > + let atime_now = metadata_now.accessed()?; > + if atime_past >= atime_now { > + bail!("atime update check failed, is atime support enabled on datastore backing filesystem?"); > + } if something implements explicit timestamp updates to work like relatime does for read-triggered updates, this would still potentially fail - relatime covers checking *mtime* (which we haven't updated above!) first to see if *atime* should be updated when doing reads. it's basically a performance optimization (skip most atime updates for read-heavy work loads) without breaking a very specific use case (that of mutt abusing atime to know when it last read a local mailbox[0], which would break if atime were never updated on reads). I think what we actually want to do at GC time is: (potentially) insert the chunk stat the chunk touch the chunk (regularly) stat the chunk again compare the timestamps we now have three cases: A) the old atime is before the new atime => the storage doesn't discard *all* atime updates and we can proceed B) the old atime and the new atime are identical and atime is *before* mtime and ctime => atime should have been updated even under "relatime-like" semantics, very likely this storage is discarding all atime updates, we should abort with an error C) the old atime and the new atime are identical, but atime is *after* ctime/mtime => we need to check whether we just ran into relatime-like behaviour C means we either abort as well (if we don't want to support such setups), or we try to detect if setting both mtime and atime works, and force the GC into doing that we might want to add a "wait a second" between the first stat and the touching to account for filesystems with more granular timestamps (inserting touches, and other tasks might touch at "the same time" as well by accident), since the requested timestamps might be modified to become "the greatest value supported by the filesystem that is not greater than the specified time" (utimensat(2)), and one second doesn't really hurt is in the scope of a GC.. note that the 24h grace period is totally independent of this - it comes from lazytime and relates to how often changes of timestamps are *persisted on disk*. that shouldn't affect us for local storages (since we talk to the kernel and always get the updated view anyway, no matter if it is in-memory or on-disk. if the PBS system crashes GC is gone as well and needs to start over, no risk of inconsistency). we might have a problem if the storage uses lazytime internally and lives somewhere else - GC starts phase1 - updates a few atimes - storage crashes and loses some of the updates while I/O to it blocks - storage comes back up - GC continues not knowing that some of the atime updates have been lost in the ether - GC enters phase2 - chunk timestamp makes it eligible for deletion, even though it was touched in phase1 - data loss I don't think we can really prevent this other than by ditching the whole atime mechanism and instead keep a list of written chunks in each backup writer, and generate such a list in phase1, merging them all and using that for phase2 (which is really expensive as it scales with the number of touched chunks and would require syncing up writers and GC). thankfully I'd expect most such failure scenarios to actually abort the GC anyway for I/O error reasons.. at datastore creation we could do more in-depth checks (including "faking" past timestamps and seeing if updates are honored for those if they are not honored when doing the simple tests) - if we think we gain something from that? 0: http://www.mutt.org/doc/manual/#new-mail-formats > + Ok(()) > + } > + > pub fn insert_chunk(&self, chunk: &DataBlob, digest: &[u8; 32]) -> Result<(bool, u64), Error> { > // unwrap: only `None` in unit tests > assert!(self.locker.is_some()); > @@ -631,8 +688,15 @@ fn test_chunk_store1() { > let user = nix::unistd::User::from_uid(nix::unistd::Uid::current()) > .unwrap() > .unwrap(); > - let chunk_store = > - ChunkStore::create("test", &path, user.uid, user.gid, DatastoreFSyncLevel::None).unwrap(); > + let chunk_store = ChunkStore::create( > + "test", > + &path, > + user.uid, > + user.gid, > + DatastoreFSyncLevel::None, > + true, > + ) > + .unwrap(); > > let (chunk, digest) = crate::data_blob::DataChunkBuilder::new(&[0u8, 1u8]) > .build() > @@ -644,8 +708,14 @@ fn test_chunk_store1() { > let (exists, _) = chunk_store.insert_chunk(&chunk, &digest).unwrap(); > assert!(exists); > > - let chunk_store = > - ChunkStore::create("test", &path, user.uid, user.gid, DatastoreFSyncLevel::None); > + let chunk_store = ChunkStore::create( > + "test", > + &path, > + user.uid, > + user.gid, > + DatastoreFSyncLevel::None, > + true, > + ); > assert!(chunk_store.is_err()); > > if let Err(_e) = std::fs::remove_dir_all(".testdir") { /* ignore */ } > diff --git a/pbs-datastore/src/datastore.rs b/pbs-datastore/src/datastore.rs > index 75c0c16ab..9b01790cc 100644 > --- a/pbs-datastore/src/datastore.rs > +++ b/pbs-datastore/src/datastore.rs > @@ -1170,6 +1170,16 @@ impl DataStore { > upid: Some(upid.to_string()), > ..Default::default() > }; > + let tuning: DatastoreTuning = serde_json::from_value( > + DatastoreTuning::API_SCHEMA > + .parse_property_string(gc_store_config.tuning.as_deref().unwrap_or(""))?, > + )?; > + if tuning.gc_atime_check.unwrap_or(true) { > + self.inner.chunk_store.check_fs_atime_update()?; > + info!("Filesystem atime update check successful."); > + } else { > + info!("Filesystem atime update check disabled by datastore tuning options."); > + } > > info!("Start GC phase1 (mark used chunks)"); > > diff --git a/src/api2/config/datastore.rs b/src/api2/config/datastore.rs > index fe3260f6d..1cd38f2db 100644 > --- a/src/api2/config/datastore.rs > +++ b/src/api2/config/datastore.rs > @@ -119,6 +119,7 @@ pub(crate) fn do_create_datastore( > backup_user.uid, > backup_user.gid, > tuning.sync_level.unwrap_or_default(), > + tuning.gc_atime_check.unwrap_or(true), > ) > .map(|_| ()) > } else { > -- > 2.39.5 > > > > _______________________________________________ > pbs-devel mailing list > pbs-devel@lists.proxmox.com > https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel > > > _______________________________________________ pbs-devel mailing list pbs-devel@lists.proxmox.com https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel