From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) by lore.proxmox.com (Postfix) with ESMTPS id 9F4A01FF141 for ; Tue, 05 May 2026 15:02:39 +0200 (CEST) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 9480B1EFC; Tue, 5 May 2026 15:02:38 +0200 (CEST) Message-ID: <189d0714-90c1-4607-95d0-3ae74febef59@proxmox.com> Date: Tue, 5 May 2026 15:02:29 +0200 MIME-Version: 1.0 User-Agent: Mozilla Thunderbird Subject: Re: [PATCH proxmox-backup 4/4] api/datastore: use maintenance-mode lock to protect against changes To: =?UTF-8?Q?Fabian_Gr=C3=BCnbichler?= , pbs-devel@lists.proxmox.com References: <20260505081137.227901-1-c.ebner@proxmox.com> <20260505081137.227901-5-c.ebner@proxmox.com> <1777980053.m1uthlu73r.astroid@yuna.none> Content-Language: en-US, de-DE From: Christian Ebner In-Reply-To: <1777980053.m1uthlu73r.astroid@yuna.none> Content-Type: text/plain; charset=UTF-8; format=flowed Content-Transfer-Encoding: 8bit X-Bm-Milter-Handled: 55990f41-d878-4baa-be0a-ee34c49e34d2 X-Bm-Transport-Timestamp: 1777986044087 X-SPAM-LEVEL: Spam detection results: 0 AWL 0.070 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record URIBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to URIBL was blocked. See http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block for more information. [proxmox.com,lib.rs,datastore.rs] Message-ID-Hash: PIRFTIW4YIWABBYLFQVFEWKKF56Z7RGN X-Message-ID-Hash: PIRFTIW4YIWABBYLFQVFEWKKF56Z7RGN X-MailFrom: c.ebner@proxmox.com X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; loop; banned-address; emergency; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header X-Mailman-Version: 3.3.10 Precedence: list List-Id: Proxmox Backup Server development discussion List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: On 5/5/26 2:12 PM, Fabian Grünbichler wrote: > On May 5, 2026 10:11 am, Christian Ebner wrote: >> Instead of locking the whole config, which causes also unrelated >> datastores to not be available, use a per-datastore maintenance mode >> lock to protect against changes during potentially longer running >> operations such as s3-refresh or removable datastore mount/unmount. >> >> However keep the current semantic of first setting the maintenance >> mode, allow to still opt out from the mode while waiting for active >> operations to finish, and only then enforce the maintenance mode >> during the s3 refresh or removable datastore mount operation. >> >> unset_maintenance() is adapted to either reacquire the datastore >> config lock or use the optional lock guard. >> >> Fixes: https://forum.proxmox.com/threads/183244/ >> Signed-off-by: Christian Ebner >> --- >> pbs-datastore/src/datastore.rs | 2 ++ >> pbs-datastore/src/lib.rs | 11 ++++++++++- >> src/api2/admin/datastore.rs | 26 ++++++++++++++++++++------ >> src/api2/config/datastore.rs | 6 +++++- >> 4 files changed, 37 insertions(+), 8 deletions(-) >> >> diff --git a/pbs-datastore/src/datastore.rs b/pbs-datastore/src/datastore.rs >> index 34efcd398..aa372cd73 100644 >> --- a/pbs-datastore/src/datastore.rs >> +++ b/pbs-datastore/src/datastore.rs >> @@ -3036,6 +3036,7 @@ impl DataStore { >> let (mut config, _digest) = pbs_config::datastore::config()?; >> let mut datastore_config: DataStoreConfig = config.lookup("datastore", name)?; >> >> + let maintenance_mode_lock = crate::maintenance_mode_lock(name)?; >> datastore_config.set_maintenance_mode(Some(MaintenanceMode { > > meh that this is in pbs-api-types, else we could require it being passed > the lock to ensure we never miss call sites.. Yeah, unfortunately. That is the reason I kept it independet. > > right now, datastore creation doesn't take the maintenance_mode_lock > despite setting it (in the reuse-existing S3 case), and it suffers from > a similar problem - it holds the lock for the whole duration of the > creation, but that also entails S3 requests that are allowed to take up > to 30m(!) - and other expensive things. have we maybe found our second > use case for this lock? if we hold it during creation of datastores, we > could drop the config lock before re-acquiring it for persisting the > config entry? or we need to move this into a two-phase creation, first > add the entry with maintenance mode "creating", then while only holding > the maintenance_mode_lock do all the prep work, and then at the end > re-acquire the config lock and unset the maintenance mode? this would > have considerable overlap with how we handle s3 refreshing.. > > IMHO moving to a two-step creation would be cleanest, since we've also > had reports in the past about a long chunk store initialization being > cumbersome (since it effectively blocks all modifications of the config > while it is going on). Good catch, the s3 requests there and the creation are indeed worth the suggested 2-step approach with a corresponding maintenance-mode and holding it's lock. > >> ty: MaintenanceType::Delete, >> message: None, >> @@ -3043,6 +3044,7 @@ impl DataStore { >> >> config.set_data(name, "datastore", &datastore_config)?; >> pbs_config::datastore::save_config(&config)?; >> + drop(maintenance_mode_lock); >> drop(config_lock); >> >> let (operations, _lock) = task_tracking::get_active_operations_locked(name)?; >> diff --git a/pbs-datastore/src/lib.rs b/pbs-datastore/src/lib.rs >> index 48acba4c8..823f3bf2f 100644 >> --- a/pbs-datastore/src/lib.rs >> +++ b/pbs-datastore/src/lib.rs >> @@ -159,8 +159,9 @@ >> >> use std::os::unix::io::AsRawFd; >> use std::path::Path; >> +use std::time::Duration; >> >> -use anyhow::{bail, Error}; >> +use anyhow::{bail, Context, Error}; >> >> use pbs_config::BackupLockGuard; >> >> @@ -271,3 +272,11 @@ where >> >> Ok(lock) >> } >> + >> +/// Acquire an exclusive lock for the datastore's maintenance-mode >> +pub fn maintenance_mode_lock(store: &str) -> Result { >> + lock_helper(store, Path::new("maintenance-mode.lck"), |p| { >> + pbs_config::open_backup_lockfile(p, Some(Duration::from_secs(0)), true) > > this might warrant a comment, usually we open config related locks with > a timeout to prevent flaky locking in case of concurrency, waiting a few > seconds is normally fine. > > but here, we must always first acquire the datastore config lock (should > we encode that in the signature??) anyway, which makes the 0 > timeout/non-blocking behaviour okay, if I understand it correctly? Right, it makes sense to pass in the config lock to "prove" that it was acquired first. But it could make sense to add the short delay here nevertheless. After all, another operation might have dropped the config lock already, while still holding onto the maintenance-mode lock. OTOH the operations which do hold it for longer probably will outlive the timeout, so a bit torn as I'm not to sure if there is much benefit. > >> + .context("unable to acquire exclusive datastore's maintenance-mode lock") >> + }) >> +} >> diff --git a/src/api2/admin/datastore.rs b/src/api2/admin/datastore.rs >> index 1e0a78f3a..883091e6f 100644 >> --- a/src/api2/admin/datastore.rs >> +++ b/src/api2/admin/datastore.rs >> @@ -61,8 +61,8 @@ use pbs_datastore::index::IndexFile; >> use pbs_datastore::manifest::BackupManifest; >> use pbs_datastore::prune::compute_prune_info; >> use pbs_datastore::{ >> - check_backup_owner, ensure_datastore_is_mounted, task_tracking, BackupDir, DataStore, >> - LocalChunkReader, StoreProgress, >> + check_backup_owner, ensure_datastore_is_mounted, maintenance_mode_lock, task_tracking, >> + BackupDir, DataStore, LocalChunkReader, StoreProgress, >> }; >> use pbs_tools::json::required_string_param; >> use proxmox_rest_server::{formatter, worker_is_active, WorkerTask}; >> @@ -2705,9 +2705,13 @@ fn expect_maintenance_type( >> } >> >> fn unset_maintenance( >> - _lock: pbs_config::BackupLockGuard, >> + lock: Option, > > I think this should take Option<(lock, lock)>, and > expect_maintenance_type should acquire and return both locks, but it > needs to be adapted further, because atm it is broken: > > 1. start s3-refresh while active operations are running > 2. s3-refresh will be in loop waiting for operations to disappear > 3. roughly every second, it will call expect_maintenance_type > 4. if somebody else is holding the datastore config lock for more than > ten seconds (let's say you started a re-use existing datastore > creation with S3 ;)), expect_maintenance_type will error out! Yeah, point 4 is broken in any case... :/ But will fix according to the suggestions above. > > we currently call expect_maintenance_type in three places: > 1. to check whether the maintenance mode changed under us while waiting > for operations to stop (no need for locking here) > 2. to check whether the maintenance mode changed under us if we abort > waiting - we don't really need to lock here if we move this call to > unset_maintenance > 3. to do a final check before we do the actual work - this one uses the > locks then, but we could just as well lock first, then check? > > what do you think about the following (untested!): > > ----8<---- > diff --git a/src/api2/admin/datastore.rs b/src/api2/admin/datastore.rs > index 883091e6f..1acb0baf3 100644 > --- a/src/api2/admin/datastore.rs > +++ b/src/api2/admin/datastore.rs > @@ -49,7 +49,7 @@ use pbs_api_types::{ > VERIFY_JOB_READ_THREADS_SCHEMA, VERIFY_JOB_VERIFY_THREADS_SCHEMA, > }; > use pbs_client::pxar::{create_tar, create_zip}; > -use pbs_config::CachedUserInfo; > +use pbs_config::{BackupLockGuard, CachedUserInfo}; > use pbs_datastore::backup_info::BackupInfo; > use pbs_datastore::cached_chunk_reader::CachedChunkReader; > use pbs_datastore::catalog::{ArchiveEntry, CatalogReader}; > @@ -2686,11 +2686,7 @@ pub fn mount(store: String, rpcenv: &mut dyn RpcEnvironment) -> Result Ok(json!(upid)) > } > > -fn expect_maintenance_type( > - store: &str, > - maintenance_type: MaintenanceType, > -) -> Result<(pbs_config::BackupLockGuard, DataStoreConfig), Error> { > - let lock = pbs_config::datastore::lock_config()?; > +fn expect_maintenance_type(store: &str, maintenance_type: MaintenanceType) -> Result<(), Error> { > let (section_config, _digest) = pbs_config::datastore::config()?; > let store_config: DataStoreConfig = section_config.lookup("datastore", store)?; > > @@ -2701,20 +2697,24 @@ fn expect_maintenance_type( > bail!("maintenance mode is not '{maintenance_type}'"); > } > > - Ok((lock, store_config)) > + Ok(()) > } > > fn unset_maintenance( > - lock: Option, > - mut config: DataStoreConfig, > + maintenance_lock: Option, > + store: &str, > + expected: MaintenanceType, > ) -> Result<(), Error> { > - let _lock = match lock { > + let _lock = pbs_config::datastore::lock_config()?; > + let _maintenance_mode_lock = match maintenance_lock { > Some(lock) => lock, > - None => pbs_config::datastore::lock_config()?, > + None => maintenance_mode_lock(store)?, > }; > + expect_maintenance_type(store, expected)?; > let (mut section_config, _digest) = pbs_config::datastore::config()?; > - config.maintenance_mode = None; > - section_config.set_data(&config.name, "datastore", &config)?; > + let mut store_config: DataStoreConfig = section_config.lookup("datastore", store)?; > + store_config.maintenance_mode = None; > + section_config.set_data(store, "datastore", &store_config)?; > pbs_config::datastore::save_config(§ion_config)?; > Ok(()) > } > @@ -2914,23 +2914,20 @@ fn run_maintenance_locked( > } > > if aborted || worker.abort_requested() { > - let _ = expect_maintenance_type(store, maintenance_expected) > - .inspect_err(|e| warn!("maintenance mode was not as expected: {e}")) > - .and_then(|(lock, config)| { > - unset_maintenance(Some(lock), config) > - .inspect_err(|e| warn!("could not reset maintenance mode: {e}")) > - }); > + let _ = unset_maintenance(None, store, maintenance_expected) > + .inspect_err(|e| warn!("could not reset maintenance mode: {e}")); > bail!("aborted, due to user request"); > } > > - let (lock, config) = expect_maintenance_type(store, maintenance_expected)?; > - let _maintenance_mode_lock = maintenance_mode_lock(&config.name)?; > + let lock = pbs_config::datastore::lock_config()?; > + let maintenance_mode_lock = maintenance_mode_lock(store)?; > + expect_maintenance_type(store, maintenance_expected)?; > // avoid blocking config access/updates, but keep maintenance-mode locked > // until done. > drop(lock); > > callback()?; > - unset_maintenance(None, config) > + unset_maintenance(Some(maintenance_mode_lock), store, maintenance_expected) > .map_err(|e| format_err!("could not reset maintenance mode: {e}"))?; > > Ok(()) > ---->8---- > > the tiny bit of double work of reparsing the config is IMHO not that > bad, since this is not on any hot path.. Hmm, yes, can incorporate this, moving the locking and expected mode check into the unset helper makes indeed sense. > > > > >> mut config: DataStoreConfig, >> ) -> Result<(), Error> { >> + let _lock = match lock { >> + Some(lock) => lock, >> + None => pbs_config::datastore::lock_config()?, >> + }; >> let (mut section_config, _digest) = pbs_config::datastore::config()?; >> config.maintenance_mode = None; >> section_config.set_data(&config.name, "datastore", &config)?; >> @@ -2763,6 +2767,7 @@ async fn do_unmount(store: String, auth_id: Authid, to_stdout: bool) -> Result> let (mut section_config, _digest) = pbs_config::datastore::config()?; >> let mut datastore: DataStoreConfig = section_config.lookup("datastore", &store)?; >> >> + let maintenance_mode_lock = maintenance_mode_lock(&store)?; >> datastore.set_maintenance_mode(Some(MaintenanceMode { >> ty: MaintenanceType::Unmount, >> message: None, >> @@ -2770,6 +2775,7 @@ async fn do_unmount(store: String, auth_id: Authid, to_stdout: bool) -> Result> section_config.set_data(&store, "datastore", &datastore)?; >> pbs_config::datastore::save_config(§ion_config)?; >> >> + drop(maintenance_mode_lock); >> drop(_lock); >> >> if let Ok(proxy_pid) = proxmox_rest_server::read_pid(pbs_buildcfg::PROXMOX_BACKUP_PROXY_PID_FN) >> @@ -2845,6 +2851,7 @@ pub fn s3_refresh(store: String, rpcenv: &mut dyn RpcEnvironment) -> Result> let (mut section_config, _digest) = pbs_config::datastore::config()?; >> let mut datastore_config: DataStoreConfig = section_config.lookup("datastore", &store)?; >> >> + let maintenance_mode_lock = maintenance_mode_lock(&store)?; >> datastore_config.set_maintenance_mode(Some(MaintenanceMode { >> ty: MaintenanceType::S3Refresh, >> message: None, >> @@ -2852,6 +2859,7 @@ pub fn s3_refresh(store: String, rpcenv: &mut dyn RpcEnvironment) -> Result> >> section_config.set_data(&store, "datastore", &datastore_config)?; >> pbs_config::datastore::save_config(§ion_config)?; >> + drop(maintenance_mode_lock); >> drop(_lock); >> >> let upid = WorkerTask::new_thread( >> @@ -2875,7 +2883,8 @@ pub(crate) fn do_s3_refresh(store: &str, worker: &dyn WorkerTaskContext) -> Resu >> } >> >> /// Wait for no more active operations on the given datastore and run the provided callback in >> -/// a datastore locked context, protecting against maintenance mode changes. >> +/// a maintenance mode locked context, protecting against leaving the mode for the functions >> +/// lifetime. >> fn run_maintenance_locked( >> store: &str, >> maintenance_expected: MaintenanceType, >> @@ -2908,15 +2917,20 @@ fn run_maintenance_locked( >> let _ = expect_maintenance_type(store, maintenance_expected) >> .inspect_err(|e| warn!("maintenance mode was not as expected: {e}")) >> .and_then(|(lock, config)| { >> - unset_maintenance(lock, config) >> + unset_maintenance(Some(lock), config) >> .inspect_err(|e| warn!("could not reset maintenance mode: {e}")) >> }); >> bail!("aborted, due to user request"); >> } >> >> let (lock, config) = expect_maintenance_type(store, maintenance_expected)?; >> + let _maintenance_mode_lock = maintenance_mode_lock(&config.name)?; >> + // avoid blocking config access/updates, but keep maintenance-mode locked >> + // until done. >> + drop(lock); >> + >> callback()?; >> - unset_maintenance(lock, config) >> + unset_maintenance(None, config) >> .map_err(|e| format_err!("could not reset maintenance mode: {e}"))?; >> >> Ok(()) >> diff --git a/src/api2/config/datastore.rs b/src/api2/config/datastore.rs >> index 16e85a636..b07e7fc7e 100644 >> --- a/src/api2/config/datastore.rs >> +++ b/src/api2/config/datastore.rs >> @@ -31,7 +31,9 @@ use crate::api2::config::tape_backup_job::{delete_tape_backup_job, list_tape_bac >> use crate::api2::config::verify::delete_verification_job; >> use pbs_config::CachedUserInfo; >> >> -use pbs_datastore::{get_datastore_mount_status, DataStore, S3_DATASTORE_IN_USE_MARKER}; >> +use pbs_datastore::{ >> + get_datastore_mount_status, maintenance_mode_lock, DataStore, S3_DATASTORE_IN_USE_MARKER, >> +}; >> use proxmox_rest_server::WorkerTask; >> use proxmox_s3_client::{S3ObjectKey, S3_HTTP_REQUEST_TIMEOUT}; >> >> @@ -540,6 +542,7 @@ pub fn update_datastore( >> data.tuning = None; >> } >> DeletableProperty::MaintenanceMode => { >> + let _maintenance_mode_lock = maintenance_mode_lock(&name)?; >> data.set_maintenance_mode(None)?; >> } >> DeletableProperty::NotificationThresholds => { >> @@ -638,6 +641,7 @@ pub fn update_datastore( >> )?), >> None => None, >> }; >> + let _maintenance_mode_lock = maintenance_mode_lock(&name)?; >> data.set_maintenance_mode(maintenance_mode)?; >> } >> >> -- >> 2.47.3 >> >> >> >> >> >>