* [pbs-devel] [PATCH proxmox-backup 1/4] datastore/api/backup: prepare for fix of #3935 by adding lock helpers
2025-03-10 14:36 [pbs-devel] [PATCH proxmox-backup 0/4] refactor datastore locking to use tmpfs Shannon Sterz
@ 2025-03-10 14:36 ` Shannon Sterz
2025-03-10 14:36 ` [pbs-devel] [PATCH proxmox-backup 2/4] fix #3935: datastore/api/backup: move datastore locking to '/run' Shannon Sterz
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: Shannon Sterz @ 2025-03-10 14:36 UTC (permalink / raw)
To: pbs-devel
to avoid duplicate code, add helpers for locking groups and snapshots
to the BackupGroup and BackupDir traits respectively and refactor
existing code to use them.
this also changes the error semantics of the refactored
functions as locking related error messages are now unified and, thus,
less specific.
Signed-off-by: Shannon Sterz <s.sterz@proxmox.com>
---
pbs-datastore/src/backup_info.rs | 39 +++++++++++++---
pbs-datastore/src/datastore.rs | 66 ++++++++++------------------
pbs-datastore/src/snapshot_reader.rs | 7 +--
src/api2/backup/environment.rs | 9 +---
src/api2/backup/mod.rs | 8 +---
src/api2/reader/mod.rs | 7 +--
src/backup/verify.rs | 12 ++---
src/server/sync.rs | 10 ++---
8 files changed, 68 insertions(+), 90 deletions(-)
diff --git a/pbs-datastore/src/backup_info.rs b/pbs-datastore/src/backup_info.rs
index ef2b982c..42ce74e9 100644
--- a/pbs-datastore/src/backup_info.rs
+++ b/pbs-datastore/src/backup_info.rs
@@ -5,7 +5,9 @@ use std::sync::Arc;
use anyhow::{bail, format_err, Error};
-use proxmox_sys::fs::{lock_dir_noblock, replace_file, CreateOptions};
+use proxmox_sys::fs::{
+ lock_dir_noblock, lock_dir_noblock_shared, replace_file, CreateOptions, DirLockGuard,
+};
use pbs_api_types::{
Authid, BackupGroupDeleteStats, BackupNamespace, BackupType, GroupFilter, VerifyState,
@@ -199,9 +201,8 @@ impl BackupGroup {
/// Returns `BackupGroupDeleteStats`, containing the number of deleted snapshots
/// and number of protected snaphsots, which therefore were not removed.
pub fn destroy(&self) -> Result<BackupGroupDeleteStats, Error> {
+ let _guard = self.lock()?;
let path = self.full_group_path();
- let _guard =
- proxmox_sys::fs::lock_dir_noblock(&path, "backup group", "possible running backup")?;
log::info!("removing backup group {:?}", path);
let mut delete_stats = BackupGroupDeleteStats::default();
@@ -237,6 +238,15 @@ impl BackupGroup {
self.store
.set_owner(&self.ns, self.as_ref(), auth_id, force)
}
+
+ /// Lock a group exclusively
+ pub fn lock(&self) -> Result<DirLockGuard, Error> {
+ lock_dir_noblock(
+ &self.full_group_path(),
+ "backup group",
+ "possible running backup",
+ )
+ }
}
impl AsRef<pbs_api_types::BackupNamespace> for BackupGroup {
@@ -442,15 +452,31 @@ impl BackupDir {
.map_err(|err| format_err!("unable to acquire manifest lock {:?} - {}", &path, err))
}
+ /// Lock this snapshot exclusively
+ pub fn lock(&self) -> Result<DirLockGuard, Error> {
+ lock_dir_noblock(
+ &self.full_path(),
+ "snapshot",
+ "backup is running or snapshot is in use",
+ )
+ }
+
+ /// Acquire a shared lock on this snapshot
+ pub fn lock_shared(&self) -> Result<DirLockGuard, Error> {
+ lock_dir_noblock_shared(
+ &self.full_path(),
+ "snapshot",
+ "backup is running or snapshot is in use, could not acquire shared lock",
+ )
+ }
+
/// Destroy the whole snapshot, bails if it's protected
///
/// Setting `force` to true skips locking and thus ignores if the backup is currently in use.
pub fn destroy(&self, force: bool) -> Result<(), Error> {
- let full_path = self.full_path();
-
let (_guard, _manifest_guard);
if !force {
- _guard = lock_dir_noblock(&full_path, "snapshot", "possibly running or in use")?;
+ _guard = self.lock()?;
_manifest_guard = self.lock_manifest()?;
}
@@ -458,6 +484,7 @@ impl BackupDir {
bail!("cannot remove protected snapshot"); // use special error type?
}
+ let full_path = self.full_path();
log::info!("removing backup snapshot {:?}", full_path);
std::fs::remove_dir_all(&full_path).map_err(|err| {
format_err!("removing backup snapshot {:?} failed - {}", full_path, err,)
diff --git a/pbs-datastore/src/datastore.rs b/pbs-datastore/src/datastore.rs
index 75c0c16a..b3804b5a 100644
--- a/pbs-datastore/src/datastore.rs
+++ b/pbs-datastore/src/datastore.rs
@@ -13,8 +13,8 @@ use proxmox_human_byte::HumanByte;
use proxmox_schema::ApiType;
use proxmox_sys::error::SysError;
+use proxmox_sys::fs::DirLockGuard;
use proxmox_sys::fs::{file_read_optional_string, replace_file, CreateOptions};
-use proxmox_sys::fs::{lock_dir_noblock, DirLockGuard};
use proxmox_sys::linux::procfs::MountInfo;
use proxmox_sys::process_locker::ProcessLockSharedGuard;
use proxmox_worker_task::WorkerTaskContext;
@@ -774,41 +774,31 @@ impl DataStore {
///
/// This also acquires an exclusive lock on the directory and returns the lock guard.
pub fn create_locked_backup_group(
- &self,
+ self: &Arc<Self>,
ns: &BackupNamespace,
backup_group: &pbs_api_types::BackupGroup,
auth_id: &Authid,
) -> Result<(Authid, DirLockGuard), Error> {
- // create intermediate path first:
- let mut full_path = self.base_path();
- for ns in ns.components() {
- full_path.push("ns");
- full_path.push(ns);
- }
- full_path.push(backup_group.ty.as_str());
- std::fs::create_dir_all(&full_path)?;
+ let backup_group = self.backup_group(ns.clone(), backup_group.clone());
- full_path.push(&backup_group.id);
+ // create intermediate path first
+ let full_path = backup_group.full_group_path();
- // create the last component now
+ std::fs::create_dir_all(full_path.parent().ok_or_else(|| {
+ format_err!("could not construct parent path for group {backup_group:?}")
+ })?)?;
+
+ // now create the group, this allows us to check whether it existed before
match std::fs::create_dir(&full_path) {
Ok(_) => {
- let guard = lock_dir_noblock(
- &full_path,
- "backup group",
- "another backup is already running",
- )?;
- self.set_owner(ns, backup_group, auth_id, false)?;
- let owner = self.get_owner(ns, backup_group)?; // just to be sure
+ let guard = backup_group.lock()?;
+ self.set_owner(ns, backup_group.group(), auth_id, false)?;
+ let owner = self.get_owner(ns, backup_group.group())?; // just to be sure
Ok((owner, guard))
}
Err(ref err) if err.kind() == io::ErrorKind::AlreadyExists => {
- let guard = lock_dir_noblock(
- &full_path,
- "backup group",
- "another backup is already running",
- )?;
- let owner = self.get_owner(ns, backup_group)?; // just to be sure
+ let guard = backup_group.lock()?;
+ let owner = self.get_owner(ns, backup_group.group())?; // just to be sure
Ok((owner, guard))
}
Err(err) => bail!("unable to create backup group {:?} - {}", full_path, err),
@@ -819,29 +809,17 @@ impl DataStore {
///
/// The BackupGroup directory needs to exist.
pub fn create_locked_backup_dir(
- &self,
+ self: &Arc<Self>,
ns: &BackupNamespace,
backup_dir: &pbs_api_types::BackupDir,
) -> Result<(PathBuf, bool, DirLockGuard), Error> {
- let full_path = self.snapshot_path(ns, backup_dir);
- let relative_path = full_path.strip_prefix(self.base_path()).map_err(|err| {
- format_err!(
- "failed to produce correct path for backup {backup_dir} in namespace {ns}: {err}"
- )
- })?;
+ let backup_dir = self.backup_dir(ns.clone(), backup_dir.clone())?;
+ let relative_path = backup_dir.relative_path();
- let lock = || {
- lock_dir_noblock(
- &full_path,
- "snapshot",
- "internal error - tried creating snapshot that's already in use",
- )
- };
-
- match std::fs::create_dir(&full_path) {
- Ok(_) => Ok((relative_path.to_owned(), true, lock()?)),
+ match std::fs::create_dir(backup_dir.full_path()) {
+ Ok(_) => Ok((relative_path, true, backup_dir.lock()?)),
Err(ref e) if e.kind() == io::ErrorKind::AlreadyExists => {
- Ok((relative_path.to_owned(), false, lock()?))
+ Ok((relative_path, false, backup_dir.lock()?))
}
Err(e) => Err(e.into()),
}
@@ -1305,7 +1283,7 @@ impl DataStore {
bail!("snapshot {} does not exist!", backup_dir.dir());
}
- let _guard = lock_dir_noblock(&full_path, "snapshot", "possibly running or in use")?;
+ let _guard = backup_dir.lock()?;
let protected_path = backup_dir.protected_file();
if protection {
diff --git a/pbs-datastore/src/snapshot_reader.rs b/pbs-datastore/src/snapshot_reader.rs
index 36349888..604d35c1 100644
--- a/pbs-datastore/src/snapshot_reader.rs
+++ b/pbs-datastore/src/snapshot_reader.rs
@@ -1,14 +1,12 @@
use std::fs::File;
use std::os::unix::io::{AsRawFd, FromRawFd};
use std::path::Path;
-use std::sync::Arc;
use std::rc::Rc;
+use std::sync::Arc;
use anyhow::{bail, Error};
use nix::dir::Dir;
-use proxmox_sys::fs::lock_dir_noblock_shared;
-
use pbs_api_types::{
print_store_and_ns, ArchiveType, BackupNamespace, Operation, CLIENT_LOG_BLOB_NAME,
MANIFEST_BLOB_NAME,
@@ -48,8 +46,7 @@ impl SnapshotReader {
bail!("snapshot {} does not exist!", snapshot.dir());
}
- let locked_dir =
- lock_dir_noblock_shared(&snapshot_path, "snapshot", "locked by another operation")?;
+ let locked_dir = snapshot.lock_shared()?;
let datastore_name = datastore.name().to_string();
let manifest = match snapshot.load_manifest() {
diff --git a/src/api2/backup/environment.rs b/src/api2/backup/environment.rs
index 99d885e2..133932b3 100644
--- a/src/api2/backup/environment.rs
+++ b/src/api2/backup/environment.rs
@@ -8,7 +8,7 @@ use ::serde::Serialize;
use serde_json::{json, Value};
use proxmox_router::{RpcEnvironment, RpcEnvironmentType};
-use proxmox_sys::fs::{lock_dir_noblock_shared, replace_file, CreateOptions};
+use proxmox_sys::fs::{replace_file, CreateOptions};
use pbs_api_types::Authid;
use pbs_datastore::backup_info::{BackupDir, BackupInfo};
@@ -645,12 +645,7 @@ impl BackupEnvironment {
// Downgrade to shared lock, the backup itself is finished
drop(excl_snap_lock);
- let snap_lock = lock_dir_noblock_shared(
- &self.backup_dir.full_path(),
- "snapshot",
- "snapshot is already locked by another operation",
- )?;
-
+ let snap_lock = self.backup_dir.lock_shared()?;
let worker_id = format!(
"{}:{}/{}/{:08X}",
self.datastore.name(),
diff --git a/src/api2/backup/mod.rs b/src/api2/backup/mod.rs
index 82c6438a..d5513893 100644
--- a/src/api2/backup/mod.rs
+++ b/src/api2/backup/mod.rs
@@ -17,7 +17,6 @@ use proxmox_router::{
};
use proxmox_schema::*;
use proxmox_sortable_macro::sortable;
-use proxmox_sys::fs::lock_dir_noblock_shared;
use pbs_api_types::{
ArchiveType, Authid, BackupNamespace, BackupType, Operation, VerifyState,
@@ -186,12 +185,7 @@ fn upgrade_to_backup_protocol(
}
// lock last snapshot to prevent forgetting/pruning it during backup
- let full_path = last.backup_dir.full_path();
- Some(lock_dir_noblock_shared(
- &full_path,
- "snapshot",
- "base snapshot is already locked by another operation",
- )?)
+ Some(last.backup_dir.lock_shared()?)
} else {
None
};
diff --git a/src/api2/reader/mod.rs b/src/api2/reader/mod.rs
index 328141c8..5a9678ff 100644
--- a/src/api2/reader/mod.rs
+++ b/src/api2/reader/mod.rs
@@ -16,7 +16,6 @@ use proxmox_router::{
};
use proxmox_schema::{BooleanSchema, ObjectSchema};
use proxmox_sortable_macro::sortable;
-use proxmox_sys::fs::lock_dir_noblock_shared;
use pbs_api_types::{
ArchiveType, Authid, Operation, BACKUP_ARCHIVE_NAME_SCHEMA, BACKUP_ID_SCHEMA,
@@ -129,11 +128,7 @@ fn upgrade_to_backup_reader_protocol(
bail!("snapshot {} does not exist.", backup_dir.dir());
}
- let _guard = lock_dir_noblock_shared(
- &backup_dir.full_path(),
- "snapshot",
- "locked by another operation",
- )?;
+ let _guard = backup_dir.lock_shared()?;
let path = datastore.base_path();
diff --git a/src/backup/verify.rs b/src/backup/verify.rs
index f652c262..10a64fda 100644
--- a/src/backup/verify.rs
+++ b/src/backup/verify.rs
@@ -4,10 +4,9 @@ use std::sync::{Arc, Mutex};
use std::time::Instant;
use anyhow::{bail, Error};
-use nix::dir::Dir;
+use proxmox_sys::fs::DirLockGuard;
use tracing::{error, info, warn};
-use proxmox_sys::fs::lock_dir_noblock_shared;
use proxmox_worker_task::WorkerTaskContext;
use pbs_api_types::{
@@ -307,11 +306,8 @@ pub fn verify_backup_dir(
return Ok(true);
}
- let snap_lock = lock_dir_noblock_shared(
- &backup_dir.full_path(),
- "snapshot",
- "locked by another operation",
- );
+ let snap_lock = backup_dir.lock_shared();
+
match snap_lock {
Ok(snap_lock) => {
verify_backup_dir_with_lock(verify_worker, backup_dir, upid, filter, snap_lock)
@@ -334,7 +330,7 @@ pub fn verify_backup_dir_with_lock(
backup_dir: &BackupDir,
upid: UPID,
filter: Option<&dyn Fn(&BackupManifest) -> bool>,
- _snap_lock: Dir,
+ _snap_lock: DirLockGuard,
) -> Result<bool, Error> {
let datastore_name = verify_worker.datastore.name();
let backup_dir_name = backup_dir.dir();
diff --git a/src/server/sync.rs b/src/server/sync.rs
index 4dd46c5a..abe7316a 100644
--- a/src/server/sync.rs
+++ b/src/server/sync.rs
@@ -10,6 +10,7 @@ use std::time::Duration;
use anyhow::{bail, format_err, Context, Error};
use futures::{future::FutureExt, select};
use hyper::http::StatusCode;
+use proxmox_sys::fs::DirLockGuard;
use serde_json::json;
use tracing::{info, warn};
@@ -105,7 +106,7 @@ pub(crate) struct RemoteSourceReader {
}
pub(crate) struct LocalSourceReader {
- pub(crate) _dir_lock: Arc<Mutex<proxmox_sys::fs::DirLockGuard>>,
+ pub(crate) _dir_lock: Arc<Mutex<DirLockGuard>>,
pub(crate) path: PathBuf,
pub(crate) datastore: Arc<DataStore>,
}
@@ -478,13 +479,8 @@ impl SyncSource for LocalSource {
dir: &BackupDir,
) -> Result<Arc<dyn SyncSourceReader>, Error> {
let dir = self.store.backup_dir(ns.clone(), dir.clone())?;
- let dir_lock = proxmox_sys::fs::lock_dir_noblock_shared(
- &dir.full_path(),
- "snapshot",
- "locked by another operation",
- )?;
Ok(Arc::new(LocalSourceReader {
- _dir_lock: Arc::new(Mutex::new(dir_lock)),
+ _dir_lock: Arc::new(Mutex::new(dir.lock()?)),
path: dir.full_path(),
datastore: dir.datastore().clone(),
}))
--
2.39.5
_______________________________________________
pbs-devel mailing list
pbs-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel
^ permalink raw reply [flat|nested] 5+ messages in thread
* [pbs-devel] [PATCH proxmox-backup 2/4] fix #3935: datastore/api/backup: move datastore locking to '/run'
2025-03-10 14:36 [pbs-devel] [PATCH proxmox-backup 0/4] refactor datastore locking to use tmpfs Shannon Sterz
2025-03-10 14:36 ` [pbs-devel] [PATCH proxmox-backup 1/4] datastore/api/backup: prepare for fix of #3935 by adding lock helpers Shannon Sterz
@ 2025-03-10 14:36 ` Shannon Sterz
2025-03-10 14:36 ` [pbs-devel] [PATCH proxmox-backup 3/4] fix #3935: datastore: move manifest locking to new locking method Shannon Sterz
2025-03-10 14:36 ` [pbs-devel] [PATCH proxmox-backup 4/4] fix: api: avoid race condition in set_backup_owner Shannon Sterz
3 siblings, 0 replies; 5+ messages in thread
From: Shannon Sterz @ 2025-03-10 14:36 UTC (permalink / raw)
To: pbs-devel
to avoid issues when removing a group or snapshot directory where two
threads hold a lock to the same directory, move locking to the tmpfs
backed '/run' directory. also adds double stat'ing to make it possible
to remove locks without certain race condition issues.
Signed-off-by: Shannon Sterz <s.sterz@proxmox.com>
---
pbs-config/src/lib.rs | 13 ++-
pbs-datastore/Cargo.toml | 1 +
pbs-datastore/src/backup_info.rs | 156 ++++++++++++++++++++++-----
pbs-datastore/src/datastore.rs | 6 +-
pbs-datastore/src/snapshot_reader.rs | 17 ++-
src/api2/backup/environment.rs | 5 +-
src/backup/verify.rs | 4 +-
src/server/sync.rs | 4 +-
8 files changed, 166 insertions(+), 40 deletions(-)
diff --git a/pbs-config/src/lib.rs b/pbs-config/src/lib.rs
index 20a8238d..1b4b9989 100644
--- a/pbs-config/src/lib.rs
+++ b/pbs-config/src/lib.rs
@@ -22,6 +22,7 @@ pub use config_version_cache::ConfigVersionCache;
use anyhow::{format_err, Error};
use nix::unistd::{Gid, Group, Uid, User};
+use std::os::unix::prelude::AsRawFd;
pub use pbs_buildcfg::{BACKUP_GROUP_NAME, BACKUP_USER_NAME};
@@ -46,13 +47,19 @@ pub fn backup_group() -> Result<nix::unistd::Group, Error> {
}
pub struct BackupLockGuard {
- _file: Option<std::fs::File>,
+ file: Option<std::fs::File>,
+}
+
+impl AsRawFd for BackupLockGuard {
+ fn as_raw_fd(&self) -> i32 {
+ self.file.as_ref().map_or(-1, |f| f.as_raw_fd())
+ }
}
#[doc(hidden)]
/// Note: do not use for production code, this is only intended for tests
pub unsafe fn create_mocked_lock() -> BackupLockGuard {
- BackupLockGuard { _file: None }
+ BackupLockGuard { file: None }
}
/// Open or create a lock file owned by user "backup" and lock it.
@@ -76,7 +83,7 @@ pub fn open_backup_lockfile<P: AsRef<std::path::Path>>(
let timeout = timeout.unwrap_or(std::time::Duration::new(10, 0));
let file = proxmox_sys::fs::open_file_locked(&path, timeout, exclusive, options)?;
- Ok(BackupLockGuard { _file: Some(file) })
+ Ok(BackupLockGuard { file: Some(file) })
}
/// Atomically write data to file owned by "root:backup" with permission "0640"
diff --git a/pbs-datastore/Cargo.toml b/pbs-datastore/Cargo.toml
index 4ebc5fdc..7623adc2 100644
--- a/pbs-datastore/Cargo.toml
+++ b/pbs-datastore/Cargo.toml
@@ -35,6 +35,7 @@ proxmox-lang.workspace=true
proxmox-schema = { workspace = true, features = [ "api-macro" ] }
proxmox-serde = { workspace = true, features = [ "serde_json" ] }
proxmox-sys.workspace = true
+proxmox-systemd.workspace = true
proxmox-time.workspace = true
proxmox-uuid.workspace = true
proxmox-worker-task.workspace = true
diff --git a/pbs-datastore/src/backup_info.rs b/pbs-datastore/src/backup_info.rs
index 42ce74e9..5496e053 100644
--- a/pbs-datastore/src/backup_info.rs
+++ b/pbs-datastore/src/backup_info.rs
@@ -1,13 +1,15 @@
use std::fmt;
-use std::os::unix::io::RawFd;
+use std::os::unix::io::{AsRawFd, RawFd};
+use std::os::unix::prelude::OsStrExt;
+use std::path::Path;
use std::path::PathBuf;
use std::sync::Arc;
+use std::time::Duration;
use anyhow::{bail, format_err, Error};
-use proxmox_sys::fs::{
- lock_dir_noblock, lock_dir_noblock_shared, replace_file, CreateOptions, DirLockGuard,
-};
+use proxmox_sys::fs::{replace_file, CreateOptions};
+use proxmox_systemd::escape_unit;
use pbs_api_types::{
Authid, BackupGroupDeleteStats, BackupNamespace, BackupType, GroupFilter, VerifyState,
@@ -18,6 +20,8 @@ use pbs_config::{open_backup_lockfile, BackupLockGuard};
use crate::manifest::{BackupManifest, MANIFEST_LOCK_NAME};
use crate::{DataBlob, DataStore};
+pub const DATASTORE_LOCKS_DIR: &str = "/run/proxmox-backup/locks";
+
/// BackupGroup is a directory containing a list of BackupDir
#[derive(Clone)]
pub struct BackupGroup {
@@ -223,6 +227,7 @@ impl BackupGroup {
delete_stats.increment_removed_groups();
}
+ let _ = std::fs::remove_file(self.lock_path());
Ok(delete_stats)
}
@@ -239,13 +244,25 @@ impl BackupGroup {
.set_owner(&self.ns, self.as_ref(), auth_id, force)
}
- /// Lock a group exclusively
- pub fn lock(&self) -> Result<DirLockGuard, Error> {
- lock_dir_noblock(
- &self.full_group_path(),
- "backup group",
- "possible running backup",
- )
+ /// Returns a file name for locking a group.
+ ///
+ /// The lock file will be located in:
+ /// `${DATASTORE_LOCKS_DIR}/${datastore name}/${lock_file_path_helper(rpath)}`
+ /// where `rpath` is the relative path of the group.
+ fn lock_path(&self) -> PathBuf {
+ let path = Path::new(DATASTORE_LOCKS_DIR).join(self.store.name());
+
+ let rpath = Path::new(self.group.ty.as_str()).join(&self.group.id);
+
+ path.join(lock_file_path_helper(&self.ns, rpath))
+ }
+
+ /// Locks a group exclusively.
+ pub fn lock(&self) -> Result<BackupLockGuard, Error> {
+ lock_helper(self.store.name(), &self.lock_path(), |p| {
+ open_backup_lockfile(p, Some(Duration::from_secs(0)), true)
+ .map_err(|err| format_err!("unable to acquire backup group lock {p:?} - {err}"))
+ })
}
}
@@ -452,22 +469,35 @@ impl BackupDir {
.map_err(|err| format_err!("unable to acquire manifest lock {:?} - {}", &path, err))
}
- /// Lock this snapshot exclusively
- pub fn lock(&self) -> Result<DirLockGuard, Error> {
- lock_dir_noblock(
- &self.full_path(),
- "snapshot",
- "backup is running or snapshot is in use",
- )
+ /// Returns a file name for locking a snapshot.
+ ///
+ /// The lock file will be located in:
+ /// `${DATASTORE_LOCKS_DIR}/${datastore name}/${lock_file_path_helper(rpath)}`
+ /// where `rpath` is the relative path of the snapshot.
+ fn lock_path(&self) -> PathBuf {
+ let path = Path::new(DATASTORE_LOCKS_DIR).join(self.store.name());
+
+ let rpath = Path::new(self.dir.group.ty.as_str())
+ .join(&self.dir.group.id)
+ .join(&self.backup_time_string);
+
+ path.join(lock_file_path_helper(&self.ns, rpath))
}
- /// Acquire a shared lock on this snapshot
- pub fn lock_shared(&self) -> Result<DirLockGuard, Error> {
- lock_dir_noblock_shared(
- &self.full_path(),
- "snapshot",
- "backup is running or snapshot is in use, could not acquire shared lock",
- )
+ /// Locks a snapshot exclusively.
+ pub fn lock(&self) -> Result<BackupLockGuard, Error> {
+ lock_helper(self.store.name(), &self.lock_path(), |p| {
+ open_backup_lockfile(p, Some(Duration::from_secs(0)), true)
+ .map_err(|err| format_err!("unable to acquire snapshot lock {p:?} - {err}"))
+ })
+ }
+
+ /// Acquires a shared lock on a snapshot.
+ pub fn lock_shared(&self) -> Result<BackupLockGuard, Error> {
+ lock_helper(self.store.name(), &self.lock_path(), |p| {
+ open_backup_lockfile(p, Some(Duration::from_secs(0)), false)
+ .map_err(|err| format_err!("unable to acquire shared snapshot lock {p:?} - {err}"))
+ })
}
/// Destroy the whole snapshot, bails if it's protected
@@ -490,11 +520,13 @@ impl BackupDir {
format_err!("removing backup snapshot {:?} failed - {}", full_path, err,)
})?;
- // the manifest doesn't exist anymore, no need to keep the lock (already done by guard?)
+ // remove no longer needed lock files
if let Ok(path) = self.manifest_lock_path() {
let _ = std::fs::remove_file(path); // ignore errors
}
+ let _ = std::fs::remove_file(self.lock_path()); // ignore errors
+
Ok(())
}
@@ -688,3 +720,75 @@ fn list_backup_files<P: ?Sized + nix::NixPath>(
Ok(files)
}
+
+/// Creates a path to a lock file depending on the relative path of an object (snapshot, group,
+/// manifest) in a datastore. First all namespaces will be concatenated with a colon (ns-folder).
+/// Then the actual file name will depend on the length of the relative path without namespaces. If
+/// it is shorter than 255 characters in its unit encoded form, than the unit encoded form will be
+/// used directly. If not, the file name will consist of the first 80 character, the last 80
+/// characters and the hash of the unit encoded relative path without namespaces. It will also be
+/// placed into a "hashed" subfolder in the namespace folder.
+///
+/// Examples:
+///
+/// - vm-100
+/// - vm-100-2022\x2d05\x2d02T08\x3a11\x3a33Z
+/// - ns1:ns2:ns3:ns4:ns5:ns6:ns7/vm-100-2022\x2d05\x2d02T08\x3a11\x3a33Z
+///
+/// A "hashed" lock file would look like this:
+/// - ns1:ns2:ns3/hashed/$first_eighty...$last_eighty-$hash
+fn lock_file_path_helper(ns: &BackupNamespace, path: PathBuf) -> PathBuf {
+ let to_return = PathBuf::from(
+ ns.components()
+ .map(String::from)
+ .reduce(|acc, n| format!("{acc}:{n}"))
+ .unwrap_or_default(),
+ );
+
+ let path_bytes = path.as_os_str().as_bytes();
+
+ let enc = escape_unit(path_bytes, true);
+
+ if enc.len() < 255 {
+ return to_return.join(enc);
+ }
+
+ let to_return = to_return.join("hashed");
+
+ let first_eigthy = &enc[..80];
+ let last_eighty = &enc[enc.len() - 80..];
+ let hash = hex::encode(openssl::sha::sha256(path_bytes));
+
+ to_return.join(format!("{first_eigthy}...{last_eighty}-{hash}"))
+}
+
+/// Helps implement the double stat'ing procedure. It avoids certain race conditions upon lock
+/// deletion.
+///
+/// It also creates the base directory for lock files.
+fn lock_helper<F>(
+ store_name: &str,
+ path: &std::path::Path,
+ lock_fn: F,
+) -> Result<BackupLockGuard, Error>
+where
+ F: Fn(&std::path::Path) -> Result<BackupLockGuard, Error>,
+{
+ let mut lock_dir = Path::new(DATASTORE_LOCKS_DIR).join(store_name);
+
+ if let Some(parent) = path.parent() {
+ lock_dir = lock_dir.join(parent);
+ };
+
+ std::fs::create_dir_all(&lock_dir)?;
+
+ let lock = lock_fn(path)?;
+
+ let stat = nix::sys::stat::fstat(lock.as_raw_fd())?.st_ino;
+
+ if nix::sys::stat::stat(path).map_or(true, |st| stat != st.st_ino) {
+ bail!("could not acquire lock, another thread modified the lock file");
+ }
+
+ Ok(lock)
+}
diff --git a/pbs-datastore/src/datastore.rs b/pbs-datastore/src/datastore.rs
index b3804b5a..15377a8a 100644
--- a/pbs-datastore/src/datastore.rs
+++ b/pbs-datastore/src/datastore.rs
@@ -13,7 +13,6 @@ use proxmox_human_byte::HumanByte;
use proxmox_schema::ApiType;
use proxmox_sys::error::SysError;
-use proxmox_sys::fs::DirLockGuard;
use proxmox_sys::fs::{file_read_optional_string, replace_file, CreateOptions};
use proxmox_sys::linux::procfs::MountInfo;
use proxmox_sys::process_locker::ProcessLockSharedGuard;
@@ -24,6 +23,7 @@ use pbs_api_types::{
DataStoreConfig, DatastoreFSyncLevel, DatastoreTuning, GarbageCollectionStatus,
MaintenanceMode, MaintenanceType, Operation, UPID,
};
+use pbs_config::BackupLockGuard;
use crate::backup_info::{BackupDir, BackupGroup};
use crate::chunk_store::ChunkStore;
@@ -778,7 +778,7 @@ impl DataStore {
ns: &BackupNamespace,
backup_group: &pbs_api_types::BackupGroup,
auth_id: &Authid,
- ) -> Result<(Authid, DirLockGuard), Error> {
+ ) -> Result<(Authid, BackupLockGuard), Error> {
let backup_group = self.backup_group(ns.clone(), backup_group.clone());
// create intermediate path first
@@ -812,7 +812,7 @@ impl DataStore {
self: &Arc<Self>,
ns: &BackupNamespace,
backup_dir: &pbs_api_types::BackupDir,
- ) -> Result<(PathBuf, bool, DirLockGuard), Error> {
+ ) -> Result<(PathBuf, bool, BackupLockGuard), Error> {
let backup_dir = self.backup_dir(ns.clone(), backup_dir.clone())?;
let relative_path = backup_dir.relative_path();
diff --git a/pbs-datastore/src/snapshot_reader.rs b/pbs-datastore/src/snapshot_reader.rs
index 604d35c1..63836e39 100644
--- a/pbs-datastore/src/snapshot_reader.rs
+++ b/pbs-datastore/src/snapshot_reader.rs
@@ -4,8 +4,11 @@ use std::path::Path;
use std::rc::Rc;
use std::sync::Arc;
-use anyhow::{bail, Error};
+use anyhow::{bail, format_err, Error};
use nix::dir::Dir;
+use nix::fcntl::OFlag;
+use nix::sys::stat::Mode;
+use pbs_config::BackupLockGuard;
use pbs_api_types::{
print_store_and_ns, ArchiveType, BackupNamespace, Operation, CLIENT_LOG_BLOB_NAME,
@@ -26,6 +29,10 @@ pub struct SnapshotReader {
datastore_name: String,
file_list: Vec<String>,
locked_dir: Dir,
+
+ // while this is never read, the lock needs to be kept until the
+ // reader is dropped to ensure valid locking semantics
+ _lock: BackupLockGuard,
}
impl SnapshotReader {
@@ -46,7 +53,12 @@ impl SnapshotReader {
bail!("snapshot {} does not exist!", snapshot.dir());
}
- let locked_dir = snapshot.lock_shared()?;
+ let lock = snapshot.lock_shared()?;
+
+ let locked_dir =
+ Dir::open(&snapshot_path, OFlag::O_RDONLY, Mode::empty()).map_err(|err| {
+ format_err!("unable to open snapshot directory {snapshot_path:?} - {err}")
+ })?;
let datastore_name = datastore.name().to_string();
let manifest = match snapshot.load_manifest() {
@@ -77,6 +89,7 @@ impl SnapshotReader {
datastore_name,
file_list,
locked_dir,
+ _lock: lock,
})
}
diff --git a/src/api2/backup/environment.rs b/src/api2/backup/environment.rs
index 133932b3..e4cd20b6 100644
--- a/src/api2/backup/environment.rs
+++ b/src/api2/backup/environment.rs
@@ -1,5 +1,6 @@
use anyhow::{bail, format_err, Error};
-use nix::dir::Dir;
+
+use pbs_config::BackupLockGuard;
use std::collections::HashMap;
use std::sync::{Arc, Mutex};
use tracing::info;
@@ -635,7 +636,7 @@ impl BackupEnvironment {
/// If verify-new is set on the datastore, this will run a new verify task
/// for the backup. If not, this will return and also drop the passed lock
/// immediately.
- pub fn verify_after_complete(&self, excl_snap_lock: Dir) -> Result<(), Error> {
+ pub fn verify_after_complete(&self, excl_snap_lock: BackupLockGuard) -> Result<(), Error> {
self.ensure_finished()?;
if !self.datastore.verify_new() {
diff --git a/src/backup/verify.rs b/src/backup/verify.rs
index 10a64fda..3d2cba8a 100644
--- a/src/backup/verify.rs
+++ b/src/backup/verify.rs
@@ -1,10 +1,10 @@
+use pbs_config::BackupLockGuard;
use std::collections::HashSet;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::{Arc, Mutex};
use std::time::Instant;
use anyhow::{bail, Error};
-use proxmox_sys::fs::DirLockGuard;
use tracing::{error, info, warn};
use proxmox_worker_task::WorkerTaskContext;
@@ -330,7 +330,7 @@ pub fn verify_backup_dir_with_lock(
backup_dir: &BackupDir,
upid: UPID,
filter: Option<&dyn Fn(&BackupManifest) -> bool>,
- _snap_lock: DirLockGuard,
+ _snap_lock: BackupLockGuard,
) -> Result<bool, Error> {
let datastore_name = verify_worker.datastore.name();
let backup_dir_name = backup_dir.dir();
diff --git a/src/server/sync.rs b/src/server/sync.rs
index abe7316a..da003cdc 100644
--- a/src/server/sync.rs
+++ b/src/server/sync.rs
@@ -10,7 +10,7 @@ use std::time::Duration;
use anyhow::{bail, format_err, Context, Error};
use futures::{future::FutureExt, select};
use hyper::http::StatusCode;
-use proxmox_sys::fs::DirLockGuard;
+use pbs_config::BackupLockGuard;
use serde_json::json;
use tracing::{info, warn};
@@ -106,7 +106,7 @@ pub(crate) struct RemoteSourceReader {
}
pub(crate) struct LocalSourceReader {
- pub(crate) _dir_lock: Arc<Mutex<DirLockGuard>>,
+ pub(crate) _dir_lock: Arc<Mutex<BackupLockGuard>>,
pub(crate) path: PathBuf,
pub(crate) datastore: Arc<DataStore>,
}
--
2.39.5
_______________________________________________
pbs-devel mailing list
pbs-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel
^ permalink raw reply [flat|nested] 5+ messages in thread
* [pbs-devel] [PATCH proxmox-backup 3/4] fix #3935: datastore: move manifest locking to new locking method
2025-03-10 14:36 [pbs-devel] [PATCH proxmox-backup 0/4] refactor datastore locking to use tmpfs Shannon Sterz
2025-03-10 14:36 ` [pbs-devel] [PATCH proxmox-backup 1/4] datastore/api/backup: prepare for fix of #3935 by adding lock helpers Shannon Sterz
2025-03-10 14:36 ` [pbs-devel] [PATCH proxmox-backup 2/4] fix #3935: datastore/api/backup: move datastore locking to '/run' Shannon Sterz
@ 2025-03-10 14:36 ` Shannon Sterz
2025-03-10 14:36 ` [pbs-devel] [PATCH proxmox-backup 4/4] fix: api: avoid race condition in set_backup_owner Shannon Sterz
3 siblings, 0 replies; 5+ messages in thread
From: Shannon Sterz @ 2025-03-10 14:36 UTC (permalink / raw)
To: pbs-devel
adds double stat'ing and removes directory hierarchy to bring manifest
locking in-line with other locks used by the BackupDir trait.
Signed-off-by: Shannon Sterz <s.sterz@proxmox.com>
---
pbs-datastore/src/backup_info.rs | 34 ++++++++++++++++----------------
1 file changed, 17 insertions(+), 17 deletions(-)
diff --git a/pbs-datastore/src/backup_info.rs b/pbs-datastore/src/backup_info.rs
index 5496e053..4bfd4393 100644
--- a/pbs-datastore/src/backup_info.rs
+++ b/pbs-datastore/src/backup_info.rs
@@ -448,25 +448,28 @@ impl BackupDir {
/// Returns the filename to lock a manifest
///
/// Also creates the basedir. The lockfile is located in
- /// '/run/proxmox-backup/locks/{datastore}/[ns/{ns}/]+{type}/{id}/{timestamp}.index.json.lck'
- fn manifest_lock_path(&self) -> Result<PathBuf, Error> {
- let mut path = PathBuf::from(&format!("/run/proxmox-backup/locks/{}", self.store.name()));
- path.push(self.relative_path());
+ /// `${DATASTORE_LOCKS_DIR}/${datastore name}/${lock_file_path_helper(rpath)}.index.json.lck`
+ /// where rpath is the relative path of the snapshot.
+ fn manifest_lock_path(&self) -> PathBuf {
+ let path = Path::new(DATASTORE_LOCKS_DIR).join(self.store.name());
- std::fs::create_dir_all(&path)?;
- let ts = self.backup_time_string();
- path.push(format!("{ts}{MANIFEST_LOCK_NAME}"));
+ let rpath = Path::new(self.dir.group.ty.as_str())
+ .join(&self.dir.group.id)
+ .join(&self.backup_time_string)
+ .join(MANIFEST_LOCK_NAME);
- Ok(path)
+ path.join(lock_file_path_helper(&self.ns, rpath))
}
/// Locks the manifest of a snapshot, for example, to update or delete it.
pub(crate) fn lock_manifest(&self) -> Result<BackupLockGuard, Error> {
- let path = self.manifest_lock_path()?;
-
- // actions locking the manifest should be relatively short, only wait a few seconds
- open_backup_lockfile(&path, Some(std::time::Duration::from_secs(5)), true)
- .map_err(|err| format_err!("unable to acquire manifest lock {:?} - {}", &path, err))
+ lock_helper(self.store.name(), &self.manifest_lock_path(), |p| {
+ // update_manifest should never take a long time, so if
+ // someone else has the lock we can simply block a bit
+ // and should get it soon
+ open_backup_lockfile(p, Some(Duration::from_secs(5)), true)
+ .map_err(|err| format_err!("unable to acquire manifest lock {p:?} - {err}"))
+ })
}
/// Returns a file name for locking a snapshot.
@@ -521,10 +524,7 @@ impl BackupDir {
})?;
// remove no longer needed lock files
- if let Ok(path) = self.manifest_lock_path() {
- let _ = std::fs::remove_file(path); // ignore errors
- }
-
+ let _ = std::fs::remove_file(self.manifest_lock_path()); // ignore errors
let _ = std::fs::remove_file(self.lock_path()); // ignore errors
Ok(())
--
2.39.5
_______________________________________________
pbs-devel mailing list
pbs-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel
^ permalink raw reply [flat|nested] 5+ messages in thread
* [pbs-devel] [PATCH proxmox-backup 4/4] fix: api: avoid race condition in set_backup_owner
2025-03-10 14:36 [pbs-devel] [PATCH proxmox-backup 0/4] refactor datastore locking to use tmpfs Shannon Sterz
` (2 preceding siblings ...)
2025-03-10 14:36 ` [pbs-devel] [PATCH proxmox-backup 3/4] fix #3935: datastore: move manifest locking to new locking method Shannon Sterz
@ 2025-03-10 14:36 ` Shannon Sterz
3 siblings, 0 replies; 5+ messages in thread
From: Shannon Sterz @ 2025-03-10 14:36 UTC (permalink / raw)
To: pbs-devel
when two clients change the owner of a backup store, a race condition
arose. add locking to avoid this.
Signed-off-by: Shannon Sterz <s.sterz@proxmox.com>
---
src/api2/admin/datastore.rs | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/src/api2/admin/datastore.rs b/src/api2/admin/datastore.rs
index cfe319c4..b16e609a 100644
--- a/src/api2/admin/datastore.rs
+++ b/src/api2/admin/datastore.rs
@@ -2347,10 +2347,9 @@ pub async fn set_backup_owner(
let datastore = DataStore::lookup_datastore(&store, Some(Operation::Write))?;
let backup_group = datastore.backup_group(ns, backup_group);
+ let owner = backup_group.get_owner()?;
if owner_check_required {
- let owner = backup_group.get_owner()?;
-
let allowed = match (owner.is_token(), new_owner.is_token()) {
(true, true) => {
// API token to API token, owned by same user
@@ -2397,6 +2396,12 @@ pub async fn set_backup_owner(
);
}
+ let _guard = backup_group.lock()?;
+
+ if owner != backup_group.get_owner()? {
+ bail!("{owner} does not own this group anymore");
+ }
+
backup_group.set_owner(&new_owner, true)?;
Ok(())
--
2.39.5
_______________________________________________
pbs-devel mailing list
pbs-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel
^ permalink raw reply [flat|nested] 5+ messages in thread