From: Stefan Sterz <s.sterz@proxmox.com>
To: pbs-devel@lists.proxmox.com
Subject: [pbs-devel] [PATCH proxmox-backup v3 3/5] fix #3935: datastore/api/backup: move datastore locking to '/run'
Date: Tue, 24 May 2022 10:28:14 +0200 [thread overview]
Message-ID: <20220524082816.97270-4-s.sterz@proxmox.com> (raw)
In-Reply-To: <20220524082816.97270-1-s.sterz@proxmox.com>
to avoid issues when removing a group or snapshot directory where two
threads hold a lock to the same directory, move locking to the tmpfs
backed '/run' directory and use a flat file structure. also adds
double stat'ing to make it possible to remove locks without certain
race condition issues.
Signed-off-by: Stefan Sterz <s.sterz@proxmox.com>
---
pbs-config/src/lib.rs | 7 ++
pbs-datastore/src/backup_info.rs | 121 +++++++++++++++++++++++----
pbs-datastore/src/datastore.rs | 7 +-
pbs-datastore/src/snapshot_reader.rs | 17 +++-
src/api2/backup/environment.rs | 5 +-
src/backup/verify.rs | 4 +-
6 files changed, 133 insertions(+), 28 deletions(-)
diff --git a/pbs-config/src/lib.rs b/pbs-config/src/lib.rs
index f9b63922..81f106c4 100644
--- a/pbs-config/src/lib.rs
+++ b/pbs-config/src/lib.rs
@@ -21,6 +21,7 @@ pub use config_version_cache::ConfigVersionCache;
use anyhow::{format_err, Error};
use nix::unistd::{Gid, Group, Uid, User};
+use std::os::unix::prelude::AsRawFd;
pub use pbs_buildcfg::{BACKUP_GROUP_NAME, BACKUP_USER_NAME};
@@ -46,6 +47,12 @@ pub fn backup_group() -> Result<nix::unistd::Group, Error> {
pub struct BackupLockGuard(Option<std::fs::File>);
+impl AsRawFd for BackupLockGuard {
+ fn as_raw_fd(&self) -> i32 {
+ self.0.as_ref().map_or(-1, |f| f.as_raw_fd())
+ }
+}
+
#[doc(hidden)]
/// Note: do not use for production code, this is only intended for tests
pub unsafe fn create_mocked_lock() -> BackupLockGuard {
diff --git a/pbs-datastore/src/backup_info.rs b/pbs-datastore/src/backup_info.rs
index 0711fcfa..9bbe68ac 100644
--- a/pbs-datastore/src/backup_info.rs
+++ b/pbs-datastore/src/backup_info.rs
@@ -1,14 +1,15 @@
use std::convert::TryFrom;
use std::fmt;
-use std::os::unix::io::RawFd;
+use std::os::unix::io::{AsRawFd, RawFd};
+use std::path::Path;
use std::path::PathBuf;
use std::sync::Arc;
+use std::time::Duration;
use anyhow::{bail, format_err, Error};
-use proxmox_sys::fs::{
- lock_dir_noblock, lock_dir_noblock_shared, replace_file, CreateOptions, DirLockGuard,
-};
+use proxmox_sys::fs::{replace_file, CreateOptions};
+use proxmox_sys::systemd::escape_unit;
use pbs_api_types::{
Authid, BackupNamespace, BackupType, GroupFilter, BACKUP_DATE_REGEX, BACKUP_FILE_REGEX,
@@ -20,6 +21,8 @@ use crate::manifest::{
};
use crate::{DataBlob, DataStore};
+pub const DATASTORE_LOCKS_DIR: &str = "/run/proxmox-backup/locks";
+
/// BackupGroup is a directory containing a list of BackupDir
#[derive(Clone)]
pub struct BackupGroup {
@@ -216,6 +219,8 @@ impl BackupGroup {
})?;
}
+ let _ = std::fs::remove_file(self.lock_path());
+
Ok(removed_all_snaps)
}
@@ -232,13 +237,29 @@ impl BackupGroup {
.set_owner(&self.ns, self.as_ref(), auth_id, force)
}
- /// Lock a group exclusively
- pub fn lock(&self) -> Result<DirLockGuard, Error> {
- lock_dir_noblock(
- &self.full_group_path(),
- "backup group",
- "possible running backup",
- )
+ /// Returns a file name for locking a group.
+ ///
+ /// The lock file will be located in:
+ /// `${DATASTORE_LOCKS_DIR}/${datastore name}/${lock_file_name_helper(rpath)}`
+ /// where `rpath` is the relative path of the group.
+ ///
+ /// If the group's relative path contains non-Unicode sequences they will be replaced via
+ /// [std::ffi::OsStr::to_string_lossy()].
+ fn lock_path(&self) -> PathBuf {
+ let path = Path::new(DATASTORE_LOCKS_DIR).join(self.store.name());
+
+ let rpath = self.relative_group_path();
+ let rpath = rpath.as_os_str().to_string_lossy();
+
+ path.join(lock_file_name_helper(&rpath))
+ }
+
+ /// Locks a group exclusively.
+ pub fn lock(&self) -> Result<BackupLockGuard, Error> {
+ lock_helper(self.store.name(), &self.lock_path(), |p| {
+ open_backup_lockfile(p, Some(Duration::from_secs(0)), true)
+ .map_err(|err| format_err!("unable to acquire backup group lock {p:?} - {err}"))
+ })
}
}
@@ -443,14 +464,37 @@ impl BackupDir {
.map_err(|err| format_err!("unable to acquire manifest lock {:?} - {}", &path, err))
}
- /// Lock this snapshot exclusively
- pub fn lock(&self) -> Result<DirLockGuard, Error> {
- lock_dir_noblock(&self.full_path(), "snapshot", "possibly running or in use")
+ /// Returns a file name for locking a snapshot.
+ ///
+ /// The lock file will be located in:
+ /// `${DATASTORE_LOCKS_DIR}/${datastore name}/${lock_file_name_helper(rpath)}`
+ /// where `rpath` is the relative path of the snapshot.
+ ///
+ /// If the snapshot's relative path contains non-Unicode sequences they will be replaced via
+ /// [std::ffi::OsStr::to_string_lossy()].
+ fn lock_path(&self) -> PathBuf {
+ let path = Path::new(DATASTORE_LOCKS_DIR).join(self.store.name());
+
+ let rpath = self.relative_path();
+ let rpath = rpath.as_os_str().to_string_lossy();
+
+ path.join(lock_file_name_helper(&rpath))
+ }
+
+ /// Locks a snapshot exclusively.
+ pub fn lock(&self) -> Result<BackupLockGuard, Error> {
+ lock_helper(self.store.name(), &self.lock_path(), |p| {
+ open_backup_lockfile(p, Some(Duration::from_secs(0)), true)
+ .map_err(|err| format_err!("unable to acquire snapshot lock {p:?} - {err}"))
+ })
}
- /// Acquire a shared lock on this snapshot
- pub fn lock_shared(&self) -> Result<DirLockGuard, Error> {
- lock_dir_noblock_shared(&self.full_path(), "snapshot", "possibly running or in use")
+ /// Acquires a shared lock on a snapshot.
+ pub fn lock_shared(&self) -> Result<BackupLockGuard, Error> {
+ lock_helper(self.store.name(), &self.lock_path(), |p| {
+ open_backup_lockfile(p, Some(Duration::from_secs(0)), false)
+ .map_err(|err| format_err!("unable to acquire shared snapshot lock {p:?} - {err}"))
+ })
}
/// Destroy the whole snapshot, bails if it's protected
@@ -473,11 +517,13 @@ impl BackupDir {
format_err!("removing backup snapshot {:?} failed - {}", full_path, err,)
})?;
- // the manifest doesn't exist anymore, no need to keep the lock (already done by guard?)
+ // remove no longer needed lock files
if let Ok(path) = self.manifest_lock_path() {
let _ = std::fs::remove_file(path); // ignore errors
}
+ let _ = std::fs::remove_file(self.lock_path()); // ignore errors
+
Ok(())
}
@@ -664,3 +710,42 @@ fn list_backup_files<P: ?Sized + nix::NixPath>(
Ok(files)
}
+
+/// Encodes a string so it can be used as a lock file name.
+///
+/// The first 64 characters will be the sha256 hash of `path` then a hyphen followed by up to 100
+/// characters of the unit encoded version of `path`.
+fn lock_file_name_helper(path: &str) -> String {
+ let enc = escape_unit(path, true);
+ let from = enc.len().checked_sub(100).unwrap_or(0);
+ let enc = enc[from..].to_string();
+ let hash = hex::encode(openssl::sha::sha256(path.as_bytes()));
+
+ format!("{hash}-{enc}")
+}
+
+/// Helps implement the double stat'ing procedure. It avoids certain race conditions upon lock
+/// deletion.
+///
+/// It also creates the base directory for lock files.
+fn lock_helper<F>(
+ store_name: &str,
+ path: &std::path::Path,
+ lock_fn: F,
+) -> Result<BackupLockGuard, Error>
+where
+ F: Fn(&std::path::Path) -> Result<BackupLockGuard, Error>,
+{
+ let lock_dir = Path::new(DATASTORE_LOCKS_DIR).join(store_name);
+ std::fs::create_dir_all(&lock_dir)?;
+
+ let lock = lock_fn(path)?;
+
+ let stat = nix::sys::stat::fstat(lock.as_raw_fd())?.st_ino;
+
+ if nix::sys::stat::stat(path).map_or(true, |st| stat != st.st_ino) {
+ bail!("could not acquire lock, another thread modified the lock file");
+ }
+
+ Ok(lock)
+}
diff --git a/pbs-datastore/src/datastore.rs b/pbs-datastore/src/datastore.rs
index f74ea04b..1fc7efa7 100644
--- a/pbs-datastore/src/datastore.rs
+++ b/pbs-datastore/src/datastore.rs
@@ -11,7 +11,6 @@ use nix::unistd::{unlinkat, UnlinkatFlags};
use proxmox_schema::ApiType;
-use proxmox_sys::fs::DirLockGuard;
use proxmox_sys::fs::{file_read_optional_string, replace_file, CreateOptions};
use proxmox_sys::process_locker::ProcessLockSharedGuard;
use proxmox_sys::WorkerTaskContext;
@@ -21,7 +20,7 @@ use pbs_api_types::{
Authid, BackupNamespace, BackupType, ChunkOrder, DataStoreConfig, DatastoreTuning,
GarbageCollectionStatus, HumanByte, Operation, UPID,
};
-use pbs_config::ConfigVersionCache;
+use pbs_config::{BackupLockGuard, ConfigVersionCache};
use crate::backup_info::{BackupDir, BackupGroup};
use crate::chunk_store::ChunkStore;
@@ -615,7 +614,7 @@ impl DataStore {
ns: &BackupNamespace,
backup_group: &pbs_api_types::BackupGroup,
auth_id: &Authid,
- ) -> Result<(Authid, DirLockGuard), Error> {
+ ) -> Result<(Authid, BackupLockGuard), Error> {
let backup_group =
BackupGroup::new(Arc::new(self.clone()), ns.clone(), backup_group.clone());
@@ -650,7 +649,7 @@ impl DataStore {
&self,
ns: &BackupNamespace,
backup_dir: &pbs_api_types::BackupDir,
- ) -> Result<(PathBuf, bool, DirLockGuard), Error> {
+ ) -> Result<(PathBuf, bool, BackupLockGuard), Error> {
let backup_dir = Arc::new(self.clone()).backup_dir(ns.clone(), backup_dir.clone())?;
let relative_path = backup_dir.relative_path();
diff --git a/pbs-datastore/src/snapshot_reader.rs b/pbs-datastore/src/snapshot_reader.rs
index 8c7eefe1..40083900 100644
--- a/pbs-datastore/src/snapshot_reader.rs
+++ b/pbs-datastore/src/snapshot_reader.rs
@@ -3,8 +3,11 @@ use std::os::unix::io::{AsRawFd, FromRawFd};
use std::path::Path;
use std::sync::Arc;
-use anyhow::{bail, Error};
+use anyhow::{bail, format_err, Error};
use nix::dir::Dir;
+use nix::fcntl::OFlag;
+use nix::sys::stat::Mode;
+use pbs_config::BackupLockGuard;
use pbs_api_types::{BackupNamespace, DatastoreWithNamespace, Operation};
@@ -23,6 +26,10 @@ pub struct SnapshotReader {
datastore_name: String,
file_list: Vec<String>,
locked_dir: Dir,
+
+ // while this is never read, the lock needs to be kept until the
+ // reader is dropped to ensure valid locking semantics
+ _lock: BackupLockGuard,
}
impl SnapshotReader {
@@ -42,7 +49,12 @@ impl SnapshotReader {
ns: snapshot.backup_ns().clone(),
};
- let locked_dir = snapshot.lock_shared()?;
+ let lock = snapshot.lock_shared()?;
+
+ let path = snapshot.full_path();
+
+ let locked_dir = Dir::open(&path, OFlag::O_RDONLY, Mode::empty())
+ .map_err(|err| format_err!("unable to open snapshot directory {path:?} - {err}"))?;
let datastore_name = datastore.name().to_string();
let manifest = match snapshot.load_manifest() {
@@ -74,6 +86,7 @@ impl SnapshotReader {
datastore_name,
file_list,
locked_dir,
+ _lock: lock,
})
}
diff --git a/src/api2/backup/environment.rs b/src/api2/backup/environment.rs
index 8c1c42db..c4280b3f 100644
--- a/src/api2/backup/environment.rs
+++ b/src/api2/backup/environment.rs
@@ -1,5 +1,6 @@
use anyhow::{bail, format_err, Error};
-use nix::dir::Dir;
+
+use pbs_config::BackupLockGuard;
use std::collections::HashMap;
use std::sync::{Arc, Mutex};
@@ -632,7 +633,7 @@ impl BackupEnvironment {
/// If verify-new is set on the datastore, this will run a new verify task
/// for the backup. If not, this will return and also drop the passed lock
/// immediately.
- pub fn verify_after_complete(&self, snap_lock: Dir) -> Result<(), Error> {
+ pub fn verify_after_complete(&self, snap_lock: BackupLockGuard) -> Result<(), Error> {
self.ensure_finished()?;
if !self.datastore.verify_new() {
diff --git a/src/backup/verify.rs b/src/backup/verify.rs
index dfe1dc56..2a89ab87 100644
--- a/src/backup/verify.rs
+++ b/src/backup/verify.rs
@@ -1,4 +1,4 @@
-use nix::dir::Dir;
+use pbs_config::BackupLockGuard;
use std::collections::HashSet;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::{Arc, Mutex};
@@ -351,7 +351,7 @@ pub fn verify_backup_dir_with_lock(
backup_dir: &BackupDir,
upid: UPID,
filter: Option<&dyn Fn(&BackupManifest) -> bool>,
- _snap_lock: Dir,
+ _snap_lock: BackupLockGuard,
) -> Result<bool, Error> {
let manifest = match backup_dir.load_manifest() {
Ok((manifest, _)) => manifest,
--
2.30.2
next prev parent reply other threads:[~2022-05-24 8:28 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-05-24 8:28 [pbs-devel] [PATCH proxmox-backup v3 0/5] refactor datastore locking to use tmpfs Stefan Sterz
2022-05-24 8:28 ` [pbs-devel] [PATCH proxmox-backup v3 1/5] fix #3935: datastore/api/backup: add lock helpers to backup dir/group Stefan Sterz
2022-05-24 8:28 ` [pbs-devel] [PATCH proxmox-backup v3 2/5] fix: datastore: make relative_group_path() return relative path Stefan Sterz
2022-05-24 8:28 ` Stefan Sterz [this message]
2022-05-24 8:28 ` [pbs-devel] [PATCH proxmox-backup v3 4/5] fix #3935: datastore: move manifest locking to new locking method Stefan Sterz
2022-05-24 8:28 ` [pbs-devel] [PATCH proxmox-backup v3 5/5] fix: api: avoid race condition in set_backup_owner Stefan Sterz
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220524082816.97270-4-s.sterz@proxmox.com \
--to=s.sterz@proxmox.com \
--cc=pbs-devel@lists.proxmox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.