From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <pbs-devel-bounces@lists.proxmox.com>
Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68])
	by lore.proxmox.com (Postfix) with ESMTPS id 998681FF142
	for <inbox@lore.proxmox.com>; Tue, 07 Apr 2026 15:18:55 +0200 (CEST)
Received: from firstgate.proxmox.com (localhost [127.0.0.1])
	by firstgate.proxmox.com (Proxmox) with ESMTP id 53CC11B34C;
	Tue,  7 Apr 2026 15:19:30 +0200 (CEST)
Date: Tue, 07 Apr 2026 15:18:50 +0200
From: Fabian =?iso-8859-1?q?Gr=FCnbichler?= <f.gruenbichler@proxmox.com>
Subject: Re: [PATCH proxmox-backup v6 2/8] datastore: add move_group
To: Hannes Laimer <h.laimer@proxmox.com>, pbs-devel@lists.proxmox.com
References: <20260331123409.198353-1-h.laimer@proxmox.com>
	<20260331123409.198353-3-h.laimer@proxmox.com>
In-Reply-To: <20260331123409.198353-3-h.laimer@proxmox.com>
MIME-Version: 1.0
User-Agent: astroid/0.17.0 (https://github.com/astroidmail/astroid)
Message-Id: <1775565710.jsgunkzqjf.astroid@yuna.none>
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: quoted-printable
X-Bm-Milter-Handled: 55990f41-d878-4baa-be0a-ee34c49e34d2
X-Bm-Transport-Timestamp: 1775567868212
X-SPAM-LEVEL: Spam detection results:  0
	AWL                     0.053 Adjusted score from AWL reputation of From:
 address
	BAYES_00                 -1.9 Bayes spam probability is 0 to 1%
	DMARC_MISSING             0.1 Missing DMARC policy
	KAM_DMARC_STATUS         0.01 Test Rule for DKIM or SPF Failure with Strict
 Alignment
	RCVD_IN_VALIDITY_CERTIFIED_BLOCKED  0.001 ADMINISTRATOR NOTICE: The query to
 Validity was blocked.  See
 https://knowledge.validity.com/hc/en-us/articles/20961730681243 for more
 information.
	RCVD_IN_VALIDITY_RPBL_BLOCKED  0.001 ADMINISTRATOR NOTICE: The query to
 Validity was blocked.  See
 https://knowledge.validity.com/hc/en-us/articles/20961730681243 for more
 information.
	RCVD_IN_VALIDITY_SAFE_BLOCKED  0.001 ADMINISTRATOR NOTICE: The query to
 Validity was blocked.  See
 https://knowledge.validity.com/hc/en-us/articles/20961730681243 for more
 information.
	SPF_HELO_NONE           0.001 SPF: HELO does not publish an SPF Record
	SPF_PASS               -0.001 SPF: sender matches SPF record
Message-ID-Hash: RWGKYLJFEDW4PYW3BVVDHXFZGHCQE3WF
X-Message-ID-Hash: RWGKYLJFEDW4PYW3BVVDHXFZGHCQE3WF
X-MailFrom: f.gruenbichler@proxmox.com
X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; loop;
 banned-address; emergency; member-moderation; nonmember-moderation;
 administrivia; implicit-dest; max-recipients; max-size; news-moderation;
 no-subject; digests; suspicious-header
X-Mailman-Version: 3.3.10
Precedence: list
List-Id: Proxmox Backup Server development discussion
 <pbs-devel.lists.proxmox.com>
List-Help: <mailto:pbs-devel-request@lists.proxmox.com?subject=help>
List-Owner: <mailto:pbs-devel-owner@lists.proxmox.com>
List-Post: <mailto:pbs-devel@lists.proxmox.com>
List-Subscribe: <mailto:pbs-devel-join@lists.proxmox.com>
List-Unsubscribe: <mailto:pbs-devel-leave@lists.proxmox.com>

On March 31, 2026 2:34 pm, Hannes Laimer wrote:
> Add support for moving a single backup group to a different namespace
> within the same datastore.
>=20
> For the filesystem backend, the group directory is relocated with a
> single rename(2). For S3, all objects under the source prefix are
> copied to the target prefix and then deleted.
>=20
> An exclusive group lock and exclusive locks on every snapshot are
> acquired before the move, mirroring the locking strategy used when
> destroying a group. This ensures no concurrent readers, writers, or
> verify tasks are operating on any snapshot being moved. Existence
> checks are performed under those locks to avoid TOCTOU races.
>=20
> Signed-off-by: Hannes Laimer <h.laimer@proxmox.com>
> ---
>  pbs-datastore/src/backup_info.rs | 148 ++++++++++++++++++++++++++++++-
>  pbs-datastore/src/datastore.rs   |  56 ++++++++++++
>  2 files changed, 203 insertions(+), 1 deletion(-)
>=20
> diff --git a/pbs-datastore/src/backup_info.rs b/pbs-datastore/src/backup_=
info.rs
> index c33eb307..a8a56198 100644
> --- a/pbs-datastore/src/backup_info.rs
> +++ b/pbs-datastore/src/backup_info.rs
> @@ -9,7 +9,7 @@ use std::time::Duration;
>  use anyhow::{bail, format_err, Context, Error};
>  use const_format::concatcp;
> =20
> -use proxmox_s3_client::S3PathPrefix;
> +use proxmox_s3_client::{S3ObjectKey, S3PathPrefix};
>  use proxmox_sys::fs::{lock_dir_noblock, lock_dir_noblock_shared, replace=
_file, CreateOptions};
>  use proxmox_systemd::escape_unit;
> =20
> @@ -273,6 +273,152 @@ impl BackupGroup {
>          Ok(delete_stats)
>      }
> =20
> +    /// Move this group to a new namespace.
> +    ///
> +    /// For the filesystem backend, uses `rename` to atomically relocate=
 the group directory. For
> +    /// the S3 backend, copies all objects to the destination prefix fir=
st, then renames the local
> +    /// cache directory, then deletes the source objects. A copy failure=
 returns an error with the
> +    /// group intact at source. A delete failure is logged as a warning =
- any un-deleted source
> +    /// objects are orphaned and must be removed manually.
> +    ///
> +    /// The caller must have created the target type directory
> +    /// (e.g. `{target_ns}/{backup_type}/`) before calling this method.
> +    ///
> +    /// The caller must hold an exclusive group lock and exclusive locks=
 on all snapshots in the
> +    /// group. This prevents concurrent readers, writers, and verify tas=
ks from operating on the
> +    /// group and ensures no new objects are added between the S3 copy s=
weep and the subsequent
> +    /// deletes.

see below

> +    pub(crate) fn move_to(
> +        &self,
> +        target_ns: &BackupNamespace,
> +        backend: &DatastoreBackend,
> +    ) -> Result<(), Error> {
> +        let src_path =3D self.full_group_path();
> +        let target_path =3D self.store.group_path(target_ns, &self.group=
);
> +
> +        log::info!("moving backup group {src_path:?} to {target_path:?}"=
);

nit: this line is changed in the next patch, that hunk should probably
move here into this patch?

> +
> +        match backend {
> +            DatastoreBackend::Filesystem =3D> {
> +                std::fs::rename(&src_path, &target_path).with_context(||=
 {
> +                    format!("failed to move group {src_path:?} to {targe=
t_path:?}")
> +                })?;
> +                // Remove the now-stale source lock file. The caller's l=
ock guard
> +                // still holds the flock via the open FD until it is dro=
pped.
> +                let _ =3D std::fs::remove_file(self.lock_path());

the comment here is wrong, a new attempt to lock will obtain the lock by
virtue of it operating on a new file and new FD.. which in turn means
that if we want to drop the lock file here, we also need to consume the
guard. in practice, we drop it anyway right after, but why not make the
interface enforce that from the start, rather than risk introducing
subtle issues down the line if the code is changed?

what about the snapshot lock files, those are not removed? note that
they'd need to be removed before the group path, else another call could
re-create the group and race..

> +            }
> +            DatastoreBackend::S3(s3_client) =3D> {
> +                // Build S3 key prefixes for source and target groups, e=
.g.:
> +                //   src: ".cnt/a/b/vm/100/"
> +                //   tgt: ".cnt/a/c/vm/100/"
> +                let src_rel =3D self.relative_group_path();
> +                let src_rel_str =3D src_rel
> +                    .to_str()
> +                    .ok_or_else(|| format_err!("invalid source group pat=
h"))?;
> +                let src_prefix_str =3D format!("{S3_CONTENT_PREFIX}/{src=
_rel_str}/");
> +
> +                let mut tgt_rel =3D target_ns.path();
> +                tgt_rel.push(self.group.ty.as_str());
> +                tgt_rel.push(&self.group.id);
> +                let tgt_rel_str =3D tgt_rel
> +                    .to_str()
> +                    .ok_or_else(|| format_err!("invalid target group pat=
h"))?;
> +                let tgt_prefix_str =3D format!("{S3_CONTENT_PREFIX}/{tgt=
_rel_str}/");
> +
> +                // S3 list_objects returns full keys with the store name=
 as the leading component,
> +                // e.g. "mystore/.cnt/a/b/vm/100/2026-01-01T00:00:00Z/dr=
ive-scsi0.img.fidx".
> +                // Strip "mystore/" to get the relative key used by copy=
_object.
> +                let store_prefix =3D format!("{}/", self.store.name());
> +
> +                log::debug!(
> +                    "S3 move: listing prefix '{src_prefix_str}', store_p=
refix=3D'{store_prefix}', tgt_prefix=3D'{tgt_prefix_str}'"
> +                );
> +
> +                // Copy all objects to the target prefix first. No sourc=
e objects are deleted
> +                // until all copies succeed, so a copy failure leaves th=
e group intact at source.
> +                let prefix =3D S3PathPrefix::Some(src_prefix_str.clone()=
);
> +                let mut token: Option<String> =3D None;
> +                let mut src_keys =3D Vec::new();
> +
> +                loop {
> +                    let result =3D proxmox_async::runtime::block_on(
> +                        s3_client.list_objects_v2(&prefix, token.as_dere=
f()),
> +                    )
> +                    .context("failed to list group objects on S3 backend=
")?;
> +
> +                    log::debug!(
> +                        "S3 move: listed {} objects (truncated=3D{})",
> +                        result.contents.len(),
> +                        result.is_truncated
> +                    );
> +
> +                    for item in result.contents {
> +                        let full_key_str: &str =3D &item.key;
> +                        log::debug!("S3 move: processing key '{full_key_=
str}'");
> +                        let rel_key =3D
> +                            full_key_str.strip_prefix(&store_prefix).ok_=
or_else(|| {
> +                                format_err!("unexpected key prefix in '{=
full_key_str}'")
> +                            })?;
> +                        let src_key =3D S3ObjectKey::try_from(rel_key)?;
> +
> +                        // Replace the source group prefix with the targ=
et prefix,
> +                        // keeping the snapshot dir and filename (suffix=
) intact.
> +                        let suffix =3D rel_key
> +                            .strip_prefix(&src_prefix_str)
> +                            .ok_or_else(|| format_err!("unexpected key f=
ormat '{rel_key}'"))?;
> +                        let dst_key_str =3D format!("{tgt_prefix_str}{su=
ffix}");
> +                        let dst_key =3D S3ObjectKey::try_from(dst_key_st=
r.as_str())?;
> +
> +                        log::debug!("S3 move: copy '{rel_key}' -> '{dst_=
key_str}'");
> +                        proxmox_async::runtime::block_on(
> +                            s3_client.copy_object(src_key.clone(), dst_k=
ey),
> +                        )
> +                        .with_context(|| format!("failed to copy S3 obje=
ct '{rel_key}'"))?;
> +                        src_keys.push(src_key);
> +                    }
> +
> +                    if result.is_truncated {
> +                        token =3D result.next_continuation_token;
> +                    } else {
> +                        break;
> +                    }
> +                }
> +
> +                // All copies succeeded. Remove any pre-created target d=
irectory (the
> +                // caller may have created one) before renaming - rename=
(2) requires
> +                // the target to be empty if it exists.
> +                if target_path.exists() {
> +                    let _ =3D std::fs::remove_dir_all(&target_path);

this is seems wrong (but also, why does it only happen for S3?)

> +                }
> +                // Rename the local cache directory before deleting sour=
ce objects so that
> +                // the local cache reflects the target state as soon as =
possible.
> +                std::fs::rename(&src_path, &target_path).with_context(||=
 {
> +                    format!("failed to move group {src_path:?} to {targe=
t_path:?}")
> +                })?;
> +                let _ =3D std::fs::remove_file(self.lock_path());

see above and below!

> +
> +                // Delete source objects. In case of a delete failure th=
e group is already at the
> +                // target (S3 copies + local cache). Treat delete failur=
es as warnings so the
> +                // caller does not misreport the group as "failed to mov=
e".
> +                // Un-deleted sources must be removed manually.
> +                log::debug!("S3 move: deleting {} source objects", src_k=
eys.len());
> +                for src_key in src_keys {
> +                    log::debug!("S3 move: delete '{src_key:?}'");
> +                    if let Err(err) =3D
> +                        proxmox_async::runtime::block_on(s3_client.delet=
e_object(src_key.clone()))
> +                    {
> +                        log::warn!(
> +                            "S3 move: failed to delete source object '{s=
rc_key:?}' \
> +                            (group already at target, orphaned object re=
quires manual removal): {err:#}"
> +                        );
> +                    }
> +                }
> +            }
> +        }
> +
> +        Ok(())
> +    }
> +
>      /// Helper function, assumes that no more snapshots are present in t=
he group.
>      fn remove_group_dir(&self) -> Result<(), Error> {
>          let note_path =3D self.store.group_notes_path(&self.ns, &self.gr=
oup);
> diff --git a/pbs-datastore/src/datastore.rs b/pbs-datastore/src/datastore=
.rs
> index ef378c69..d74e7e42 100644
> --- a/pbs-datastore/src/datastore.rs
> +++ b/pbs-datastore/src/datastore.rs
> @@ -1014,6 +1014,62 @@ impl DataStore {
>          backup_group.destroy(&self.backend()?)
>      }
> =20
> +    /// Move a single backup group to a different namespace within the s=
ame datastore.
> +    ///
> +    /// Acquires an exclusive group lock and exclusive locks on every sn=
apshot in the group to
> +    /// ensure no concurrent readers, writers, or verify tasks are opera=
ting on any snapshot.
> +    /// This mirrors the locking strategy of `BackupGroup::destroy()`.
> +    pub fn move_group(
> +        self: &Arc<Self>,
> +        source_ns: &BackupNamespace,
> +        group: &pbs_api_types::BackupGroup,
> +        target_ns: &BackupNamespace,
> +    ) -> Result<(), Error> {
> +        if source_ns =3D=3D target_ns {
> +            bail!("source and target namespace must be different");
> +        }
> +

we should assert that we are using new style locking here, and not allow
moving if we are still using legacy dir-based locking.

> +        let source_group =3D self.backup_group(source_ns.clone(), group.=
clone());
> +        let target_group =3D self.backup_group(target_ns.clone(), group.=
clone());
> +
> +        // Acquire exclusive group lock - prevents new snapshot addition=
s/removals.
> +        let _group_lock =3D source_group
> +            .lock()
> +            .with_context(|| format!("failed to lock group '{group}' for=
 move"))?;

we also need to lock the new group (which doesn't require the group to
exist if we are requiring new style locking), else somebody else could
do that while we are preparing the move, or during copying for the S3
case. and then we should check that it doesn't exist yet here as well -
we still need to re-check that later on, because creating a backup group
will first create the dir, and then obtain the lock..

> +        // Acquire exclusive lock on each snapshot - ensures no concurre=
nt readers, verify
> +        // tasks, or other operations are active on any snapshot in this=
 group.

this potentially means *a lot* of locks. maybe we should consider moving
in batches and only locking each batch in turn? that gives us an upper
bound for the number of open file descriptors for a single move task,
and for the common case it shouldn't take much longer to move (up to) a
few thousand snapshot dirs than a single group dir?

or we could check and move in batches if there are more than N
snapshots? if we move in batches, we can create and lock the target
group here, instead of just locking and checking it doesn't exist..

> +        let mut _snap_locks =3D Vec::new();
> +        for snap in source_group.iter_snapshots()? {
> +            let snap =3D snap?;
> +            _snap_locks.push(snap.lock().with_context(|| {
> +                format!(
> +                    "cannot move group '{group}': snapshot '{}' is in us=
e",
> +                    snap.dir(),
> +                )
> +            })?);
> +        }
> +
> +        // Check existence under locks to avoid TOCTOU races.
> +        if !self.namespace_exists(target_ns) {
> +            bail!("target namespace '{target_ns}' does not exist");
> +        }
> +        if !source_group.exists() {
> +            bail!("group '{group}' does not exist in namespace '{source_=
ns}'");
> +        }
> +        if target_group.exists() {
> +            bail!("group '{group}' already exists in target namespace '{=
target_ns}'");
> +        }

these checks should happen before the snapshots are locked..

> +
> +        let backend =3D self.backend()?;
> +
> +        std::fs::create_dir_all(self.type_path(target_ns, group.ty)).wit=
h_context(|| {
> +            format!("failed to create type directory in '{target_ns}' fo=
r move")
> +        })?;
> +
> +        source_group.move_to(target_ns, &backend)
> +    }
> +
>      /// Remove a backup directory including all content
>      pub fn remove_backup_dir(
>          self: &Arc<Self>,
> --=20
> 2.47.3
>=20
>=20
>=20
>=20
>=20
>=20