From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <pbs-devel-bounces@lists.proxmox.com>
Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68])
	by lore.proxmox.com (Postfix) with ESMTPS id A23921FF136
	for <inbox@lore.proxmox.com>; Mon, 20 Apr 2026 16:50:30 +0200 (CEST)
Received: from firstgate.proxmox.com (localhost [127.0.0.1])
	by firstgate.proxmox.com (Proxmox) with ESMTP id 85784558E;
	Mon, 20 Apr 2026 16:50:30 +0200 (CEST)
Date: Mon, 20 Apr 2026 16:49:50 +0200
From: Fabian =?iso-8859-1?q?Gr=FCnbichler?= <f.gruenbichler@proxmox.com>
Subject: Re: [PATCH proxmox-backup v7 2/9] datastore: add move-group
To: Hannes Laimer <h.laimer@proxmox.com>, pbs-devel@lists.proxmox.com
References: <20260416171830.266553-1-h.laimer@proxmox.com>
	<20260416171830.266553-3-h.laimer@proxmox.com>
In-Reply-To: <20260416171830.266553-3-h.laimer@proxmox.com>
MIME-Version: 1.0
User-Agent: astroid/0.17.0 (https://github.com/astroidmail/astroid)
Message-Id: <1776689762.plz71v8ptr.astroid@yuna.none>
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: quoted-printable
X-Bm-Milter-Handled: 55990f41-d878-4baa-be0a-ee34c49e34d2
X-Bm-Transport-Timestamp: 1776696509735
X-SPAM-LEVEL: Spam detection results:  0
	AWL                     0.054 Adjusted score from AWL reputation of From:
 address
	BAYES_00                 -1.9 Bayes spam probability is 0 to 1%
	DMARC_MISSING             0.1 Missing DMARC policy
	KAM_DMARC_STATUS         0.01 Test Rule for DKIM or SPF Failure with Strict
 Alignment
	SPF_HELO_NONE           0.001 SPF: HELO does not publish an SPF Record
	SPF_PASS               -0.001 SPF: sender matches SPF record
Message-ID-Hash: KCDFNHOURRTJBS23CRYDXWKVQE4E5N44
X-Message-ID-Hash: KCDFNHOURRTJBS23CRYDXWKVQE4E5N44
X-MailFrom: f.gruenbichler@proxmox.com
X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; loop;
 banned-address; emergency; member-moderation; nonmember-moderation;
 administrivia; implicit-dest; max-recipients; max-size; news-moderation;
 no-subject; digests; suspicious-header
X-Mailman-Version: 3.3.10
Precedence: list
List-Id: Proxmox Backup Server development discussion
 <pbs-devel.lists.proxmox.com>
List-Help: <mailto:pbs-devel-request@lists.proxmox.com?subject=help>
List-Owner: <mailto:pbs-devel-owner@lists.proxmox.com>
List-Post: <mailto:pbs-devel@lists.proxmox.com>
List-Subscribe: <mailto:pbs-devel-join@lists.proxmox.com>
List-Unsubscribe: <mailto:pbs-devel-leave@lists.proxmox.com>

On April 16, 2026 7:18 pm, Hannes Laimer wrote:
> Add support for moving a single backup group to a different namespace
> within the same datastore.
>=20
> For the filesystem backend each snapshot directory is renamed
> individually. For S3 all objects are copied to the target prefix
> before deleting the source, per snapshot.
>=20
> Exclusive locks on the group and all its snapshots are acquired
> before the move to ensure no concurrent operations are active.
> Snapshots are locked and moved in batches to avoid exhausting file
> descriptors on groups with many snapshots.
>=20
> If the target group already exists and merging is enabled, the
> individual snapshots are moved into the existing target group
> provided both groups have the same owner and the source snapshots
> are strictly newer than the target snapshots.
>=20
> Signed-off-by: Hannes Laimer <h.laimer@proxmox.com>
> ---
>  pbs-datastore/src/backup_info.rs | 165 +++++++++++++++++++++++++++-
>  pbs-datastore/src/datastore.rs   | 177 +++++++++++++++++++++++++++++++
>  2 files changed, 340 insertions(+), 2 deletions(-)
>=20
> diff --git a/pbs-datastore/src/backup_info.rs b/pbs-datastore/src/backup_=
info.rs
> index c33eb307..46dd9af1 100644
> --- a/pbs-datastore/src/backup_info.rs
> +++ b/pbs-datastore/src/backup_info.rs
> @@ -9,7 +9,7 @@ use std::time::Duration;
>  use anyhow::{bail, format_err, Context, Error};
>  use const_format::concatcp;
> =20
> -use proxmox_s3_client::S3PathPrefix;
> +use proxmox_s3_client::{S3ObjectKey, S3PathPrefix};
>  use proxmox_sys::fs::{lock_dir_noblock, lock_dir_noblock_shared, replace=
_file, CreateOptions};
>  use proxmox_systemd::escape_unit;
> =20
> @@ -273,8 +273,169 @@ impl BackupGroup {
>          Ok(delete_stats)
>      }
> =20
> +    /// Check merge invariants for moving this group's snapshots into an=
 existing
> +    /// target group. Returns an error if ownership differs or snapshot =
times overlap.
> +    pub(crate) fn check_merge_invariants(&self, target_ns: &BackupNamesp=
ace) -> Result<(), Error> {

this could take the other group right away? there's only a single call
site (below) that already has the instance of the target group
available.. and it would make it possible to re-use this logic for
inter-datastore group merge checks in the future as well ;)

> +        let target_group =3D BackupGroup {
> +            store: self.store.clone(),
> +            ns: target_ns.clone(),
> +            group: self.group.clone(),
> +        };
> +
> +        let src_owner =3D self.get_owner()?;
> +        let tgt_owner =3D target_group.get_owner()?;
> +        if src_owner !=3D tgt_owner {
> +            bail!(
> +                "cannot merge group '{}/{}': owner mismatch (source: {sr=
c_owner}, \
> +                target: {tgt_owner})",
> +                self.group.ty,
> +                self.group.id,
> +            );

this error is lacking important context - we are attempting to merge two
groups, so we should mention both of them (or both namespaces)?

> +        }
> +
> +        let src_oldest =3D self
> +            .iter_snapshots()?
> +            .collect::<Result<Vec<_>, _>>()?

this collect is not needed (and should probably be avoided if there are
a lot of snapshots ;)).

let iter =3D self.iter_snapshots()?;
let oldest =3D iter
  .filter_map(|s| s.ok())
  .map(|s| s.backup_time())
  .min();

> +            .iter()
> +            .map(|s| s.backup_time())
> +            .min();
> +        let tgt_newest =3D target_group
> +            .iter_snapshots()?
> +            .collect::<Result<Vec<_>, _>>()?
> +            .iter()
> +            .map(|s| s.backup_time())
> +            .max();

same here, though instead of max we could short-circuit and check for
the first snapshot that is >=3D src_oldest?

> +        if let (Some(src_oldest), Some(tgt_newest)) =3D (src_oldest, tgt=
_newest) {
> +            if src_oldest <=3D tgt_newest {
> +                bail!(
> +                    "cannot merge group '{}/{}': snapshot time overlap \
> +                    (oldest source: {src_oldest}, newest target: {tgt_ne=
west})",
> +                    self.group.ty,
> +                    self.group.id,
> +                );
> +            }
> +        }
> +
> +        Ok(())
> +    }
> +
> +    /// Move a single snapshot to a target group path.
> +    ///
> +    /// For the filesystem backend, renames the snapshot directory. For =
S3, copies all
> +    /// objects under the snapshot prefix to the target, renames the loc=
al cache directory,
> +    /// then deletes the source objects. A copy failure returns an error=
 with the snapshot
> +    /// intact at source. A delete failure is logged as a warning.
> +    ///
> +    /// The caller must hold an exclusive lock on the snapshot being mov=
ed.
> +    pub(crate) fn move_snapshot(

should this be renamed to `move` and implemented on `BackupDir`? &self
is only used for accessing the store path/name, and that is available in
exactly the same fashion there..

> +        &self,
> +        snap: &BackupDir,
> +        target_group_path: &Path,

shouldn't this be a BackupGroup? or even a BackupDir?

> +        backend: &DatastoreBackend,
> +    ) -> Result<(), Error> {
> +        let src_snap_path =3D snap.full_path();
> +        let dst_snap_path =3D target_group_path.join(snap.backup_time_st=
ring());

should we assert here that source and target are in the same store?

> +
> +        match backend {
> +            DatastoreBackend::Filesystem =3D> {
> +                std::fs::rename(&src_snap_path, &dst_snap_path).with_con=
text(|| {
> +                    format!("failed to move snapshot {src_snap_path:?} t=
o {dst_snap_path:?}")
> +                })?;
> +            }
> +            DatastoreBackend::S3(s3_client) =3D> {
> +                let src_rel =3D snap.relative_path();
> +                let src_rel_str =3D src_rel
> +                    .to_str()
> +                    .ok_or_else(|| format_err!("invalid source snapshot =
path"))?;
> +                let src_prefix_str =3D format!("{S3_CONTENT_PREFIX}/{src=
_rel_str}/");
> +
> +                let dst_snap_rel =3D dst_snap_path
> +                    .strip_prefix(self.store.base_path())

then we could use relative_path here as well?

> +                    .with_context(|| {
> +                        format!(
> +                            "target snapshot path {dst_snap_path:?} does=
 not start with \
> +                            datastore base path {:?}",
> +                            self.store.base_path(),
> +                        )
> +                    })?;
> +                let dst_rel_str =3D dst_snap_rel
> +                    .to_str()
> +                    .ok_or_else(|| format_err!("invalid target snapshot =
path"))?;
> +                let dst_prefix_str =3D format!("{S3_CONTENT_PREFIX}/{dst=
_rel_str}/");
> +
> +                let store_prefix =3D format!("{}/", self.store.name());
> +
> +                // Copy all objects for this snapshot to the target pref=
ix. On
> +                // failure the source snapshot remains intact and any pa=
rtial
> +                // target copies stay as leftovers (visible via the API =
for cleanup).
> +                let prefix =3D S3PathPrefix::Some(src_prefix_str.clone()=
);
> +                let mut token: Option<String> =3D None;
> +                let mut src_keys =3D Vec::new();
> +
> +                loop {
> +                    let result =3D proxmox_async::runtime::block_on(
> +                        s3_client.list_objects_v2(&prefix, token.as_dere=
f()),
> +                    )
> +                    .context("failed to list snapshot objects on S3 back=
end")?;
> +
> +                    for item in result.contents {
> +                        let full_key_str: &str =3D &item.key;
> +                        let rel_key =3D
> +                            full_key_str.strip_prefix(&store_prefix).ok_=
or_else(|| {
> +                                format_err!("unexpected key prefix in '{=
full_key_str}'")
> +                            })?;
> +                        let src_key =3D S3ObjectKey::try_from(rel_key)?;
> +
> +                        let suffix =3D rel_key
> +                            .strip_prefix(&src_prefix_str)
> +                            .ok_or_else(|| format_err!("unexpected key f=
ormat '{rel_key}'"))?;
> +                        let dst_key_str =3D format!("{dst_prefix_str}{su=
ffix}");
> +                        let dst_key =3D S3ObjectKey::try_from(dst_key_st=
r.as_str())?;
> +
> +                        proxmox_async::runtime::block_on(
> +                            s3_client.copy_object(src_key.clone(), dst_k=
ey),
> +                        )
> +                        .with_context(|| format!("failed to copy S3 obje=
ct '{rel_key}'"))?;
> +                        src_keys.push(src_key);
> +                    }
> +
> +                    if result.is_truncated {
> +                        token =3D result.next_continuation_token;
> +                    } else {
> +                        break;
> +                    }
> +                }
> +
> +                // Rename local cache directory.
> +                std::fs::rename(&src_snap_path, &dst_snap_path).with_con=
text(|| {
> +                    format!("failed to move snapshot cache {src_snap_pat=
h:?} to {dst_snap_path:?}")
> +                })?;
> +
> +                // Delete source S3 objects. Treat failures as warnings =
since the
> +                // snapshot is already at the target.
> +                for src_key in src_keys {
> +                    if let Err(err) =3D
> +                        proxmox_async::runtime::block_on(s3_client.delet=
e_object(src_key.clone()))
> +                    {
> +                        log::warn!(
> +                            "S3 move: failed to delete source object '{s=
rc_key:?}' \
> +                            (snapshot already at target, orphaned object=
 requires manual removal): \
> +                            {err:#}"
> +                        );
> +                    }
> +                }
> +            }
> +        }
> +
> +        // Clean up stale source lock files under /run for this snapshot=
.
> +        let _ =3D std::fs::remove_file(snap.manifest_lock_path());
> +        let _ =3D std::fs::remove_file(snap.lock_path());
> +
> +        Ok(())
> +    }
> +
>      /// Helper function, assumes that no more snapshots are present in t=
he group.
> -    fn remove_group_dir(&self) -> Result<(), Error> {
> +    pub(crate) fn remove_group_dir(&self) -> Result<(), Error> {
>          let note_path =3D self.store.group_notes_path(&self.ns, &self.gr=
oup);
>          if let Err(err) =3D std::fs::remove_file(&note_path) {
>              if err.kind() !=3D std::io::ErrorKind::NotFound {
> diff --git a/pbs-datastore/src/datastore.rs b/pbs-datastore/src/datastore=
.rs
> index f1475d77..f88c356e 100644
> --- a/pbs-datastore/src/datastore.rs
> +++ b/pbs-datastore/src/datastore.rs
> @@ -88,6 +88,15 @@ const CHUNK_LOCK_TIMEOUT: Duration =3D Duration::from_=
secs(3 * 60 * 60);
>  const S3_DELETE_BATCH_LIMIT: usize =3D 100;
>  // max defer time for s3 batch deletions
>  const S3_DELETE_DEFER_LIMIT_SECONDS: Duration =3D Duration::from_secs(60=
 * 5);
> +// upper bound on concurrent snapshot locks during move operations to av=
oid exhausting FDs
> +const MAX_SNAPSHOT_LOCKS: usize =3D 512;
> +
> +/// Error from a per-group move attempt. Soft errors (e.g. lock conflict=
s) are
> +/// retryable, hard errors are not.
> +pub(crate) enum MoveGroupError {
> +    Soft(Error),
> +    Hard(Error),
> +}
> =20
>  /// checks if auth_id is owner, or, if owner is a token, if
>  /// auth_id is the user of the token
> @@ -1129,6 +1138,174 @@ impl DataStore {
>          backup_group.destroy(&self.backend()?)
>      }
> =20
> +    /// Move a single backup group to a different namespace within the s=
ame datastore.
> +    pub fn move_group(
> +        self: &Arc<Self>,
> +        source_ns: &BackupNamespace,
> +        group: &pbs_api_types::BackupGroup,
> +        target_ns: &BackupNamespace,
> +        merge_group: bool,
> +    ) -> Result<(), Error> {
> +        if *OLD_LOCKING {
> +            bail!("move operations require new-style file-based locking"=
);
> +        }
> +        if source_ns =3D=3D target_ns {
> +            bail!("source and target namespace must be different");
> +        }
> +
> +        let source_group =3D self.backup_group(source_ns.clone(), group.=
clone());
> +
> +        // Pre-lock existence checks to avoid unnecessary locking overhe=
ad.
> +        if !self.namespace_exists(target_ns) {
> +            bail!("target namespace '{target_ns}' does not exist");
> +        }
> +        if !source_group.exists() {
> +            bail!("group '{group}' does not exist in namespace '{source_=
ns}'");
> +        }
> +
> +        let backend =3D self.backend()?;
> +
> +        self.lock_and_move_group(&source_group, source_ns, target_ns, &b=
ackend, merge_group)
> +            .map_err(|err| match err {
> +                MoveGroupError::Soft(err) | MoveGroupError::Hard(err) =
=3D> err,
> +            })
> +    }
> +
> +    /// Lock and move a single group. Acquires exclusive locks on both s=
ource and target
> +    /// group paths, then moves snapshots in batches - each batch is loc=
ked, moved, and
> +    /// released before the next batch starts. This ensures no concurren=
t readers (e.g.
> +    /// verify) can operate on snapshots being moved, without exhausting=
 file descriptors.
> +    ///
> +    /// Returns a `MoveGroupError` on failure, indicating whether the fa=
ilure was a
> +    /// transient lock conflict (retryable) or a hard error.
> +    pub(crate) fn lock_and_move_group(

this can be private

> +        self: &Arc<Self>,
> +        group: &BackupGroup,
> +        source_ns: &BackupNamespace,
> +        target_ns: &BackupNamespace,

the order of parameters is different than above..

> +        backend: &DatastoreBackend,
> +        merge_groups: bool,
> +    ) -> Result<(), MoveGroupError> {
> +        let target_group_ns =3D group
> +            .backup_ns()
> +            .map_prefix(source_ns, target_ns)
> +            .map_err(MoveGroupError::Hard)?;
> +
> +        let target_group =3D self.backup_group(target_group_ns.clone(), =
group.group().clone());
> +
> +        // Acquire exclusive group locks - source prevents new snapshot =
additions/removals,
> +        // target prevents concurrent creation at the destination path.
> +        let _group_lock =3D group.lock().map_err(MoveGroupError::Soft)?;
> +        let _target_group_lock =3D target_group.lock().map_err(MoveGroup=
Error::Soft)?;

this (continued below)

> +        let merge =3D target_group.exists();
> +
> +        if merge && !merge_groups {
> +            return Err(MoveGroupError::Hard(format_err!(
> +                "group '{}' already exists in target namespace '{target_=
group_ns}' \
> +                and merging is disabled",
> +                group.group(),
> +            )));
> +        }
> +
> +        if merge {
> +            group
> +                .check_merge_invariants(&target_group_ns)
> +                .map_err(MoveGroupError::Hard)?;
> +        }
> +
> +        let mut snapshots: Vec<_> =3D group
> +            .iter_snapshots()
> +            .map_err(MoveGroupError::Hard)?
> +            .collect::<Result<Vec<_>, _>>()
> +            .map_err(MoveGroupError::Hard)?;
> +        // Sort by time so that a partial failure (e.g. batch N succeeds=
 but N+1
> +        // fails) leaves a clean time-ordered split: all moved snapshots=
 are older
> +        // than all remaining ones. This allows a merge retry to succeed=
 since the
> +        // invariant (source oldest > target newest) still holds.
> +        snapshots.sort_by_key(|s| s.backup_time());

if we need the full list of sorted backups, why not use
group.list_backups() and BackupInfo::sort_list ?

> +
> +        // Ensure the target type and group directories exist.
> +        std::fs::create_dir_all(self.type_path(&target_group_ns, group.g=
roup().ty))
> +            .map_err(|err| MoveGroupError::Hard(err.into()))?;
> +        let target_group_path =3D self.group_path(&target_group_ns, grou=
p.group());
> +        if !merge {
> +            std::fs::create_dir_all(&target_group_path)
> +                .map_err(|err| MoveGroupError::Hard(err.into()))?;
> +            // Copy owner so the group is visible in the UI during the m=
ove (especially
> +            // relevant for S3 where the copy can take a while).
> +            if let Ok(owner) =3D group.get_owner() {
> +                if let Err(err) =3D target_group.set_owner(&owner, true)=
 {
> +                    warn!(
> +                        "failed to set owner for target group '{}' in '{=
target_group_ns}': {err:#}",
> +                        group.group(),
> +                    );
> +                }
> +            }
> +        }

(continued from above) and this could be replaced with a call to
`create_locked_backup_group`. downside - differentiating between
"currently locked" and "other error" is harder, but the upside is that
that code already handles a lot of the logic for us, and is how backup,
tape restore and pull jobs create new groups or add snapshots to
existing groups..

maybe it would be worth it to refactor that implementation, and a add
new non-pub variant/wrapper that
- returns whether it created the group or not
- allows us to differentiate locking from other errors?

> +
> +        log::info!(
> +            "{} group '{}/{}' from '{}' to '{target_group_ns}'",
> +            if merge { "merging" } else { "moving" },
> +            group.group().ty,
> +            group.group().id,
> +            group.backup_ns(),
> +        );
> +
> +        // Lock and move snapshots in batches. Each batch is locked, mov=
ed, and released
> +        // before the next batch starts. This ensures no concurrent read=
er (e.g. verify,
> +        // which only takes a snapshot-level shared lock) can operate on=
 a snapshot while
> +        // it is being moved.
> +        for chunk in snapshots.chunks(MAX_SNAPSHOT_LOCKS) {
> +            let _batch_locks: Vec<_> =3D chunk
> +                .iter()
> +                .map(|snap| snap.lock().map_err(MoveGroupError::Soft))
> +                .collect::<Result<Vec<_>, _>>()?;

this is funky, but I guess it does the job ;) I'd feel a bit more
comfortable if the Vec<_> were explicit though..

> +
> +            for snap in chunk {
> +                group
> +                    .move_snapshot(snap, &target_group_path, backend)
> +                    .map_err(MoveGroupError::Hard)?;

see above, I think we can leverage the type system a bit more here..

> +            }
> +            // batch locks released here
> +        }
> +
> +        // Copy group notes to the target (unless merging into a group t=
hat already has notes).
> +        let src_notes_path =3D self.group_notes_path(group.backup_ns(), =
group.group());
> +        let dst_notes_path =3D self.group_notes_path(&target_group_ns, g=
roup.group());
> +        if src_notes_path.exists() && !dst_notes_path.exists() {
> +            if let Err(err) =3D std::fs::copy(&src_notes_path, &dst_note=
s_path) {

should this be a rename? should we warn if merging and the notes have
diverged? what if this is on S3?

> +                warn!(
> +                    "failed to copy group notes from {src_notes_path:?} =
to {dst_notes_path:?}: {err}"
> +                );
> +            }
> +        }
> +

the part starting here is basically group.destroy(), but we already have
the lock.. since it's pbs-datastore internal, we could change it to
consume the lock, and call it here? it only has two call sites, and we
don't risk the logic below diverging if new additional files are added
in the future..

> +        // Remove the now-empty source group directory (owner, notes, di=
r, group lock).
> +        group.remove_group_dir().map_err(MoveGroupError::Hard)?;
> +
> +        // For S3: delete orphaned source group-level objects (owner, no=
tes).
> +        if let DatastoreBackend::S3(s3_client) =3D backend {
> +            let src_group_rel =3D group.relative_group_path();
> +            let src_group_rel_str =3D src_group_rel
> +                .to_str()
> +                .ok_or_else(|| MoveGroupError::Hard(format_err!("invalid=
 source group path")))?;
> +            for filename in [GROUP_OWNER_FILE_NAME, GROUP_NOTES_FILE_NAM=
E] {
> +                let key_str =3D format!("{S3_CONTENT_PREFIX}/{src_group_=
rel_str}/{filename}");
> +                if let Ok(key) =3D S3ObjectKey::try_from(key_str.as_str(=
)) {
> +                    if let Err(err) =3D proxmox_async::runtime::block_on=
(s3_client.delete_object(key))
> +                    {
> +                        log::warn!(
> +                            "S3 move: failed to delete source group obje=
ct '{key_str}': {err:#}"
> +                        );
> +                    }
> +                }
> +            }
> +        }
> +
> +        Ok(())
> +    }
> +
>      /// Remove a backup directory including all content
>      pub fn remove_backup_dir(
>          self: &Arc<Self>,
> --=20
> 2.47.3
>=20
>=20
>=20
>=20
>=20
>=20