From: Christian Ebner <c.ebner@proxmox.com>
To: pbs-devel@lists.proxmox.com
Subject: [pbs-devel] [PATCH proxmox-backup 3/4] datastore: introduce reverse chunk digest lookup table
Date: Mon, 19 Jan 2026 14:27:06 +0100 [thread overview]
Message-ID: <20260119132707.686523-4-c.ebner@proxmox.com> (raw)
In-Reply-To: <20260119132707.686523-1-c.ebner@proxmox.com>
Store the [digest]->[namespace]->[group]->[snapshot] relation
by inserting the items in the respective indices, generating their
key and store the referencing keys in a hashmap, allowing for
easy lookup. This will be used to track chunk references of snapshot
index files during phase 1 of garbage collection and accumulate
statistics after filling relevant raw chunk size information during
phase 2 of garbage collection.
The actual hierarchy is only reconstructed for each chunk digest on
accumulation, allowing to identify unique chunks and perform
chunk reference counting.
Signed-off-by: Christian Ebner <c.ebner@proxmox.com>
---
pbs-datastore/src/reverse_digest_map.rs | 245 ++++++++++++++++++++++++
1 file changed, 245 insertions(+)
diff --git a/pbs-datastore/src/reverse_digest_map.rs b/pbs-datastore/src/reverse_digest_map.rs
index 323e7315d..1c3efa8a8 100644
--- a/pbs-datastore/src/reverse_digest_map.rs
+++ b/pbs-datastore/src/reverse_digest_map.rs
@@ -1,7 +1,10 @@
use std::collections::{BTreeMap, HashMap};
use std::hash::BuildHasher;
+use tracing::info;
+
use pbs_api_types::{BackupDir, BackupGroup, BackupNamespace};
+use proxmox_human_byte::HumanByte;
#[derive(Copy, Clone, Eq, Hash, PartialEq, PartialOrd, Ord)]
/// Backup snapshot index key
@@ -102,3 +105,245 @@ impl NamespaceIndex {
self.index.get(key)
}
}
+
+#[derive(Default)]
+/// Store metadata associated with given digest such as raw chunk size and
+/// reference counted list of snapshot keys referencing given chunk.
+struct DigestMetadata {
+ raw_size: u64,
+ logical_size: u64,
+ refcount: BTreeMap<SKey, u32>,
+}
+
+impl DigestMetadata {
+ fn from_skey_with_logical_size(skey: SKey, logical_size: u64) -> Self {
+ let mut refcount = BTreeMap::new();
+ refcount.insert(skey, 1);
+ Self {
+ raw_size: 0,
+ logical_size,
+ refcount,
+ }
+ }
+}
+
+#[derive(Default)]
+pub(super) struct ReverseDigestMap {
+ // Map associating digest to snapshot index with reference count (digest metadata)
+ digests: HashMap<[u8; 32], DigestMetadata>,
+ // indexes storing the actual namespace, group and snapshot data
+ namespaces: NamespaceIndex,
+ groups: GroupIndex,
+ snapshots: SnapshotIndex,
+}
+
+impl ReverseDigestMap {
+ /// Associate digest or update reference count for backup snapshot of given namespace.
+ ///
+ /// Returns [`true`] if the digest key was not yet present.
+ pub(super) fn insert(
+ &mut self,
+ digest: &[u8; 32],
+ namespace: &BackupNamespace,
+ snapshot: &BackupDir,
+ logical_size: u64,
+ ) -> bool {
+ let mut is_new = true;
+ let nkey = self.namespaces.insert(namespace.clone());
+ let gkey = self.groups.insert(snapshot.group.clone());
+ let skey = self
+ .snapshots
+ .insert(BackupTime::from(snapshot.time), nkey, gkey);
+
+ self.digests
+ .entry(*digest)
+ .and_modify(|digest_info| {
+ is_new = false;
+ digest_info
+ .refcount
+ .entry(skey)
+ .and_modify(|count| *count += 1)
+ .or_insert(1);
+ digest_info.logical_size = logical_size;
+ })
+ .or_insert(DigestMetadata::from_skey_with_logical_size(
+ skey,
+ logical_size,
+ ));
+
+ is_new
+ }
+
+ /// Set the raw chunk size metadata information for associated digest.
+ ///
+ /// Inserts the digest without snapshots associated if not present.
+ pub(super) fn set_raw_chunk_size(&mut self, digest: &[u8; 32], raw_size: u64) {
+ self.digests
+ .entry(*digest)
+ .and_modify(|digest_info| digest_info.raw_size = raw_size);
+ }
+}
+
+#[derive(Debug, Default)]
+pub(super) struct AccumulatedStats {
+ raw_size: u64,
+ logical_size: u64,
+ unique_raw_size: u64,
+ refcount: u64,
+ uniquecount: u64,
+ dedupcount: u64,
+}
+
+// Hierarchy for snapshots on individual digest: [NKey] -> [GKey] -> [(SKey, count)]
+type DigestHierarchy = HashMap<NKey, HashMap<GKey, Vec<(SKey, u32)>>>;
+
+fn format_namespace(namespace: &BackupNamespace) -> String {
+ if namespace.is_root() {
+ "[root]".to_string()
+ } else {
+ namespace.display_as_path().to_string()
+ }
+}
+
+#[derive(Default)]
+pub(super) struct DigestStatAccumulator {
+ list: HashMap<String, AccumulatedStats>,
+}
+
+impl DigestStatAccumulator {
+ // Accumulate statistics by consuming the digest map
+ pub(super) fn accumulate_and_list(mut self, digest_map: ReverseDigestMap) {
+ let mut datastore_stats = AccumulatedStats::default();
+
+ for digest_info in digest_map.digests.into_values() {
+ if digest_info.refcount.is_empty() {
+ continue;
+ }
+
+ // Generate [namespaces] -> [groups] -> [snapshots] reference hierarchy for given digest
+ // to account for statistics. The hierarchy is then used to account for chunk uniqueness
+ // and raw size.
+ let mut hierarchy: DigestHierarchy = HashMap::new();
+ for (skey, count) in digest_info.refcount {
+ if let Some((nkey, gkey, _snapshot_time)) = digest_map.snapshots.lookup(&skey) {
+ hierarchy
+ .entry(*nkey)
+ .or_default()
+ .entry(*gkey)
+ .or_default()
+ .push((skey, count));
+ }
+ }
+
+ let mut unique = hierarchy.len() <= 1;
+ let raw_size = digest_info.raw_size;
+ let logical_size = digest_info.logical_size;
+
+ for (nkey, gkey_map) in hierarchy {
+ if gkey_map.len() > 1 {
+ unique = false;
+ }
+
+ let namespace = digest_map.namespaces.lookup(&nkey).unwrap();
+ let mut logical_namespace_sum = 0;
+
+ for (gkey, skey_list) in gkey_map {
+ if skey_list.len() > 1 {
+ unique = false;
+ }
+
+ let group = digest_map.groups.lookup(&gkey).unwrap();
+ let mut logical_group_sum = 0;
+
+ for (skey, duplicates) in skey_list {
+ if duplicates > 1 {
+ unique = false;
+ }
+
+ let (_nkey, _gkey, timestamp) = digest_map.snapshots.lookup(&skey).unwrap();
+ let snapshot =
+ BackupDir::from((group.clone(), Into::<i64>::into(*timestamp)));
+
+ let snapshot = format!("{}/{snapshot}", format_namespace(namespace));
+ let logical_snapshot_sum = duplicates as u64 * logical_size;
+ self.account(snapshot, raw_size, logical_snapshot_sum, unique);
+ logical_group_sum += logical_snapshot_sum;
+ }
+
+ let group = format!("{}/{group}", format_namespace(namespace));
+ self.account(group, raw_size, logical_group_sum, unique);
+ logical_namespace_sum += logical_group_sum;
+ }
+
+ let namespace = format_namespace(namespace);
+ self.account(namespace, raw_size, logical_namespace_sum, unique);
+ }
+
+ datastore_stats.raw_size += raw_size;
+ datastore_stats.refcount += 1;
+ if unique {
+ datastore_stats.uniquecount += 1;
+ datastore_stats.unique_raw_size += raw_size;
+ } else {
+ datastore_stats.dedupcount += 1;
+ }
+ }
+
+ let mut list: Vec<(String, AccumulatedStats)> = self.list.into_iter().collect();
+ list.sort_unstable_by(|a, b| a.0.cmp(&b.0));
+
+ for (name, stats) in list {
+ info!(
+ "{name}: {} ({} referenced / {} unique ({}) / {} deduped chunks), deduplication factor: {:.2}",
+ HumanByte::from(stats.raw_size),
+ stats.refcount,
+ stats.uniquecount,
+ HumanByte::from(stats.unique_raw_size),
+ stats.dedupcount,
+ Self::dedup_factor(stats.raw_size, stats.logical_size),
+ );
+ }
+
+ info!("===");
+ info!(
+ "Datastore: {} ({} referenced / {} unique / {} deduped chunks)",
+ HumanByte::from(datastore_stats.raw_size),
+ datastore_stats.refcount,
+ datastore_stats.uniquecount,
+ datastore_stats.dedupcount,
+ );
+ info!("===");
+ }
+
+ /// Account for statistics on given key, inserted if not yet present
+ fn account(&mut self, key: String, raw_size: u64, logical_size: u64, unique: bool) {
+ let (uniquecount, dedupcount, unique_raw_size) =
+ if unique { (1, 0, raw_size) } else { (0, 1, 0) };
+ self.list
+ .entry(key)
+ .and_modify(|stats| {
+ stats.raw_size += raw_size;
+ stats.unique_raw_size += unique_raw_size;
+ stats.logical_size += logical_size;
+ stats.refcount += 1;
+ stats.uniquecount += uniquecount;
+ stats.dedupcount += dedupcount;
+ })
+ .or_insert(AccumulatedStats {
+ raw_size,
+ unique_raw_size,
+ logical_size,
+ refcount: 1,
+ uniquecount,
+ dedupcount,
+ });
+ }
+
+ fn dedup_factor(raw_size: u64, logical_size: u64) -> f64 {
+ if raw_size > 0 {
+ logical_size as f64 / raw_size as f64
+ } else {
+ 1.0
+ }
+ }
+}
--
2.47.3
_______________________________________________
pbs-devel mailing list
pbs-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel
next prev parent reply other threads:[~2026-01-19 13:27 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-19 13:27 [pbs-devel] [RFC proxmox-backup 0/4] fix #5799: Gather per-namespace/group/snapshot storage usage stats Christian Ebner
2026-01-19 13:27 ` [pbs-devel] [PATCH proxmox-backup 1/4] chunk store: restrict chunk sweep helper method to module parent Christian Ebner
2026-01-19 13:27 ` [pbs-devel] [PATCH proxmox-backup 2/4] datastore: add namespace/group/snapshot indices for reverse lookups Christian Ebner
2026-01-19 13:27 ` Christian Ebner [this message]
2026-01-19 13:27 ` [pbs-devel] [PATCH proxmox-backup 4/4] fix #5799: GC: track chunk digests and accumulate statistics Christian Ebner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260119132707.686523-4-c.ebner@proxmox.com \
--to=c.ebner@proxmox.com \
--cc=pbs-devel@lists.proxmox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox