From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) by lore.proxmox.com (Postfix) with ESMTPS id 7B6B71FF146 for ; Mon, 19 Jan 2026 14:27:18 +0100 (CET) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 085EA1ECE2; Mon, 19 Jan 2026 14:27:26 +0100 (CET) From: Christian Ebner To: pbs-devel@lists.proxmox.com Date: Mon, 19 Jan 2026 14:27:06 +0100 Message-ID: <20260119132707.686523-4-c.ebner@proxmox.com> X-Mailer: git-send-email 2.47.3 In-Reply-To: <20260119132707.686523-1-c.ebner@proxmox.com> References: <20260119132707.686523-1-c.ebner@proxmox.com> MIME-Version: 1.0 X-Bm-Milter-Handled: 55990f41-d878-4baa-be0a-ee34c49e34d2 X-Bm-Transport-Timestamp: 1768829187478 X-SPAM-LEVEL: Spam detection results: 0 AWL 0.048 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Subject: [pbs-devel] [PATCH proxmox-backup 3/4] datastore: introduce reverse chunk digest lookup table X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: Proxmox Backup Server development discussion Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: pbs-devel-bounces@lists.proxmox.com Sender: "pbs-devel" Store the [digest]->[namespace]->[group]->[snapshot] relation by inserting the items in the respective indices, generating their key and store the referencing keys in a hashmap, allowing for easy lookup. This will be used to track chunk references of snapshot index files during phase 1 of garbage collection and accumulate statistics after filling relevant raw chunk size information during phase 2 of garbage collection. The actual hierarchy is only reconstructed for each chunk digest on accumulation, allowing to identify unique chunks and perform chunk reference counting. Signed-off-by: Christian Ebner --- pbs-datastore/src/reverse_digest_map.rs | 245 ++++++++++++++++++++++++ 1 file changed, 245 insertions(+) diff --git a/pbs-datastore/src/reverse_digest_map.rs b/pbs-datastore/src/reverse_digest_map.rs index 323e7315d..1c3efa8a8 100644 --- a/pbs-datastore/src/reverse_digest_map.rs +++ b/pbs-datastore/src/reverse_digest_map.rs @@ -1,7 +1,10 @@ use std::collections::{BTreeMap, HashMap}; use std::hash::BuildHasher; +use tracing::info; + use pbs_api_types::{BackupDir, BackupGroup, BackupNamespace}; +use proxmox_human_byte::HumanByte; #[derive(Copy, Clone, Eq, Hash, PartialEq, PartialOrd, Ord)] /// Backup snapshot index key @@ -102,3 +105,245 @@ impl NamespaceIndex { self.index.get(key) } } + +#[derive(Default)] +/// Store metadata associated with given digest such as raw chunk size and +/// reference counted list of snapshot keys referencing given chunk. +struct DigestMetadata { + raw_size: u64, + logical_size: u64, + refcount: BTreeMap, +} + +impl DigestMetadata { + fn from_skey_with_logical_size(skey: SKey, logical_size: u64) -> Self { + let mut refcount = BTreeMap::new(); + refcount.insert(skey, 1); + Self { + raw_size: 0, + logical_size, + refcount, + } + } +} + +#[derive(Default)] +pub(super) struct ReverseDigestMap { + // Map associating digest to snapshot index with reference count (digest metadata) + digests: HashMap<[u8; 32], DigestMetadata>, + // indexes storing the actual namespace, group and snapshot data + namespaces: NamespaceIndex, + groups: GroupIndex, + snapshots: SnapshotIndex, +} + +impl ReverseDigestMap { + /// Associate digest or update reference count for backup snapshot of given namespace. + /// + /// Returns [`true`] if the digest key was not yet present. + pub(super) fn insert( + &mut self, + digest: &[u8; 32], + namespace: &BackupNamespace, + snapshot: &BackupDir, + logical_size: u64, + ) -> bool { + let mut is_new = true; + let nkey = self.namespaces.insert(namespace.clone()); + let gkey = self.groups.insert(snapshot.group.clone()); + let skey = self + .snapshots + .insert(BackupTime::from(snapshot.time), nkey, gkey); + + self.digests + .entry(*digest) + .and_modify(|digest_info| { + is_new = false; + digest_info + .refcount + .entry(skey) + .and_modify(|count| *count += 1) + .or_insert(1); + digest_info.logical_size = logical_size; + }) + .or_insert(DigestMetadata::from_skey_with_logical_size( + skey, + logical_size, + )); + + is_new + } + + /// Set the raw chunk size metadata information for associated digest. + /// + /// Inserts the digest without snapshots associated if not present. + pub(super) fn set_raw_chunk_size(&mut self, digest: &[u8; 32], raw_size: u64) { + self.digests + .entry(*digest) + .and_modify(|digest_info| digest_info.raw_size = raw_size); + } +} + +#[derive(Debug, Default)] +pub(super) struct AccumulatedStats { + raw_size: u64, + logical_size: u64, + unique_raw_size: u64, + refcount: u64, + uniquecount: u64, + dedupcount: u64, +} + +// Hierarchy for snapshots on individual digest: [NKey] -> [GKey] -> [(SKey, count)] +type DigestHierarchy = HashMap>>; + +fn format_namespace(namespace: &BackupNamespace) -> String { + if namespace.is_root() { + "[root]".to_string() + } else { + namespace.display_as_path().to_string() + } +} + +#[derive(Default)] +pub(super) struct DigestStatAccumulator { + list: HashMap, +} + +impl DigestStatAccumulator { + // Accumulate statistics by consuming the digest map + pub(super) fn accumulate_and_list(mut self, digest_map: ReverseDigestMap) { + let mut datastore_stats = AccumulatedStats::default(); + + for digest_info in digest_map.digests.into_values() { + if digest_info.refcount.is_empty() { + continue; + } + + // Generate [namespaces] -> [groups] -> [snapshots] reference hierarchy for given digest + // to account for statistics. The hierarchy is then used to account for chunk uniqueness + // and raw size. + let mut hierarchy: DigestHierarchy = HashMap::new(); + for (skey, count) in digest_info.refcount { + if let Some((nkey, gkey, _snapshot_time)) = digest_map.snapshots.lookup(&skey) { + hierarchy + .entry(*nkey) + .or_default() + .entry(*gkey) + .or_default() + .push((skey, count)); + } + } + + let mut unique = hierarchy.len() <= 1; + let raw_size = digest_info.raw_size; + let logical_size = digest_info.logical_size; + + for (nkey, gkey_map) in hierarchy { + if gkey_map.len() > 1 { + unique = false; + } + + let namespace = digest_map.namespaces.lookup(&nkey).unwrap(); + let mut logical_namespace_sum = 0; + + for (gkey, skey_list) in gkey_map { + if skey_list.len() > 1 { + unique = false; + } + + let group = digest_map.groups.lookup(&gkey).unwrap(); + let mut logical_group_sum = 0; + + for (skey, duplicates) in skey_list { + if duplicates > 1 { + unique = false; + } + + let (_nkey, _gkey, timestamp) = digest_map.snapshots.lookup(&skey).unwrap(); + let snapshot = + BackupDir::from((group.clone(), Into::::into(*timestamp))); + + let snapshot = format!("{}/{snapshot}", format_namespace(namespace)); + let logical_snapshot_sum = duplicates as u64 * logical_size; + self.account(snapshot, raw_size, logical_snapshot_sum, unique); + logical_group_sum += logical_snapshot_sum; + } + + let group = format!("{}/{group}", format_namespace(namespace)); + self.account(group, raw_size, logical_group_sum, unique); + logical_namespace_sum += logical_group_sum; + } + + let namespace = format_namespace(namespace); + self.account(namespace, raw_size, logical_namespace_sum, unique); + } + + datastore_stats.raw_size += raw_size; + datastore_stats.refcount += 1; + if unique { + datastore_stats.uniquecount += 1; + datastore_stats.unique_raw_size += raw_size; + } else { + datastore_stats.dedupcount += 1; + } + } + + let mut list: Vec<(String, AccumulatedStats)> = self.list.into_iter().collect(); + list.sort_unstable_by(|a, b| a.0.cmp(&b.0)); + + for (name, stats) in list { + info!( + "{name}: {} ({} referenced / {} unique ({}) / {} deduped chunks), deduplication factor: {:.2}", + HumanByte::from(stats.raw_size), + stats.refcount, + stats.uniquecount, + HumanByte::from(stats.unique_raw_size), + stats.dedupcount, + Self::dedup_factor(stats.raw_size, stats.logical_size), + ); + } + + info!("==="); + info!( + "Datastore: {} ({} referenced / {} unique / {} deduped chunks)", + HumanByte::from(datastore_stats.raw_size), + datastore_stats.refcount, + datastore_stats.uniquecount, + datastore_stats.dedupcount, + ); + info!("==="); + } + + /// Account for statistics on given key, inserted if not yet present + fn account(&mut self, key: String, raw_size: u64, logical_size: u64, unique: bool) { + let (uniquecount, dedupcount, unique_raw_size) = + if unique { (1, 0, raw_size) } else { (0, 1, 0) }; + self.list + .entry(key) + .and_modify(|stats| { + stats.raw_size += raw_size; + stats.unique_raw_size += unique_raw_size; + stats.logical_size += logical_size; + stats.refcount += 1; + stats.uniquecount += uniquecount; + stats.dedupcount += dedupcount; + }) + .or_insert(AccumulatedStats { + raw_size, + unique_raw_size, + logical_size, + refcount: 1, + uniquecount, + dedupcount, + }); + } + + fn dedup_factor(raw_size: u64, logical_size: u64) -> f64 { + if raw_size > 0 { + logical_size as f64 / raw_size as f64 + } else { + 1.0 + } + } +} -- 2.47.3 _______________________________________________ pbs-devel mailing list pbs-devel@lists.proxmox.com https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel