From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by lists.proxmox.com (Postfix) with ESMTPS id A75F2A0CAE for ; Thu, 9 Nov 2023 19:47:32 +0100 (CET) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 684DD187C3 for ; Thu, 9 Nov 2023 19:46:40 +0100 (CET) Received: from proxmox-new.maurer-it.com (proxmox-new.maurer-it.com [94.136.29.106]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by firstgate.proxmox.com (Proxmox) with ESMTPS for ; Thu, 9 Nov 2023 19:46:38 +0100 (CET) Received: from proxmox-new.maurer-it.com (localhost.localdomain [127.0.0.1]) by proxmox-new.maurer-it.com (Proxmox) with ESMTP id AD48D478BD for ; Thu, 9 Nov 2023 19:46:37 +0100 (CET) From: Christian Ebner To: pbs-devel@lists.proxmox.com Date: Thu, 9 Nov 2023 19:46:13 +0100 Message-Id: <20231109184614.1611127-26-c.ebner@proxmox.com> X-Mailer: git-send-email 2.39.2 In-Reply-To: <20231109184614.1611127-1-c.ebner@proxmox.com> References: <20231109184614.1611127-1-c.ebner@proxmox.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-SPAM-LEVEL: Spam detection results: 0 AWL 0.063 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record T_SCC_BODY_TEXT_LINE -0.01 - Subject: [pbs-devel] [PATCH v4 proxmox-backup 25/26] pxar: add heuristic to reduce reused chunk fragmentation X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 09 Nov 2023 18:47:32 -0000 For multiple consecutive runs with metadata based file change detection the referencing of existing chunks can lead to fragmentation and an increased size of the index file. In order to reduce this, check which chunks relative size has been referenced by the previous run and re-chunk files where a constant threshold value has been underrun. Signed-off-by: Christian Ebner --- Changes since version 3: - no present in version 3 pbs-client/src/pxar/create.rs | 55 +++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/pbs-client/src/pxar/create.rs b/pbs-client/src/pxar/create.rs index 8b283b23..56e1de7a 100644 --- a/pbs-client/src/pxar/create.rs +++ b/pbs-client/src/pxar/create.rs @@ -1,7 +1,8 @@ -use std::collections::{HashMap, HashSet, VecDeque}; +use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; use std::ffi::{CStr, CString, OsStr}; use std::fmt; use std::io::{self, Read}; +use std::ops::Bound::Included; use std::os::unix::ffi::OsStrExt; use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; use std::path::{Path, PathBuf}; @@ -36,6 +37,7 @@ use crate::pxar::tools::assert_single_path_component; use crate::pxar::Flags; const MAX_FILE_SIZE: u64 = 1024; +const FRAGMENTATION_THRESHOLD: f64 = 0.75; /// Pxar options for creating a pxar archive/stream #[derive(Default)] @@ -214,6 +216,7 @@ struct Archiver { forced_boundaries: Arc>>, appendix: Appendix, prev_appendix: Option, + ref_offsets: BTreeMap, } type Encoder<'a, T> = pxar::encoder::aio::Encoder<'a, T>; @@ -265,21 +268,23 @@ where )?); } - let (appendix_start, prev_cat_parent) = if let Some(ref mut prev_ref) = options.previous_ref { - let entry = prev_ref - .catalog - .lookup_recursive(prev_ref.archive_name.as_bytes())?; - let parent = match entry.attr { - DirEntryAttribute::Archive { .. } => Some(entry), - _ => None, + let (appendix_start, prev_cat_parent, ref_offsets) = + if let Some(ref mut prev_ref) = options.previous_ref { + let entry = prev_ref + .catalog + .lookup_recursive(prev_ref.archive_name.as_bytes())?; + let parent = match entry.attr { + DirEntryAttribute::Archive { .. } => Some(entry), + _ => None, + }; + let appendix_start = prev_ref + .catalog + .appendix_offset(prev_ref.archive_name.as_bytes())?; + let ref_offsets = prev_ref.catalog.fetch_offsets()?; + (appendix_start, parent, ref_offsets) + } else { + (None, None, BTreeMap::new()) }; - let appendix_start = prev_ref - .catalog - .appendix_offset(prev_ref.archive_name.as_bytes())?; - (appendix_start, parent) - } else { - (None, None) - }; let mut archiver = Archiver { feature_flags, @@ -299,6 +304,7 @@ where forced_boundaries, appendix: Appendix::new(), prev_appendix: appendix_start, + ref_offsets, }; if let Some(ref mut catalog) = archiver.catalog { @@ -745,6 +751,25 @@ impl Archiver { let (indices, start_padding, end_padding) = prev_ref.index.indices(start_offset, end_offset)?; + // Heuristic to reduce chunk fragmentation by only referencing files if the previous + // run referenced the files. If the files spans more than 2 chunks, it should always be + // referenced, otherwise check if at least 3/4 of the chunk where referenced in the + // last backup run. + if indices.len() < 3 { + let chunk_sum = indices.iter().fold(0f64, |sum, ind| sum + ind.end() as f64); + let chunks_start = start_offset - start_padding; + let chunks_end = end_offset + end_padding; + let refs_sum = self + .ref_offsets + .range((Included(chunks_start), Included(chunks_end))) + .fold(0f64, |sum, (_, size)| sum + *size as f64); + + // Does not cover the attribute size information + if refs_sum / chunk_sum < FRAGMENTATION_THRESHOLD { + return Ok(false); + } + } + let appendix_ref_offset = self.appendix.insert(indices, start_padding); let file_name: &Path = OsStr::from_bytes(c_file_name.to_bytes()).as_ref(); self.add_appendix_ref(encoder, file_name, appendix_ref_offset, file_size) -- 2.39.2