From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <c.ebner@proxmox.com>
Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature RSA-PSS (2048 bits))
 (No client certificate requested)
 by lists.proxmox.com (Postfix) with ESMTPS id A75F2A0CAE
 for <pbs-devel@lists.proxmox.com>; Thu,  9 Nov 2023 19:47:32 +0100 (CET)
Received: from firstgate.proxmox.com (localhost [127.0.0.1])
 by firstgate.proxmox.com (Proxmox) with ESMTP id 684DD187C3
 for <pbs-devel@lists.proxmox.com>; Thu,  9 Nov 2023 19:46:40 +0100 (CET)
Received: from proxmox-new.maurer-it.com (proxmox-new.maurer-it.com
 [94.136.29.106])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature RSA-PSS (2048 bits))
 (No client certificate requested)
 by firstgate.proxmox.com (Proxmox) with ESMTPS
 for <pbs-devel@lists.proxmox.com>; Thu,  9 Nov 2023 19:46:38 +0100 (CET)
Received: from proxmox-new.maurer-it.com (localhost.localdomain [127.0.0.1])
 by proxmox-new.maurer-it.com (Proxmox) with ESMTP id AD48D478BD
 for <pbs-devel@lists.proxmox.com>; Thu,  9 Nov 2023 19:46:37 +0100 (CET)
From: Christian Ebner <c.ebner@proxmox.com>
To: pbs-devel@lists.proxmox.com
Date: Thu,  9 Nov 2023 19:46:13 +0100
Message-Id: <20231109184614.1611127-26-c.ebner@proxmox.com>
X-Mailer: git-send-email 2.39.2
In-Reply-To: <20231109184614.1611127-1-c.ebner@proxmox.com>
References: <20231109184614.1611127-1-c.ebner@proxmox.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-SPAM-LEVEL: Spam detection results:  0
 AWL 0.063 Adjusted score from AWL reputation of From: address
 BAYES_00                 -1.9 Bayes spam probability is 0 to 1%
 DMARC_MISSING             0.1 Missing DMARC policy
 KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment
 SPF_HELO_NONE           0.001 SPF: HELO does not publish an SPF Record
 SPF_PASS               -0.001 SPF: sender matches SPF record
 T_SCC_BODY_TEXT_LINE    -0.01 -
Subject: [pbs-devel] [PATCH v4 proxmox-backup 25/26] pxar: add heuristic to
 reduce reused chunk fragmentation
X-BeenThere: pbs-devel@lists.proxmox.com
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Proxmox Backup Server development discussion
 <pbs-devel.lists.proxmox.com>
List-Unsubscribe: <https://lists.proxmox.com/cgi-bin/mailman/options/pbs-devel>, 
 <mailto:pbs-devel-request@lists.proxmox.com?subject=unsubscribe>
List-Archive: <http://lists.proxmox.com/pipermail/pbs-devel/>
List-Post: <mailto:pbs-devel@lists.proxmox.com>
List-Help: <mailto:pbs-devel-request@lists.proxmox.com?subject=help>
List-Subscribe: <https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel>, 
 <mailto:pbs-devel-request@lists.proxmox.com?subject=subscribe>
X-List-Received-Date: Thu, 09 Nov 2023 18:47:32 -0000

For multiple consecutive runs with metadata based file change detection
the referencing of existing chunks can lead to fragmentation and an
increased size of the index file.

In order to reduce this, check which chunks relative size has been
referenced by the previous run and re-chunk files where a constant
threshold value has been underrun.

Signed-off-by: Christian Ebner <c.ebner@proxmox.com>
---
Changes since version 3:
- no present in version 3

 pbs-client/src/pxar/create.rs | 55 +++++++++++++++++++++++++----------
 1 file changed, 40 insertions(+), 15 deletions(-)

diff --git a/pbs-client/src/pxar/create.rs b/pbs-client/src/pxar/create.rs
index 8b283b23..56e1de7a 100644
--- a/pbs-client/src/pxar/create.rs
+++ b/pbs-client/src/pxar/create.rs
@@ -1,7 +1,8 @@
-use std::collections::{HashMap, HashSet, VecDeque};
+use std::collections::{BTreeMap, HashMap, HashSet, VecDeque};
 use std::ffi::{CStr, CString, OsStr};
 use std::fmt;
 use std::io::{self, Read};
+use std::ops::Bound::Included;
 use std::os::unix::ffi::OsStrExt;
 use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
 use std::path::{Path, PathBuf};
@@ -36,6 +37,7 @@ use crate::pxar::tools::assert_single_path_component;
 use crate::pxar::Flags;
 
 const MAX_FILE_SIZE: u64 = 1024;
+const FRAGMENTATION_THRESHOLD: f64 = 0.75;
 
 /// Pxar options for creating a pxar archive/stream
 #[derive(Default)]
@@ -214,6 +216,7 @@ struct Archiver {
     forced_boundaries: Arc<Mutex<VecDeque<InjectChunks>>>,
     appendix: Appendix,
     prev_appendix: Option<AppendixStartOffset>,
+    ref_offsets: BTreeMap<u64, u64>,
 }
 
 type Encoder<'a, T> = pxar::encoder::aio::Encoder<'a, T>;
@@ -265,21 +268,23 @@ where
         )?);
     }
 
-    let (appendix_start, prev_cat_parent) = if let Some(ref mut prev_ref) = options.previous_ref {
-        let entry = prev_ref
-            .catalog
-            .lookup_recursive(prev_ref.archive_name.as_bytes())?;
-        let parent = match entry.attr {
-            DirEntryAttribute::Archive { .. } => Some(entry),
-            _ => None,
+    let (appendix_start, prev_cat_parent, ref_offsets) =
+        if let Some(ref mut prev_ref) = options.previous_ref {
+            let entry = prev_ref
+                .catalog
+                .lookup_recursive(prev_ref.archive_name.as_bytes())?;
+            let parent = match entry.attr {
+                DirEntryAttribute::Archive { .. } => Some(entry),
+                _ => None,
+            };
+            let appendix_start = prev_ref
+                .catalog
+                .appendix_offset(prev_ref.archive_name.as_bytes())?;
+            let ref_offsets = prev_ref.catalog.fetch_offsets()?;
+            (appendix_start, parent, ref_offsets)
+        } else {
+            (None, None, BTreeMap::new())
         };
-        let appendix_start = prev_ref
-            .catalog
-            .appendix_offset(prev_ref.archive_name.as_bytes())?;
-        (appendix_start, parent)
-    } else {
-        (None, None)
-    };
 
     let mut archiver = Archiver {
         feature_flags,
@@ -299,6 +304,7 @@ where
         forced_boundaries,
         appendix: Appendix::new(),
         prev_appendix: appendix_start,
+        ref_offsets,
     };
 
     if let Some(ref mut catalog) = archiver.catalog {
@@ -745,6 +751,25 @@ impl Archiver {
         let (indices, start_padding, end_padding) =
             prev_ref.index.indices(start_offset, end_offset)?;
 
+        // Heuristic to reduce chunk fragmentation by only referencing files if the previous
+        // run referenced the files. If the files spans more than 2 chunks, it should always be
+        // referenced, otherwise check if at least 3/4 of the chunk where referenced in the
+        // last backup run.
+        if indices.len() < 3 {
+            let chunk_sum = indices.iter().fold(0f64, |sum, ind| sum + ind.end() as f64);
+            let chunks_start = start_offset - start_padding;
+            let chunks_end = end_offset + end_padding;
+            let refs_sum = self
+                .ref_offsets
+                .range((Included(chunks_start), Included(chunks_end)))
+                .fold(0f64, |sum, (_, size)| sum + *size as f64);
+
+            // Does not cover the attribute size information
+            if refs_sum / chunk_sum < FRAGMENTATION_THRESHOLD {
+                return Ok(false);
+            }
+        }
+
         let appendix_ref_offset = self.appendix.insert(indices, start_padding);
         let file_name: &Path = OsStr::from_bytes(c_file_name.to_bytes()).as_ref();
         self.add_appendix_ref(encoder, file_name, appendix_ref_offset, file_size)
-- 
2.39.2