From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by lists.proxmox.com (Postfix) with ESMTPS id 95293BC069 for ; Thu, 28 Mar 2024 13:38:06 +0100 (CET) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 7625B9FF4 for ; Thu, 28 Mar 2024 13:37:48 +0100 (CET) Received: from proxmox-new.maurer-it.com (proxmox-new.maurer-it.com [94.136.29.106]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by firstgate.proxmox.com (Proxmox) with ESMTPS for ; Thu, 28 Mar 2024 13:37:45 +0100 (CET) Received: from proxmox-new.maurer-it.com (localhost.localdomain [127.0.0.1]) by proxmox-new.maurer-it.com (Proxmox) with ESMTP id AE18542936 for ; Thu, 28 Mar 2024 13:37:45 +0100 (CET) From: Christian Ebner To: pbs-devel@lists.proxmox.com Date: Thu, 28 Mar 2024 13:36:52 +0100 Message-Id: <20240328123707.336951-44-c.ebner@proxmox.com> X-Mailer: git-send-email 2.39.2 In-Reply-To: <20240328123707.336951-1-c.ebner@proxmox.com> References: <20240328123707.336951-1-c.ebner@proxmox.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-SPAM-LEVEL: Spam detection results: 0 AWL 0.029 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Subject: [pbs-devel] [PATCH v3 proxmox-backup 43/58] client: pxar: implement store to insert chunks on caching X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 28 Mar 2024 12:38:06 -0000 In preparation for the look-ahead caching used to temprarily store entries before encoding them in the pxar archive, being able to decide wether to re-use or re-encode regular file entries. Allows to insert and store reused chunks in the archiver, deduplicating chunks upon insert when possible. Signed-off-by: Christian Ebner --- changes since version 2: - Strongly adapted and refactored: keep track also of paddings introduced by reusing the chunks, making a suggestion whether to re-use, re-encode or check next entry based on threshold - completely removed code which allowed to calculate offsets based on chunks found in the middle, they must either be a continuation of the end or be added after, otherwise offsets are not monotonically increasing, which is required for sequential restore pbs-client/src/pxar/create.rs | 126 +++++++++++++++++++++++++++++++++- 1 file changed, 125 insertions(+), 1 deletion(-) diff --git a/pbs-client/src/pxar/create.rs b/pbs-client/src/pxar/create.rs index 335e3556f..95a91a59b 100644 --- a/pbs-client/src/pxar/create.rs +++ b/pbs-client/src/pxar/create.rs @@ -20,7 +20,7 @@ use pathpatterns::{MatchEntry, MatchFlag, MatchList, MatchType, PatternFlag}; use pbs_datastore::index::IndexFile; use proxmox_sys::error::SysError; use pxar::accessor::aio::Accessor; -use pxar::encoder::{LinkOffset, SeqWrite}; +use pxar::encoder::{LinkOffset, PayloadOffset, SeqWrite}; use pxar::Metadata; use proxmox_io::vec; @@ -36,6 +36,128 @@ use crate::pxar::metadata::errno_is_unsupported; use crate::pxar::tools::assert_single_path_component; use crate::pxar::Flags; +const CHUNK_PADDING_THRESHOLD: f64 = 0.1; + +#[derive(Default)] +struct ReusedChunks { + start_boundary: PayloadOffset, + total: PayloadOffset, + padding: u64, + chunks: Vec<(u64, ReusableDynamicEntry)>, + must_flush_first: bool, + suggestion: Suggested, +} + +#[derive(Copy, Clone, Default)] +enum Suggested { + #[default] + CheckNext, + Reuse, + Reencode, +} + +impl ReusedChunks { + fn new() -> Self { + Self::default() + } + + fn start_boundary(&self) -> PayloadOffset { + self.start_boundary + } + + fn is_empty(&self) -> bool { + self.chunks.is_empty() + } + + fn suggested(&self) -> Suggested { + self.suggestion + } + + fn insert( + &mut self, + indices: Vec, + boundary: PayloadOffset, + start_padding: u64, + end_padding: u64, + ) -> PayloadOffset { + if self.is_empty() { + self.start_boundary = boundary; + } + + if let Some(offset) = self.last_digest_matched(&indices) { + if let Some((padding, last)) = self.chunks.last_mut() { + // Existing chunk, update padding based on pre-existing one + // Start padding is expected to be larger than previous padding + *padding += start_padding - last.size(); + self.padding += start_padding - last.size(); + } + + for chunk in indices.into_iter().skip(1) { + self.total = self.total.add(chunk.size()); + self.chunks.push((0, chunk)); + } + + if let Some((padding, _last)) = self.chunks.last_mut() { + *padding += end_padding; + self.padding += end_padding; + } + + let padding_ratio = self.padding as f64 / self.total.raw() as f64; + if self.chunks.len() > 1 && padding_ratio < CHUNK_PADDING_THRESHOLD { + self.suggestion = Suggested::Reuse; + } + + self.start_boundary.add(offset + start_padding) + } else { + let offset = self.total.raw(); + + if let Some(first) = indices.first() { + self.total = self.total.add(first.size()); + self.chunks.push((start_padding, first.clone())); + // New chunk, all start padding counts + self.padding += start_padding; + } + + for chunk in indices.into_iter().skip(1) { + self.total = self.total.add(chunk.size()); + self.chunks.push((chunk.size(), chunk)); + } + + if let Some((padding, _last)) = self.chunks.last_mut() { + *padding += end_padding; + self.padding += end_padding; + } + + if self.chunks.len() > 2 { + let padding_ratio = self.padding as f64 / self.total.raw() as f64; + if padding_ratio < CHUNK_PADDING_THRESHOLD { + self.suggestion = Suggested::Reuse; + } else { + self.suggestion = Suggested::Reencode; + } + } + + self.start_boundary.add(offset + start_padding) + } + } + + fn last_digest_matched(&self, indices: &[ReusableDynamicEntry]) -> Option { + let digest = if let Some(first) = indices.first() { + first.digest() + } else { + return None; + }; + + if let Some(last) = self.chunks.last() { + if last.1.digest() == digest { + return Some(self.total.raw() - last.1.size()); + } + } + + None + } +} + /// Pxar options for creating a pxar archive/stream #[derive(Default, Clone)] pub struct PxarCreateOptions { @@ -147,6 +269,7 @@ struct Archiver { hardlinks: HashMap, file_copy_buffer: Vec, skip_e2big_xattr: bool, + reused_chunks: ReusedChunks, forced_boundaries: Option>>>, } @@ -239,6 +362,7 @@ where hardlinks: HashMap::new(), file_copy_buffer: vec::undefined(4 * 1024 * 1024), skip_e2big_xattr: options.skip_e2big_xattr, + reused_chunks: ReusedChunks::new(), forced_boundaries, }; -- 2.39.2