From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) by lore.proxmox.com (Postfix) with ESMTPS id 4C1571FF38E for ; Tue, 14 May 2024 12:35:37 +0200 (CEST) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 25667476E; Tue, 14 May 2024 12:35:12 +0200 (CEST) From: Christian Ebner To: pbs-devel@lists.proxmox.com Date: Tue, 14 May 2024 12:34:14 +0200 Message-Id: <20240514103421.289431-59-c.ebner@proxmox.com> X-Mailer: git-send-email 2.39.2 In-Reply-To: <20240514103421.289431-1-c.ebner@proxmox.com> References: <20240514103421.289431-1-c.ebner@proxmox.com> MIME-Version: 1.0 X-SPAM-LEVEL: Spam detection results: 0 AWL 0.026 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Subject: [pbs-devel] [PATCH v6 proxmox-backup 58/65] datastore: chunker: implement chunker for payload stream X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: Proxmox Backup Server development discussion Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: pbs-devel-bounces@lists.proxmox.com Sender: "pbs-devel" Implement the Chunker trait for a dedicated payload stream chunker, which extends the regular chunker by the option to suggest boundaries to be used over the hast based boundaries whenever possible. Signed-off-by: Christian Ebner --- pbs-datastore/src/chunker.rs | 89 ++++++++++++++++++++++++++++++++++++ pbs-datastore/src/lib.rs | 2 +- 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/pbs-datastore/src/chunker.rs b/pbs-datastore/src/chunker.rs index d75e63fa8..bfa1c8ca1 100644 --- a/pbs-datastore/src/chunker.rs +++ b/pbs-datastore/src/chunker.rs @@ -1,3 +1,5 @@ +use std::sync::mpsc::Receiver; + /// Note: window size 32 or 64, is faster because we can /// speedup modulo operations, but always computes hash 0 /// for constant data streams .. 0,0,0,0,0,0 @@ -46,6 +48,16 @@ pub struct ChunkerImpl { window: [u8; CA_CHUNKER_WINDOW_SIZE], } +/// Sliding window chunker (Buzhash) with boundary suggestions +/// +/// Suggest to chunk at a given boundary instead of the regular chunk boundary for better alignment +/// with file payload boundaries. +pub struct PayloadChunker { + chunker: ChunkerImpl, + current_suggested: Option, + suggested_boundaries: Receiver, +} + const BUZHASH_TABLE: [u32; 256] = [ 0x458be752, 0xc10748cc, 0xfbbcdbb8, 0x6ded5b68, 0xb10a82b5, 0x20d75648, 0xdfc5665f, 0xa8428801, 0x7ebf5191, 0x841135c7, 0x65cc53b3, 0x280a597c, 0x16f60255, 0xc78cbc3e, 0x294415f5, 0xb938d494, @@ -221,6 +233,83 @@ impl Chunker for ChunkerImpl { } } +impl PayloadChunker { + /// Create a new PayloadChunker instance, which produces and average + /// chunk size of `chunk_size_avg` (need to be a power of two), if no + /// suggested boundaries are provided. + /// Use suggested boundaries instead, whenever the chunk size is within + /// the min - max range. + pub fn new(chunk_size_avg: usize, suggested_boundaries: Receiver) -> Self { + Self { + chunker: ChunkerImpl::new(chunk_size_avg), + current_suggested: None, + suggested_boundaries, + } + } +} + +impl Chunker for PayloadChunker { + fn scan(&mut self, data: &[u8], ctx: &Context) -> usize { + let pos = ctx.total - data.len() as u64; + + loop { + if let Some(boundary) = self.current_suggested { + if boundary < ctx.base + pos { + log::debug!("Boundary {boundary} in past"); + // ignore passed boundaries + self.current_suggested = None; + continue; + } + + if boundary > ctx.base + ctx.total { + log::debug!("Boundary {boundary} in future"); + // boundary in future, cannot decide yet + return self.chunker.scan(data, ctx); + } + + let chunk_size = (boundary - ctx.base) as usize; + if chunk_size < self.chunker.chunk_size_min { + log::debug!("Chunk size {chunk_size} below minimum chunk size"); + // chunk to small, ignore boundary + self.current_suggested = None; + continue; + } + + if chunk_size <= self.chunker.chunk_size_max { + self.current_suggested = None; + // calculate boundary relative to start of given data buffer + let len = chunk_size - pos as usize; + if len == 0 { + // passed this one, previous scan did not know about boundary just yet + return self.chunker.scan(data, ctx); + } + self.chunker.reset(); + log::debug!( + "Chunk at suggested boundary: {boundary}, chunk size: {chunk_size}" + ); + return len; + } + + log::debug!("Chunk {chunk_size} to big, regular scan"); + // chunk to big, cannot decide yet + // scan for hash based chunk boundary instead + return self.chunker.scan(data, ctx); + } + + if let Ok(boundary) = self.suggested_boundaries.try_recv() { + self.current_suggested = Some(boundary); + } else { + log::debug!("No suggested boundary, regular scan"); + return self.chunker.scan(data, ctx); + } + } + } + + fn reset(&mut self) { + self.chunker.reset(); + } +} + #[test] fn test_chunker1() { let mut buffer = Vec::new(); diff --git a/pbs-datastore/src/lib.rs b/pbs-datastore/src/lib.rs index 24429626c..3e4aa34c2 100644 --- a/pbs-datastore/src/lib.rs +++ b/pbs-datastore/src/lib.rs @@ -196,7 +196,7 @@ pub use backup_info::{BackupDir, BackupGroup, BackupInfo}; pub use checksum_reader::ChecksumReader; pub use checksum_writer::ChecksumWriter; pub use chunk_store::ChunkStore; -pub use chunker::{Chunker, ChunkerImpl}; +pub use chunker::{Chunker, ChunkerImpl, PayloadChunker}; pub use crypt_reader::CryptReader; pub use crypt_writer::CryptWriter; pub use data_blob::DataBlob; -- 2.39.2 _______________________________________________ pbs-devel mailing list pbs-devel@lists.proxmox.com https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel