From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by lists.proxmox.com (Postfix) with ESMTPS id A210793203 for ; Mon, 8 Apr 2024 15:54:34 +0200 (CEST) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 82C91C919 for ; Mon, 8 Apr 2024 15:54:34 +0200 (CEST) Received: from proxmox-new.maurer-it.com (proxmox-new.maurer-it.com [94.136.29.106]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by firstgate.proxmox.com (Proxmox) with ESMTPS for ; Mon, 8 Apr 2024 15:54:33 +0200 (CEST) Received: from proxmox-new.maurer-it.com (localhost.localdomain [127.0.0.1]) by proxmox-new.maurer-it.com (Proxmox) with ESMTP id 7D7564432A for ; Mon, 8 Apr 2024 15:54:33 +0200 (CEST) Message-ID: Date: Mon, 8 Apr 2024 15:54:31 +0200 MIME-Version: 1.0 User-Agent: Mozilla Thunderbird To: Proxmox Backup Server development discussion , =?UTF-8?Q?Fabian_Gr=C3=BCnbichler?= References: <20240328123707.336951-1-c.ebner@proxmox.com> <20240328123707.336951-41-c.ebner@proxmox.com> <1712241225.maig1bup9p.astroid@yuna.none> Content-Language: en-US, de-DE From: Christian Ebner In-Reply-To: <1712241225.maig1bup9p.astroid@yuna.none> Content-Type: text/plain; charset=UTF-8; format=flowed Content-Transfer-Encoding: 8bit X-SPAM-LEVEL: Spam detection results: 0 AWL 0.031 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record URIBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to URIBL was blocked. See http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block for more information. [create.rs, catar.rs, api.rs, main.rs] Subject: Re: [pbs-devel] [PATCH v3 proxmox-backup 40/58] client: chunk stream: add dynamic entries injection queues X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 08 Apr 2024 13:54:34 -0000 On 4/4/24 16:52, Fabian Grünbichler wrote: > On March 28, 2024 1:36 pm, Christian Ebner wrote: >> Adds a queue to the chunk stream to request forced boundaries at a >> given offset within the stream and inject reused dynamic entries >> after this boundary. >> >> The chunks are then passed along to the uploader stream using the >> injection queue, which inserts them during upload. >> >> Signed-off-by: Christian Ebner >> --- >> changes since version 2: >> - combined queues into new optional struct >> - refactoring >> >> examples/test_chunk_speed2.rs | 2 +- >> pbs-client/src/backup_writer.rs | 89 +++++++++++-------- >> pbs-client/src/chunk_stream.rs | 36 +++++++- >> pbs-client/src/pxar/create.rs | 6 +- >> pbs-client/src/pxar_backup_stream.rs | 7 +- >> proxmox-backup-client/src/main.rs | 31 ++++--- >> .../src/proxmox_restore_daemon/api.rs | 1 + >> pxar-bin/src/main.rs | 1 + >> tests/catar.rs | 1 + >> 9 files changed, 121 insertions(+), 53 deletions(-) >> >> diff --git a/examples/test_chunk_speed2.rs b/examples/test_chunk_speed2.rs >> index 3f69b436d..22dd14ce2 100644 >> --- a/examples/test_chunk_speed2.rs >> +++ b/examples/test_chunk_speed2.rs >> @@ -26,7 +26,7 @@ async fn run() -> Result<(), Error> { >> .map_err(Error::from); >> >> //let chunk_stream = FixedChunkStream::new(stream, 4*1024*1024); >> - let mut chunk_stream = ChunkStream::new(stream, None); >> + let mut chunk_stream = ChunkStream::new(stream, None, None); >> >> let start_time = std::time::Instant::now(); >> >> diff --git a/pbs-client/src/backup_writer.rs b/pbs-client/src/backup_writer.rs >> index 8bd0e4f36..032d93da7 100644 >> --- a/pbs-client/src/backup_writer.rs >> +++ b/pbs-client/src/backup_writer.rs >> @@ -1,4 +1,4 @@ >> -use std::collections::HashSet; >> +use std::collections::{HashSet, VecDeque}; >> use std::future::Future; >> use std::os::unix::fs::OpenOptionsExt; >> use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; >> @@ -23,6 +23,7 @@ use pbs_tools::crypt_config::CryptConfig; >> >> use proxmox_human_byte::HumanByte; >> >> +use super::inject_reused_chunks::{InjectChunks, InjectReusedChunks, InjectedChunksInfo}; >> use super::merge_known_chunks::{MergeKnownChunks, MergedChunkInfo}; >> >> use super::{H2Client, HttpClient}; >> @@ -265,6 +266,7 @@ impl BackupWriter { >> archive_name: &str, >> stream: impl Stream>, >> options: UploadOptions, >> + injection_queue: Option>>>, >> ) -> Result { >> let known_chunks = Arc::new(Mutex::new(HashSet::new())); >> >> @@ -341,6 +343,7 @@ impl BackupWriter { >> None >> }, >> options.compress, >> + injection_queue, >> ) >> .await?; >> >> @@ -637,6 +640,7 @@ impl BackupWriter { >> known_chunks: Arc>>, >> crypt_config: Option>, >> compress: bool, >> + injection_queue: Option>>>, >> ) -> impl Future> { >> let total_chunks = Arc::new(AtomicUsize::new(0)); >> let total_chunks2 = total_chunks.clone(); >> @@ -663,48 +667,63 @@ impl BackupWriter { >> let index_csum_2 = index_csum.clone(); >> >> stream >> - .and_then(move |data| { >> - let chunk_len = data.len(); >> + .inject_reused_chunks( >> + injection_queue.unwrap_or_default(), >> + stream_len, >> + reused_len.clone(), >> + index_csum.clone(), >> + ) >> + .and_then(move |chunk_info| match chunk_info { > > for this part here I am still not sure whether doing all of the > accounting here wouldn't be nicer.. > Moved almost all the accounting to here, only stream len is still required for the offset calculation in `inject_reused_chunks`. > >> diff --git a/pbs-client/src/chunk_stream.rs b/pbs-client/src/chunk_stream.rs >> index a45420ca0..6ac0c638b 100644 >> --- a/pbs-client/src/chunk_stream.rs >> +++ b/pbs-client/src/chunk_stream.rs >> @@ -38,15 +38,17 @@ pub struct ChunkStream { >> chunker: Chunker, >> buffer: BytesMut, >> scan_pos: usize, >> + injection_data: Option, >> } >> >> impl ChunkStream { >> - pub fn new(input: S, chunk_size: Option) -> Self { >> + pub fn new(input: S, chunk_size: Option, injection_data: Option) -> Self { >> Self { >> input, >> chunker: Chunker::new(chunk_size.unwrap_or(4 * 1024 * 1024)), >> buffer: BytesMut::new(), >> scan_pos: 0, >> + injection_data, >> } >> } >> } >> @@ -64,6 +66,34 @@ where >> fn poll_next(self: Pin<&mut Self>, cx: &mut Context) -> Poll> { >> let this = self.get_mut(); >> loop { >> + if let Some(InjectionData { >> + boundaries, >> + injections, >> + consumed, >> + }) = this.injection_data.as_mut() >> + { >> + // Make sure to release this lock as soon as possible >> + let mut boundaries = boundaries.lock().unwrap(); >> + if let Some(inject) = boundaries.pop_front() { > > here I am a bit more wary that this popping and re-pushing might hurt > performance.. > >> + let max = *consumed + this.buffer.len() as u64; >> + if inject.boundary <= max { >> + let chunk_size = (inject.boundary - *consumed) as usize; >> + let result = this.buffer.split_to(chunk_size); > > a comment or better variable naming would make this easier to follow > along.. > > "result" is a forced chunk that is created here because we've reached a > point where we want to inject something afterwards.. > Improved the variable naming and added comments to clarify the functionality for the upcoming version of the patches. > once more I am wondering here whether for the payload stream, a vastly > simplified chunker that just picks the boundaries based on re-use and > payload size(s) (to avoid the one file == one chunk pathological case > for lots of small files) wouldn't improve performance :) Do you suggest to have 2 chunker implementations and for the payload stream, instead of performing chunking by the statistical sliding window approach use the provide the chunk boundaries by some interface rather than performing the chunking based on the statistical approach with the sliding window? As you mentioned in response to Dietmar on patch 49 of this patch series version?