From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [IPv6:2a01:7e0:0:424::9]) by lore.proxmox.com (Postfix) with ESMTPS id DED441FF165 for ; Thu, 6 Nov 2025 09:54:39 +0100 (CET) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 182F4106E7; Thu, 6 Nov 2025 09:55:21 +0100 (CET) Message-ID: <528f6574-c7b1-42cb-ac2a-988c3fbacf08@proxmox.com> Date: Thu, 6 Nov 2025 09:54:45 +0100 MIME-Version: 1.0 User-Agent: Mozilla Thunderbird To: Proxmox Backup Server development discussion , Nicolas Frey References: <20251105155129.517430-1-n.frey@proxmox.com> <20251105155129.517430-3-n.frey@proxmox.com> Content-Language: en-US, de-DE From: Christian Ebner In-Reply-To: <20251105155129.517430-3-n.frey@proxmox.com> X-Bm-Milter-Handled: 55990f41-d878-4baa-be0a-ee34c49e34d2 X-Bm-Transport-Timestamp: 1762419266825 X-SPAM-LEVEL: Spam detection results: 0 AWL 0.047 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Subject: Re: [pbs-devel] [PATCH proxmox-backup 1/4] api: verify: move chunk loading into parallel handler X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: Proxmox Backup Server development discussion Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="us-ascii"; Format="flowed" Errors-To: pbs-devel-bounces@lists.proxmox.com Sender: "pbs-devel" On 11/5/25 4:51 PM, Nicolas Frey wrote: > This way, the chunks will be loaded in parallel in addition to being > checked in parallel. > > Depending on the underlying storage, this can speed up reading chunks > from disk, especially when the underlying storage is IO depth > dependent, and the CPU is faster than the storage. > > Originally-by: Dominik Csapak > Signed-off-by: Nicolas Frey > --- > src/backup/verify.rs | 120 +++++++++++++++++++++++++++---------------- > 1 file changed, 75 insertions(+), 45 deletions(-) > > diff --git a/src/backup/verify.rs b/src/backup/verify.rs > index e33fdf50..7f91f38c 100644 > --- a/src/backup/verify.rs > +++ b/src/backup/verify.rs > @@ -1,6 +1,6 @@ > use pbs_config::BackupLockGuard; > use std::collections::HashSet; > -use std::sync::atomic::{AtomicUsize, Ordering}; > +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; > use std::sync::{Arc, Mutex}; > use std::time::Instant; > > @@ -20,7 +20,7 @@ use pbs_datastore::index::{ChunkReadInfo, IndexFile}; > use pbs_datastore::manifest::{BackupManifest, FileInfo}; > use pbs_datastore::{DataBlob, DataStore, DatastoreBackend, StoreProgress}; > > -use crate::tools::parallel_handler::ParallelHandler; > +use crate::tools::parallel_handler::{ParallelHandler, SendHandle}; > > use crate::backup::hierarchy::ListAccessibleBackupGroups; > > @@ -156,23 +156,20 @@ impl VerifyWorker { > > let start_time = Instant::now(); > > - let mut read_bytes = 0; > - let mut decoded_bytes = 0; > + let read_bytes = Arc::new(AtomicU64::new(0)); > + let decoded_bytes = Arc::new(AtomicU64::new(0)); > > - let datastore2 = Arc::clone(&self.datastore); > - let corrupt_chunks2 = Arc::clone(&self.corrupt_chunks); > - let verified_chunks2 = Arc::clone(&self.verified_chunks); > - let errors2 = Arc::clone(&errors); > - > - let decoder_pool = ParallelHandler::new( > - "verify chunk decoder", > - 4, > + let decoder_pool = ParallelHandler::new("verify chunk decoder", 4, { > + let datastore = Arc::clone(&self.datastore); > + let corrupt_chunks = Arc::clone(&self.corrupt_chunks); > + let verified_chunks = Arc::clone(&self.verified_chunks); > + let errors = Arc::clone(&errors); Hmm, this really warrants an internal state struct for bundling theses and only cloning the arc around that. E.g. I would propose to introduce a struct along the lines of: struct IndexVerifyState { read_bytes: AtomicU64, decoded_bytes: AtomicU64, errors: AtomicUsize, datastore: Arc, corrupt_chunks: Arc>>, start_time: Instant, } Which can then be wrapped in an Arc and also passed along as parameter to the inner methods where required. See the diff below (untested), which could be used for further refining this. > move |(chunk, digest, size): (DataBlob, [u8; 32], u64)| { > let chunk_crypt_mode = match chunk.crypt_mode() { > Err(err) => { > - corrupt_chunks2.lock().unwrap().insert(digest); > + corrupt_chunks.lock().unwrap().insert(digest); > info!("can't verify chunk, unknown CryptMode - {err}"); > - errors2.fetch_add(1, Ordering::SeqCst); > + errors.fetch_add(1, Ordering::SeqCst); > return Ok(()); > } > Ok(mode) => mode, > @@ -182,21 +179,21 @@ impl VerifyWorker { > info!( > "chunk CryptMode {chunk_crypt_mode:?} does not match index CryptMode {crypt_mode:?}" > ); > - errors2.fetch_add(1, Ordering::SeqCst); > + errors.fetch_add(1, Ordering::SeqCst); > } > > if let Err(err) = chunk.verify_unencrypted(size as usize, &digest) { > - corrupt_chunks2.lock().unwrap().insert(digest); > + corrupt_chunks.lock().unwrap().insert(digest); > info!("{err}"); > - errors2.fetch_add(1, Ordering::SeqCst); > - Self::rename_corrupted_chunk(datastore2.clone(), &digest); > + errors.fetch_add(1, Ordering::SeqCst); > + Self::rename_corrupted_chunk(datastore.clone(), &digest); > } else { > - verified_chunks2.lock().unwrap().insert(digest); > + verified_chunks.lock().unwrap().insert(digest); > } > > Ok(()) > - }, > - ); > + } > + }); > > let skip_chunk = |digest: &[u8; 32]| -> bool { > if self.verified_chunks.lock().unwrap().contains(digest) { > @@ -223,6 +220,29 @@ impl VerifyWorker { > .datastore > .get_chunks_in_order(&*index, skip_chunk, check_abort)?; > > + let reader_pool = ParallelHandler::new("read chunks", 4, { > + let decoder_pool = decoder_pool.channel(); > + let datastore = Arc::clone(&self.datastore); > + let corrupt_chunks = Arc::clone(&self.corrupt_chunks); > + let read_bytes = Arc::clone(&read_bytes); > + let decoded_bytes = Arc::clone(&decoded_bytes); > + let errors = Arc::clone(&errors); > + let backend = self.backend.clone(); > + > + move |info: ChunkReadInfo| { > + Self::verify_chunk_by_backend( > + &backend, > + Arc::clone(&datastore), > + Arc::clone(&corrupt_chunks), > + Arc::clone(&read_bytes), > + Arc::clone(&decoded_bytes), > + Arc::clone(&errors), > + &decoder_pool, > + &info, > + ) > + } > + }); > + > for (pos, _) in chunk_list { > self.worker.check_abort()?; > self.worker.fail_on_shutdown()?; > @@ -234,19 +254,16 @@ impl VerifyWorker { > continue; // already verified or marked corrupt > } > > - self.verify_chunk_by_backend( > - &info, > - &mut read_bytes, > - &mut decoded_bytes, > - Arc::clone(&errors), > - &decoder_pool, > - )?; > + reader_pool.send(info)?; > } > > - decoder_pool.complete()?; > + reader_pool.complete()?; > > let elapsed = start_time.elapsed().as_secs_f64(); > > + let read_bytes = read_bytes.load(Ordering::SeqCst); > + let decoded_bytes = decoded_bytes.load(Ordering::SeqCst); > + > let read_bytes_mib = (read_bytes as f64) / (1024.0 * 1024.0); > let decoded_bytes_mib = (decoded_bytes as f64) / (1024.0 * 1024.0); > > @@ -266,26 +283,31 @@ impl VerifyWorker { > Ok(()) > } > > + #[allow(clippy::too_many_arguments)] > fn verify_chunk_by_backend( > - &self, > - info: &ChunkReadInfo, > - read_bytes: &mut u64, > - decoded_bytes: &mut u64, > + backend: &DatastoreBackend, > + datastore: Arc, > + corrupt_chunks: Arc>>, > + read_bytes: Arc, > + decoded_bytes: Arc, > errors: Arc, > - decoder_pool: &ParallelHandler<(DataBlob, [u8; 32], u64)>, > + decoder_pool: &SendHandle<(DataBlob, [u8; 32], u64)>, > + info: &ChunkReadInfo, > ) -> Result<(), Error> { > - match &self.backend { > - DatastoreBackend::Filesystem => match self.datastore.load_chunk(&info.digest) { > - Err(err) => self.add_corrupt_chunk( > + match backend { > + DatastoreBackend::Filesystem => match datastore.load_chunk(&info.digest) { > + Err(err) => Self::add_corrupt_chunk( > + datastore, > + corrupt_chunks, > info.digest, > errors, > &format!("can't verify chunk, load failed - {err}"), > ), > Ok(chunk) => { > let size = info.size(); > - *read_bytes += chunk.raw_size(); > + read_bytes.fetch_add(chunk.raw_size(), Ordering::SeqCst); > decoder_pool.send((chunk, info.digest, size))?; > - *decoded_bytes += size; > + decoded_bytes.fetch_add(size, Ordering::SeqCst); > } > }, > DatastoreBackend::S3(s3_client) => { > @@ -302,9 +324,9 @@ impl VerifyWorker { > match chunk_result { > Ok(chunk) => { > let size = info.size(); > - *read_bytes += chunk.raw_size(); > + read_bytes.fetch_add(chunk.raw_size(), Ordering::SeqCst); > decoder_pool.send((chunk, info.digest, size))?; > - *decoded_bytes += size; > + decoded_bytes.fetch_add(size, Ordering::SeqCst); > } > Err(err) => { > errors.fetch_add(1, Ordering::SeqCst); > @@ -312,7 +334,9 @@ impl VerifyWorker { > } > } > } > - Ok(None) => self.add_corrupt_chunk( > + Ok(None) => Self::add_corrupt_chunk( > + datastore, > + corrupt_chunks, > info.digest, > errors, > &format!( > @@ -330,13 +354,19 @@ impl VerifyWorker { > Ok(()) > } > > - fn add_corrupt_chunk(&self, digest: [u8; 32], errors: Arc, message: &str) { > + fn add_corrupt_chunk( > + datastore: Arc, > + corrupt_chunks: Arc>>, > + digest: [u8; 32], > + errors: Arc, > + message: &str, > + ) { > // Panic on poisoned mutex > - let mut corrupt_chunks = self.corrupt_chunks.lock().unwrap(); > + let mut corrupt_chunks = corrupt_chunks.lock().unwrap(); > corrupt_chunks.insert(digest); > error!(message); > errors.fetch_add(1, Ordering::SeqCst); > - Self::rename_corrupted_chunk(self.datastore.clone(), &digest); > + Self::rename_corrupted_chunk(datastore.clone(), &digest); > } > > fn verify_fixed_index(&self, backup_dir: &BackupDir, info: &FileInfo) -> Result<(), Error> { diff --git a/src/backup/verify.rs b/src/backup/verify.rs index 7f91f38c9..e01405750 100644 --- a/src/backup/verify.rs +++ b/src/backup/verify.rs @@ -152,24 +152,24 @@ impl VerifyWorker { index: Box, crypt_mode: CryptMode, ) -> Result<(), Error> { - let errors = Arc::new(AtomicUsize::new(0)); - - let start_time = Instant::now(); - - let read_bytes = Arc::new(AtomicU64::new(0)); - let decoded_bytes = Arc::new(AtomicU64::new(0)); + let index_verify_state = Arc::new(IndexVerifyState { + read_bytes: AtomicU64::new(0), + decoded_bytes: AtomicU64::new(0), + errors: AtomicUsize::new(0), + datastore: Arc::clone(&self.datastore), + corrupt_chunks: Arc::clone(&self.corrupt_chunks), + start_time: Instant::now(), + }); let decoder_pool = ParallelHandler::new("verify chunk decoder", 4, { - let datastore = Arc::clone(&self.datastore); - let corrupt_chunks = Arc::clone(&self.corrupt_chunks); + let state = Arc::clone(&index_verify_state); let verified_chunks = Arc::clone(&self.verified_chunks); - let errors = Arc::clone(&errors); move |(chunk, digest, size): (DataBlob, [u8; 32], u64)| { let chunk_crypt_mode = match chunk.crypt_mode() { Err(err) => { - corrupt_chunks.lock().unwrap().insert(digest); + state.corrupt_chunks.lock().unwrap().insert(digest); info!("can't verify chunk, unknown CryptMode - {err}"); - errors.fetch_add(1, Ordering::SeqCst); + state.errors.fetch_add(1, Ordering::SeqCst); return Ok(()); } Ok(mode) => mode, @@ -179,14 +179,14 @@ impl VerifyWorker { info!( "chunk CryptMode {chunk_crypt_mode:?} does not match index CryptMode {crypt_mode:?}" ); - errors.fetch_add(1, Ordering::SeqCst); + state.errors.fetch_add(1, Ordering::SeqCst); } if let Err(err) = chunk.verify_unencrypted(size as usize, &digest) { - corrupt_chunks.lock().unwrap().insert(digest); + state.corrupt_chunks.lock().unwrap().insert(digest); info!("{err}"); - errors.fetch_add(1, Ordering::SeqCst); - Self::rename_corrupted_chunk(datastore.clone(), &digest); + state.errors.fetch_add(1, Ordering::SeqCst); + Self::rename_corrupted_chunk(state.datastore.clone(), &digest); } else { verified_chunks.lock().unwrap().insert(digest); } @@ -201,7 +201,7 @@ impl VerifyWorker { } else if self.corrupt_chunks.lock().unwrap().contains(digest) { let digest_str = hex::encode(digest); info!("chunk {digest_str} was marked as corrupt"); - errors.fetch_add(1, Ordering::SeqCst); + index_verify_state.errors.fetch_add(1, Ordering::SeqCst); true } else { false @@ -222,24 +222,11 @@ impl VerifyWorker { let reader_pool = ParallelHandler::new("read chunks", 4, { let decoder_pool = decoder_pool.channel(); - let datastore = Arc::clone(&self.datastore); - let corrupt_chunks = Arc::clone(&self.corrupt_chunks); - let read_bytes = Arc::clone(&read_bytes); - let decoded_bytes = Arc::clone(&decoded_bytes); - let errors = Arc::clone(&errors); + let state = Arc::clone(&index_verify_state); let backend = self.backend.clone(); move |info: ChunkReadInfo| { - Self::verify_chunk_by_backend( - &backend, - Arc::clone(&datastore), - Arc::clone(&corrupt_chunks), - Arc::clone(&read_bytes), - Arc::clone(&decoded_bytes), - Arc::clone(&errors), - &decoder_pool, - &info, - ) + Self::verify_chunk_by_backend(&backend, Arc::clone(&state), &decoder_pool, &info) } }); @@ -259,10 +246,10 @@ impl VerifyWorker { reader_pool.complete()?; - let elapsed = start_time.elapsed().as_secs_f64(); + let elapsed = index_verify_state.start_time.elapsed().as_secs_f64(); - let read_bytes = read_bytes.load(Ordering::SeqCst); - let decoded_bytes = decoded_bytes.load(Ordering::SeqCst); + let read_bytes = index_verify_state.read_bytes.load(Ordering::SeqCst); + let decoded_bytes = index_verify_state.decoded_bytes.load(Ordering::SeqCst); let read_bytes_mib = (read_bytes as f64) / (1024.0 * 1024.0); let decoded_bytes_mib = (decoded_bytes as f64) / (1024.0 * 1024.0); @@ -270,13 +257,13 @@ impl VerifyWorker { let read_speed = read_bytes_mib / elapsed; let decode_speed = decoded_bytes_mib / elapsed; - let error_count = errors.load(Ordering::SeqCst); + let error_count = index_verify_state.errors.load(Ordering::SeqCst); info!( " verified {read_bytes_mib:.2}/{decoded_bytes_mib:.2} MiB in {elapsed:.2} seconds, speed {read_speed:.2}/{decode_speed:.2} MiB/s ({error_count} errors)" ); - if errors.load(Ordering::SeqCst) > 0 { + if index_verify_state.errors.load(Ordering::SeqCst) > 0 { bail!("chunks could not be verified"); } @@ -286,28 +273,26 @@ impl VerifyWorker { #[allow(clippy::too_many_arguments)] fn verify_chunk_by_backend( backend: &DatastoreBackend, - datastore: Arc, - corrupt_chunks: Arc>>, - read_bytes: Arc, - decoded_bytes: Arc, - errors: Arc, + state: Arc, decoder_pool: &SendHandle<(DataBlob, [u8; 32], u64)>, info: &ChunkReadInfo, ) -> Result<(), Error> { match backend { - DatastoreBackend::Filesystem => match datastore.load_chunk(&info.digest) { + DatastoreBackend::Filesystem => match state.datastore.load_chunk(&info.digest) { Err(err) => Self::add_corrupt_chunk( - datastore, - corrupt_chunks, + Arc::clone(&state.datastore), + Arc::clone(&state.corrupt_chunks), info.digest, - errors, + &state.errors, &format!("can't verify chunk, load failed - {err}"), ), Ok(chunk) => { let size = info.size(); - read_bytes.fetch_add(chunk.raw_size(), Ordering::SeqCst); + state + .read_bytes + .fetch_add(chunk.raw_size(), Ordering::SeqCst); decoder_pool.send((chunk, info.digest, size))?; - decoded_bytes.fetch_add(size, Ordering::SeqCst); + state.decoded_bytes.fetch_add(size, Ordering::SeqCst); } }, DatastoreBackend::S3(s3_client) => { @@ -324,28 +309,30 @@ impl VerifyWorker { match chunk_result { Ok(chunk) => { let size = info.size(); - read_bytes.fetch_add(chunk.raw_size(), Ordering::SeqCst); + state + .read_bytes + .fetch_add(chunk.raw_size(), Ordering::SeqCst); decoder_pool.send((chunk, info.digest, size))?; - decoded_bytes.fetch_add(size, Ordering::SeqCst); + state.decoded_bytes.fetch_add(size, Ordering::SeqCst); } Err(err) => { - errors.fetch_add(1, Ordering::SeqCst); + state.errors.fetch_add(1, Ordering::SeqCst); error!("can't verify chunk, load failed - {err}"); } } } Ok(None) => Self::add_corrupt_chunk( - datastore, - corrupt_chunks, + Arc::clone(&state.datastore), + Arc::clone(&state.corrupt_chunks), info.digest, - errors, + &state.errors, &format!( "can't verify missing chunk with digest {}", hex::encode(info.digest) ), ), Err(err) => { - errors.fetch_add(1, Ordering::SeqCst); + state.errors.fetch_add(1, Ordering::SeqCst); error!("can't verify chunk, load failed - {err}"); } } @@ -358,7 +345,7 @@ impl VerifyWorker { datastore: Arc, corrupt_chunks: Arc>>, digest: [u8; 32], - errors: Arc, + errors: &AtomicUsize, message: &str, ) { // Panic on poisoned mutex @@ -681,3 +668,12 @@ impl VerifyWorker { } } } + +struct IndexVerifyState { + read_bytes: AtomicU64, + decoded_bytes: AtomicU64, + errors: AtomicUsize, + datastore: Arc, + corrupt_chunks: Arc>>, + start_time: Instant, +} _______________________________________________ pbs-devel mailing list pbs-devel@lists.proxmox.com https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel