From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by lists.proxmox.com (Postfix) with ESMTPS id 79AFC627F2 for ; Wed, 30 Sep 2020 16:16:11 +0200 (CEST) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 746541A300 for ; Wed, 30 Sep 2020 16:16:11 +0200 (CEST) Received: from proxmox-new.maurer-it.com (proxmox-new.maurer-it.com [212.186.127.180]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by firstgate.proxmox.com (Proxmox) with ESMTPS id 4CE2F1A2E9 for ; Wed, 30 Sep 2020 16:16:09 +0200 (CEST) Received: from proxmox-new.maurer-it.com (localhost.localdomain [127.0.0.1]) by proxmox-new.maurer-it.com (Proxmox) with ESMTP id 17B8C44B63 for ; Wed, 30 Sep 2020 16:16:09 +0200 (CEST) From: Stefan Reiter To: pbs-devel@lists.proxmox.com Date: Wed, 30 Sep 2020 16:16:01 +0200 Message-Id: <20200930141601.27233-6-s.reiter@proxmox.com> X-Mailer: git-send-email 2.20.1 In-Reply-To: <20200930141601.27233-1-s.reiter@proxmox.com> References: <20200930141601.27233-1-s.reiter@proxmox.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-SPAM-LEVEL: Spam detection results: 0 AWL -0.046 Adjusted score from AWL reputation of From: address KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment RCVD_IN_DNSWL_MED -2.3 Sender listed at https://www.dnswl.org/, medium trust SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record URIBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to URIBL was blocked. See http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block for more information. [environment.rs] Subject: [pbs-devel] [RFC proxmox-backup 5/5] backup: validate chunk existance in background X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 30 Sep 2020 14:16:11 -0000 Reused chunks will not be uploaded, and thus never touched. We need to verify their existance manually to ensure a valid backup. Since we know all chunks that the client may reuse must be recorded in the previous snapshot (which is locked during backup and can't be forgotten), we can do the validation in the background, while the backup is still running, and only join at the end if there's still work left. The tradeoff here is that we don't know yet which chunks the client will *not* reuse later in the backup, so we have to check them all. This also means we can revert the changes to the KnownChunksMap type made in 43772efc6e. Signed-off-by: Stefan Reiter --- src/api2/backup/environment.rs | 155 ++++++++++++++++++++++----------- 1 file changed, 105 insertions(+), 50 deletions(-) diff --git a/src/api2/backup/environment.rs b/src/api2/backup/environment.rs index a8c9ddb4..08ecc290 100644 --- a/src/api2/backup/environment.rs +++ b/src/api2/backup/environment.rs @@ -11,8 +11,9 @@ use proxmox::api::{RpcEnvironment, RpcEnvironmentType}; use crate::api2::types::{Userid, SnapshotVerifyState, VerifyState}; use crate::backup::*; -use crate::server::WorkerTask; +use crate::server::{WorkerTask, UPID}; use crate::server::formatter::*; +use crate::tools::ParallelHandler; use hyper::{Body, Response}; #[derive(Copy, Clone, Serialize)] @@ -66,8 +67,14 @@ struct FixedWriterState { incremental: bool, } -// key=digest, value=(length, existance checked) -type KnownChunksMap = HashMap<[u8;32], (u32, bool)>; +// key=digest, value=length +type KnownChunksMap = HashMap<[u8;32], u32>; + +enum ValidateHandlerState { + NotInitialized, + NotNeeded, + Running(ParallelHandler<'static, [u8;32]>), +} struct SharedBackupState { finished: bool, @@ -78,6 +85,7 @@ struct SharedBackupState { known_chunks: KnownChunksMap, backup_size: u64, // sums up size of all files backup_stat: UploadStatistic, + validate_handler: ValidateHandlerState, } impl SharedBackupState { @@ -131,6 +139,7 @@ impl BackupEnvironment { known_chunks: HashMap::new(), backup_size: 0, backup_stat: UploadStatistic::new(), + validate_handler: ValidateHandlerState::NotInitialized, }; Self { @@ -156,11 +165,89 @@ impl BackupEnvironment { state.ensure_unfinished()?; - state.known_chunks.insert(digest, (length, false)); + state.known_chunks.insert(digest, length); + + match &state.validate_handler { + ValidateHandlerState::NotInitialized => { + if self.last_backup_has_recent_verify()? { + state.validate_handler = ValidateHandlerState::NotNeeded; + } else { + let handler = self.start_validate_handler(); + handler.send(digest)?; + state.validate_handler = ValidateHandlerState::Running(handler); + } + }, + ValidateHandlerState::Running(handler) => { + handler.send(digest)?; + }, + ValidateHandlerState::NotNeeded => {} + } Ok(()) } + fn start_validate_handler(&self) -> ParallelHandler<'static, [u8;32]> { + let datastore = Arc::clone(&self.datastore); + let upid = Arc::new(self.worker.upid().clone()); + let last_backup = Arc::new(self.last_backup.clone()); + ParallelHandler::new( + "verify handler", + 1, // one worker is enough, and means we don't need a lock to mark the prev snapshot + move |digest| { + Self::validate_chunk_existance( + &digest, + Arc::clone(&datastore), + Arc::clone(&upid), + Arc::clone(&last_backup) + ) + }, + false // don't block on send + ) + } + + fn validate_chunk_existance( + digest: &[u8;32], + datastore: Arc, + upid: Arc, + last_backup: Arc>, + ) -> Result<(), Error> { + if !datastore.chunk_path(digest).0.exists() { + // Chunk is missing, mark last snapshot (which references it) as "verify failed" + let mark_msg = if let Some(ref last_backup) = *last_backup { + let last_dir = &last_backup.backup_dir; + let verify_state = SnapshotVerifyState { + state: VerifyState::Failed, + upid: UPID::clone(upid.as_ref()), + }; + + let res = proxmox::try_block!{ + let (mut manifest, _) = datastore.load_manifest(last_dir)?; + manifest.unprotected["verify_state"] = serde_json::to_value(verify_state)?; + datastore.store_manifest(last_dir, serde_json::to_value(manifest)?) + }; + + if let Err(err) = res { + format!("tried marking previous snapshot as bad, \ + but got error accessing manifest: {}", err) + } else { + "marked previous snapshot as bad, please use \ + 'verify' for a detailed check".to_owned() + } + } else { + "internal error: no base backup registered to mark invalid".to_owned() + }; + + bail!( + "chunk '{}' was attempted to be reused but doesn't exist - {}", + digest_to_hex(digest), + mark_msg + ); + } + + Ok(()) + } + + /// Register fixed length chunks after upload. /// /// Like `register_chunk()`, but additionally record statistics for @@ -176,6 +263,9 @@ impl BackupEnvironment { let mut state = self.state.lock().unwrap(); state.ensure_unfinished()?; + if let ValidateHandlerState::Running(handler) = &state.validate_handler { + handler.check_abort()?; + } let mut data = match state.fixed_writers.get_mut(&wid) { Some(data) => data, @@ -198,7 +288,7 @@ impl BackupEnvironment { if is_duplicate { data.upload_stat.duplicates += 1; } // register chunk - state.known_chunks.insert(digest, (size, true)); + state.known_chunks.insert(digest, size); Ok(()) } @@ -218,6 +308,9 @@ impl BackupEnvironment { let mut state = self.state.lock().unwrap(); state.ensure_unfinished()?; + if let ValidateHandlerState::Running(handler) = &state.validate_handler { + handler.check_abort()?; + } let mut data = match state.dynamic_writers.get_mut(&wid) { Some(data) => data, @@ -231,7 +324,7 @@ impl BackupEnvironment { if is_duplicate { data.upload_stat.duplicates += 1; } // register chunk - state.known_chunks.insert(digest, (size, true)); + state.known_chunks.insert(digest, size); Ok(()) } @@ -240,7 +333,7 @@ impl BackupEnvironment { let state = self.state.lock().unwrap(); match state.known_chunks.get(digest) { - Some((len, _)) => Some(*len), + Some(len) => Some(*len), None => None, } } @@ -483,47 +576,6 @@ impl BackupEnvironment { } } - /// Ensure all chunks referenced in this backup actually exist. - /// Only call *after* all writers have been closed, to avoid race with GC. - /// In case of error, mark the previous backup as 'verify failed'. - fn verify_chunk_existance(&self, known_chunks: &KnownChunksMap) -> Result<(), Error> { - for (digest, (_, checked)) in known_chunks.iter() { - if !checked && !self.datastore.chunk_path(digest).0.exists() { - let mark_msg = if let Some(ref last_backup) = self.last_backup { - let last_dir = &last_backup.backup_dir; - let verify_state = SnapshotVerifyState { - state: VerifyState::Failed, - upid: self.worker.upid().clone(), - }; - - let res = proxmox::try_block!{ - let (mut manifest, _) = self.datastore.load_manifest(last_dir)?; - manifest.unprotected["verify_state"] = serde_json::to_value(verify_state)?; - self.datastore.store_manifest(last_dir, serde_json::to_value(manifest)?) - }; - - if let Err(err) = res { - format!("tried marking previous snapshot as bad, \ - but got error accessing manifest: {}", err) - } else { - "marked previous snapshot as bad, please use \ - 'verify' for a detailed check".to_owned() - } - } else { - "internal error: no base backup registered to mark invalid".to_owned() - }; - - bail!( - "chunk '{}' was attempted to be reused but doesn't exist - {}", - digest_to_hex(digest), - mark_msg - ); - } - } - - Ok(()) - } - /// Mark backup as finished pub fn finish_backup(&self) -> Result<(), Error> { let mut state = self.state.lock().unwrap(); @@ -560,8 +612,11 @@ impl BackupEnvironment { } } - if !self.last_backup_has_recent_verify()? { - self.verify_chunk_existance(&state.known_chunks)?; + // stop verify handler and verify remaining chunks + let handler = std::mem::replace(&mut state.validate_handler, ValidateHandlerState::NotInitialized); + if let ValidateHandlerState::Running(handler) = handler { + self.worker.log("waiting for validate thread to complete"); + handler.complete()?; } // marks the backup as successful -- 2.20.1