From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by lists.proxmox.com (Postfix) with ESMTPS id B2E5B601F6 for ; Wed, 14 Oct 2020 14:16:49 +0200 (CEST) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id B0C7D9B26 for ; Wed, 14 Oct 2020 14:16:49 +0200 (CEST) Received: from proxmox-new.maurer-it.com (proxmox-new.maurer-it.com [212.186.127.180]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by firstgate.proxmox.com (Proxmox) with ESMTPS id BD8369AC3 for ; Wed, 14 Oct 2020 14:16:47 +0200 (CEST) Received: from proxmox-new.maurer-it.com (localhost.localdomain [127.0.0.1]) by proxmox-new.maurer-it.com (Proxmox) with ESMTP id 89E9E45D57 for ; Wed, 14 Oct 2020 14:16:47 +0200 (CEST) From: Stefan Reiter To: pbs-devel@lists.proxmox.com Date: Wed, 14 Oct 2020 14:16:34 +0200 Message-Id: <20201014121639.25276-7-s.reiter@proxmox.com> X-Mailer: git-send-email 2.20.1 In-Reply-To: <20201014121639.25276-1-s.reiter@proxmox.com> References: <20201014121639.25276-1-s.reiter@proxmox.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-SPAM-LEVEL: Spam detection results: 0 AWL -0.038 Adjusted score from AWL reputation of From: address KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment RCVD_IN_DNSWL_MED -2.3 Sender listed at https://www.dnswl.org/, medium trust SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record URIBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to URIBL was blocked. See http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block for more information. [datastore.rs] Subject: [pbs-devel] [PATCH proxmox-backup 06/11] gc: avoid race between phase1 and forget/prune X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 14 Oct 2020 12:16:49 -0000 ...by saving all forgotten chunks into a HashSet when we detect a GC phase1 currently running. If GC then hits a NotFound, it can check if the snapshot was simply forgotten behind its back, or if an actual error occurred and it needs to abort (since it might delete still referenced chunks, if the error is transient and the index file is still there). We have to attach the error message in {fixed,dynamic}_index via the .context() method, otherwise the original std::io::Error gets lost and we can't check for NotFound. Signed-off-by: Stefan Reiter --- @Dietmar: I went with the anyhow .context() method again - I know we talked about this, but using a std::io::Error directly also doesn't solve the problem, since we do a (io_)format_err(...) which removes the context again. And if we return the IO error directly, we'd have to add that text formatting to all call sites, or drop it entirely, which I both rather dislike. src/backup/datastore.rs | 56 +++++++++++++++++++++++++++++++++---- src/backup/dynamic_index.rs | 5 +++- src/backup/fixed_index.rs | 7 +++-- 3 files changed, 60 insertions(+), 8 deletions(-) diff --git a/src/backup/datastore.rs b/src/backup/datastore.rs index a77c5f1d..d1758408 100644 --- a/src/backup/datastore.rs +++ b/src/backup/datastore.rs @@ -37,6 +37,9 @@ pub struct DataStore { chunk_store: Arc, gc_mutex: Mutex, last_gc_status: Mutex, + + // bool indicates if phase1 is currently active + removed_during_gc: Mutex<(bool, HashSet)>, } impl DataStore { @@ -83,6 +86,7 @@ impl DataStore { chunk_store: Arc::new(chunk_store), gc_mutex: Mutex::new(false), last_gc_status: Mutex::new(gc_status), + removed_during_gc: Mutex::new((false, HashSet::new())), }) } @@ -230,6 +234,14 @@ impl DataStore { _guard = lock_dir_noblock(&full_path, "snapshot", "possibly running or in use")?; } + // Acquire lock and keep it during remove operation, so there's no + // chance for a race between adding to the hash and actually removing + // the dir (phase1 might otherwise start in-between) + let mut removed_guard = self.removed_during_gc.lock().unwrap(); + if removed_guard.0 { + removed_guard.1.insert(self.snapshot_path(&backup_dir)); + } + log::info!("removing backup snapshot {:?}", full_path); std::fs::remove_dir_all(&full_path) .map_err(|err| { @@ -449,6 +461,21 @@ impl DataStore { let mut last_percentage: usize = 0; + let handle_notfound = |err: Error, path: &Path| { + if let Some(ioerr) = err.downcast_ref::() { + if ioerr.kind() == std::io::ErrorKind::NotFound { + let (_, removed_hash) = &*self.removed_during_gc.lock().unwrap(); + let backup_dir = path.parent(); + if backup_dir.is_some() && removed_hash.contains(backup_dir.unwrap()) { + // index file not found, but we know that it was deleted by + // a concurrent 'forget', so we can safely ignore + return Ok(()) + } + } + } + Err(err) + }; + for path in image_list { worker.check_abort()?; @@ -456,11 +483,17 @@ impl DataStore { if let Ok(archive_type) = archive_type(&path) { if archive_type == ArchiveType::FixedIndex { - let index = self.open_fixed_reader(&path)?; - self.index_mark_used_chunks(index, &path, status, worker)?; + let index = self.open_fixed_reader(&path); + match index { + Ok(index) => self.index_mark_used_chunks(index, &path, status, worker)?, + Err(err) => handle_notfound(err, &path)? + } } else if archive_type == ArchiveType::DynamicIndex { - let index = self.open_dynamic_reader(&path)?; - self.index_mark_used_chunks(index, &path, status, worker)?; + let index = self.open_dynamic_reader(&path); + match index { + Ok(index) => self.index_mark_used_chunks(index, &path, status, worker)?, + Err(err) => handle_notfound(err, &path)? + } } } done += 1; @@ -506,7 +539,20 @@ impl DataStore { crate::task_log!(worker, "Start GC phase1 (mark used chunks)"); - self.mark_used_chunks(&mut gc_status, worker)?; + { + let mut guard = self.removed_during_gc.lock().unwrap(); + guard.0 = true; + } + let mark_res = self.mark_used_chunks(&mut gc_status, worker); + { + let mut guard = self.removed_during_gc.lock().unwrap(); + guard.0 = false; + guard.1.clear(); + } + + if let Err(err) = mark_res { + bail!(err); + } crate::task_log!(worker, "Start GC phase2 (sweep unused chunks)"); self.chunk_store.sweep_unused_chunks( diff --git a/src/backup/dynamic_index.rs b/src/backup/dynamic_index.rs index 8731a418..0776209c 100644 --- a/src/backup/dynamic_index.rs +++ b/src/backup/dynamic_index.rs @@ -86,7 +86,10 @@ impl DynamicIndexReader { File::open(path) .map_err(Error::from) .and_then(Self::new) - .map_err(|err| format_err!("Unable to open dynamic index {:?} - {}", path, err)) + .map_err(|err| { + let msg = format!("Unable to open dynamic index {:?} - {}", path, &err); + err.context(msg) + }) } pub fn new(mut file: std::fs::File) -> Result { diff --git a/src/backup/fixed_index.rs b/src/backup/fixed_index.rs index eff50055..6a1e66f5 100644 --- a/src/backup/fixed_index.rs +++ b/src/backup/fixed_index.rs @@ -1,4 +1,4 @@ -use anyhow::{bail, format_err, Error}; +use anyhow::{bail, Error}; use std::io::{Seek, SeekFrom}; use super::chunk_stat::*; @@ -61,7 +61,10 @@ impl FixedIndexReader { File::open(path) .map_err(Error::from) .and_then(|file| Self::new(file)) - .map_err(|err| format_err!("Unable to open fixed index {:?} - {}", path, err)) + .map_err(|err| { + let msg = format!("Unable to open fixed index {:?} - {}", path, &err); + err.context(msg) + }) } pub fn new(mut file: std::fs::File) -> Result { -- 2.20.1