From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <s.reiter@proxmox.com>
Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature RSA-PSS (2048 bits))
 (No client certificate requested)
 by lists.proxmox.com (Postfix) with ESMTPS id 4A3CD60882
 for <pbs-devel@lists.proxmox.com>; Thu, 15 Oct 2020 12:49:28 +0200 (CEST)
Received: from firstgate.proxmox.com (localhost [127.0.0.1])
 by firstgate.proxmox.com (Proxmox) with ESMTP id 44D8011FD4
 for <pbs-devel@lists.proxmox.com>; Thu, 15 Oct 2020 12:49:28 +0200 (CEST)
Received: from proxmox-new.maurer-it.com (proxmox-new.maurer-it.com
 [212.186.127.180])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature RSA-PSS (2048 bits))
 (No client certificate requested)
 by firstgate.proxmox.com (Proxmox) with ESMTPS id F3A4711FAA
 for <pbs-devel@lists.proxmox.com>; Thu, 15 Oct 2020 12:49:25 +0200 (CEST)
Received: from proxmox-new.maurer-it.com (localhost.localdomain [127.0.0.1])
 by proxmox-new.maurer-it.com (Proxmox) with ESMTP id B464B45D9A
 for <pbs-devel@lists.proxmox.com>; Thu, 15 Oct 2020 12:49:25 +0200 (CEST)
From: Stefan Reiter <s.reiter@proxmox.com>
To: pbs-devel@lists.proxmox.com
Date: Thu, 15 Oct 2020 12:49:13 +0200
Message-Id: <20201015104916.21170-2-s.reiter@proxmox.com>
X-Mailer: git-send-email 2.20.1
In-Reply-To: <20201015104916.21170-1-s.reiter@proxmox.com>
References: <20201015104916.21170-1-s.reiter@proxmox.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-SPAM-LEVEL: Spam detection results:  0
 AWL -0.036 Adjusted score from AWL reputation of From: address
 KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment
 RCVD_IN_DNSWL_MED        -2.3 Sender listed at https://www.dnswl.org/,
 medium trust
 SPF_HELO_NONE           0.001 SPF: HELO does not publish an SPF Record
 SPF_PASS               -0.001 SPF: sender matches SPF record
 URIBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to URIBL was blocked. See
 http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block for more
 information. [datastore.rs]
Subject: [pbs-devel] [PATCH v2 proxmox-backup 1/4] gc: avoid race between
 phase1 and forget/prune
X-BeenThere: pbs-devel@lists.proxmox.com
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Proxmox Backup Server development discussion
 <pbs-devel.lists.proxmox.com>
List-Unsubscribe: <https://lists.proxmox.com/cgi-bin/mailman/options/pbs-devel>, 
 <mailto:pbs-devel-request@lists.proxmox.com?subject=unsubscribe>
List-Archive: <http://lists.proxmox.com/pipermail/pbs-devel/>
List-Post: <mailto:pbs-devel@lists.proxmox.com>
List-Help: <mailto:pbs-devel-request@lists.proxmox.com?subject=help>
List-Subscribe: <https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel>, 
 <mailto:pbs-devel-request@lists.proxmox.com?subject=subscribe>
X-List-Received-Date: Thu, 15 Oct 2020 10:49:28 -0000

...by saving all forgotten chunks into a HashSet when we detect a GC
phase1 currently running. If GC then hits a NotFound, it can check if
the snapshot was simply forgotten behind its back, or if an actual error
occurred and it needs to abort (since it might delete still referenced
chunks, if the error is transient and the index file is still there).

We have to attach the error message in {fixed,dynamic}_index via the
.context() method, otherwise the original std::io::Error gets lost and
we can't check for NotFound.

Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
---

v2:
* use File::open directly and catch std::io::Error that way, avoid Error.context

 src/backup/datastore.rs | 62 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 55 insertions(+), 7 deletions(-)

diff --git a/src/backup/datastore.rs b/src/backup/datastore.rs
index 7dd2624c..ca8ca438 100644
--- a/src/backup/datastore.rs
+++ b/src/backup/datastore.rs
@@ -36,6 +36,9 @@ pub struct DataStore {
     chunk_store: Arc<ChunkStore>,
     gc_mutex: Mutex<bool>,
     last_gc_status: Mutex<GarbageCollectionStatus>,
+
+    // bool indicates if phase1 is currently active
+    removed_during_gc: Mutex<(bool, HashSet<PathBuf>)>,
 }
 
 impl DataStore {
@@ -82,6 +85,7 @@ impl DataStore {
             chunk_store: Arc::new(chunk_store),
             gc_mutex: Mutex::new(false),
             last_gc_status: Mutex::new(gc_status),
+            removed_during_gc: Mutex::new((false, HashSet::new())),
         })
     }
 
@@ -236,6 +240,14 @@ impl DataStore {
             _guard = lock_dir_noblock(&full_path, "snapshot", "possibly running or in use")?;
         }
 
+        // Acquire lock and keep it during remove operation, so there's no
+        // chance for a race between adding to the hash and actually removing
+        // the dir (phase1 might otherwise start in-between)
+        let mut removed_guard = self.removed_during_gc.lock().unwrap();
+        if removed_guard.0 {
+            removed_guard.1.insert(self.snapshot_path(&backup_dir));
+        }
+
         log::info!("removing backup snapshot {:?}", full_path);
         std::fs::remove_dir_all(&full_path)
             .map_err(|err| {
@@ -461,12 +473,35 @@ impl DataStore {
             tools::fail_on_shutdown()?;
 
             if let Ok(archive_type) = archive_type(&path) {
-                if archive_type == ArchiveType::FixedIndex {
-                    let index = self.open_fixed_reader(&path)?;
-                    self.index_mark_used_chunks(index, &path, status, worker)?;
-                } else if archive_type == ArchiveType::DynamicIndex {
-                    let index = self.open_dynamic_reader(&path)?;
-                    self.index_mark_used_chunks(index, &path, status, worker)?;
+                let full_path =  self.chunk_store.relative_path(&path);
+
+                match std::fs::File::open(&full_path) {
+                    Ok(file) => {
+                        if archive_type == ArchiveType::FixedIndex {
+                            let index = FixedIndexReader::new(file)?;
+                            self.index_mark_used_chunks(index, &path, status, worker)?;
+                        } else if archive_type == ArchiveType::DynamicIndex {
+                            let index = DynamicIndexReader::new(file)?;
+                            self.index_mark_used_chunks(index, &path, status, worker)?;
+                        }
+                    },
+                    Err(err) => {
+                        if err.kind() == std::io::ErrorKind::NotFound {
+                            let (_, removed_hash) = &*self.removed_during_gc.lock().unwrap();
+                            let backup_dir = path.parent();
+                            if backup_dir.is_some() && removed_hash.contains(backup_dir.unwrap()) {
+                                // index file not found, but we know that it was deleted by
+                                // a concurrent 'forget', so we can safely ignore
+                            } else {
+                                bail!(
+                                    "index file not found but hasn't been removed by forget/prune, aborting GC - {}",
+                                    err
+                                )
+                            }
+                        } else {
+                            bail!(err)
+                        }
+                    }
                 }
             }
             done += 1;
@@ -512,7 +547,20 @@ impl DataStore {
 
             crate::task_log!(worker, "Start GC phase1 (mark used chunks)");
 
-            self.mark_used_chunks(&mut gc_status, worker)?;
+            {
+                let mut guard = self.removed_during_gc.lock().unwrap();
+                guard.0 = true;
+            }
+            let mark_res = self.mark_used_chunks(&mut gc_status, worker);
+            {
+                let mut guard = self.removed_during_gc.lock().unwrap();
+                guard.0 = false;
+                guard.1.clear();
+            }
+
+            if let Err(err) = mark_res {
+                bail!(err);
+            }
 
             crate::task_log!(worker, "Start GC phase2 (sweep unused chunks)");
             self.chunk_store.sweep_unused_chunks(
-- 
2.20.1