public inbox for pbs-devel@lists.proxmox.com
 help / color / mirror / Atom feed
From: Stefan Reiter <s.reiter@proxmox.com>
To: pbs-devel@lists.proxmox.com
Subject: [pbs-devel] [PATCH proxmox-backup 06/11] gc: avoid race between phase1 and forget/prune
Date: Wed, 14 Oct 2020 14:16:34 +0200	[thread overview]
Message-ID: <20201014121639.25276-7-s.reiter@proxmox.com> (raw)
In-Reply-To: <20201014121639.25276-1-s.reiter@proxmox.com>

...by saving all forgotten chunks into a HashSet when we detect a GC
phase1 currently running. If GC then hits a NotFound, it can check if
the snapshot was simply forgotten behind its back, or if an actual error
occurred and it needs to abort (since it might delete still referenced
chunks, if the error is transient and the index file is still there).

We have to attach the error message in {fixed,dynamic}_index via the
.context() method, otherwise the original std::io::Error gets lost and
we can't check for NotFound.

Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
---

@Dietmar: I went with the anyhow .context() method again - I know we talked
about this, but using a std::io::Error directly also doesn't solve the problem,
since we do a (io_)format_err(...) which removes the context again. And if we
return the IO error directly, we'd have to add that text formatting to all call
sites, or drop it entirely, which I both rather dislike.

 src/backup/datastore.rs     | 56 +++++++++++++++++++++++++++++++++----
 src/backup/dynamic_index.rs |  5 +++-
 src/backup/fixed_index.rs   |  7 +++--
 3 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/src/backup/datastore.rs b/src/backup/datastore.rs
index a77c5f1d..d1758408 100644
--- a/src/backup/datastore.rs
+++ b/src/backup/datastore.rs
@@ -37,6 +37,9 @@ pub struct DataStore {
     chunk_store: Arc<ChunkStore>,
     gc_mutex: Mutex<bool>,
     last_gc_status: Mutex<GarbageCollectionStatus>,
+
+    // bool indicates if phase1 is currently active
+    removed_during_gc: Mutex<(bool, HashSet<PathBuf>)>,
 }
 
 impl DataStore {
@@ -83,6 +86,7 @@ impl DataStore {
             chunk_store: Arc::new(chunk_store),
             gc_mutex: Mutex::new(false),
             last_gc_status: Mutex::new(gc_status),
+            removed_during_gc: Mutex::new((false, HashSet::new())),
         })
     }
 
@@ -230,6 +234,14 @@ impl DataStore {
             _guard = lock_dir_noblock(&full_path, "snapshot", "possibly running or in use")?;
         }
 
+        // Acquire lock and keep it during remove operation, so there's no
+        // chance for a race between adding to the hash and actually removing
+        // the dir (phase1 might otherwise start in-between)
+        let mut removed_guard = self.removed_during_gc.lock().unwrap();
+        if removed_guard.0 {
+            removed_guard.1.insert(self.snapshot_path(&backup_dir));
+        }
+
         log::info!("removing backup snapshot {:?}", full_path);
         std::fs::remove_dir_all(&full_path)
             .map_err(|err| {
@@ -449,6 +461,21 @@ impl DataStore {
 
         let mut last_percentage: usize = 0;
 
+        let handle_notfound = |err: Error, path: &Path| {
+            if let Some(ioerr) = err.downcast_ref::<std::io::Error>() {
+                if ioerr.kind() == std::io::ErrorKind::NotFound {
+                    let (_, removed_hash) = &*self.removed_during_gc.lock().unwrap();
+                    let backup_dir = path.parent();
+                    if backup_dir.is_some() && removed_hash.contains(backup_dir.unwrap()) {
+                        // index file not found, but we know that it was deleted by
+                        // a concurrent 'forget', so we can safely ignore
+                        return Ok(())
+                    }
+                }
+            }
+            Err(err)
+        };
+
         for path in image_list {
 
             worker.check_abort()?;
@@ -456,11 +483,17 @@ impl DataStore {
 
             if let Ok(archive_type) = archive_type(&path) {
                 if archive_type == ArchiveType::FixedIndex {
-                    let index = self.open_fixed_reader(&path)?;
-                    self.index_mark_used_chunks(index, &path, status, worker)?;
+                    let index = self.open_fixed_reader(&path);
+                    match index {
+                        Ok(index) => self.index_mark_used_chunks(index, &path, status, worker)?,
+                        Err(err) => handle_notfound(err, &path)?
+                    }
                 } else if archive_type == ArchiveType::DynamicIndex {
-                    let index = self.open_dynamic_reader(&path)?;
-                    self.index_mark_used_chunks(index, &path, status, worker)?;
+                    let index = self.open_dynamic_reader(&path);
+                    match index {
+                        Ok(index) => self.index_mark_used_chunks(index, &path, status, worker)?,
+                        Err(err) => handle_notfound(err, &path)?
+                    }
                 }
             }
             done += 1;
@@ -506,7 +539,20 @@ impl DataStore {
 
             crate::task_log!(worker, "Start GC phase1 (mark used chunks)");
 
-            self.mark_used_chunks(&mut gc_status, worker)?;
+            {
+                let mut guard = self.removed_during_gc.lock().unwrap();
+                guard.0 = true;
+            }
+            let mark_res = self.mark_used_chunks(&mut gc_status, worker);
+            {
+                let mut guard = self.removed_during_gc.lock().unwrap();
+                guard.0 = false;
+                guard.1.clear();
+            }
+
+            if let Err(err) = mark_res {
+                bail!(err);
+            }
 
             crate::task_log!(worker, "Start GC phase2 (sweep unused chunks)");
             self.chunk_store.sweep_unused_chunks(
diff --git a/src/backup/dynamic_index.rs b/src/backup/dynamic_index.rs
index 8731a418..0776209c 100644
--- a/src/backup/dynamic_index.rs
+++ b/src/backup/dynamic_index.rs
@@ -86,7 +86,10 @@ impl DynamicIndexReader {
         File::open(path)
             .map_err(Error::from)
             .and_then(Self::new)
-            .map_err(|err| format_err!("Unable to open dynamic index {:?} - {}", path, err))
+            .map_err(|err| {
+                let msg = format!("Unable to open dynamic index {:?} - {}", path, &err);
+                err.context(msg)
+            })
     }
 
     pub fn new(mut file: std::fs::File) -> Result<Self, Error> {
diff --git a/src/backup/fixed_index.rs b/src/backup/fixed_index.rs
index eff50055..6a1e66f5 100644
--- a/src/backup/fixed_index.rs
+++ b/src/backup/fixed_index.rs
@@ -1,4 +1,4 @@
-use anyhow::{bail, format_err, Error};
+use anyhow::{bail, Error};
 use std::io::{Seek, SeekFrom};
 
 use super::chunk_stat::*;
@@ -61,7 +61,10 @@ impl FixedIndexReader {
         File::open(path)
             .map_err(Error::from)
             .and_then(|file| Self::new(file))
-            .map_err(|err| format_err!("Unable to open fixed index {:?} - {}", path, err))
+            .map_err(|err| {
+                let msg = format!("Unable to open fixed index {:?} - {}", path, &err);
+                err.context(msg)
+            })
     }
 
     pub fn new(mut file: std::fs::File) -> Result<Self, Error> {
-- 
2.20.1





  parent reply	other threads:[~2020-10-14 12:16 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-10-14 12:16 [pbs-devel] [PATCH 00/11] Locking and rustdoc improvements Stefan Reiter
2020-10-14 12:16 ` [pbs-devel] [PATCH proxmox-backup 01/11] prune: respect snapshot flock Stefan Reiter
2020-10-15  5:11   ` [pbs-devel] applied: " Dietmar Maurer
2020-10-14 12:16 ` [pbs-devel] [PATCH proxmox-backup 02/11] prune: never fail, just warn about failed removals Stefan Reiter
2020-10-15  5:12   ` [pbs-devel] applied: " Dietmar Maurer
2020-10-14 12:16 ` [pbs-devel] [PATCH proxmox-backup 03/11] backup: use shared flock for base snapshot Stefan Reiter
2020-10-15  5:12   ` [pbs-devel] applied: " Dietmar Maurer
2020-10-14 12:16 ` [pbs-devel] [PATCH proxmox-backup 04/11] reader: acquire shared flock on open snapshot Stefan Reiter
2020-10-15  5:13   ` [pbs-devel] applied: " Dietmar Maurer
2020-10-14 12:16 ` [pbs-devel] [PATCH proxmox-backup 05/11] verify: acquire shared snapshot flock and skip on error Stefan Reiter
2020-10-15  5:13   ` [pbs-devel] applied: " Dietmar Maurer
2020-10-14 12:16 ` Stefan Reiter [this message]
2020-10-15  5:17   ` [pbs-devel] [PATCH proxmox-backup 06/11] gc: avoid race between phase1 and forget/prune Dietmar Maurer
2020-10-14 12:16 ` [pbs-devel] [PATCH proxmox-backup 07/11] datastore: remove load_manifest_json Stefan Reiter
2020-10-15  5:28   ` [pbs-devel] applied: " Dietmar Maurer
2020-10-14 12:16 ` [pbs-devel] [PATCH proxmox-backup 08/11] datastore: add manifest locking Stefan Reiter
2020-10-15  5:25   ` Dietmar Maurer
2020-10-15  7:04     ` Fabian Grünbichler
2020-10-15  5:39   ` Dietmar Maurer
2020-10-15  7:53     ` Stefan Reiter
2020-10-15  5:43   ` Dietmar Maurer
2020-10-15  7:53     ` Stefan Reiter
2020-10-14 12:16 ` [pbs-devel] [PATCH proxmox-backup 09/11] datastore: remove individual snapshots before group Stefan Reiter
2020-10-15  5:51   ` [pbs-devel] applied: " Dietmar Maurer
2020-10-14 12:16 ` [pbs-devel] [PATCH proxmox-backup 10/11] rustdoc: add crate level doc Stefan Reiter
2020-10-14 12:16 ` [pbs-devel] [PATCH proxmox-backup 11/11] rustdoc: overhaul backup rustdoc and add locking table Stefan Reiter

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201014121639.25276-7-s.reiter@proxmox.com \
    --to=s.reiter@proxmox.com \
    --cc=pbs-devel@lists.proxmox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal