all lists on lists.proxmox.com
 help / color / mirror / Atom feed
From: Christian Ebner <c.ebner@proxmox.com>
To: pbs-devel@lists.proxmox.com
Subject: [pbs-devel] [PATCH proxmox-backup 5/5] fix #5331: garbage collection: avoid multiple chunk atime updates
Date: Fri, 21 Feb 2025 15:01:10 +0100	[thread overview]
Message-ID: <20250221140110.377328-6-c.ebner@proxmox.com> (raw)
In-Reply-To: <20250221140110.377328-1-c.ebner@proxmox.com>

Reduce the number of atime updates on the same chunk by logically
iterating over image index files, following the incremental backup
logic. By inserting paths for encountered images during
`list_images` using the GroupedImageList structure, the iteration
happens now for the same image filenames in the same image namespace
and group in a order based on the snapshot timestamp. For each image,
keep track of the encountered chunk digests, and remember these as
seen for the next snapshot. Chunks which have been encountered in the
previous image index, but are not present anymore are removed from
the list after each image, in order to reduce memory footprint.

Fixes: https://bugzilla.proxmox.com/show_bug.cgi?id=5331
Signed-off-by: Christian Ebner <c.ebner@proxmox.com>
---
 pbs-datastore/src/datastore.rs | 70 ++++++++++++++++++++--------------
 1 file changed, 42 insertions(+), 28 deletions(-)

diff --git a/pbs-datastore/src/datastore.rs b/pbs-datastore/src/datastore.rs
index f9047820a..992812269 100644
--- a/pbs-datastore/src/datastore.rs
+++ b/pbs-datastore/src/datastore.rs
@@ -971,14 +971,14 @@ impl DataStore {
         ListGroups::new(Arc::clone(self), ns)?.collect()
     }
 
-    fn list_images(&self) -> Result<Vec<PathBuf>, Error> {
+    fn list_images(&self) -> Result<GroupedImageList, Error> {
         let base = self.base_path();
 
-        let mut list = vec![];
+        let mut list = GroupedImageList::new();
 
         use walkdir::WalkDir;
 
-        let walker = WalkDir::new(base).into_iter();
+        let walker = WalkDir::new(&base).into_iter();
 
         // make sure we skip .chunks (and other hidden files to keep it simple)
         fn is_hidden(entry: &walkdir::DirEntry) -> bool {
@@ -1022,7 +1022,7 @@ impl DataStore {
                 if archive_type == ArchiveType::FixedIndex
                     || archive_type == ArchiveType::DynamicIndex
                 {
-                    list.push(path);
+                    list.insert(&path, &base)?;
                 }
             }
         }
@@ -1035,6 +1035,7 @@ impl DataStore {
         &self,
         index: I,
         file_name: &Path, // only used for error reporting
+        touched_chunks: &mut TouchedChunks,
         status: &mut GarbageCollectionStatus,
         worker: &dyn WorkerTaskContext,
     ) -> Result<(), Error> {
@@ -1045,6 +1046,12 @@ impl DataStore {
             worker.check_abort()?;
             worker.fail_on_shutdown()?;
             let digest = index.index_digest(pos).unwrap();
+
+            // Avoid multiple expensive atime updates by utimensat
+            if touched_chunks.insert(*digest) {
+                continue;
+            }
+
             if !self.inner.chunk_store.cond_touch_chunk(digest, false)? {
                 let hex = hex::encode(digest);
                 warn!(
@@ -1069,6 +1076,7 @@ impl DataStore {
     fn mark_used_chunks_do(
         &self,
         img: &Path,
+        touched_chunks: &mut TouchedChunks,
         status: &mut GarbageCollectionStatus,
         worker: &dyn WorkerTaskContext,
     ) -> Result<(), Error> {
@@ -1079,12 +1087,12 @@ impl DataStore {
                         let index = FixedIndexReader::new(file).map_err(|err| {
                             format_err!("can't read index '{}' - {err}", img.to_string_lossy())
                         })?;
-                        self.index_mark_used_chunks(index, img, status, worker)?;
+                        self.index_mark_used_chunks(index, img, touched_chunks, status, worker)?;
                     } else if archive_type == ArchiveType::DynamicIndex {
                         let index = DynamicIndexReader::new(file).map_err(|err| {
                             format_err!("can't read index '{}' - {err}", img.to_string_lossy())
                         })?;
-                        self.index_mark_used_chunks(index, img, status, worker)?;
+                        self.index_mark_used_chunks(index, img, touched_chunks, status, worker)?;
                     }
                 }
             }
@@ -1099,38 +1107,44 @@ impl DataStore {
         status: &mut GarbageCollectionStatus,
         worker: &dyn WorkerTaskContext,
     ) -> Result<(), Error> {
-        let image_list = self.list_images()?;
+        let mut image_list = self.list_images()?;
         let image_count = image_list.len();
 
         let mut last_percentage: usize = 0;
 
-        let mut strange_paths_count: u64 = 0;
-
-        for (i, img) in image_list.into_iter().enumerate() {
-            worker.check_abort()?;
-            worker.fail_on_shutdown()?;
-
-            if let Some(backup_dir_path) = img.parent() {
-                let backup_dir_path = backup_dir_path.strip_prefix(self.base_path())?;
-                if let Some(backup_dir_str) = backup_dir_path.to_str() {
-                    if pbs_api_types::parse_ns_and_snapshot(backup_dir_str).is_err() {
-                        strange_paths_count += 1;
+        // Optimize for avoiding updates of chunks atime in same group with same
+        // image names multiple times.
+        let mut touched_chunks = TouchedChunks::new();
+        let mut processed_images = 0;
+        for (_group, images) in image_list.groups.iter_mut() {
+            for (_image, snapshots) in images.iter_mut() {
+                // Sort by snapshot timestamp to iterate over consecutive snapshots for each image.
+                snapshots.sort_by(|a, b| a.0.cmp(&b.0));
+                for (_timestamp, img) in snapshots {
+                    worker.check_abort()?;
+                    worker.fail_on_shutdown()?;
+
+                    self.mark_used_chunks_do(img, &mut touched_chunks, status, worker)?;
+                    touched_chunks.reset();
+
+                    let percentage = (processed_images + 1) * 100 / image_count;
+                    if percentage > last_percentage {
+                        info!(
+                            "marked {percentage}% ({} of {image_count} index files)",
+                            processed_images + 1,
+                        );
+                        last_percentage = percentage;
                     }
+                    processed_images += 1;
                 }
             }
+        }
 
-            self.mark_used_chunks_do(&img, status, worker)?;
-
-            let percentage = (i + 1) * 100 / image_count;
-            if percentage > last_percentage {
-                info!(
-                    "marked {percentage}% ({} of {image_count} index files)",
-                    i + 1,
-                );
-                last_percentage = percentage;
-            }
+        for img in &image_list.strange_path_images {
+            self.mark_used_chunks_do(img, &mut touched_chunks, status, worker)?;
         }
 
+        let strange_paths_count = image_list.strange_path_images.len();
         if strange_paths_count > 0 {
             info!(
                 "found (and marked) {strange_paths_count} index files outside of expected directory scheme"
-- 
2.39.5



_______________________________________________
pbs-devel mailing list
pbs-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel


  parent reply	other threads:[~2025-02-21 14:02 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-02-21 14:01 [pbs-devel] [PATCH proxmox-backup 0/5] GC: avoid multiple " Christian Ebner
2025-02-21 14:01 ` [pbs-devel] [PATCH proxmox-backup 1/5] datastore: restrict datastores list_images method scope to module Christian Ebner
2025-02-21 14:01 ` [pbs-devel] [PATCH proxmox-backup 2/5] garbage collection: refactor archive type based chunk marking logic Christian Ebner
2025-02-21 14:01 ` [pbs-devel] [PATCH proxmox-backup 3/5] garbage collection: add structure for optimized image iteration Christian Ebner
2025-03-05 13:47   ` Fabian Grünbichler
2025-03-07  8:24     ` Christian Ebner
2025-03-07  8:53       ` Fabian Grünbichler
2025-03-07  8:59         ` Christian Ebner
2025-02-21 14:01 ` [pbs-devel] [PATCH proxmox-backup 4/5] garbage collection: allow to keep track of already touched chunks Christian Ebner
2025-02-21 14:01 ` Christian Ebner [this message]
2025-02-21 15:35 ` [pbs-devel] [PATCH proxmox-backup 0/5] GC: avoid multiple atime updates Roland
2025-02-21 15:49   ` Christian Ebner
2025-02-22 17:50     ` Roland
2025-03-10 11:18 ` Christian Ebner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250221140110.377328-6-c.ebner@proxmox.com \
    --to=c.ebner@proxmox.com \
    --cc=pbs-devel@lists.proxmox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal