public inbox for pbs-devel@lists.proxmox.com
 help / color / mirror / Atom feed
From: Christian Ebner <c.ebner@proxmox.com>
To: pbs-devel@lists.proxmox.com
Subject: [pbs-devel] [RFC proxmox-backup 19/20] fix #3174: archiver: reuse files with unchanged metadata
Date: Fri, 22 Sep 2023 09:16:20 +0200	[thread overview]
Message-ID: <20230922071621.12670-20-c.ebner@proxmox.com> (raw)
In-Reply-To: <20230922071621.12670-1-c.ebner@proxmox.com>

During pxar archive encoding, check regular files against their
previous backup catalogs metadata, if present.

Instead of re-encoding files with unchanged metadata with file size over
a given threshold limit, mark the entries as appendix references in the
pxar archive and append the chunks containing the file payload in the
appendix.

Signed-off-by: Christian Ebner <c.ebner@proxmox.com>
---
 pbs-client/src/pxar/create.rs             | 149 +++++++++++++++++++++-
 src/tape/file_formats/snapshot_archive.rs |   2 +-
 2 files changed, 147 insertions(+), 4 deletions(-)

diff --git a/pbs-client/src/pxar/create.rs b/pbs-client/src/pxar/create.rs
index d6afc465..cb9af26f 100644
--- a/pbs-client/src/pxar/create.rs
+++ b/pbs-client/src/pxar/create.rs
@@ -24,7 +24,7 @@ use proxmox_io::vec;
 use proxmox_lang::c_str;
 use proxmox_sys::fs::{self, acl, xattr};
 
-use pbs_datastore::catalog::{BackupCatalogWriter, CatalogReader};
+use pbs_datastore::catalog::{BackupCatalogWriter, CatalogReader, DirEntryAttribute};
 use pbs_datastore::dynamic_index::{DynamicEntry, DynamicIndexReader};
 
 use crate::inject_reused_chunks::InjectChunks;
@@ -32,6 +32,8 @@ use crate::pxar::metadata::errno_is_unsupported;
 use crate::pxar::tools::assert_single_path_component;
 use crate::pxar::Flags;
 
+const MAX_FILE_SIZE: u64 = 128;
+
 /// Pxar options for creating a pxar archive/stream
 #[derive(Default)]
 pub struct PxarCreateOptions {
@@ -218,7 +220,14 @@ where
     archiver
         .archive_dir_contents(&mut encoder, source_dir, true)
         .await?;
-    encoder.finish().await?;
+
+    if archiver.inject.1.len() > 0 {
+        let (appendix_offset, appendix_size) = archiver.add_appendix(&mut encoder).await?;
+        encoder.finish(Some((appendix_offset, appendix_size))).await?;
+    } else {
+        encoder.finish(None).await?;
+    }
+
     Ok(())
 }
 
@@ -529,6 +538,132 @@ impl Archiver {
         Ok(())
     }
 
+    async fn add_appendix<T: SeqWrite + Send>(
+        &mut self,
+        encoder: &mut Encoder<'_, T>,
+    ) -> Result<(LinkOffset, u64), Error> {
+        let total = self
+            .inject
+            .1
+            .iter()
+            .fold(0, |sum, inject| sum + inject.end());
+        let appendix_offset = encoder.add_appendix(total).await?;
+        let mut boundaries = self.forced_boundaries.lock().unwrap();
+        let mut position = encoder.position_add(0);
+
+        // Inject reused chunks in patches of 128 to not exceed upload post req size limit
+        for injects in self.inject.1.chunks(128) {
+            let size = injects
+                .iter()
+                .fold(0, |sum, inject| sum + inject.end() as usize);
+            let inject_chunks = InjectChunks {
+                boundary: position,
+                chunks: injects.to_vec(),
+                size,
+            };
+            boundaries.push_back(inject_chunks);
+            position = encoder.position_add(size as u64);
+        }
+
+        Ok((appendix_offset, total))
+    }
+
+    async fn reuse_if_metadata_unchanged<T: SeqWrite + Send>(
+        &mut self,
+        encoder: &mut Encoder<'_, T>,
+        c_file_name: &CStr,
+        metadata: &Metadata,
+        stat: &FileStat,
+    ) -> Result<bool, Error> {
+        let prev_ref = match self.previous_ref {
+            None => return Ok(false),
+            Some(ref mut prev_ref) => prev_ref
+        };
+
+        let path = Path::new(prev_ref.archive_name.as_str()).join(self.path.clone());
+        let catalog_entry = prev_ref
+            .catalog
+            .lookup_recursive(path.as_os_str().as_bytes())?;
+
+        match catalog_entry.attr {
+            DirEntryAttribute::File {
+                size,
+                mtime,
+                link_offset,
+            } => {
+                let file_size = stat.st_size as u64;
+                if mtime == stat.st_mtime && size == file_size {
+                    if let Some(ref catalog) = self.catalog {
+                        catalog.lock().unwrap().add_file(
+                            c_file_name,
+                            file_size,
+                            stat.st_mtime,
+                            link_offset,
+                        )?;
+                    }
+
+                    // Filename header
+                    let mut metadata_bytes = std::mem::size_of::<pxar::format::Header>();
+                    // Filename payload
+                    metadata_bytes += std::mem::size_of_val(c_file_name);
+                    // Metadata with headers and payloads
+                    metadata_bytes += metadata.calculate_byte_len();
+                    // Payload header
+                    metadata_bytes += std::mem::size_of::<pxar::format::Header>();
+
+                    let metadata_bytes = u64::try_from(metadata_bytes)?;
+                    let chunk_start_offset = link_offset.raw();
+                    let start = chunk_start_offset;
+                    let end = chunk_start_offset + metadata_bytes + file_size;
+                    let (indices, total_size, padding_start) =
+                        prev_ref.index.indices(start, end)?;
+
+                    let mut appendix_offset = self.inject.0 as u64 + padding_start;
+
+                    if let (Some(current_end), Some(new_start)) =
+                        (self.inject.1.last(), indices.first())
+                    {
+                        if new_start.digest() == current_end.digest() {
+                            // Already got that chunk, do not append it again and correct
+                            // appendix_offset to be relative to chunk before this one
+                            appendix_offset -= new_start.end();
+                            if indices.len() > 1 {
+                                // Append all following chunks
+                                self.inject.0 += indices[1..]
+                                    .iter()
+                                    .fold(0, |sum, index| sum + index.end() as usize);
+                                self.inject.1.extend_from_slice(&indices[1..]);
+                            }
+                        }
+                    } else {
+                        self.inject.0 += total_size;
+                        self.inject.1.extend_from_slice(&indices);
+                    }
+
+                    let file_name: &Path = OsStr::from_bytes(c_file_name.to_bytes()).as_ref();
+                    let _offset = self
+                        .add_appendix_ref(
+                            encoder,
+                            file_name,
+                            &metadata,
+                            appendix_offset,
+                            file_size,
+                        )
+                        .await?;
+
+                    return Ok(true);
+                }
+            }
+            DirEntryAttribute::Hardlink => {
+                // Catalog contains a hardlink, but the hard link was not present in the current
+                // pxar archive. So be sure to reencode this file instead of reusing it.
+                return Ok(false)
+            }
+            _ => println!("Unexpected attribute type, expected 'File' or 'Hardlink'"),
+        }
+        Ok(false)
+    }
+
     async fn add_entry<T: SeqWrite + Send>(
         &mut self,
         encoder: &mut Encoder<'_, T>,
@@ -595,6 +730,14 @@ impl Archiver {
                 }
 
                 let file_size = stat.st_size as u64;
+                if file_size > MAX_FILE_SIZE
+                    && self
+                        .reuse_if_metadata_unchanged(encoder, c_file_name, &metadata, stat)
+                        .await?
+                {
+                    return Ok(());
+                }
+
                 let offset: LinkOffset = self
                     .add_regular_file(encoder, fd, file_name, &metadata, file_size)
                     .await?;
@@ -712,7 +855,7 @@ impl Archiver {
         self.fs_feature_flags = old_fs_feature_flags;
         self.current_st_dev = old_st_dev;
 
-        encoder.finish().await?;
+        encoder.finish(None).await?;
         result
     }
 
diff --git a/src/tape/file_formats/snapshot_archive.rs b/src/tape/file_formats/snapshot_archive.rs
index 252384b5..4bbf4727 100644
--- a/src/tape/file_formats/snapshot_archive.rs
+++ b/src/tape/file_formats/snapshot_archive.rs
@@ -88,7 +88,7 @@ pub fn tape_write_snapshot_archive<'a>(
                 proxmox_lang::io_bail!("file '{}' shrunk while reading", filename);
             }
         }
-        encoder.finish()?;
+        encoder.finish(None)?;
         Ok(())
     });
 
-- 
2.39.2





  parent reply	other threads:[~2023-09-22  7:17 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-09-22  7:16 [pbs-devel] [RFC pxar proxmox-backup 00/20] fix #3174: improve file-level backup Christian Ebner
2023-09-22  7:16 ` [pbs-devel] [RFC pxar 1/20] fix #3174: encoder: impl fn new for LinkOffset Christian Ebner
2023-09-27 12:08   ` Wolfgang Bumiller
2023-09-27 12:26     ` Christian Ebner
2023-09-28  6:49       ` Wolfgang Bumiller
2023-09-28  7:52         ` Christian Ebner
2023-09-22  7:16 ` [pbs-devel] [RFC pxar 2/20] fix #3174: decoder: factor out skip_bytes from skip_entry Christian Ebner
2023-09-27 11:32   ` Wolfgang Bumiller
2023-09-27 11:53     ` Christian Ebner
2023-09-22  7:16 ` [pbs-devel] [RFC pxar 3/20] fix #3174: decoder: impl skip_bytes for sync dec Christian Ebner
2023-09-22  7:16 ` [pbs-devel] [RFC pxar 4/20] fix #3174: metadata: impl fn to calc byte size Christian Ebner
2023-09-27 11:38   ` Wolfgang Bumiller
2023-09-27 11:55     ` Christian Ebner
2023-09-28  8:07       ` Christian Ebner
2023-09-28  9:00         ` Wolfgang Bumiller
2023-09-28  9:27           ` Christian Ebner
2023-09-22  7:16 ` [pbs-devel] [RFC pxar 5/20] fix #3174: enc/dec: impl PXAR_APPENDIX_REF entrytype Christian Ebner
2023-09-22  7:16 ` [pbs-devel] [RFC pxar 6/20] fix #3174: enc/dec: impl PXAR_APPENDIX entrytype Christian Ebner
2023-09-22  7:16 ` [pbs-devel] [RFC pxar 7/20] fix #3174: encoder: add helper to incr encoder pos Christian Ebner
2023-09-27 12:07   ` Wolfgang Bumiller
2023-09-27 12:20     ` Christian Ebner
2023-09-28  7:04       ` Wolfgang Bumiller
2023-09-28  7:50         ` Christian Ebner
2023-09-28  8:32           ` Wolfgang Bumiller
2023-09-22  7:16 ` [pbs-devel] [RFC pxar 8/20] fix #3174: enc/dec: impl PXAR_APPENDIX_TAIL entrytype Christian Ebner
2023-09-22  7:16 ` [pbs-devel] [RFC proxmox-backup 09/20] fix #3174: index: add fn index list from start/end-offsets Christian Ebner
2023-09-22  7:16 ` [pbs-devel] [RFC proxmox-backup 10/20] fix #3174: index: add fn digest for DynamicEntry Christian Ebner
2023-09-22  7:16 ` [pbs-devel] [RFC proxmox-backup 11/20] fix #3174: api: double catalog upload size Christian Ebner
2023-09-22  7:16 ` [pbs-devel] [RFC proxmox-backup 12/20] fix #3174: catalog: incl pxar archives file offset Christian Ebner
2023-09-22  7:16 ` [pbs-devel] [RFC proxmox-backup 13/20] fix #3174: archiver/extractor: impl appendix ref Christian Ebner
2023-09-22  7:16 ` [pbs-devel] [RFC proxmox-backup 14/20] fix #3174: extractor: impl seq restore from appendix Christian Ebner
2023-09-22  7:16 ` [pbs-devel] [RFC proxmox-backup 15/20] fix #3174: archiver: store ref to previous backup Christian Ebner
2023-09-22  7:16 ` [pbs-devel] [RFC proxmox-backup 16/20] fix #3174: upload stream: impl reused chunk injector Christian Ebner
2023-09-22  7:16 ` [pbs-devel] [RFC proxmox-backup 17/20] fix #3174: chunker: add forced boundaries Christian Ebner
2023-09-22  7:16 ` [pbs-devel] [RFC proxmox-backup 18/20] fix #3174: backup writer: inject queued chunk in upload steam Christian Ebner
2023-09-22  7:16 ` Christian Ebner [this message]
2023-09-26  7:01   ` [pbs-devel] [RFC proxmox-backup 19/20] fix #3174: archiver: reuse files with unchanged metadata Christian Ebner
2023-09-22  7:16 ` [pbs-devel] [RFC proxmox-backup 20/20] fix #3174: client: Add incremental flag to backup creation Christian Ebner
2023-09-26  7:11   ` Christian Ebner
2023-09-26  7:15 ` [pbs-devel] [RFC pxar proxmox-backup 00/20] fix #3174: improve file-level backup Christian Ebner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230922071621.12670-20-c.ebner@proxmox.com \
    --to=c.ebner@proxmox.com \
    --cc=pbs-devel@lists.proxmox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal