From: Christian Ebner <c.ebner@proxmox.com>
To: pbs-devel@lists.proxmox.com
Subject: [pbs-devel] [RFC proxmox-backup 19/20] fix #3174: archiver: reuse files with unchanged metadata
Date: Fri, 22 Sep 2023 09:16:20 +0200 [thread overview]
Message-ID: <20230922071621.12670-20-c.ebner@proxmox.com> (raw)
In-Reply-To: <20230922071621.12670-1-c.ebner@proxmox.com>
During pxar archive encoding, check regular files against their
previous backup catalogs metadata, if present.
Instead of re-encoding files with unchanged metadata with file size over
a given threshold limit, mark the entries as appendix references in the
pxar archive and append the chunks containing the file payload in the
appendix.
Signed-off-by: Christian Ebner <c.ebner@proxmox.com>
---
pbs-client/src/pxar/create.rs | 149 +++++++++++++++++++++-
src/tape/file_formats/snapshot_archive.rs | 2 +-
2 files changed, 147 insertions(+), 4 deletions(-)
diff --git a/pbs-client/src/pxar/create.rs b/pbs-client/src/pxar/create.rs
index d6afc465..cb9af26f 100644
--- a/pbs-client/src/pxar/create.rs
+++ b/pbs-client/src/pxar/create.rs
@@ -24,7 +24,7 @@ use proxmox_io::vec;
use proxmox_lang::c_str;
use proxmox_sys::fs::{self, acl, xattr};
-use pbs_datastore::catalog::{BackupCatalogWriter, CatalogReader};
+use pbs_datastore::catalog::{BackupCatalogWriter, CatalogReader, DirEntryAttribute};
use pbs_datastore::dynamic_index::{DynamicEntry, DynamicIndexReader};
use crate::inject_reused_chunks::InjectChunks;
@@ -32,6 +32,8 @@ use crate::pxar::metadata::errno_is_unsupported;
use crate::pxar::tools::assert_single_path_component;
use crate::pxar::Flags;
+const MAX_FILE_SIZE: u64 = 128;
+
/// Pxar options for creating a pxar archive/stream
#[derive(Default)]
pub struct PxarCreateOptions {
@@ -218,7 +220,14 @@ where
archiver
.archive_dir_contents(&mut encoder, source_dir, true)
.await?;
- encoder.finish().await?;
+
+ if archiver.inject.1.len() > 0 {
+ let (appendix_offset, appendix_size) = archiver.add_appendix(&mut encoder).await?;
+ encoder.finish(Some((appendix_offset, appendix_size))).await?;
+ } else {
+ encoder.finish(None).await?;
+ }
+
Ok(())
}
@@ -529,6 +538,132 @@ impl Archiver {
Ok(())
}
+ async fn add_appendix<T: SeqWrite + Send>(
+ &mut self,
+ encoder: &mut Encoder<'_, T>,
+ ) -> Result<(LinkOffset, u64), Error> {
+ let total = self
+ .inject
+ .1
+ .iter()
+ .fold(0, |sum, inject| sum + inject.end());
+ let appendix_offset = encoder.add_appendix(total).await?;
+ let mut boundaries = self.forced_boundaries.lock().unwrap();
+ let mut position = encoder.position_add(0);
+
+ // Inject reused chunks in patches of 128 to not exceed upload post req size limit
+ for injects in self.inject.1.chunks(128) {
+ let size = injects
+ .iter()
+ .fold(0, |sum, inject| sum + inject.end() as usize);
+ let inject_chunks = InjectChunks {
+ boundary: position,
+ chunks: injects.to_vec(),
+ size,
+ };
+ boundaries.push_back(inject_chunks);
+ position = encoder.position_add(size as u64);
+ }
+
+ Ok((appendix_offset, total))
+ }
+
+ async fn reuse_if_metadata_unchanged<T: SeqWrite + Send>(
+ &mut self,
+ encoder: &mut Encoder<'_, T>,
+ c_file_name: &CStr,
+ metadata: &Metadata,
+ stat: &FileStat,
+ ) -> Result<bool, Error> {
+ let prev_ref = match self.previous_ref {
+ None => return Ok(false),
+ Some(ref mut prev_ref) => prev_ref
+ };
+
+ let path = Path::new(prev_ref.archive_name.as_str()).join(self.path.clone());
+ let catalog_entry = prev_ref
+ .catalog
+ .lookup_recursive(path.as_os_str().as_bytes())?;
+
+ match catalog_entry.attr {
+ DirEntryAttribute::File {
+ size,
+ mtime,
+ link_offset,
+ } => {
+ let file_size = stat.st_size as u64;
+ if mtime == stat.st_mtime && size == file_size {
+ if let Some(ref catalog) = self.catalog {
+ catalog.lock().unwrap().add_file(
+ c_file_name,
+ file_size,
+ stat.st_mtime,
+ link_offset,
+ )?;
+ }
+
+ // Filename header
+ let mut metadata_bytes = std::mem::size_of::<pxar::format::Header>();
+ // Filename payload
+ metadata_bytes += std::mem::size_of_val(c_file_name);
+ // Metadata with headers and payloads
+ metadata_bytes += metadata.calculate_byte_len();
+ // Payload header
+ metadata_bytes += std::mem::size_of::<pxar::format::Header>();
+
+ let metadata_bytes = u64::try_from(metadata_bytes)?;
+ let chunk_start_offset = link_offset.raw();
+ let start = chunk_start_offset;
+ let end = chunk_start_offset + metadata_bytes + file_size;
+ let (indices, total_size, padding_start) =
+ prev_ref.index.indices(start, end)?;
+
+ let mut appendix_offset = self.inject.0 as u64 + padding_start;
+
+ if let (Some(current_end), Some(new_start)) =
+ (self.inject.1.last(), indices.first())
+ {
+ if new_start.digest() == current_end.digest() {
+ // Already got that chunk, do not append it again and correct
+ // appendix_offset to be relative to chunk before this one
+ appendix_offset -= new_start.end();
+ if indices.len() > 1 {
+ // Append all following chunks
+ self.inject.0 += indices[1..]
+ .iter()
+ .fold(0, |sum, index| sum + index.end() as usize);
+ self.inject.1.extend_from_slice(&indices[1..]);
+ }
+ }
+ } else {
+ self.inject.0 += total_size;
+ self.inject.1.extend_from_slice(&indices);
+ }
+
+ let file_name: &Path = OsStr::from_bytes(c_file_name.to_bytes()).as_ref();
+ let _offset = self
+ .add_appendix_ref(
+ encoder,
+ file_name,
+ &metadata,
+ appendix_offset,
+ file_size,
+ )
+ .await?;
+
+ return Ok(true);
+ }
+ }
+ DirEntryAttribute::Hardlink => {
+ // Catalog contains a hardlink, but the hard link was not present in the current
+ // pxar archive. So be sure to reencode this file instead of reusing it.
+ return Ok(false)
+ }
+ _ => println!("Unexpected attribute type, expected 'File' or 'Hardlink'"),
+ }
+ Ok(false)
+ }
+
async fn add_entry<T: SeqWrite + Send>(
&mut self,
encoder: &mut Encoder<'_, T>,
@@ -595,6 +730,14 @@ impl Archiver {
}
let file_size = stat.st_size as u64;
+ if file_size > MAX_FILE_SIZE
+ && self
+ .reuse_if_metadata_unchanged(encoder, c_file_name, &metadata, stat)
+ .await?
+ {
+ return Ok(());
+ }
+
let offset: LinkOffset = self
.add_regular_file(encoder, fd, file_name, &metadata, file_size)
.await?;
@@ -712,7 +855,7 @@ impl Archiver {
self.fs_feature_flags = old_fs_feature_flags;
self.current_st_dev = old_st_dev;
- encoder.finish().await?;
+ encoder.finish(None).await?;
result
}
diff --git a/src/tape/file_formats/snapshot_archive.rs b/src/tape/file_formats/snapshot_archive.rs
index 252384b5..4bbf4727 100644
--- a/src/tape/file_formats/snapshot_archive.rs
+++ b/src/tape/file_formats/snapshot_archive.rs
@@ -88,7 +88,7 @@ pub fn tape_write_snapshot_archive<'a>(
proxmox_lang::io_bail!("file '{}' shrunk while reading", filename);
}
}
- encoder.finish()?;
+ encoder.finish(None)?;
Ok(())
});
--
2.39.2
next prev parent reply other threads:[~2023-09-22 7:17 UTC|newest]
Thread overview: 40+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-09-22 7:16 [pbs-devel] [RFC pxar proxmox-backup 00/20] fix #3174: improve file-level backup Christian Ebner
2023-09-22 7:16 ` [pbs-devel] [RFC pxar 1/20] fix #3174: encoder: impl fn new for LinkOffset Christian Ebner
2023-09-27 12:08 ` Wolfgang Bumiller
2023-09-27 12:26 ` Christian Ebner
2023-09-28 6:49 ` Wolfgang Bumiller
2023-09-28 7:52 ` Christian Ebner
2023-09-22 7:16 ` [pbs-devel] [RFC pxar 2/20] fix #3174: decoder: factor out skip_bytes from skip_entry Christian Ebner
2023-09-27 11:32 ` Wolfgang Bumiller
2023-09-27 11:53 ` Christian Ebner
2023-09-22 7:16 ` [pbs-devel] [RFC pxar 3/20] fix #3174: decoder: impl skip_bytes for sync dec Christian Ebner
2023-09-22 7:16 ` [pbs-devel] [RFC pxar 4/20] fix #3174: metadata: impl fn to calc byte size Christian Ebner
2023-09-27 11:38 ` Wolfgang Bumiller
2023-09-27 11:55 ` Christian Ebner
2023-09-28 8:07 ` Christian Ebner
2023-09-28 9:00 ` Wolfgang Bumiller
2023-09-28 9:27 ` Christian Ebner
2023-09-22 7:16 ` [pbs-devel] [RFC pxar 5/20] fix #3174: enc/dec: impl PXAR_APPENDIX_REF entrytype Christian Ebner
2023-09-22 7:16 ` [pbs-devel] [RFC pxar 6/20] fix #3174: enc/dec: impl PXAR_APPENDIX entrytype Christian Ebner
2023-09-22 7:16 ` [pbs-devel] [RFC pxar 7/20] fix #3174: encoder: add helper to incr encoder pos Christian Ebner
2023-09-27 12:07 ` Wolfgang Bumiller
2023-09-27 12:20 ` Christian Ebner
2023-09-28 7:04 ` Wolfgang Bumiller
2023-09-28 7:50 ` Christian Ebner
2023-09-28 8:32 ` Wolfgang Bumiller
2023-09-22 7:16 ` [pbs-devel] [RFC pxar 8/20] fix #3174: enc/dec: impl PXAR_APPENDIX_TAIL entrytype Christian Ebner
2023-09-22 7:16 ` [pbs-devel] [RFC proxmox-backup 09/20] fix #3174: index: add fn index list from start/end-offsets Christian Ebner
2023-09-22 7:16 ` [pbs-devel] [RFC proxmox-backup 10/20] fix #3174: index: add fn digest for DynamicEntry Christian Ebner
2023-09-22 7:16 ` [pbs-devel] [RFC proxmox-backup 11/20] fix #3174: api: double catalog upload size Christian Ebner
2023-09-22 7:16 ` [pbs-devel] [RFC proxmox-backup 12/20] fix #3174: catalog: incl pxar archives file offset Christian Ebner
2023-09-22 7:16 ` [pbs-devel] [RFC proxmox-backup 13/20] fix #3174: archiver/extractor: impl appendix ref Christian Ebner
2023-09-22 7:16 ` [pbs-devel] [RFC proxmox-backup 14/20] fix #3174: extractor: impl seq restore from appendix Christian Ebner
2023-09-22 7:16 ` [pbs-devel] [RFC proxmox-backup 15/20] fix #3174: archiver: store ref to previous backup Christian Ebner
2023-09-22 7:16 ` [pbs-devel] [RFC proxmox-backup 16/20] fix #3174: upload stream: impl reused chunk injector Christian Ebner
2023-09-22 7:16 ` [pbs-devel] [RFC proxmox-backup 17/20] fix #3174: chunker: add forced boundaries Christian Ebner
2023-09-22 7:16 ` [pbs-devel] [RFC proxmox-backup 18/20] fix #3174: backup writer: inject queued chunk in upload steam Christian Ebner
2023-09-22 7:16 ` Christian Ebner [this message]
2023-09-26 7:01 ` [pbs-devel] [RFC proxmox-backup 19/20] fix #3174: archiver: reuse files with unchanged metadata Christian Ebner
2023-09-22 7:16 ` [pbs-devel] [RFC proxmox-backup 20/20] fix #3174: client: Add incremental flag to backup creation Christian Ebner
2023-09-26 7:11 ` Christian Ebner
2023-09-26 7:15 ` [pbs-devel] [RFC pxar proxmox-backup 00/20] fix #3174: improve file-level backup Christian Ebner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230922071621.12670-20-c.ebner@proxmox.com \
--to=c.ebner@proxmox.com \
--cc=pbs-devel@lists.proxmox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal