From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by lists.proxmox.com (Postfix) with ESMTPS id 7755BDAF6 for ; Fri, 22 Sep 2023 09:17:28 +0200 (CEST) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 3DF9E6CA3 for ; Fri, 22 Sep 2023 09:16:56 +0200 (CEST) Received: from proxmox-new.maurer-it.com (proxmox-new.maurer-it.com [94.136.29.106]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by firstgate.proxmox.com (Proxmox) with ESMTPS for ; Fri, 22 Sep 2023 09:16:53 +0200 (CEST) Received: from proxmox-new.maurer-it.com (localhost.localdomain [127.0.0.1]) by proxmox-new.maurer-it.com (Proxmox) with ESMTP id 48A534877F for ; Fri, 22 Sep 2023 09:16:53 +0200 (CEST) From: Christian Ebner To: pbs-devel@lists.proxmox.com Date: Fri, 22 Sep 2023 09:16:20 +0200 Message-Id: <20230922071621.12670-20-c.ebner@proxmox.com> X-Mailer: git-send-email 2.39.2 In-Reply-To: <20230922071621.12670-1-c.ebner@proxmox.com> References: <20230922071621.12670-1-c.ebner@proxmox.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-SPAM-LEVEL: Spam detection results: 0 AWL 0.106 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Subject: [pbs-devel] [RFC proxmox-backup 19/20] fix #3174: archiver: reuse files with unchanged metadata X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 22 Sep 2023 07:17:28 -0000 During pxar archive encoding, check regular files against their previous backup catalogs metadata, if present. Instead of re-encoding files with unchanged metadata with file size over a given threshold limit, mark the entries as appendix references in the pxar archive and append the chunks containing the file payload in the appendix. Signed-off-by: Christian Ebner --- pbs-client/src/pxar/create.rs | 149 +++++++++++++++++++++- src/tape/file_formats/snapshot_archive.rs | 2 +- 2 files changed, 147 insertions(+), 4 deletions(-) diff --git a/pbs-client/src/pxar/create.rs b/pbs-client/src/pxar/create.rs index d6afc465..cb9af26f 100644 --- a/pbs-client/src/pxar/create.rs +++ b/pbs-client/src/pxar/create.rs @@ -24,7 +24,7 @@ use proxmox_io::vec; use proxmox_lang::c_str; use proxmox_sys::fs::{self, acl, xattr}; -use pbs_datastore::catalog::{BackupCatalogWriter, CatalogReader}; +use pbs_datastore::catalog::{BackupCatalogWriter, CatalogReader, DirEntryAttribute}; use pbs_datastore::dynamic_index::{DynamicEntry, DynamicIndexReader}; use crate::inject_reused_chunks::InjectChunks; @@ -32,6 +32,8 @@ use crate::pxar::metadata::errno_is_unsupported; use crate::pxar::tools::assert_single_path_component; use crate::pxar::Flags; +const MAX_FILE_SIZE: u64 = 128; + /// Pxar options for creating a pxar archive/stream #[derive(Default)] pub struct PxarCreateOptions { @@ -218,7 +220,14 @@ where archiver .archive_dir_contents(&mut encoder, source_dir, true) .await?; - encoder.finish().await?; + + if archiver.inject.1.len() > 0 { + let (appendix_offset, appendix_size) = archiver.add_appendix(&mut encoder).await?; + encoder.finish(Some((appendix_offset, appendix_size))).await?; + } else { + encoder.finish(None).await?; + } + Ok(()) } @@ -529,6 +538,132 @@ impl Archiver { Ok(()) } + async fn add_appendix( + &mut self, + encoder: &mut Encoder<'_, T>, + ) -> Result<(LinkOffset, u64), Error> { + let total = self + .inject + .1 + .iter() + .fold(0, |sum, inject| sum + inject.end()); + let appendix_offset = encoder.add_appendix(total).await?; + let mut boundaries = self.forced_boundaries.lock().unwrap(); + let mut position = encoder.position_add(0); + + // Inject reused chunks in patches of 128 to not exceed upload post req size limit + for injects in self.inject.1.chunks(128) { + let size = injects + .iter() + .fold(0, |sum, inject| sum + inject.end() as usize); + let inject_chunks = InjectChunks { + boundary: position, + chunks: injects.to_vec(), + size, + }; + boundaries.push_back(inject_chunks); + position = encoder.position_add(size as u64); + } + + Ok((appendix_offset, total)) + } + + async fn reuse_if_metadata_unchanged( + &mut self, + encoder: &mut Encoder<'_, T>, + c_file_name: &CStr, + metadata: &Metadata, + stat: &FileStat, + ) -> Result { + let prev_ref = match self.previous_ref { + None => return Ok(false), + Some(ref mut prev_ref) => prev_ref + }; + + let path = Path::new(prev_ref.archive_name.as_str()).join(self.path.clone()); + let catalog_entry = prev_ref + .catalog + .lookup_recursive(path.as_os_str().as_bytes())?; + + match catalog_entry.attr { + DirEntryAttribute::File { + size, + mtime, + link_offset, + } => { + let file_size = stat.st_size as u64; + if mtime == stat.st_mtime && size == file_size { + if let Some(ref catalog) = self.catalog { + catalog.lock().unwrap().add_file( + c_file_name, + file_size, + stat.st_mtime, + link_offset, + )?; + } + + // Filename header + let mut metadata_bytes = std::mem::size_of::(); + // Filename payload + metadata_bytes += std::mem::size_of_val(c_file_name); + // Metadata with headers and payloads + metadata_bytes += metadata.calculate_byte_len(); + // Payload header + metadata_bytes += std::mem::size_of::(); + + let metadata_bytes = u64::try_from(metadata_bytes)?; + let chunk_start_offset = link_offset.raw(); + let start = chunk_start_offset; + let end = chunk_start_offset + metadata_bytes + file_size; + let (indices, total_size, padding_start) = + prev_ref.index.indices(start, end)?; + + let mut appendix_offset = self.inject.0 as u64 + padding_start; + + if let (Some(current_end), Some(new_start)) = + (self.inject.1.last(), indices.first()) + { + if new_start.digest() == current_end.digest() { + // Already got that chunk, do not append it again and correct + // appendix_offset to be relative to chunk before this one + appendix_offset -= new_start.end(); + if indices.len() > 1 { + // Append all following chunks + self.inject.0 += indices[1..] + .iter() + .fold(0, |sum, index| sum + index.end() as usize); + self.inject.1.extend_from_slice(&indices[1..]); + } + } + } else { + self.inject.0 += total_size; + self.inject.1.extend_from_slice(&indices); + } + + let file_name: &Path = OsStr::from_bytes(c_file_name.to_bytes()).as_ref(); + let _offset = self + .add_appendix_ref( + encoder, + file_name, + &metadata, + appendix_offset, + file_size, + ) + .await?; + + return Ok(true); + } + } + DirEntryAttribute::Hardlink => { + // Catalog contains a hardlink, but the hard link was not present in the current + // pxar archive. So be sure to reencode this file instead of reusing it. + return Ok(false) + } + _ => println!("Unexpected attribute type, expected 'File' or 'Hardlink'"), + } + Ok(false) + } + async fn add_entry( &mut self, encoder: &mut Encoder<'_, T>, @@ -595,6 +730,14 @@ impl Archiver { } let file_size = stat.st_size as u64; + if file_size > MAX_FILE_SIZE + && self + .reuse_if_metadata_unchanged(encoder, c_file_name, &metadata, stat) + .await? + { + return Ok(()); + } + let offset: LinkOffset = self .add_regular_file(encoder, fd, file_name, &metadata, file_size) .await?; @@ -712,7 +855,7 @@ impl Archiver { self.fs_feature_flags = old_fs_feature_flags; self.current_st_dev = old_st_dev; - encoder.finish().await?; + encoder.finish(None).await?; result } diff --git a/src/tape/file_formats/snapshot_archive.rs b/src/tape/file_formats/snapshot_archive.rs index 252384b5..4bbf4727 100644 --- a/src/tape/file_formats/snapshot_archive.rs +++ b/src/tape/file_formats/snapshot_archive.rs @@ -88,7 +88,7 @@ pub fn tape_write_snapshot_archive<'a>( proxmox_lang::io_bail!("file '{}' shrunk while reading", filename); } } - encoder.finish()?; + encoder.finish(None)?; Ok(()) }); -- 2.39.2