From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) by lore.proxmox.com (Postfix) with ESMTPS id 308AE1FF38F for ; Tue, 4 Jun 2024 11:35:23 +0200 (CEST) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id A9FCCEB1E; Tue, 4 Jun 2024 11:35:51 +0200 (CEST) Date: Tue, 04 Jun 2024 11:35:15 +0200 From: Fabian =?iso-8859-1?q?Gr=FCnbichler?= To: Proxmox Backup Server development discussion References: <20240528094303.309806-1-c.ebner@proxmox.com> <20240528094303.309806-49-c.ebner@proxmox.com> In-Reply-To: <20240528094303.309806-49-c.ebner@proxmox.com> MIME-Version: 1.0 User-Agent: astroid/0.16.0 (https://github.com/astroidmail/astroid) Message-Id: <1717493343.bps2geb0tc.astroid@yuna.none> X-SPAM-LEVEL: Spam detection results: 0 AWL 0.058 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record T_SCC_BODY_TEXT_LINE -0.01 - URIBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to URIBL was blocked. See http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block for more information. [create.rs, proxmox.com, mod.rs] Subject: Re: [pbs-devel] [PATCH v8 proxmox-backup 48/69] pxar: caching: add look-ahead cache X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: Proxmox Backup Server development discussion Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: pbs-devel-bounces@lists.proxmox.com Sender: "pbs-devel" On May 28, 2024 11:42 am, Christian Ebner wrote: > Add a lookahead cache and the neccessary types to store the required > data and keep track of directory boundaries while traversing the > filesystem tree, in order to postpone a decision if to reuse or > reencode a given regular file with unchanged metadata. > > Signed-off-by: Christian Ebner > --- > changes since version 7: > - no changes > > changes since version 6: > - add PxarLookaheadCache and refactor some of the logic to be contained > within this patch > > pbs-client/src/pxar/create.rs | 2 +- > pbs-client/src/pxar/look_ahead_cache.rs | 165 ++++++++++++++++++++++++ > pbs-client/src/pxar/mod.rs | 1 + > 3 files changed, 167 insertions(+), 1 deletion(-) > create mode 100644 pbs-client/src/pxar/look_ahead_cache.rs > > diff --git a/pbs-client/src/pxar/create.rs b/pbs-client/src/pxar/create.rs > index ac8827bb2..6127aa88f 100644 > --- a/pbs-client/src/pxar/create.rs > +++ b/pbs-client/src/pxar/create.rs > @@ -131,7 +131,7 @@ impl fmt::Display for ArchiveError { > } > > #[derive(Eq, PartialEq, Hash)] > -struct HardLinkInfo { > +pub(crate) struct HardLinkInfo { > st_dev: u64, > st_ino: u64, > } > diff --git a/pbs-client/src/pxar/look_ahead_cache.rs b/pbs-client/src/pxar/look_ahead_cache.rs > new file mode 100644 > index 000000000..539586271 > --- /dev/null > +++ b/pbs-client/src/pxar/look_ahead_cache.rs > @@ -0,0 +1,165 @@ > +use std::collections::HashSet; > +use std::ffi::CString; > +use std::ops::Range; > +use std::os::unix::io::OwnedFd; > +use std::path::PathBuf; > + > +use nix::sys::stat::FileStat; > + > +use pxar::encoder::PayloadOffset; > +use pxar::Metadata; > + > +use super::create::*; > + > +const DEFAULT_CACHE_SIZE: usize = 512; > + > +pub(crate) struct CacheEntryData { > + pub(crate) fd: OwnedFd, > + pub(crate) c_file_name: CString, > + pub(crate) stat: FileStat, > + pub(crate) metadata: Metadata, > + pub(crate) payload_offset: PayloadOffset, > +} > + > +pub(crate) enum CacheEntry { > + RegEntry(CacheEntryData), > + DirEntry(CacheEntryData), > + DirEnd, > +} > + > +pub(crate) struct PxarLookaheadCache { > + // Current state of the cache > + enabled: bool, > + // Cached entries > + entries: Vec, > + // Entries encountered having more than one link given by stat > + hardlinks: HashSet, > + // Payload range covered by the currently cached entries > + range: Range, > + // Possible held back last chunk from last flush, used for possible chunk continuation > + last_chunk: Option, > + // Path when started caching > + start_path: PathBuf, > + // Number of entries with file descriptors > + fd_entries: usize, > + // Max number of entries with file descriptors > + cache_size: usize, > +} > + > +impl PxarLookaheadCache { > + pub(crate) fn new(size: Option) -> Self { > + Self { > + enabled: false, > + entries: Vec::new(), > + hardlinks: HashSet::new(), > + range: 0..0, > + last_chunk: None, > + start_path: PathBuf::new(), > + fd_entries: 0, > + cache_size: size.unwrap_or(DEFAULT_CACHE_SIZE), > + } > + } > + > + pub(crate) fn is_full(&self) -> bool { > + self.fd_entries >= self.cache_size > + } > + > + pub(crate) fn caching_enabled(&self) -> bool { > + self.enabled > + } > + > + pub(crate) fn insert( 2 out of 3 calls to this are preceded by the same call to update_start_path.. we could just add the path as parameter here, and inline that call and drop update_start_path altogether AFAICT? > + &mut self, > + fd: OwnedFd, > + c_file_name: CString, > + stat: FileStat, > + metadata: Metadata, > + payload_offset: PayloadOffset, > + ) { > + self.enabled = true; > + self.fd_entries += 1; > + if metadata.is_dir() { > + self.entries.push(CacheEntry::DirEntry(CacheEntryData { > + fd, > + c_file_name, > + stat, > + metadata, > + payload_offset, > + })) > + } else { > + self.entries.push(CacheEntry::RegEntry(CacheEntryData { > + fd, > + c_file_name, > + stat, > + metadata, > + payload_offset, > + })) > + } > + } > + > + pub(crate) fn insert_dir_end(&mut self) { > + self.entries.push(CacheEntry::DirEnd); > + } > + > + pub(crate) fn take_and_reset(&mut self) -> Vec { > + self.fd_entries = 0; > + self.enabled = false; > + self.start_path.clear(); start_path is cleared here, and take_and_reset is called > + self.clear_range(); > + std::mem::take(&mut self.entries) > + } > + > + pub(crate) fn update_start_path(&mut self, path: PathBuf) { > + self.start_path = path; > + } > + > + pub(crate) fn start_path(&self) -> &PathBuf { > + &self.start_path right after the only call to this.. so take_and_reset could just take the path as well and return it, and we can drop this one here? > + } > + > + pub(crate) fn contains_hardlink(&self, info: &HardLinkInfo) -> bool { > + self.hardlinks.contains(info) > + } > + > + pub(crate) fn insert_hardlink(&mut self, info: HardLinkInfo) -> bool { > + self.hardlinks.insert(info) > + } > + > + pub(crate) fn range(&self) -> &Range { > + &self.range > + } > + > + pub(crate) fn update_range(&mut self, range: Range) { > + self.range = range; > + } > + > + pub(crate) fn clear_range(&mut self) { > + // keep end for possible continuation if cache has been cleared because > + // it was full, but further caching would be fine > + self.range = self.range.end..self.range.end > + } dangerous name.. clear to me always implies removing everything.. especially since there is no doc comment on it that gives me such important information at the call site. buuuut, thankfully this is only called once, and that call is a few lines above in take_and_reset, so maybe we can just inline it for now and not expose this to accidents? > + > + pub(crate) fn try_extend_range(&mut self, range: Range) -> bool { > + if self.range.end == 0 { > + // initialize first range to start and end with start of new range > + self.range.start = range.start; > + self.range.end = range.start; > + } > + > + // range continued, update end > + if self.range.end == range.start { > + self.range.end = range.end; > + return true; > + } > + > + false > + } > + > + pub(crate) fn take_last_chunk(&mut self) -> Option { > + self.last_chunk.take() > + } > + > + pub(crate) fn update_last_chunk(&mut self, chunk: Option) { > + self.last_chunk = chunk; > + } > +} > diff --git a/pbs-client/src/pxar/mod.rs b/pbs-client/src/pxar/mod.rs > index 5248a1956..334759df6 100644 > --- a/pbs-client/src/pxar/mod.rs > +++ b/pbs-client/src/pxar/mod.rs > @@ -50,6 +50,7 @@ > pub(crate) mod create; > pub(crate) mod dir_stack; > pub(crate) mod extract; > +pub(crate) mod look_ahead_cache; > pub(crate) mod metadata; > pub(crate) mod tools; > > -- > 2.39.2 > > > > _______________________________________________ > pbs-devel mailing list > pbs-devel@lists.proxmox.com > https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel > > > _______________________________________________ pbs-devel mailing list pbs-devel@lists.proxmox.com https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel