From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by lists.proxmox.com (Postfix) with ESMTPS id 1A37F93C53 for ; Tue, 9 Apr 2024 16:53:08 +0200 (CEST) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id EFD0B1AA3 for ; Tue, 9 Apr 2024 16:53:07 +0200 (CEST) Received: from proxmox-new.maurer-it.com (proxmox-new.maurer-it.com [94.136.29.106]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by firstgate.proxmox.com (Proxmox) with ESMTPS for ; Tue, 9 Apr 2024 16:53:07 +0200 (CEST) Received: from proxmox-new.maurer-it.com (localhost.localdomain [127.0.0.1]) by proxmox-new.maurer-it.com (Proxmox) with ESMTP id 1BB8342EFD for ; Tue, 9 Apr 2024 16:53:07 +0200 (CEST) Message-ID: Date: Tue, 9 Apr 2024 16:53:05 +0200 MIME-Version: 1.0 User-Agent: Mozilla Thunderbird To: =?UTF-8?Q?Fabian_Gr=C3=BCnbichler?= , pbs-devel@lists.proxmox.com References: <20240328123707.336951-1-c.ebner@proxmox.com> <20240328123707.336951-48-c.ebner@proxmox.com> <171230603494.1926770.11920216027784905682@yuna.proxmox.com> Content-Language: en-US, de-DE From: Christian Ebner In-Reply-To: <171230603494.1926770.11920216027784905682@yuna.proxmox.com> Content-Type: text/plain; charset=UTF-8; format=flowed Content-Transfer-Encoding: 8bit X-SPAM-LEVEL: Spam detection results: 0 AWL 0.031 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Subject: Re: [pbs-devel] [PATCH v3 proxmox-backup 47/58] client: pxar: add look-ahead caching X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 09 Apr 2024 14:53:08 -0000 On 4/5/24 10:33, Fabian Grünbichler wrote: > Quoting Christian Ebner (2024-03-28 13:36:56) >> + async fn cache_or_flush_entries( >> + &mut self, >> + encoder: &mut Encoder<'_, T>, >> + previous_metadata_accessor: &mut Option>>, >> + c_file_name: &CStr, >> + stat: &FileStat, >> + fd: OwnedFd, >> + metadata: &Metadata, >> + ) -> Result<(), Error> { >> + let file_name: &Path = OsStr::from_bytes(c_file_name.to_bytes()).as_ref(); >> + let reusable = if let Some(accessor) = previous_metadata_accessor { >> + self.is_reusable_entry(accessor, file_name, stat, metadata) >> + .await? >> + } else { >> + None >> + }; >> + >> + let file_size = stat.st_size as u64; > > couldn't we get this via is_reusable? > >> + if let Some(start_offset) = reusable { >> + if let Some(ref ref_payload_index) = self.previous_payload_index { >> + let payload_size = file_size + size_of::() as u64; > > or better yet, get this here directly ;) > >> + let end_offset = start_offset + payload_size; > > or better yet, this one here ;) > >> + let (indices, start_padding, end_padding) = >> + lookup_dynamic_entries(ref_payload_index, start_offset..end_offset)?; > > or better yet, just return the Range in the payload archive? :) Okay, yes.. changed is_reusable_entry to return the Range of the payload in the payload archive, calculated based on the offset and size as encoded in the archive, which is already available there. > >> + >> + let boundary = encoder.payload_position()?; >> + let offset = >> + self.reused_chunks >> + .insert(indices, boundary, start_padding, end_padding); >> + >> + self.caching_enabled = true; >> + let cache_entry = CacheEntry::RegEntry(CacheEntryData::new( >> + fd, >> + c_file_name.into(), >> + *stat, >> + metadata.clone(), >> + offset, >> + )); >> + self.cached_entries.push(cache_entry); >> + >> + match self.reused_chunks.suggested() { >> + Suggested::Reuse => self.flush_cached_to_archive(encoder, true, true).await?, >> + Suggested::Reencode => { >> + self.flush_cached_to_archive(encoder, false, true).await? >> + } >> + Suggested::CheckNext => {} >> + } >> + >> + return Ok(()); >> + } >> + } >> + >> + self.flush_cached_to_archive(encoder, false, true).await?; >> + self.add_entry(encoder, previous_metadata_accessor, fd.as_raw_fd(), c_file_name, stat) >> + .await > > this part here is where I think we mishandle some edge cases, like mentioned in > the ReusedChunks patch comments.. even keeping back the last chunk doesn't save > us from losing some re-usable files sometimes.. Still need to have a closer look at what you mean exactly here... The code path itself should be fine I think, or am I missing your point here? It is rather the match against the `suggested` (now called `action`) where the wrong decision might be made. >> + if let Some((padding, chunk)) = last_chunk { >> + // Make sure that we flush this chunk even on clear calls >> + self.reused_chunks.must_flush_first = true; > > might make sense to rename this one to "must_flush_first_chunk", else the other > call sites might be interpreted as "must flush (all) chunks first" > Yes, updated this according to your suggestion >> + let _offset = self >> + .reused_chunks >> + .insert(vec![chunk], injection_boundary, padding, 0); >> + } >> + >> + Ok(()) >> + } >> + >> + fn clear_cached_chunks( >> + &mut self, >> + encoder: &mut Encoder<'_, T>, >> + ) -> Result<(), Error> { >> + let reused_chunks = std::mem::take(&mut self.reused_chunks); >> + >> + if !reused_chunks.must_flush_first { >> + return Ok(()); >> + } >> + >> + // First chunk was kept back to avoid duplication but needs to be injected >> + let injection_boundary = reused_chunks.start_boundary(); >> + let payload_writer_position = encoder.payload_position()?.raw(); >> + >> + if !reused_chunks.chunks.is_empty() && injection_boundary.raw() != payload_writer_position { >> + bail!( >> + "encoder payload writer position out of sync: got {payload_writer_position}, expected {}", >> + injection_boundary.raw() >> + ); >> + } >> + >> + if let Some((padding, chunk)) = reused_chunks.chunks.first() { >> + let size = PayloadOffset::default().add(chunk.size()); >> + log::debug!( >> + "Injecting chunk with {} padding (chunk size {})", >> + HumanByte::from(*padding), >> + HumanByte::from(chunk.size()), >> + ); >> + let inject_chunks = InjectChunks { >> + boundary: injection_boundary.raw(), >> + chunks: vec![chunk.clone()], >> + size: size.raw() as usize, >> + }; >> + >> + self.total_injected_size += size.raw(); >> + self.total_injected_count += 1; >> + if *padding > 0 { >> + self.partial_chunks_count += 1; >> + } >> + >> + if let Some(boundary) = self.forced_boundaries.as_mut() { >> + let mut boundary = boundary.lock().unwrap(); >> + boundary.push_back(inject_chunks); >> + } else { >> + bail!("missing injection queue"); >> + }; >> + encoder.advance(size)?; > > this part is basically the loop in flush_reused_chunks and could be de-duplicated in some fashion.. Yes, moved the common code into an additional helper function `inject_chunks_at_boundary`.