From: Stefan Hanreich <s.hanreich@proxmox.com>
To: pbs-devel@lists.proxmox.com
Subject: Re: [pbs-devel] [PATCH proxmox-backup 2/2] debug cli: add 'compare-content' flag to `diff archive` command
Date: Thu, 1 Dec 2022 14:29:15 +0100 [thread overview]
Message-ID: <daa33e90-b48d-756b-4874-30029ab48327@proxmox.com> (raw)
In-Reply-To: <20221129141730.740199-3-l.wagner@proxmox.com>
On 11/29/22 15:17, Lukas Wagner wrote:
> When --compare-content is set, the command will compare the
> file content instead on relying on mtime to detect modified files.
>
> Signed-off-by: Lukas Wagner <l.wagner@proxmox.com>
> ---
> src/bin/proxmox_backup_debug/diff.rs | 141 +++++++++++++++++++++++++--
> 1 file changed, 135 insertions(+), 6 deletions(-)
>
> diff --git a/src/bin/proxmox_backup_debug/diff.rs b/src/bin/proxmox_backup_debug/diff.rs
> index d5a4a1fe..9b916760 100644
> --- a/src/bin/proxmox_backup_debug/diff.rs
> +++ b/src/bin/proxmox_backup_debug/diff.rs
> @@ -28,12 +28,15 @@ use pbs_tools::json::required_string_param;
> use pxar::accessor::ReadAt;
> use pxar::EntryKind;
> use serde_json::Value;
> +use tokio::io::AsyncReadExt;
>
> type ChunkDigest = [u8; 32];
> type FileEntry = pxar::accessor::aio::FileEntry<Arc<dyn ReadAt + Send + Sync>>;
> type Accessor = pxar::accessor::aio::Accessor<Arc<dyn ReadAt + Send + Sync>>;
> type Directory = pxar::accessor::aio::Directory<Arc<dyn ReadAt + Send + Sync>>;
>
> +const BUFFERSIZE: usize = 1024;
> +
> pub fn diff_commands() -> CommandLineInterface {
> let cmd_def = CliCommandMap::new().insert(
> "archive",
> @@ -79,18 +82,30 @@ pub fn diff_commands() -> CommandLineInterface {
> schema: KEYFD_SCHEMA,
> optional: true,
> },
> + "compare-content": {
> + optional: true,
> + type: bool,
> + description: "Compare file content rather than solely relying on mtime for detecting modified files.",
> + },
> }
> }
> )]
> /// Diff an archive in two snapshots. The command will output a list of added, modified and deleted files.
> -/// For modified files, only the file metadata (e.g. mtime, size, etc.) will be considered. The actual
> -/// file contents will not be compared.
> +/// For modified files, the file metadata (e.g. mode, uid, gid, size, etc.) will be considered. For detecting
> +/// modification of file content, only mtime will be used by default. If the --compare-content flag is provided,
> +/// mtime is ignored and file content will be compared.
> async fn diff_archive_cmd(param: Value) -> Result<(), Error> {
> let repo = extract_repository_from_value(¶m)?;
> let snapshot_a = required_string_param(¶m, "prev-snapshot")?;
> let snapshot_b = required_string_param(¶m, "snapshot")?;
> let archive_name = required_string_param(¶m, "archive-name")?;
>
> + let compare_contents = match param.get("compare-content") {
> + Some(Value::Bool(value)) => *value,
> + Some(_) => bail!("invalid flag for compare-content"),
> + None => false,
> + };
> +
> let namespace = match param.get("ns") {
> Some(Value::String(ns)) => ns.parse()?,
> Some(_) => bail!("invalid namespace parameter"),
> @@ -120,7 +135,14 @@ async fn diff_archive_cmd(param: Value) -> Result<(), Error> {
>
> if archive_name.ends_with(".pxar") {
> let file_name = format!("{}.didx", archive_name);
> - diff_archive(snapshot_a, snapshot_b, &file_name, &repo_params).await?;
> + diff_archive(
> + snapshot_a,
> + snapshot_b,
> + &file_name,
> + &repo_params,
> + compare_contents,
> + )
> + .await?;
> } else {
> bail!("Only .pxar files are supported");
> }
> @@ -133,6 +155,7 @@ async fn diff_archive(
> snapshot_b: &str,
> file_name: &str,
> repo_params: &RepoParams,
> + compare_contents: bool,
> ) -> Result<(), Error> {
> let (index_a, accessor_a) = open_dynamic_index(snapshot_a, file_name, repo_params).await?;
> let (index_b, accessor_b) = open_dynamic_index(snapshot_b, file_name, repo_params).await?;
> @@ -184,7 +207,7 @@ async fn diff_archive(
>
> // ... so we compare the file metadata/contents to narrow the selection down to files
> // which where *really* modified.
> - let modified_files = compare_files(potentially_modified).await?;
> + let modified_files = compare_files(potentially_modified, compare_contents).await?;
>
> show_file_list(&added_files, &deleted_files, &modified_files);
>
> @@ -352,11 +375,12 @@ fn visit_directory<'f, 'c>(
> /// Check if files were actually modified
> async fn compare_files<'a>(
> files: HashMap<&'a OsStr, (&'a FileEntry, &'a FileEntry)>,
> + compare_contents: bool,
> ) -> Result<HashMap<&'a OsStr, (&'a FileEntry, ChangedProperties)>, Error> {
> let mut modified_files = HashMap::new();
>
> for (path, (entry_a, entry_b)) in files {
> - if let Some(changed) = compare_file(entry_a, entry_b).await? {
> + if let Some(changed) = compare_file(entry_a, entry_b, compare_contents).await? {
> modified_files.insert(path, (entry_b, changed));
> }
> }
> @@ -367,6 +391,7 @@ async fn compare_files<'a>(
> async fn compare_file(
> file_a: &FileEntry,
> file_b: &FileEntry,
> + compare_contents: bool,
> ) -> Result<Option<ChangedProperties>, Error> {
> let mut changed = ChangedProperties::default();
>
> @@ -388,6 +413,15 @@ async fn compare_file(
> if size_a != size_b {
> changed.size = true;
> changed.content = true;
> + } else if compare_contents {
> + let content_identical = compare_file_contents(file_a, file_b).await?;
> + if content_identical && !changed.any_without_mtime() {
> + // If the content is identical and nothing, exluding mtime,
> + // has changed, we don't consider the entry as modified.
> + changed.mtime = false;
> + }
> +
> + changed.content = !content_identical;
> };
> }
> (EntryKind::Directory, EntryKind::Directory) => {}
> @@ -405,6 +439,44 @@ async fn compare_file(
> }
> }
>
> +async fn compare_file_contents(file_a: &FileEntry, file_b: &FileEntry) -> Result<bool, Error> {
> + let mut contents_a = file_a.contents().await?;
> + let mut contents_b = file_b.contents().await?;
> +
> + compare_readers(&mut contents_a, &mut contents_b).await
> +}
> +
> +async fn compare_readers<T>(reader_a: &mut T, reader_b: &mut T) -> Result<bool, Error>
> +where
> + T: AsyncReadExt + Unpin,
> +{
> + let mut buf_a = vec![0u8; BUFFERSIZE];
> + let mut buf_b = vec![0u8; BUFFERSIZE];
> +
> + loop {
> + // normally, a join! would be nice here, but this leads to
> + // some 'higher-order lifetime error' for which I'm not smart enough to solve.
> + // let (bytes_read_a, bytes_read_b) =
> + // tokio::try_join!(reader_a.read(&mut buf_a), reader_b.read(&mut buf_b))?;
> + let bytes_read_a = reader_a.read(&mut buf_a).await?;
> + let bytes_read_b = reader_b.read(&mut buf_b).await?;
> +
> + if bytes_read_a != bytes_read_b {
> + return Ok(false);
> + }
> +
> + if bytes_read_a == 0 {
> + break;
> + }
> +
> + if buf_a[..bytes_read_a] != buf_b[..bytes_read_b] {
> + return Ok(false);
> + }
> + }
> +
> + Ok(true)
> +}
> +
> #[derive(Copy, Clone, Default)]
> struct ChangedProperties {
> entry_type: bool,
> @@ -438,8 +510,11 @@ impl ChangedProperties {
> }
>
> fn any(&self) -> bool {
> + self.any_without_mtime() || self.mtime
> + }
> +
> + fn any_without_mtime(&self) -> bool {
> self.entry_type
> - || self.mtime
> || self.acl
> || self.xattrs
> || self.fcaps
> @@ -553,3 +628,57 @@ fn show_file_list(
> println!("{op} {entry_type:>2} {mode:>5} {uid:>6} {gid:>6} {size:>10} {mtime:11} {name}");
> }
> }
> +
> +#[cfg(test)]
> +mod tests {
> + use super::*;
> +
> + use std::{
> + io::Cursor,
> + pin::Pin,
> + task::{Context, Poll},
> + };
> + use tokio::io::AsyncRead;
> +
> + struct MockedAsyncReader(Cursor<Vec<u8>>);
> +
> + impl AsyncRead for MockedAsyncReader {
> + fn poll_read(
> + self: Pin<&mut Self>,
> + _cx: &mut Context<'_>,
> + read_buf: &mut tokio::io::ReadBuf<'_>,
> + ) -> std::task::Poll<std::io::Result<()>> {
very minor nit: Poll already gets imported above, so it could be removed
> + let mut buf = vec![0u8; 100];
> +
> + let res = std::io::Read::read(&mut self.get_mut().0, &mut buf);
> +
> + if let Ok(bytes) = res {
> + read_buf.put_slice(&buf[..bytes]);
> + }
> +
> + Poll::Ready(res.map(|_| ()))
> + }
> + }
> +
> + #[test]
> + fn test_do_compare_file_contents() {
> + fn compare(a: Vec<u8>, b: Vec<u8>) -> Result<bool, Error> {
> + let mut mock_a = MockedAsyncReader(Cursor::new(a));
> + let mut mock_b = MockedAsyncReader(Cursor::new(b));
> +
> + proxmox_async::runtime::block_on(compare_readers(&mut mock_a, &mut mock_b))
> + }
> +
> + assert!(matches!(compare(vec![0; 15], vec![0; 15]), Ok(true)));
> + assert!(matches!(compare(vec![0; 15], vec![0; 14]), Ok(false)));
> + assert!(matches!(compare(vec![0; 15], vec![1; 15]), Ok(false)));
> +
> + let mut buf = vec![1u8; 2 * BUFFERSIZE];
> + buf[BUFFERSIZE] = 0;
> + assert!(matches!(compare(vec![1u8; 2 * BUFFERSIZE], buf), Ok(false)));
> +
> + let mut buf = vec![1u8; 2 * BUFFERSIZE];
> + buf[2 * BUFFERSIZE - 1] = 0;
> + assert!(matches!(compare(vec![1u8; 2 * BUFFERSIZE], buf), Ok(false)));
> + }
> +}
next prev parent reply other threads:[~2022-12-01 13:29 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-11-29 14:17 [pbs-devel] [PATCH proxmox-backup 0/2] debug cli: improve output, optionally compare file content for `diff archive` Lukas Wagner
2022-11-29 14:17 ` [pbs-devel] [PATCH proxmox-backup 1/2] debug cli: show more file attributes for `diff archive` command Lukas Wagner
2022-11-29 14:17 ` [pbs-devel] [PATCH proxmox-backup 2/2] debug cli: add 'compare-content' flag to " Lukas Wagner
2022-12-01 13:29 ` Stefan Hanreich [this message]
2022-12-01 13:30 ` Stefan Hanreich
2022-11-30 16:27 ` [pbs-devel] [PATCH proxmox-backup 0/2] debug cli: improve output, optionally compare file content for `diff archive` Thomas Lamprecht
2022-12-01 7:30 ` Lukas Wagner
2022-12-01 8:02 ` Thomas Lamprecht
2022-12-01 13:29 ` Stefan Hanreich
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=daa33e90-b48d-756b-4874-30029ab48327@proxmox.com \
--to=s.hanreich@proxmox.com \
--cc=pbs-devel@lists.proxmox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox