all lists on lists.proxmox.com
 help / color / mirror / Atom feed
From: Lukas Wagner <l.wagner@proxmox.com>
To: pbs-devel@lists.proxmox.com
Subject: [pbs-devel] [PATCH v2 proxmox-backup 2/3] debug cli: add 'compare-content' flag to `diff archive` command
Date: Mon,  5 Dec 2022 15:55:13 +0100	[thread overview]
Message-ID: <20221205145514.645111-3-l.wagner@proxmox.com> (raw)
In-Reply-To: <20221205145514.645111-1-l.wagner@proxmox.com>

When --compare-content is set, the command will compare the
file content instead on relying on mtime to detect modified files.

Signed-off-by: Lukas Wagner <l.wagner@proxmox.com>
---
Changes from v1:
  - Made `changed indicator for file content` a bit less confusing.
    For regular files, it is now only displayed if
      - the --comapare-content flag is set and
      - the file contents *actually* differ

  - Removed unnecessesary namespace prefix for std::task::Pin in unit tests

 src/bin/proxmox_backup_debug/diff.rs | 157 +++++++++++++++++++++++++--
 1 file changed, 145 insertions(+), 12 deletions(-)

diff --git a/src/bin/proxmox_backup_debug/diff.rs b/src/bin/proxmox_backup_debug/diff.rs
index d5a4a1fe..b5ecd721 100644
--- a/src/bin/proxmox_backup_debug/diff.rs
+++ b/src/bin/proxmox_backup_debug/diff.rs
@@ -28,12 +28,15 @@ use pbs_tools::json::required_string_param;
 use pxar::accessor::ReadAt;
 use pxar::EntryKind;
 use serde_json::Value;
+use tokio::io::AsyncReadExt;
 
 type ChunkDigest = [u8; 32];
 type FileEntry = pxar::accessor::aio::FileEntry<Arc<dyn ReadAt + Send + Sync>>;
 type Accessor = pxar::accessor::aio::Accessor<Arc<dyn ReadAt + Send + Sync>>;
 type Directory = pxar::accessor::aio::Directory<Arc<dyn ReadAt + Send + Sync>>;
 
+const BUFFERSIZE: usize = 1024;
+
 pub fn diff_commands() -> CommandLineInterface {
     let cmd_def = CliCommandMap::new().insert(
         "archive",
@@ -79,18 +82,30 @@ pub fn diff_commands() -> CommandLineInterface {
                 schema: KEYFD_SCHEMA,
                 optional: true,
             },
+            "compare-content": {
+                optional: true,
+                type: bool,
+                description: "Compare file content rather than solely relying on mtime for detecting modified files.",
+            },
         }
     }
 )]
 /// Diff an archive in two snapshots. The command will output a list of added, modified and deleted files.
-/// For modified files, only the file metadata (e.g. mtime, size, etc.) will be considered. The actual
-/// file contents will not be compared.
+/// For modified files, the file metadata (e.g. mode, uid, gid, size, etc.) will be considered. For detecting
+/// modification of file content, only mtime will be used by default. If the --compare-content flag is provided,
+/// mtime is ignored and file content will be compared.
 async fn diff_archive_cmd(param: Value) -> Result<(), Error> {
     let repo = extract_repository_from_value(&param)?;
     let snapshot_a = required_string_param(&param, "prev-snapshot")?;
     let snapshot_b = required_string_param(&param, "snapshot")?;
     let archive_name = required_string_param(&param, "archive-name")?;
 
+    let compare_contents = match param.get("compare-content") {
+        Some(Value::Bool(value)) => *value,
+        Some(_) => bail!("invalid flag for compare-content"),
+        None => false,
+    };
+
     let namespace = match param.get("ns") {
         Some(Value::String(ns)) => ns.parse()?,
         Some(_) => bail!("invalid namespace parameter"),
@@ -120,7 +135,14 @@ async fn diff_archive_cmd(param: Value) -> Result<(), Error> {
 
     if archive_name.ends_with(".pxar") {
         let file_name = format!("{}.didx", archive_name);
-        diff_archive(snapshot_a, snapshot_b, &file_name, &repo_params).await?;
+        diff_archive(
+            snapshot_a,
+            snapshot_b,
+            &file_name,
+            &repo_params,
+            compare_contents,
+        )
+        .await?;
     } else {
         bail!("Only .pxar files are supported");
     }
@@ -133,6 +155,7 @@ async fn diff_archive(
     snapshot_b: &str,
     file_name: &str,
     repo_params: &RepoParams,
+    compare_contents: bool,
 ) -> Result<(), Error> {
     let (index_a, accessor_a) = open_dynamic_index(snapshot_a, file_name, repo_params).await?;
     let (index_b, accessor_b) = open_dynamic_index(snapshot_b, file_name, repo_params).await?;
@@ -184,7 +207,7 @@ async fn diff_archive(
 
     // ... so we compare the file metadata/contents to narrow the selection down to files
     // which where *really* modified.
-    let modified_files = compare_files(potentially_modified).await?;
+    let modified_files = compare_files(potentially_modified, compare_contents).await?;
 
     show_file_list(&added_files, &deleted_files, &modified_files);
 
@@ -352,11 +375,12 @@ fn visit_directory<'f, 'c>(
 /// Check if files were actually modified
 async fn compare_files<'a>(
     files: HashMap<&'a OsStr, (&'a FileEntry, &'a FileEntry)>,
+    compare_contents: bool,
 ) -> Result<HashMap<&'a OsStr, (&'a FileEntry, ChangedProperties)>, Error> {
     let mut modified_files = HashMap::new();
 
     for (path, (entry_a, entry_b)) in files {
-        if let Some(changed) = compare_file(entry_a, entry_b).await? {
+        if let Some(changed) = compare_file(entry_a, entry_b, compare_contents).await? {
             modified_files.insert(path, (entry_b, changed));
         }
     }
@@ -367,6 +391,7 @@ async fn compare_files<'a>(
 async fn compare_file(
     file_a: &FileEntry,
     file_b: &FileEntry,
+    compare_contents: bool,
 ) -> Result<Option<ChangedProperties>, Error> {
     let mut changed = ChangedProperties::default();
 
@@ -385,10 +410,22 @@ async fn compare_file(
             changed.content = a.major != b.major || a.minor != b.minor
         }
         (EntryKind::File { size: size_a, .. }, EntryKind::File { size: size_b, .. }) => {
-            if size_a != size_b {
-                changed.size = true;
-                changed.content = true;
-            };
+            changed.size = size_a != size_b;
+
+            if compare_contents {
+                if changed.size {
+                    changed.content = true;
+                } else {
+                    let content_identical = compare_file_contents(file_a, file_b).await?;
+                    if content_identical && !changed.any_without_mtime() {
+                        // If the content is identical and nothing, exluding mtime,
+                        // has changed, we don't consider the entry as modified.
+                        changed.mtime = false;
+                    }
+
+                    changed.content = !content_identical;
+                }
+            }
         }
         (EntryKind::Directory, EntryKind::Directory) => {}
         (EntryKind::Socket, EntryKind::Socket) => {}
@@ -405,6 +442,44 @@ async fn compare_file(
     }
 }
 
+async fn compare_file_contents(file_a: &FileEntry, file_b: &FileEntry) -> Result<bool, Error> {
+    let mut contents_a = file_a.contents().await?;
+    let mut contents_b = file_b.contents().await?;
+
+    compare_readers(&mut contents_a, &mut contents_b).await
+}
+
+async fn compare_readers<T>(reader_a: &mut T, reader_b: &mut T) -> Result<bool, Error>
+where
+    T: AsyncReadExt + Unpin,
+{
+    let mut buf_a = vec![0u8; BUFFERSIZE];
+    let mut buf_b = vec![0u8; BUFFERSIZE];
+
+    loop {
+        // normally, a join! would be nice here, but this leads to
+        // some 'higher-order lifetime error' for which I'm not smart enough to solve.
+        // let (bytes_read_a, bytes_read_b) =
+        //     tokio::try_join!(reader_a.read(&mut buf_a), reader_b.read(&mut buf_b))?;
+        let bytes_read_a = reader_a.read(&mut buf_a).await?;
+        let bytes_read_b = reader_b.read(&mut buf_b).await?;
+
+        if bytes_read_a != bytes_read_b {
+            return Ok(false);
+        }
+
+        if bytes_read_a == 0 {
+            break;
+        }
+
+        if buf_a[..bytes_read_a] != buf_b[..bytes_read_b] {
+            return Ok(false);
+        }
+    }
+
+    Ok(true)
+}
+
 #[derive(Copy, Clone, Default)]
 struct ChangedProperties {
     entry_type: bool,
@@ -438,8 +513,11 @@ impl ChangedProperties {
     }
 
     fn any(&self) -> bool {
+        self.any_without_mtime() || self.mtime
+    }
+
+    fn any_without_mtime(&self) -> bool {
         self.entry_type
-            || self.mtime
             || self.acl
             || self.xattrs
             || self.fcaps
@@ -475,9 +553,10 @@ fn format_filesize(entry: &FileEntry, changed: bool) -> String {
 fn format_mtime(entry: &FileEntry, changed: bool) -> String {
     let mtime = &entry.metadata().stat.mtime;
 
-    let format = if changed { "*%F %T" } else { " %F %T" };
+    let mut format = change_indicator(changed).to_owned();
+    format.push_str("%F %T");
 
-    proxmox_time::strftime_local(format, mtime.secs).unwrap_or_default()
+    proxmox_time::strftime_local(&format, mtime.secs).unwrap_or_default()
 }
 
 fn format_mode(entry: &FileEntry, changed: bool) -> String {
@@ -553,3 +632,57 @@ fn show_file_list(
         println!("{op} {entry_type:>2} {mode:>5} {uid:>6} {gid:>6} {size:>10} {mtime:11} {name}");
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::{
+        io::Cursor,
+        pin::Pin,
+        task::{Context, Poll},
+    };
+    use tokio::io::{AsyncRead, ReadBuf};
+
+    struct MockedAsyncReader(Cursor<Vec<u8>>);
+
+    impl AsyncRead for MockedAsyncReader {
+        fn poll_read(
+            self: Pin<&mut Self>,
+            _cx: &mut Context<'_>,
+            read_buf: &mut ReadBuf<'_>,
+        ) -> Poll<std::io::Result<()>> {
+            let mut buf = vec![0u8; 100];
+
+            let res = std::io::Read::read(&mut self.get_mut().0, &mut buf);
+
+            if let Ok(bytes) = res {
+                read_buf.put_slice(&buf[..bytes]);
+            }
+
+            Poll::Ready(res.map(|_| ()))
+        }
+    }
+
+    #[test]
+    fn test_do_compare_file_contents() {
+        fn compare(a: Vec<u8>, b: Vec<u8>) -> Result<bool, Error> {
+            let mut mock_a = MockedAsyncReader(Cursor::new(a));
+            let mut mock_b = MockedAsyncReader(Cursor::new(b));
+
+            proxmox_async::runtime::block_on(compare_readers(&mut mock_a, &mut mock_b))
+        }
+
+        assert!(matches!(compare(vec![0; 15], vec![0; 15]), Ok(true)));
+        assert!(matches!(compare(vec![0; 15], vec![0; 14]), Ok(false)));
+        assert!(matches!(compare(vec![0; 15], vec![1; 15]), Ok(false)));
+
+        let mut buf = vec![1u8; 2 * BUFFERSIZE];
+        buf[BUFFERSIZE] = 0;
+        assert!(matches!(compare(vec![1u8; 2 * BUFFERSIZE], buf), Ok(false)));
+
+        let mut buf = vec![1u8; 2 * BUFFERSIZE];
+        buf[2 * BUFFERSIZE - 1] = 0;
+        assert!(matches!(compare(vec![1u8; 2 * BUFFERSIZE], buf), Ok(false)));
+    }
+}
-- 
2.30.2





  parent reply	other threads:[~2022-12-05 14:56 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-12-05 14:55 [pbs-devel] [PATCH v2 proxmox-backup 0/3] debug cli: improve output, optionally compare file content for `diff archive` Lukas Wagner
2022-12-05 14:55 ` [pbs-devel] [PATCH v2 proxmox-backup 1/3] debug cli: show more file attributes for `diff archive` command Lukas Wagner
2022-12-05 14:55 ` Lukas Wagner [this message]
2022-12-06 11:39   ` [pbs-devel] [PATCH v2 proxmox-backup 2/3] debug cli: add 'compare-content' flag to " Wolfgang Bumiller
2022-12-06 14:44     ` Lukas Wagner
2022-12-06 15:01       ` Wolfgang Bumiller
2022-12-05 14:55 ` [pbs-devel] [PATCH v2 proxmox-backup 3/3] debug cli: add colored output " Lukas Wagner
2022-12-06 11:49   ` Wolfgang Bumiller

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20221205145514.645111-3-l.wagner@proxmox.com \
    --to=l.wagner@proxmox.com \
    --cc=pbs-devel@lists.proxmox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal