public inbox for pbs-devel@lists.proxmox.com
 help / color / mirror / Atom feed
From: Stefan Hanreich <s.hanreich@proxmox.com>
To: pbs-devel@lists.proxmox.com
Subject: Re: [pbs-devel] [PATCH proxmox-backup 2/2] debug cli: add 'compare-content' flag to `diff archive` command
Date: Thu, 1 Dec 2022 14:30:55 +0100	[thread overview]
Message-ID: <d183e391-3387-44b0-53f9-7218e0e18948@proxmox.com> (raw)
In-Reply-To: <daa33e90-b48d-756b-4874-30029ab48327@proxmox.com>

On 12/1/22 14:29, Stefan Hanreich wrote:
>
>
> On 11/29/22 15:17, Lukas Wagner wrote:
>> When --compare-content is set, the command will compare the
>> file content instead on relying on mtime to detect modified files.
>>
>> Signed-off-by: Lukas Wagner <l.wagner@proxmox.com>
>> ---
>>   src/bin/proxmox_backup_debug/diff.rs | 141 +++++++++++++++++++++++++--
>>   1 file changed, 135 insertions(+), 6 deletions(-)
>>
>> diff --git a/src/bin/proxmox_backup_debug/diff.rs 
>> b/src/bin/proxmox_backup_debug/diff.rs
>> index d5a4a1fe..9b916760 100644
>> --- a/src/bin/proxmox_backup_debug/diff.rs
>> +++ b/src/bin/proxmox_backup_debug/diff.rs
>> @@ -28,12 +28,15 @@ use pbs_tools::json::required_string_param;
>>   use pxar::accessor::ReadAt;
>>   use pxar::EntryKind;
>>   use serde_json::Value;
>> +use tokio::io::AsyncReadExt;
>>     type ChunkDigest = [u8; 32];
>>   type FileEntry = pxar::accessor::aio::FileEntry<Arc<dyn ReadAt + 
>> Send + Sync>>;
>>   type Accessor = pxar::accessor::aio::Accessor<Arc<dyn ReadAt + Send 
>> + Sync>>;
>>   type Directory = pxar::accessor::aio::Directory<Arc<dyn ReadAt + 
>> Send + Sync>>;
>>   +const BUFFERSIZE: usize = 1024;
>> +
>>   pub fn diff_commands() -> CommandLineInterface {
>>       let cmd_def = CliCommandMap::new().insert(
>>           "archive",
>> @@ -79,18 +82,30 @@ pub fn diff_commands() -> CommandLineInterface {
>>                   schema: KEYFD_SCHEMA,
>>                   optional: true,
>>               },
>> +            "compare-content": {
>> +                optional: true,
>> +                type: bool,
>> +                description: "Compare file content rather than 
>> solely relying on mtime for detecting modified files.",
>> +            },
>>           }
>>       }
>>   )]
>>   /// Diff an archive in two snapshots. The command will output a 
>> list of added, modified and deleted files.
>> -/// For modified files, only the file metadata (e.g. mtime, size, 
>> etc.) will be considered. The actual
>> -/// file contents will not be compared.
>> +/// For modified files, the file metadata (e.g. mode, uid, gid, 
>> size, etc.) will be considered. For detecting
>> +/// modification of file content, only mtime will be used by 
>> default. If the --compare-content flag is provided,
>> +/// mtime is ignored and file content will be compared.
>>   async fn diff_archive_cmd(param: Value) -> Result<(), Error> {
>>       let repo = extract_repository_from_value(&param)?;
>>       let snapshot_a = required_string_param(&param, "prev-snapshot")?;
>>       let snapshot_b = required_string_param(&param, "snapshot")?;
>>       let archive_name = required_string_param(&param, "archive-name")?;
>>   +    let compare_contents = match param.get("compare-content") {
>> +        Some(Value::Bool(value)) => *value,
>> +        Some(_) => bail!("invalid flag for compare-content"),
>> +        None => false,
>> +    };
>> +
>>       let namespace = match param.get("ns") {
>>           Some(Value::String(ns)) => ns.parse()?,
>>           Some(_) => bail!("invalid namespace parameter"),
>> @@ -120,7 +135,14 @@ async fn diff_archive_cmd(param: Value) -> 
>> Result<(), Error> {
>>         if archive_name.ends_with(".pxar") {
>>           let file_name = format!("{}.didx", archive_name);
>> -        diff_archive(snapshot_a, snapshot_b, &file_name, 
>> &repo_params).await?;
>> +        diff_archive(
>> +            snapshot_a,
>> +            snapshot_b,
>> +            &file_name,
>> +            &repo_params,
>> +            compare_contents,
>> +        )
>> +        .await?;
>>       } else {
>>           bail!("Only .pxar files are supported");
>>       }
>> @@ -133,6 +155,7 @@ async fn diff_archive(
>>       snapshot_b: &str,
>>       file_name: &str,
>>       repo_params: &RepoParams,
>> +    compare_contents: bool,
>>   ) -> Result<(), Error> {
>>       let (index_a, accessor_a) = open_dynamic_index(snapshot_a, 
>> file_name, repo_params).await?;
>>       let (index_b, accessor_b) = open_dynamic_index(snapshot_b, 
>> file_name, repo_params).await?;
>> @@ -184,7 +207,7 @@ async fn diff_archive(
>>         // ... so we compare the file metadata/contents to narrow the 
>> selection down to files
>>       // which where *really* modified.
>> -    let modified_files = compare_files(potentially_modified).await?;
>> +    let modified_files = compare_files(potentially_modified, 
>> compare_contents).await?;
>>         show_file_list(&added_files, &deleted_files, &modified_files);
>>   @@ -352,11 +375,12 @@ fn visit_directory<'f, 'c>(
>>   /// Check if files were actually modified
>>   async fn compare_files<'a>(
>>       files: HashMap<&'a OsStr, (&'a FileEntry, &'a FileEntry)>,
>> +    compare_contents: bool,
>>   ) -> Result<HashMap<&'a OsStr, (&'a FileEntry, ChangedProperties)>, 
>> Error> {
>>       let mut modified_files = HashMap::new();
>>         for (path, (entry_a, entry_b)) in files {
>> -        if let Some(changed) = compare_file(entry_a, entry_b).await? {
>> +        if let Some(changed) = compare_file(entry_a, entry_b, 
>> compare_contents).await? {
>>               modified_files.insert(path, (entry_b, changed));
>>           }
>>       }
>> @@ -367,6 +391,7 @@ async fn compare_files<'a>(
>>   async fn compare_file(
>>       file_a: &FileEntry,
>>       file_b: &FileEntry,
>> +    compare_contents: bool,
>>   ) -> Result<Option<ChangedProperties>, Error> {
>>       let mut changed = ChangedProperties::default();
>>   @@ -388,6 +413,15 @@ async fn compare_file(
>>               if size_a != size_b {
>>                   changed.size = true;
>>                   changed.content = true;
>> +            } else if compare_contents {
>> +                let content_identical = 
>> compare_file_contents(file_a, file_b).await?;
>> +                if content_identical && !changed.any_without_mtime() {
>> +                    // If the content is identical and nothing, 
>> exluding mtime,
>> +                    // has changed, we don't consider the entry as 
>> modified.
>> +                    changed.mtime = false;
>> +                }
>> +
>> +                changed.content = !content_identical;
>>               };
>>           }
>>           (EntryKind::Directory, EntryKind::Directory) => {}
>> @@ -405,6 +439,44 @@ async fn compare_file(
>>       }
>>   }
>>   +async fn compare_file_contents(file_a: &FileEntry, file_b: 
>> &FileEntry) -> Result<bool, Error> {
>> +    let mut contents_a = file_a.contents().await?;
>> +    let mut contents_b = file_b.contents().await?;
>> +
>> +    compare_readers(&mut contents_a, &mut contents_b).await
>> +}
>> +
>> +async fn compare_readers<T>(reader_a: &mut T, reader_b: &mut T) -> 
>> Result<bool, Error>
>> +where
>> +    T: AsyncReadExt + Unpin,
>> +{
>> +    let mut buf_a = vec![0u8; BUFFERSIZE];
>> +    let mut buf_b = vec![0u8; BUFFERSIZE];
>> +
>> +    loop {
>> +        // normally, a join! would be nice here, but this leads to
>> +        // some 'higher-order lifetime error' for which I'm not 
>> smart enough to solve.
>> +        // let (bytes_read_a, bytes_read_b) =
>> +        //     tokio::try_join!(reader_a.read(&mut buf_a), 
>> reader_b.read(&mut buf_b))?;
>> +        let bytes_read_a = reader_a.read(&mut buf_a).await?;
>> +        let bytes_read_b = reader_b.read(&mut buf_b).await?;
>> +
>> +        if bytes_read_a != bytes_read_b {
>> +            return Ok(false);
>> +        }
>> +
>> +        if bytes_read_a == 0 {
>> +            break;
>> +        }
>> +
>> +        if buf_a[..bytes_read_a] != buf_b[..bytes_read_b] {
>> +            return Ok(false);
>> +        }
>> +    }
>> +
>> +    Ok(true)
>> +}
>> +
>>   #[derive(Copy, Clone, Default)]
>>   struct ChangedProperties {
>>       entry_type: bool,
>> @@ -438,8 +510,11 @@ impl ChangedProperties {
>>       }
>>         fn any(&self) -> bool {
>> +        self.any_without_mtime() || self.mtime
>> +    }
>> +
>> +    fn any_without_mtime(&self) -> bool {
>>           self.entry_type
>> -            || self.mtime
>>               || self.acl
>>               || self.xattrs
>>               || self.fcaps
>> @@ -553,3 +628,57 @@ fn show_file_list(
>>           println!("{op} {entry_type:>2} {mode:>5} {uid:>6} {gid:>6} 
>> {size:>10} {mtime:11} {name}");
>>       }
>>   }
>> +
>> +#[cfg(test)]
>> +mod tests {
>> +    use super::*;
>> +
>> +    use std::{
>> +        io::Cursor,
>> +        pin::Pin,
>> +        task::{Context, Poll},
>> +    };
>> +    use tokio::io::AsyncRead;
>> +
>> +    struct MockedAsyncReader(Cursor<Vec<u8>>);
>> +
>> +    impl AsyncRead for MockedAsyncReader {
>> +        fn poll_read(
>> +            self: Pin<&mut Self>,
>> +            _cx: &mut Context<'_>,
>> +            read_buf: &mut tokio::io::ReadBuf<'_>,
>> +        ) -> std::task::Poll<std::io::Result<()>> {
>
> very minor nit: Poll already gets imported above, so it could be removed
I mean std::task:: of course
>
>> +            let mut buf = vec![0u8; 100];
>> +
>> +            let res = std::io::Read::read(&mut self.get_mut().0, 
>> &mut buf);
>> +
>> +            if let Ok(bytes) = res {
>> +                read_buf.put_slice(&buf[..bytes]);
>> +            }
>> +
>> +            Poll::Ready(res.map(|_| ()))
>> +        }
>> +    }
>> +
>> +    #[test]
>> +    fn test_do_compare_file_contents() {
>> +        fn compare(a: Vec<u8>, b: Vec<u8>) -> Result<bool, Error> {
>> +            let mut mock_a = MockedAsyncReader(Cursor::new(a));
>> +            let mut mock_b = MockedAsyncReader(Cursor::new(b));
>> +
>> + proxmox_async::runtime::block_on(compare_readers(&mut mock_a, &mut 
>> mock_b))
>> +        }
>> +
>> +        assert!(matches!(compare(vec![0; 15], vec![0; 15]), Ok(true)));
>> +        assert!(matches!(compare(vec![0; 15], vec![0; 14]), 
>> Ok(false)));
>> +        assert!(matches!(compare(vec![0; 15], vec![1; 15]), 
>> Ok(false)));
>> +
>> +        let mut buf = vec![1u8; 2 * BUFFERSIZE];
>> +        buf[BUFFERSIZE] = 0;
>> +        assert!(matches!(compare(vec![1u8; 2 * BUFFERSIZE], buf), 
>> Ok(false)));
>> +
>> +        let mut buf = vec![1u8; 2 * BUFFERSIZE];
>> +        buf[2 * BUFFERSIZE - 1] = 0;
>> +        assert!(matches!(compare(vec![1u8; 2 * BUFFERSIZE], buf), 
>> Ok(false)));
>> +    }
>> +}
>
>
> _______________________________________________
> pbs-devel mailing list
> pbs-devel@lists.proxmox.com
> https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel
>
>




  reply	other threads:[~2022-12-01 13:31 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-11-29 14:17 [pbs-devel] [PATCH proxmox-backup 0/2] debug cli: improve output, optionally compare file content for `diff archive` Lukas Wagner
2022-11-29 14:17 ` [pbs-devel] [PATCH proxmox-backup 1/2] debug cli: show more file attributes for `diff archive` command Lukas Wagner
2022-11-29 14:17 ` [pbs-devel] [PATCH proxmox-backup 2/2] debug cli: add 'compare-content' flag to " Lukas Wagner
2022-12-01 13:29   ` Stefan Hanreich
2022-12-01 13:30     ` Stefan Hanreich [this message]
2022-11-30 16:27 ` [pbs-devel] [PATCH proxmox-backup 0/2] debug cli: improve output, optionally compare file content for `diff archive` Thomas Lamprecht
2022-12-01  7:30   ` Lukas Wagner
2022-12-01  8:02     ` Thomas Lamprecht
2022-12-01 13:29 ` Stefan Hanreich

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=d183e391-3387-44b0-53f9-7218e0e18948@proxmox.com \
    --to=s.hanreich@proxmox.com \
    --cc=pbs-devel@lists.proxmox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal