From: Lukas Wagner <l.wagner@proxmox.com>
To: pbs-devel@lists.proxmox.com
Subject: [pbs-devel] [PATCH v2 proxmox-backup 1/1] fix #3828: proxmox_backup_debug: Introduce `diff archive` subcommand.
Date: Fri, 28 Oct 2022 12:01:43 +0200 [thread overview]
Message-ID: <20221028100143.9035-2-l.wagner@proxmox.com> (raw)
In-Reply-To: <20221028100143.9035-1-l.wagner@proxmox.com>
This new subcommand compares a pxar archive in two different
snapshots and prints a list of added/modified/deleted file
entries.
Signed-off-by: Lukas Wagner <l.wagner@proxmox.com>
---
Changes from v1:
- Removed now unecessary GoodbyeTable match arms since the bug in the pxar
has been fixed.
docs/proxmox-backup-debug/description.rst | 3 +
src/bin/proxmox-backup-debug.rs | 3 +-
src/bin/proxmox_backup_debug/diff.rs | 441 ++++++++++++++++++++++
src/bin/proxmox_backup_debug/mod.rs | 1 +
4 files changed, 447 insertions(+), 1 deletion(-)
create mode 100644 src/bin/proxmox_backup_debug/diff.rs
diff --git a/docs/proxmox-backup-debug/description.rst b/docs/proxmox-backup-debug/description.rst
index 2e5f35fe..8b28957e 100644
--- a/docs/proxmox-backup-debug/description.rst
+++ b/docs/proxmox-backup-debug/description.rst
@@ -1,6 +1,9 @@
Implements debugging functionality to inspect Proxmox Backup datastore
files, verify the integrity of chunks.
+The 'diff' subcommand allows comparing .pxar archives for two
+arbitrary snapshots. A list of added/modified/deleted files will be displayed.
+
Also contains an 'api' subcommand where arbitrary api paths can be called
(get/create/set/delete) as well as display their parameters (usage) and
their child-links (ls).
diff --git a/src/bin/proxmox-backup-debug.rs b/src/bin/proxmox-backup-debug.rs
index c8ea0539..a3589c16 100644
--- a/src/bin/proxmox-backup-debug.rs
+++ b/src/bin/proxmox-backup-debug.rs
@@ -12,7 +12,8 @@ fn main() {
let cmd_def = CliCommandMap::new()
.insert("inspect", inspect::inspect_commands())
.insert("recover", recover::recover_commands())
- .insert("api", api::api_commands());
+ .insert("api", api::api_commands())
+ .insert("diff", diff::diff_commands());
let uid = nix::unistd::Uid::current();
let username = match nix::unistd::User::from_uid(uid) {
diff --git a/src/bin/proxmox_backup_debug/diff.rs b/src/bin/proxmox_backup_debug/diff.rs
new file mode 100644
index 00000000..de20409d
--- /dev/null
+++ b/src/bin/proxmox_backup_debug/diff.rs
@@ -0,0 +1,441 @@
+use std::collections::{HashMap, HashSet};
+use std::ffi::{OsStr, OsString};
+use std::iter::FromIterator;
+use std::path::{Path, PathBuf};
+use std::sync::Arc;
+
+use anyhow::{bail, Context as AnyhowContext, Error};
+use futures::future::BoxFuture;
+use futures::FutureExt;
+
+use proxmox_router::cli::{CliCommand, CliCommandMap, CommandLineInterface};
+use proxmox_schema::api;
+
+use pbs_api_types::{BackupNamespace, BackupPart};
+use pbs_client::tools::key_source::{
+ crypto_parameters, format_key_source, get_encryption_key_password, KEYFD_SCHEMA,
+};
+use pbs_client::tools::{
+ complete_archive_name, complete_group_or_snapshot, connect, extract_repository_from_value,
+ REPO_URL_SCHEMA,
+};
+use pbs_client::{BackupReader, BackupRepository, RemoteChunkReader};
+use pbs_config::key_config::decrypt_key;
+use pbs_datastore::dynamic_index::{BufferedDynamicReader, DynamicIndexReader, LocalDynamicReadAt};
+use pbs_datastore::index::IndexFile;
+use pbs_tools::crypt_config::CryptConfig;
+use pbs_tools::json::required_string_param;
+use pxar::accessor::ReadAt;
+use pxar::EntryKind;
+use serde_json::Value;
+
+type ChunkDigest = [u8; 32];
+type FileEntry = pxar::accessor::aio::FileEntry<Arc<dyn ReadAt + Send + Sync>>;
+type Accessor = pxar::accessor::aio::Accessor<Arc<dyn ReadAt + Send + Sync>>;
+type Directory = pxar::accessor::aio::Directory<Arc<dyn ReadAt + Send + Sync>>;
+
+pub fn diff_commands() -> CommandLineInterface {
+ let cmd_def = CliCommandMap::new().insert(
+ "archive",
+ CliCommand::new(&API_METHOD_DIFF_ARCHIVE_CMD)
+ .arg_param(&["prev-snapshot", "snapshot", "archive-name"])
+ .completion_cb("prev-snapshot", complete_group_or_snapshot)
+ .completion_cb("snapshot", complete_group_or_snapshot)
+ .completion_cb("archive-name", complete_archive_name),
+ );
+
+ cmd_def.into()
+}
+
+#[api(
+ input: {
+ properties: {
+ "ns": {
+ type: BackupNamespace,
+ optional: true,
+ },
+ "prev-snapshot": {
+ description: "Path for the first snapshot.",
+ type: String,
+ },
+ "snapshot": {
+ description: "Path for the second snapshot.",
+ type: String,
+ },
+ "archive-name": {
+ description: "Name of the .pxar archive",
+ type: String,
+ },
+ "repository": {
+ optional: true,
+ schema: REPO_URL_SCHEMA,
+ },
+ "keyfile": {
+ optional: true,
+ type: String,
+ description: "Path to encryption key.",
+ },
+ "keyfd": {
+ schema: KEYFD_SCHEMA,
+ optional: true,
+ },
+ }
+ }
+)]
+/// Diff an archive in two snapshots. The command will output a list of added, modified and deleted files.
+/// For modified files, only the file metadata (e.g. mtime, size, etc.) will be considered. The actual
+/// file contents will not be compared.
+async fn diff_archive_cmd(param: Value) -> Result<(), Error> {
+ let repo = extract_repository_from_value(¶m)?;
+ let snapshot_a = required_string_param(¶m, "prev-snapshot")?;
+ let snapshot_b = required_string_param(¶m, "snapshot")?;
+ let archive_name = required_string_param(¶m, "archive-name")?;
+
+ let namespace = match param.get("ns") {
+ Some(Value::String(ns)) => ns.parse()?,
+ Some(_) => bail!("invalid namespace parameter"),
+ None => BackupNamespace::root(),
+ };
+
+ let crypto = crypto_parameters(¶m)?;
+
+ let crypt_config = match crypto.enc_key {
+ None => None,
+ Some(key) => {
+ let (key, _created, _fingerprint) = decrypt_key(&key.key, &get_encryption_key_password)
+ .map_err(|err| {
+ log::error!("{}", format_key_source(&key.source, "encryption"));
+ err
+ })?;
+ let crypt_config = CryptConfig::new(key)?;
+ Some(Arc::new(crypt_config))
+ }
+ };
+
+ let repo_params = RepoParams {
+ repo,
+ crypt_config,
+ namespace,
+ };
+
+ if archive_name.ends_with(".pxar") {
+ let file_name = format!("{}.didx", archive_name);
+ diff_archive(snapshot_a, snapshot_b, &file_name, &repo_params).await?;
+ } else {
+ bail!("Only .pxar files are supported");
+ }
+
+ Ok(())
+}
+
+async fn diff_archive(
+ snapshot_a: &str,
+ snapshot_b: &str,
+ file_name: &str,
+ repo_params: &RepoParams,
+) -> Result<(), Error> {
+ let (index_a, accessor_a) = open_dynamic_index(snapshot_a, file_name, repo_params).await?;
+ let (index_b, accessor_b) = open_dynamic_index(snapshot_b, file_name, repo_params).await?;
+
+ // vecs of chunk digests, in their correct order
+ let chunks_a = chunk_digests_for_index(&index_a);
+ let chunks_b = chunk_digests_for_index(&index_b);
+
+ // sets of chunk digests, 'cause we want to perform set operations
+ let chunk_set_a: HashSet<&ChunkDigest> = HashSet::from_iter(chunks_a.iter().copied());
+ let chunk_set_b: HashSet<&ChunkDigest> = HashSet::from_iter(chunks_b.iter().copied());
+
+ // Symmetric difference between both sets -
+ // content stored in those chunks was either added, modified or deleted
+ let chunk_sym_diff: HashSet<&ChunkDigest> = chunk_set_a
+ .symmetric_difference(&chunk_set_b)
+ .copied()
+ .collect();
+
+ // Figure out which files are stored in which chunks
+ let files_in_a = files_in_chunk_set(&chunks_a, &accessor_a, &index_a, &chunk_sym_diff).await?;
+ let files_in_b = files_in_chunk_set(&chunks_b, &accessor_b, &index_b, &chunk_sym_diff).await?;
+
+ // If file in A but not in B --> deleted
+ let deleted_files: HashMap<&OsStr, &FileEntry> = files_in_a
+ .iter()
+ .filter(|(path, _)| !files_in_b.contains_key(*path))
+ .map(|(path, entry)| (path.as_os_str(), entry))
+ .collect();
+
+ // If file in B but not in A --> added
+ let added_files: HashMap<&OsStr, &FileEntry> = files_in_b
+ .iter()
+ .filter(|(path, _)| !files_in_a.contains_key(*path))
+ .map(|(path, entry)| (path.as_os_str(), entry))
+ .collect();
+
+ // If file is present in both snapshots, it *might* be modified, but does not have to be.
+ // If another, unmodified file resides in the same chunk as an actually modified one,
+ // it will also show up as modified here...
+ let potentially_modified: HashMap<&OsStr, &FileEntry> = files_in_a
+ .iter()
+ .filter(|(path, _)| files_in_b.contains_key(*path))
+ .map(|(path, entry)| (path.as_os_str(), entry))
+ .collect();
+
+ // ... so we compare the file metadata/contents to narrow the selection down to files
+ // which where *really* modified.
+ let modified_files = compare_files(&files_in_a, &files_in_b, potentially_modified).await?;
+
+ show_file_list(&added_files, &deleted_files, &modified_files);
+
+ Ok(())
+}
+
+struct RepoParams {
+ repo: BackupRepository,
+ crypt_config: Option<Arc<CryptConfig>>,
+ namespace: BackupNamespace,
+}
+
+async fn open_dynamic_index(
+ snapshot: &str,
+ archive_name: &str,
+ params: &RepoParams,
+) -> Result<(DynamicIndexReader, Accessor), Error> {
+ let backup_reader = create_backup_reader(snapshot, params).await?;
+
+ let (manifest, _) = backup_reader.download_manifest().await?;
+ manifest.check_fingerprint(params.crypt_config.as_ref().map(Arc::as_ref))?;
+
+ let index = backup_reader
+ .download_dynamic_index(&manifest, archive_name)
+ .await?;
+ let most_used = index.find_most_used_chunks(8);
+
+ let lookup_index = backup_reader
+ .download_dynamic_index(&manifest, archive_name)
+ .await?;
+
+ let file_info = manifest.lookup_file_info(archive_name)?;
+ let chunk_reader = RemoteChunkReader::new(
+ backup_reader.clone(),
+ params.crypt_config.clone(),
+ file_info.chunk_crypt_mode(),
+ most_used,
+ );
+
+ let reader = BufferedDynamicReader::new(index, chunk_reader);
+ let archive_size = reader.archive_size();
+ let reader: Arc<dyn ReadAt + Send + Sync> = Arc::new(LocalDynamicReadAt::new(reader));
+ let accessor = Accessor::new(reader, archive_size).await?;
+
+ Ok((lookup_index, accessor))
+}
+
+async fn create_backup_reader(
+ snapshot: &str,
+ params: &RepoParams,
+) -> Result<Arc<BackupReader>, Error> {
+ let backup_dir = match snapshot.parse::<BackupPart>()? {
+ BackupPart::Dir(dir) => dir,
+ BackupPart::Group(_group) => {
+ bail!("A full snapshot path must be provided.");
+ }
+ };
+ let client = connect(¶ms.repo)?;
+ let backup_reader = BackupReader::start(
+ client,
+ params.crypt_config.clone(),
+ params.repo.store(),
+ ¶ms.namespace,
+ &backup_dir,
+ false,
+ )
+ .await?;
+ Ok(backup_reader)
+}
+
+/// Get a list of chunk digests for an index file.
+fn chunk_digests_for_index(index: &dyn IndexFile) -> Vec<&ChunkDigest> {
+ let mut all_chunks = Vec::new();
+
+ for i in 0..index.index_count() {
+ let digest = index
+ .index_digest(i)
+ .expect("Invalid chunk index - index corrupted?");
+ all_chunks.push(digest);
+ }
+
+ all_chunks
+}
+
+/// Compute which files are contained in a given chunk set.
+async fn files_in_chunk_set<'c, 'f>(
+ chunk_list: &[&'c ChunkDigest],
+ accessor: &'f Accessor,
+ index: &'f DynamicIndexReader,
+ chunk_set: &HashSet<&'c ChunkDigest>,
+) -> Result<HashMap<OsString, FileEntry>, Error> {
+ let path = PathBuf::new();
+ let root = accessor.open_root().await?;
+
+ visit_directory(&root, index, &path, chunk_list, chunk_set).await
+}
+
+/// Recursively visits directories in .pxar archive and create a
+/// map "digest --> set of contained files"
+fn visit_directory<'f, 'c>(
+ directory: &'f Directory,
+ index: &'f DynamicIndexReader,
+ path: &'f Path,
+ chunk_list: &'f [&'c ChunkDigest],
+ chunk_diff: &'f HashSet<&'c ChunkDigest>,
+) -> BoxFuture<'f, Result<HashMap<OsString, FileEntry>, Error>> {
+ async move {
+ let mut entries: HashMap<OsString, FileEntry> = HashMap::new();
+
+ let mut iter = directory.read_dir();
+
+ while let Some(entry) = iter.next().await {
+ let entry = entry?.decode_entry().await?;
+ let range = &entry.entry_range_info().entry_range;
+
+ let first_chunk = index
+ .chunk_from_offset(range.start)
+ .context("Invalid offest")?
+ .0;
+ let last_chunk = index
+ .chunk_from_offset(range.end)
+ .context("Invalid offset")?
+ .0;
+
+ if entry.is_dir() {
+ let new_dir = entry.enter_directory().await?;
+
+ for chunk_index in first_chunk..=last_chunk {
+ // Check if any chunk of the serialized directory is in
+ // set off modified chunks (symmetric difference).
+ // If not, we can skip the directory entirely and save a lot of time.
+
+ let digest = chunk_list.get(chunk_index).context("Invalid chunk index")?;
+
+ if chunk_diff.get(digest).is_some() {
+ let dir_path = path.join(entry.file_name());
+
+ entries.extend(
+ visit_directory(&new_dir, index, &dir_path, chunk_list, chunk_diff)
+ .await?
+ .into_iter(),
+ );
+ break;
+ }
+ }
+ }
+
+ let file_path = path.join(entry.file_name());
+
+ for chunk_index in first_chunk..=last_chunk {
+ let digest = chunk_list.get(chunk_index).context("Invalid chunk index")?;
+
+ if chunk_diff.get(digest).is_some() {
+ // files.insert(file_path.clone().into_os_string());
+ entries.insert(file_path.into_os_string(), entry);
+ break;
+ }
+ }
+ }
+
+ Ok(entries)
+ }
+ .boxed()
+}
+
+/// Check if files were actually modified
+async fn compare_files<'a>(
+ entries_a: &HashMap<OsString, FileEntry>,
+ entries_b: &HashMap<OsString, FileEntry>,
+ files: HashMap<&'a OsStr, &'a FileEntry>,
+) -> Result<HashMap<&'a OsStr, &'a FileEntry>, Error> {
+ let mut modified_files = HashMap::new();
+
+ for (path, entry) in files {
+ let p = path.to_os_string();
+ let file_a = entries_a.get(&p).context("File entry not in map")?;
+ let file_b = entries_b.get(&p).context("File entry not in map")?;
+
+ if !compare_file(&file_a, &file_b).await {
+ modified_files.insert(path, entry);
+ }
+ }
+
+ Ok(modified_files)
+}
+
+async fn compare_file(file_a: &FileEntry, file_b: &FileEntry) -> bool {
+ if file_a.metadata() != file_b.metadata() {
+ // Check if mtime, permissions, ACLs, etc. have changed - if they have changed, we consider
+ // the file as modified.
+ return false;
+ }
+
+ match (file_a.kind(), file_b.kind()) {
+ (EntryKind::Symlink(a), EntryKind::Symlink(b)) => {
+ // Check whether the link target has changed.
+ a.as_os_str() == b.as_os_str()
+ }
+ (EntryKind::Hardlink(a), EntryKind::Hardlink(b)) => {
+ // Check whether the link target has changed.
+ a.as_os_str() == b.as_os_str()
+ }
+ (EntryKind::Device(a), EntryKind::Device(b)) => a.major == b.major && a.minor == b.minor,
+ (EntryKind::Socket, EntryKind::Socket) => true,
+ (EntryKind::Fifo, EntryKind::Fifo) => true,
+ (EntryKind::File { size: size_a, .. }, EntryKind::File { size: size_b, .. }) => {
+ // At this point we know that all metadata including mtime is
+ // the same. To speed things up, we consider the files as equal if they also have
+ // the same size.
+ // If one were completely paranoid, one could compare the actual file contents,
+ // but this decreases performance drastically.
+ size_a == size_b
+ }
+ (EntryKind::Directory, EntryKind::Directory) => true,
+ (_, _) => false, // Kind has changed, so we of course consider it modified.
+ }
+}
+
+/// Display a sorted list of added, modified, deleted files.
+fn show_file_list(
+ added: &HashMap<&OsStr, &FileEntry>,
+ deleted: &HashMap<&OsStr, &FileEntry>,
+ modified: &HashMap<&OsStr, &FileEntry>,
+) {
+ let mut all: Vec<&OsStr> = Vec::new();
+
+ all.extend(added.keys());
+ all.extend(deleted.keys());
+ all.extend(modified.keys());
+
+ all.sort();
+
+ for file in all {
+ let (op, entry) = if let Some(entry) = added.get(file) {
+ ("A", *entry)
+ } else if let Some(entry) = deleted.get(file) {
+ ("D", *entry)
+ } else if let Some(entry) = modified.get(file) {
+ ("M", *entry)
+ } else {
+ unreachable!();
+ };
+
+ let entry_kind = match entry.kind() {
+ EntryKind::Symlink(_) => "l",
+ EntryKind::Hardlink(_) => "h",
+ EntryKind::Device(_) => "c/b",
+ EntryKind::Socket => "s",
+ EntryKind::Fifo => "p",
+ EntryKind::File { .. } => "f",
+ EntryKind::Directory => "d",
+ _ => " ",
+ };
+
+ println!("{} {} {}", op, entry_kind, file.to_string_lossy());
+ }
+}
diff --git a/src/bin/proxmox_backup_debug/mod.rs b/src/bin/proxmox_backup_debug/mod.rs
index 31bc68c3..0495c565 100644
--- a/src/bin/proxmox_backup_debug/mod.rs
+++ b/src/bin/proxmox_backup_debug/mod.rs
@@ -6,6 +6,7 @@ use std::{
};
pub mod api;
+pub mod diff;
pub mod inspect;
pub mod recover;
--
2.30.2
next prev parent reply other threads:[~2022-10-28 10:02 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-10-28 10:01 [pbs-devel] [PATCH v2 proxmox-backup 0/1] " Lukas Wagner
2022-10-28 10:01 ` Lukas Wagner [this message]
2022-11-23 10:26 ` [pbs-devel] applied: [PATCH v2 proxmox-backup 1/1] " w.bumiller
2022-11-23 15:46 ` [pbs-devel] " Thomas Lamprecht
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20221028100143.9035-2-l.wagner@proxmox.com \
--to=l.wagner@proxmox.com \
--cc=pbs-devel@lists.proxmox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal