From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by lists.proxmox.com (Postfix) with ESMTPS id 2B9888BD27 for ; Thu, 27 Oct 2022 14:28:54 +0200 (CEST) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id EF5C037549 for ; Thu, 27 Oct 2022 14:28:23 +0200 (CEST) Received: from proxmox-new.maurer-it.com (proxmox-new.maurer-it.com [94.136.29.106]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by firstgate.proxmox.com (Proxmox) with ESMTPS for ; Thu, 27 Oct 2022 14:28:20 +0200 (CEST) Received: from proxmox-new.maurer-it.com (localhost.localdomain [127.0.0.1]) by proxmox-new.maurer-it.com (Proxmox) with ESMTP id 5C20244C3E for ; Thu, 27 Oct 2022 14:28:20 +0200 (CEST) From: Lukas Wagner To: pbs-devel@lists.proxmox.com Date: Thu, 27 Oct 2022 14:28:06 +0200 Message-Id: <20221027122806.79851-3-l.wagner@proxmox.com> X-Mailer: git-send-email 2.30.2 In-Reply-To: <20221027122806.79851-1-l.wagner@proxmox.com> References: <20221027122806.79851-1-l.wagner@proxmox.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-SPAM-LEVEL: Spam detection results: 0 BAYES_00 -1.9 Bayes spam probability is 0 to 1% KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record URIBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to URIBL was blocked. See http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block for more information. [proxmox-backup-debug.rs, mod.rs, diff.rs] Subject: [pbs-devel] [PATCH proxmox-backup 1/1] fix #3828: proxmox_backup_debug: Introduce `diff archive` subcommand. X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 27 Oct 2022 12:28:54 -0000 This new subcommand compares a pxar archive in two different snapshots and prints a list of added/modified/deleted file entries. Signed-off-by: Lukas Wagner --- docs/proxmox-backup-debug/description.rst | 3 + src/bin/proxmox-backup-debug.rs | 3 +- src/bin/proxmox_backup_debug/diff.rs | 456 ++++++++++++++++++++++ src/bin/proxmox_backup_debug/mod.rs | 1 + 4 files changed, 462 insertions(+), 1 deletion(-) create mode 100644 src/bin/proxmox_backup_debug/diff.rs diff --git a/docs/proxmox-backup-debug/description.rst b/docs/proxmox-backup-debug/description.rst index 2e5f35fe..8b28957e 100644 --- a/docs/proxmox-backup-debug/description.rst +++ b/docs/proxmox-backup-debug/description.rst @@ -1,6 +1,9 @@ Implements debugging functionality to inspect Proxmox Backup datastore files, verify the integrity of chunks. +The 'diff' subcommand allows comparing .pxar archives for two +arbitrary snapshots. A list of added/modified/deleted files will be displayed. + Also contains an 'api' subcommand where arbitrary api paths can be called (get/create/set/delete) as well as display their parameters (usage) and their child-links (ls). diff --git a/src/bin/proxmox-backup-debug.rs b/src/bin/proxmox-backup-debug.rs index c8ea0539..a3589c16 100644 --- a/src/bin/proxmox-backup-debug.rs +++ b/src/bin/proxmox-backup-debug.rs @@ -12,7 +12,8 @@ fn main() { let cmd_def = CliCommandMap::new() .insert("inspect", inspect::inspect_commands()) .insert("recover", recover::recover_commands()) - .insert("api", api::api_commands()); + .insert("api", api::api_commands()) + .insert("diff", diff::diff_commands()); let uid = nix::unistd::Uid::current(); let username = match nix::unistd::User::from_uid(uid) { diff --git a/src/bin/proxmox_backup_debug/diff.rs b/src/bin/proxmox_backup_debug/diff.rs new file mode 100644 index 00000000..9b72bb15 --- /dev/null +++ b/src/bin/proxmox_backup_debug/diff.rs @@ -0,0 +1,456 @@ +use std::collections::{HashMap, HashSet}; +use std::ffi::{OsStr, OsString}; +use std::iter::FromIterator; +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use anyhow::{bail, Context as AnyhowContext, Error}; +use futures::future::BoxFuture; +use futures::FutureExt; + +use proxmox_router::cli::{CliCommand, CliCommandMap, CommandLineInterface}; +use proxmox_schema::api; + +use pbs_api_types::{BackupNamespace, BackupPart}; +use pbs_client::tools::key_source::{ + crypto_parameters, format_key_source, get_encryption_key_password, KEYFD_SCHEMA, +}; +use pbs_client::tools::{ + complete_archive_name, complete_group_or_snapshot, connect, extract_repository_from_value, + REPO_URL_SCHEMA, +}; +use pbs_client::{BackupReader, BackupRepository, RemoteChunkReader}; +use pbs_config::key_config::decrypt_key; +use pbs_datastore::dynamic_index::{BufferedDynamicReader, DynamicIndexReader, LocalDynamicReadAt}; +use pbs_datastore::index::IndexFile; +use pbs_tools::crypt_config::CryptConfig; +use pbs_tools::json::required_string_param; +use pxar::accessor::ReadAt; +use pxar::EntryKind; +use serde_json::Value; + +type ChunkDigest = [u8; 32]; +type FileEntry = pxar::accessor::aio::FileEntry>; +type Accessor = pxar::accessor::aio::Accessor>; +type Directory = pxar::accessor::aio::Directory>; + +pub fn diff_commands() -> CommandLineInterface { + let cmd_def = CliCommandMap::new().insert( + "archive", + CliCommand::new(&API_METHOD_DIFF_ARCHIVE_CMD) + .arg_param(&["prev-snapshot", "snapshot", "archive-name"]) + .completion_cb("prev-snapshot", complete_group_or_snapshot) + .completion_cb("snapshot", complete_group_or_snapshot) + .completion_cb("archive-name", complete_archive_name), + ); + + cmd_def.into() +} + +#[api( + input: { + properties: { + "ns": { + type: BackupNamespace, + optional: true, + }, + "prev-snapshot": { + description: "Path for the first snapshot.", + type: String, + }, + "snapshot": { + description: "Path for the second snapshot.", + type: String, + }, + "archive-name": { + description: "Name of the .pxar archive", + type: String, + }, + "repository": { + optional: true, + schema: REPO_URL_SCHEMA, + }, + "keyfile": { + optional: true, + type: String, + description: "Path to encryption key.", + }, + "keyfd": { + schema: KEYFD_SCHEMA, + optional: true, + }, + } + } +)] +/// Diff an archive in two snapshots. The command will output a list of added, modified and deleted files. +/// For modified files, only the file metadata (e.g. mtime, size, etc.) will be considered. The actual +/// file contents will not be compared. +async fn diff_archive_cmd(param: Value) -> Result<(), Error> { + let repo = extract_repository_from_value(¶m)?; + let snapshot_a = required_string_param(¶m, "prev-snapshot")?; + let snapshot_b = required_string_param(¶m, "snapshot")?; + let archive_name = required_string_param(¶m, "archive-name")?; + + let namespace = match param.get("ns") { + Some(Value::String(ns)) => ns.parse()?, + Some(_) => bail!("invalid namespace parameter"), + None => BackupNamespace::root(), + }; + + let crypto = crypto_parameters(¶m)?; + + let crypt_config = match crypto.enc_key { + None => None, + Some(key) => { + let (key, _created, _fingerprint) = decrypt_key(&key.key, &get_encryption_key_password) + .map_err(|err| { + log::error!("{}", format_key_source(&key.source, "encryption")); + err + })?; + let crypt_config = CryptConfig::new(key)?; + Some(Arc::new(crypt_config)) + } + }; + + let repo_params = RepoParams { + repo, + crypt_config, + namespace, + }; + + if archive_name.ends_with(".pxar") { + let file_name = format!("{}.didx", archive_name); + diff_archive(snapshot_a, snapshot_b, &file_name, &repo_params).await?; + } else { + bail!("Only .pxar files are supported"); + } + + Ok(()) +} + +async fn diff_archive( + snapshot_a: &str, + snapshot_b: &str, + file_name: &str, + repo_params: &RepoParams, +) -> Result<(), Error> { + let (index_a, accessor_a) = open_dynamic_index(snapshot_a, file_name, repo_params).await?; + let (index_b, accessor_b) = open_dynamic_index(snapshot_b, file_name, repo_params).await?; + + // vecs of chunk digests, in their correct order + let chunks_a = chunk_digests_for_index(&index_a); + let chunks_b = chunk_digests_for_index(&index_b); + + // sets of chunk digests, 'cause we want to perform set operations + let chunk_set_a: HashSet<&ChunkDigest> = HashSet::from_iter(chunks_a.iter().copied()); + let chunk_set_b: HashSet<&ChunkDigest> = HashSet::from_iter(chunks_b.iter().copied()); + + // Symmetric difference between both sets - + // content stored in those chunks was either added, modified or deleted + let chunk_sym_diff: HashSet<&ChunkDigest> = chunk_set_a + .symmetric_difference(&chunk_set_b) + .copied() + .collect(); + + // Figure out which files are stored in which chunks + let files_in_a = files_in_chunk_set(&chunks_a, &accessor_a, &index_a, &chunk_sym_diff).await?; + let files_in_b = files_in_chunk_set(&chunks_b, &accessor_b, &index_b, &chunk_sym_diff).await?; + + // If file in A but not in B --> deleted + let deleted_files: HashMap<&OsStr, &FileEntry> = files_in_a + .iter() + .filter(|(path, _)| !files_in_b.contains_key(*path)) + .map(|(path, entry)| (path.as_os_str(), entry)) + .collect(); + + // If file in B but not in A --> added + let added_files: HashMap<&OsStr, &FileEntry> = files_in_b + .iter() + .filter(|(path, _)| !files_in_a.contains_key(*path)) + .map(|(path, entry)| (path.as_os_str(), entry)) + .collect(); + + // If file is present in both snapshots, it *might* be modified, but does not have to be. + // If another, unmodified file resides in the same chunk as an actually modified one, + // it will also show up as modified here... + let potentially_modified: HashMap<&OsStr, &FileEntry> = files_in_a + .iter() + .filter(|(path, _)| files_in_b.contains_key(*path)) + .map(|(path, entry)| (path.as_os_str(), entry)) + .collect(); + + // ... so we compare the file metadata/contents to narrow the selection down to files + // which where *really* modified. + let modified_files = compare_files(&files_in_a, &files_in_b, potentially_modified).await?; + + show_file_list(&added_files, &deleted_files, &modified_files); + + Ok(()) +} + +struct RepoParams { + repo: BackupRepository, + crypt_config: Option>, + namespace: BackupNamespace, +} + +async fn open_dynamic_index( + snapshot: &str, + archive_name: &str, + params: &RepoParams, +) -> Result<(DynamicIndexReader, Accessor), Error> { + let backup_reader = create_backup_reader(snapshot, params).await?; + + let (manifest, _) = backup_reader.download_manifest().await?; + manifest.check_fingerprint(params.crypt_config.as_ref().map(Arc::as_ref))?; + + let index = backup_reader + .download_dynamic_index(&manifest, archive_name) + .await?; + let most_used = index.find_most_used_chunks(8); + + let lookup_index = backup_reader + .download_dynamic_index(&manifest, archive_name) + .await?; + + let file_info = manifest.lookup_file_info(archive_name)?; + let chunk_reader = RemoteChunkReader::new( + backup_reader.clone(), + params.crypt_config.clone(), + file_info.chunk_crypt_mode(), + most_used, + ); + + let reader = BufferedDynamicReader::new(index, chunk_reader); + let archive_size = reader.archive_size(); + let reader: Arc = Arc::new(LocalDynamicReadAt::new(reader)); + let accessor = Accessor::new(reader, archive_size).await?; + + Ok((lookup_index, accessor)) +} + +async fn create_backup_reader( + snapshot: &str, + params: &RepoParams, +) -> Result, Error> { + let backup_dir = match snapshot.parse::()? { + BackupPart::Dir(dir) => dir, + BackupPart::Group(_group) => { + bail!("A full snapshot path must be provided."); + } + }; + let client = connect(¶ms.repo)?; + let backup_reader = BackupReader::start( + client, + params.crypt_config.clone(), + params.repo.store(), + ¶ms.namespace, + &backup_dir, + false, + ) + .await?; + Ok(backup_reader) +} + +/// Get a list of chunk digests for an index file. +fn chunk_digests_for_index(index: &dyn IndexFile) -> Vec<&ChunkDigest> { + let mut all_chunks = Vec::new(); + + for i in 0..index.index_count() { + let digest = index + .index_digest(i) + .expect("Invalid chunk index - index corrupted?"); + all_chunks.push(digest); + } + + all_chunks +} + +/// Compute which files are contained in a given chunk set. +async fn files_in_chunk_set<'c, 'f>( + chunk_list: &[&'c ChunkDigest], + accessor: &'f Accessor, + index: &'f DynamicIndexReader, + chunk_set: &HashSet<&'c ChunkDigest>, +) -> Result, Error> { + let path = PathBuf::new(); + let root = accessor.open_root().await?; + + visit_directory(&root, index, &path, chunk_list, chunk_set).await +} + +/// Recursively visits directories in .pxar archive and create a +/// map "digest --> set of contained files" +fn visit_directory<'f, 'c>( + directory: &'f Directory, + index: &'f DynamicIndexReader, + path: &'f Path, + chunk_list: &'f [&'c ChunkDigest], + chunk_diff: &'f HashSet<&'c ChunkDigest>, +) -> BoxFuture<'f, Result, Error>> { + async move { + let mut entries: HashMap = HashMap::new(); + + let mut iter = directory.read_dir(); + + while let Some(entry) = iter.next().await { + let entry = entry?.decode_entry().await?; + let range = &entry.entry_range_info().entry_range; + + let first_chunk = index + .chunk_from_offset(range.start) + .context("Invalid offest")? + .0; + let last_chunk = index + .chunk_from_offset(range.end) + .context("Invalid offset")? + .0; + + if entry.is_dir() { + let new_dir = entry.enter_directory().await?; + + for chunk_index in first_chunk..=last_chunk { + // Check if any chunk of the serialized directory is in + // set off modified chunks (symmetric difference). + // If not, we can skip the directory entirely and save a lot of time. + + let digest = chunk_list.get(chunk_index).context("Invalid chunk index")?; + + if chunk_diff.get(digest).is_some() { + let dir_path = path.join(entry.file_name()); + + entries.extend( + visit_directory(&new_dir, index, &dir_path, chunk_list, chunk_diff) + .await? + .into_iter(), + ); + break; + } + } + } + + let file_path = path.join(entry.file_name()); + + for chunk_index in first_chunk..=last_chunk { + let digest = chunk_list.get(chunk_index).context("Invalid chunk index")?; + + if chunk_diff.get(digest).is_some() { + // files.insert(file_path.clone().into_os_string()); + entries.insert(file_path.into_os_string(), entry); + break; + } + } + } + + Ok(entries) + } + .boxed() +} + +/// Check if files were actually modified +async fn compare_files<'a>( + entries_a: &HashMap, + entries_b: &HashMap, + files: HashMap<&'a OsStr, &'a FileEntry>, +) -> Result, Error> { + let mut modified_files = HashMap::new(); + + for (path, entry) in files { + let p = path.to_os_string(); + let file_a = entries_a.get(&p).context("File entry not in map")?; + let file_b = entries_b.get(&p).context("File entry not in map")?; + + if !compare_file(&file_a, &file_b).await { + modified_files.insert(path, entry); + } + } + + Ok(modified_files) +} + +async fn compare_file(file_a: &FileEntry, file_b: &FileEntry) -> bool { + if file_a.metadata() != file_b.metadata() { + // Check if mtime, permissions, ACLs, etc. have changed - if they have changed, we consider + // the file as modified. + return false; + } + + match (file_a.kind(), file_b.kind()) { + (EntryKind::Symlink(a), EntryKind::Symlink(b)) => { + // Check whether the link target has changed. + a.as_os_str() == b.as_os_str() + } + (EntryKind::Hardlink(a), EntryKind::Hardlink(b)) => { + // Check whether the link target has changed. + a.as_os_str() == b.as_os_str() + } + (EntryKind::Device(a), EntryKind::Device(b)) => a.major == b.major && a.minor == b.minor, + (EntryKind::Socket, EntryKind::Socket) => true, + (EntryKind::Fifo, EntryKind::Fifo) => true, + (EntryKind::GoodbyeTable, EntryKind::GoodbyeTable) => { + // For some reason, .kind() returns GoodbyeTable for FIFOs/Sockets - is this a bug? + // This match arm can be removed if this is fixed. + true + } + (EntryKind::File { size: size_a, .. }, EntryKind::File { size: size_b, .. }) => { + // At this point we know that all metadata including mtime is + // the same. To speed things up, we consider the files as equal if they also have + // the same size. + // If one were completely paranoid, one could compare the actual file contents, + // but this decreases performance drastically. + size_a == size_b + } + (EntryKind::Directory, EntryKind::Directory) => true, + (_, _) => false, // Kind has changed, so we of course consider it modified. + } +} + +/// Display a sorted list of added, modified, deleted files. +fn show_file_list( + added: &HashMap<&OsStr, &FileEntry>, + deleted: &HashMap<&OsStr, &FileEntry>, + modified: &HashMap<&OsStr, &FileEntry>, +) { + let mut all: Vec<&OsStr> = Vec::new(); + + all.extend(added.keys()); + all.extend(deleted.keys()); + all.extend(modified.keys()); + + all.sort(); + + for file in all { + let (op, entry) = if let Some(entry) = added.get(file) { + ("A", *entry) + } else if let Some(entry) = deleted.get(file) { + ("D", *entry) + } else if let Some(entry) = modified.get(file) { + ("M", *entry) + } else { + unreachable!(); + }; + + let entry_kind = match entry.kind() { + EntryKind::Symlink(_) => "l", + EntryKind::Hardlink(_) => "h", + EntryKind::Device(_) => "c/b", + EntryKind::Socket => "s", + EntryKind::Fifo => "p", + EntryKind::File { .. } => "f", + EntryKind::Directory => "d", + EntryKind::GoodbyeTable => { + // For some reason, .kind() returns GoodbyeTable for FIFOs/Sockets - is this a bug? + // This match arm can be removed if this is fixed. + if entry.metadata().is_fifo() { + "p" + } else if entry.metadata().is_socket() { + "s" + } else { + panic!("GoodbyeTable entry that is not a FIFO/socket"); + } + } + }; + + println!("{} {} {}", op, entry_kind, file.to_string_lossy()); + } +} diff --git a/src/bin/proxmox_backup_debug/mod.rs b/src/bin/proxmox_backup_debug/mod.rs index 31bc68c3..0495c565 100644 --- a/src/bin/proxmox_backup_debug/mod.rs +++ b/src/bin/proxmox_backup_debug/mod.rs @@ -6,6 +6,7 @@ use std::{ }; pub mod api; +pub mod diff; pub mod inspect; pub mod recover; -- 2.30.2