From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) by lore.proxmox.com (Postfix) with ESMTPS id 1E36F1FF15F for ; Mon, 4 Nov 2024 12:51:13 +0100 (CET) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 96015B0B4; Mon, 4 Nov 2024 12:51:21 +0100 (CET) Date: Mon, 04 Nov 2024 12:51:12 +0100 From: Fabian =?iso-8859-1?q?Gr=FCnbichler?= To: Proxmox Backup Server development discussion References: <20241018090909.103952-1-g.goller@proxmox.com> <20241018090909.103952-2-g.goller@proxmox.com> In-Reply-To: <20241018090909.103952-2-g.goller@proxmox.com> MIME-Version: 1.0 User-Agent: astroid/0.16.0 (https://github.com/astroidmail/astroid) Message-Id: <1730717237.hc5rhqwdje.astroid@yuna.none> X-SPAM-LEVEL: Spam detection results: 0 AWL 0.047 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment RCVD_IN_VALIDITY_CERTIFIED_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to Validity was blocked. See https://knowledge.validity.com/hc/en-us/articles/20961730681243 for more information. RCVD_IN_VALIDITY_RPBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to Validity was blocked. See https://knowledge.validity.com/hc/en-us/articles/20961730681243 for more information. RCVD_IN_VALIDITY_SAFE_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to Validity was blocked. See https://knowledge.validity.com/hc/en-us/articles/20961730681243 for more information. SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Subject: Re: [pbs-devel] [PATCH proxmox-backup v2 1/3] fix #3786: api: add resync-corrupt option to sync jobs X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: Proxmox Backup Server development discussion Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: pbs-devel-bounces@lists.proxmox.com Sender: "pbs-devel" this doesn't really do what it says on the tin, see below. On October 18, 2024 11:09 am, Gabriel Goller wrote: > This option allows us to "fix" corrupt snapshots (and/or their chunks) > by pulling them from another remote. When traversing the remote > snapshots, we check if it exists locally, and if it is, we check if the > last verification of it failed. If the local snapshot is broken and the > `resync-corrupt` option is turned on, we pull in the remote snapshot, > overwriting the local one. > > This is very useful and has been requested a lot, as there is currently > no way to "fix" corrupt chunks/snapshots even if the user has a healthy > version of it on their offsite instance. > > Originally-by: Shannon Sterz > Signed-off-by: Gabriel Goller > --- > pbs-api-types/src/jobs.rs | 10 ++++++ > pbs-datastore/src/backup_info.rs | 13 +++++++- > src/api2/config/sync.rs | 4 +++ > src/api2/pull.rs | 9 +++++- > src/bin/proxmox-backup-manager.rs | 4 +-- > src/server/pull.rs | 52 ++++++++++++++++++++++++++----- > 6 files changed, 80 insertions(+), 12 deletions(-) > > diff --git a/pbs-api-types/src/jobs.rs b/pbs-api-types/src/jobs.rs > index 868702bc059e..58f739ad00b5 100644 > --- a/pbs-api-types/src/jobs.rs > +++ b/pbs-api-types/src/jobs.rs > @@ -498,6 +498,10 @@ pub const TRANSFER_LAST_SCHEMA: Schema = > .minimum(1) > .schema(); > > +pub const RESYNC_CORRUPT_SCHEMA: Schema = > + BooleanSchema::new("If the verification failed for a local snapshot, try to pull it again.") > + .schema(); > + > #[api( > properties: { > id: { > @@ -552,6 +556,10 @@ pub const TRANSFER_LAST_SCHEMA: Schema = > schema: TRANSFER_LAST_SCHEMA, > optional: true, > }, > + "resync-corrupt": { > + schema: RESYNC_CORRUPT_SCHEMA, > + optional: true, > + } > } > )] > #[derive(Serialize, Deserialize, Clone, Updater, PartialEq)] > @@ -585,6 +593,8 @@ pub struct SyncJobConfig { > pub limit: RateLimitConfig, > #[serde(skip_serializing_if = "Option::is_none")] > pub transfer_last: Option, > + #[serde(skip_serializing_if = "Option::is_none")] > + pub resync_corrupt: Option, > } > > impl SyncJobConfig { > diff --git a/pbs-datastore/src/backup_info.rs b/pbs-datastore/src/backup_info.rs > index 414ec878d01a..c86fbb7568ab 100644 > --- a/pbs-datastore/src/backup_info.rs > +++ b/pbs-datastore/src/backup_info.rs > @@ -8,7 +8,8 @@ use anyhow::{bail, format_err, Error}; > use proxmox_sys::fs::{lock_dir_noblock, replace_file, CreateOptions}; > > use pbs_api_types::{ > - Authid, BackupNamespace, BackupType, GroupFilter, BACKUP_DATE_REGEX, BACKUP_FILE_REGEX, > + Authid, BackupNamespace, BackupType, GroupFilter, SnapshotVerifyState, VerifyState, > + BACKUP_DATE_REGEX, BACKUP_FILE_REGEX, > }; > use pbs_config::{open_backup_lockfile, BackupLockGuard}; > > @@ -583,6 +584,16 @@ impl BackupDir { > > Ok(()) > } > + > + /// Load the verify state from the manifest. > + pub fn verify_state(&self) -> Result { > + self.load_manifest().and_then(|(m, _)| { > + let verify = m.unprotected["verify_state"].clone(); > + serde_json::from_value::(verify) > + .map(|svs| svs.state) > + .map_err(Into::into) > + }) > + } wouldn't it make more sense to have this as a getter for an optional SnapshotVerifyState on the BackupManifest? then it could go into its own commit, other call sites that load the verify state from a manifest could be adapted to it, and then this commit can also start using it? also see the comment further below about how the current implementation is very noisy if snapshots are newly synced as opposed to resynced.. > } > > impl AsRef for BackupDir { > diff --git a/src/api2/config/sync.rs b/src/api2/config/sync.rs > index 6fdc69a9e645..fa9db92f3d11 100644 > --- a/src/api2/config/sync.rs > +++ b/src/api2/config/sync.rs > @@ -368,6 +368,9 @@ pub fn update_sync_job( > if let Some(transfer_last) = update.transfer_last { > data.transfer_last = Some(transfer_last); > } > + if let Some(resync_corrupt) = update.resync_corrupt { > + data.resync_corrupt = Some(resync_corrupt); > + } > > if update.limit.rate_in.is_some() { > data.limit.rate_in = update.limit.rate_in; > @@ -527,6 +530,7 @@ acl:1:/remote/remote1/remotestore1:write@pbs:RemoteSyncOperator > ns: None, > owner: Some(write_auth_id.clone()), > comment: None, > + resync_corrupt: None, > remove_vanished: None, > max_depth: None, > group_filter: None, > diff --git a/src/api2/pull.rs b/src/api2/pull.rs > index e733c9839e3a..0d4be0e2d228 100644 > --- a/src/api2/pull.rs > +++ b/src/api2/pull.rs > @@ -10,7 +10,7 @@ use pbs_api_types::{ > Authid, BackupNamespace, GroupFilter, RateLimitConfig, SyncJobConfig, DATASTORE_SCHEMA, > GROUP_FILTER_LIST_SCHEMA, NS_MAX_DEPTH_REDUCED_SCHEMA, PRIV_DATASTORE_BACKUP, > PRIV_DATASTORE_PRUNE, PRIV_REMOTE_READ, REMOTE_ID_SCHEMA, REMOVE_VANISHED_BACKUPS_SCHEMA, > - TRANSFER_LAST_SCHEMA, > + RESYNC_CORRUPT_SCHEMA, TRANSFER_LAST_SCHEMA, > }; > use pbs_config::CachedUserInfo; > use proxmox_human_byte::HumanByte; > @@ -89,6 +89,7 @@ impl TryFrom<&SyncJobConfig> for PullParameters { > sync_job.group_filter.clone(), > sync_job.limit.clone(), > sync_job.transfer_last, > + sync_job.resync_corrupt, > ) > } > } > @@ -240,6 +241,10 @@ pub fn do_sync_job( > schema: TRANSFER_LAST_SCHEMA, > optional: true, > }, > + "resync-corrupt": { > + schema: RESYNC_CORRUPT_SCHEMA, > + optional: true, > + }, > }, > }, > access: { > @@ -264,6 +269,7 @@ async fn pull( > group_filter: Option>, > limit: RateLimitConfig, > transfer_last: Option, > + resync_corrupt: Option, > rpcenv: &mut dyn RpcEnvironment, > ) -> Result { > let auth_id: Authid = rpcenv.get_auth_id().unwrap().parse()?; > @@ -301,6 +307,7 @@ async fn pull( > group_filter, > limit, > transfer_last, > + resync_corrupt, > )?; > > // fixme: set to_stdout to false? > diff --git a/src/bin/proxmox-backup-manager.rs b/src/bin/proxmox-backup-manager.rs > index 420e96665662..38a1cf0f5881 100644 > --- a/src/bin/proxmox-backup-manager.rs > +++ b/src/bin/proxmox-backup-manager.rs > @@ -14,8 +14,8 @@ use pbs_api_types::percent_encoding::percent_encode_component; > use pbs_api_types::{ > BackupNamespace, GroupFilter, RateLimitConfig, SyncJobConfig, DATASTORE_SCHEMA, > GROUP_FILTER_LIST_SCHEMA, IGNORE_VERIFIED_BACKUPS_SCHEMA, NS_MAX_DEPTH_SCHEMA, > - REMOTE_ID_SCHEMA, REMOVE_VANISHED_BACKUPS_SCHEMA, TRANSFER_LAST_SCHEMA, UPID_SCHEMA, > - VERIFICATION_OUTDATED_AFTER_SCHEMA, > + REMOTE_ID_SCHEMA, REMOVE_VANISHED_BACKUPS_SCHEMA, RESYNC_CORRUPT_SCHEMA, TRANSFER_LAST_SCHEMA, > + UPID_SCHEMA, VERIFICATION_OUTDATED_AFTER_SCHEMA, > }; > use pbs_client::{display_task_log, view_task_result}; > use pbs_config::sync; > diff --git a/src/server/pull.rs b/src/server/pull.rs > index 3117f7d2c960..b2dd15d9d6db 100644 > --- a/src/server/pull.rs > +++ b/src/server/pull.rs > @@ -7,12 +7,14 @@ use std::sync::{Arc, Mutex}; > use std::time::SystemTime; > > use anyhow::{bail, format_err, Error}; > +use nom::combinator::verify; I think this snuck in ;) > use proxmox_human_byte::HumanByte; > use tracing::info; > > use pbs_api_types::{ > print_store_and_ns, Authid, BackupDir, BackupGroup, BackupNamespace, GroupFilter, Operation, > - RateLimitConfig, Remote, MAX_NAMESPACE_DEPTH, PRIV_DATASTORE_AUDIT, PRIV_DATASTORE_BACKUP, > + RateLimitConfig, Remote, VerifyState, MAX_NAMESPACE_DEPTH, PRIV_DATASTORE_AUDIT, > + PRIV_DATASTORE_BACKUP, > }; > use pbs_client::BackupRepository; > use pbs_config::CachedUserInfo; > @@ -55,6 +57,8 @@ pub(crate) struct PullParameters { > group_filter: Vec, > /// How many snapshots should be transferred at most (taking the newest N snapshots) > transfer_last: Option, > + /// Whether to re-sync corrupted snapshots > + resync_corrupt: bool, > } > > impl PullParameters { > @@ -72,12 +76,14 @@ impl PullParameters { > group_filter: Option>, > limit: RateLimitConfig, > transfer_last: Option, > + resync_corrupt: Option, > ) -> Result { > if let Some(max_depth) = max_depth { > ns.check_max_depth(max_depth)?; > remote_ns.check_max_depth(max_depth)?; > }; > let remove_vanished = remove_vanished.unwrap_or(false); > + let resync_corrupt = resync_corrupt.unwrap_or(false); > > let source: Arc = if let Some(remote) = remote { > let (remote_config, _digest) = pbs_config::remote::config()?; > @@ -116,6 +122,7 @@ impl PullParameters { > max_depth, > group_filter, > transfer_last, > + resync_corrupt, > }) > } > } > @@ -175,9 +182,10 @@ async fn pull_index_chunks( > target.cond_touch_chunk(&info.digest, false) > })?; > if chunk_exists { > - //info!("chunk {} exists {}", pos, hex::encode(digest)); > + //info!("chunk exists {}", hex::encode(info.digest)); this > return Ok::<_, Error>(()); > } > + and this as well? > //info!("sync {} chunk {}", pos, hex::encode(digest)); > let chunk = chunk_reader.read_raw_chunk(&info.digest).await?; > let raw_size = chunk.raw_size() as usize; > @@ -325,13 +333,15 @@ async fn pull_single_archive<'a>( > /// - (Re)download the manifest > /// -- if it matches, only download log and treat snapshot as already synced > /// - Iterate over referenced files > -/// -- if file already exists, verify contents > +/// -- if file already exists, verify contents or pull again if last > +/// verification failed and `resync_corrupt` is true > /// -- if not, pull it from the remote > /// - Download log if not already existing > async fn pull_snapshot<'a>( > reader: Arc, > snapshot: &'a pbs_datastore::BackupDir, > downloaded_chunks: Arc>>, > + resync_corrupt: bool, > ) -> Result { > let mut sync_stats = SyncStats::default(); > let mut manifest_name = snapshot.full_path(); > @@ -352,6 +362,14 @@ async fn pull_snapshot<'a>( > return Ok(sync_stats); > } > I think this part here is somewhat wrong ordering wise, or at least, unnecessarily expensive.. if resync_corrupt is enabled, we want to (in this order!) - check the local snapshot for corruption, if it exists - if it is corrupt, we proceed with resyncing - if not, we only proceed with resyncing if it is the last snapshot in this group, and return early otherwise that way, we avoid redownloading all the manifests.. but see further below for another issue with the current implementation.. > + let must_resync_existing = resync_corrupt > + && snapshot > + .verify_state() > + .inspect_err(|err| { > + tracing::error!("Failed to check verification state of snapshot: {err:?}") 2024-11-04T12:34:57+01:00: Failed to check verification state of snapshot: unable to load blob '"/tank/pbs/ns/foobar/ns/test/ns/another_test/vm/900/2023-04-06T14:36:00Z/index.json.blob"' - No such file or directory (os error 2) this seems to be very noisy for newly synced snapshots, because the helper is implemented on BackupInfo instead of on BackupManifest.. > + }) > + .is_ok_and(|state| state == VerifyState::Failed); > + > if manifest_name.exists() { > let manifest_blob = proxmox_lang::try_block!({ > let mut manifest_file = std::fs::File::open(&manifest_name).map_err(|err| { > @@ -365,7 +383,7 @@ async fn pull_snapshot<'a>( > format_err!("unable to read local manifest {manifest_name:?} - {err}") > })?; > > - if manifest_blob.raw_data() == tmp_manifest_blob.raw_data() { > + if manifest_blob.raw_data() == tmp_manifest_blob.raw_data() && !must_resync_existing { > if !client_log_name.exists() { > reader.try_download_client_log(&client_log_name).await?; > }; > @@ -377,11 +395,18 @@ async fn pull_snapshot<'a>( > > let manifest = BackupManifest::try_from(tmp_manifest_blob)?; > > + if must_resync_existing { > + info!( > + "re-syncing snapshot {} due to bad verification result", > + snapshot.dir() > + ); > + } > + > for item in manifest.files() { > let mut path = snapshot.full_path(); > path.push(&item.filename); > > - if path.exists() { > + if !must_resync_existing && path.exists() { > match ArchiveType::from_path(&item.filename)? { > ArchiveType::DynamicIndex => { > let index = DynamicIndexReader::open(&path)?; > @@ -443,6 +468,7 @@ async fn pull_snapshot_from<'a>( > reader: Arc, > snapshot: &'a pbs_datastore::BackupDir, > downloaded_chunks: Arc>>, > + resync_corrupt: bool, > ) -> Result { > let (_path, is_new, _snap_lock) = snapshot > .datastore() > @@ -451,7 +477,7 @@ async fn pull_snapshot_from<'a>( > let sync_stats = if is_new { > info!("sync snapshot {}", snapshot.dir()); > > - match pull_snapshot(reader, snapshot, downloaded_chunks).await { > + match pull_snapshot(reader, snapshot, downloaded_chunks, resync_corrupt).await { > Err(err) => { > if let Err(cleanup_err) = snapshot.datastore().remove_backup_dir( > snapshot.backup_ns(), > @@ -469,7 +495,7 @@ async fn pull_snapshot_from<'a>( > } > } else { > info!("re-sync snapshot {}", snapshot.dir()); > - pull_snapshot(reader, snapshot, downloaded_chunks).await? > + pull_snapshot(reader, snapshot, downloaded_chunks, resync_corrupt).await? > }; > > Ok(sync_stats) > @@ -528,6 +554,10 @@ async fn pull_group( > .enumerate() > .filter(|&(pos, ref dir)| { > source_snapshots.insert(dir.time); > + // If resync_corrupt is set, we go through all the remote snapshots > + if params.resync_corrupt { > + return true; > + } alternatively, we could check the local manifest here, and only include existing snapshots with a failed verification state, the last one and new ones? that way, we'd get more meaningful progress stats as well.. because right now, this will not only resync existing corrupt snapshots, but also ones that have been pruned locally, but not on the source (i.e., the other proposed "fixing" sync mode that syncs "missing" old snapshots, not just corrupt ones). > if last_sync_time > dir.time { > already_synced_skip_info.update(dir.time); > return false; > @@ -566,7 +596,13 @@ async fn pull_group( > .source > .reader(source_namespace, &from_snapshot) > .await?; > - let result = pull_snapshot_from(reader, &to_snapshot, downloaded_chunks.clone()).await; > + let result = pull_snapshot_from( > + reader, > + &to_snapshot, > + downloaded_chunks.clone(), > + params.resync_corrupt, > + ) > + .await; > > progress.done_snapshots = pos as u64 + 1; > info!("percentage done: {progress}"); > -- > 2.39.5 > > > > _______________________________________________ > pbs-devel mailing list > pbs-devel@lists.proxmox.com > https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel > > > _______________________________________________ pbs-devel mailing list pbs-devel@lists.proxmox.com https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel