From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <f.gruenbichler@proxmox.com>
Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature RSA-PSS (2048 bits))
 (No client certificate requested)
 by lists.proxmox.com (Postfix) with ESMTPS id D16008A424
 for <pbs-devel@lists.proxmox.com>; Tue,  2 Aug 2022 12:08:36 +0200 (CEST)
Received: from firstgate.proxmox.com (localhost [127.0.0.1])
 by firstgate.proxmox.com (Proxmox) with ESMTP id BE83B37327
 for <pbs-devel@lists.proxmox.com>; Tue,  2 Aug 2022 12:08:06 +0200 (CEST)
Received: from proxmox-new.maurer-it.com (proxmox-new.maurer-it.com
 [94.136.29.106])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature RSA-PSS (2048 bits))
 (No client certificate requested)
 by firstgate.proxmox.com (Proxmox) with ESMTPS
 for <pbs-devel@lists.proxmox.com>; Tue,  2 Aug 2022 12:08:05 +0200 (CEST)
Received: from proxmox-new.maurer-it.com (localhost.localdomain [127.0.0.1])
 by proxmox-new.maurer-it.com (Proxmox) with ESMTP id 4E58742D63
 for <pbs-devel@lists.proxmox.com>; Tue,  2 Aug 2022 12:08:05 +0200 (CEST)
Date: Tue, 02 Aug 2022 12:07:57 +0200
From: Fabian =?iso-8859-1?q?Gr=FCnbichler?= <f.gruenbichler@proxmox.com>
To: Proxmox Backup Server development discussion <pbs-devel@lists.proxmox.com>
References: <20220615082040.96959-1-s.sterz@proxmox.com>
 <20220615082040.96959-3-s.sterz@proxmox.com>
In-Reply-To: <20220615082040.96959-3-s.sterz@proxmox.com>
MIME-Version: 1.0
User-Agent: astroid/0.15.0 (https://github.com/astroidmail/astroid)
Message-Id: <1659431993.7yfmdrmz5o.astroid@nora.none>
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: quoted-printable
X-SPAM-LEVEL: Spam detection results:  0
 AWL 0.159 Adjusted score from AWL reputation of From: address
 BAYES_00                 -1.9 Bayes spam probability is 0 to 1%
 KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment
 SPF_HELO_NONE           0.001 SPF: HELO does not publish an SPF Record
 SPF_PASS               -0.001 SPF: sender matches SPF record
 URIBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to URIBL was blocked. See
 http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block for more
 information. [pull.rs, proxmox.com]
Subject: Re: [pbs-devel] [RFC PATCH 2/4] fix #3786: server/datastore: add
 deep sync parameter to pull sync jobs
X-BeenThere: pbs-devel@lists.proxmox.com
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Proxmox Backup Server development discussion
 <pbs-devel.lists.proxmox.com>
List-Unsubscribe: <https://lists.proxmox.com/cgi-bin/mailman/options/pbs-devel>, 
 <mailto:pbs-devel-request@lists.proxmox.com?subject=unsubscribe>
List-Archive: <http://lists.proxmox.com/pipermail/pbs-devel/>
List-Post: <mailto:pbs-devel@lists.proxmox.com>
List-Help: <mailto:pbs-devel-request@lists.proxmox.com?subject=help>
List-Subscribe: <https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel>, 
 <mailto:pbs-devel-request@lists.proxmox.com?subject=subscribe>
X-List-Received-Date: Tue, 02 Aug 2022 10:08:36 -0000

On June 15, 2022 10:20 am, Stefan Sterz wrote:
> Signed-off-by: Stefan Sterz <s.sterz@proxmox.com>
> ---
>  pbs-datastore/src/backup_info.rs | 22 +++++++++++++++++++++-
>  src/server/pull.rs               | 28 ++++++++++++++++++----------
>  2 files changed, 39 insertions(+), 11 deletions(-)
>=20
> diff --git a/pbs-datastore/src/backup_info.rs b/pbs-datastore/src/backup_=
info.rs
> index 10320a35..89461c66 100644
> --- a/pbs-datastore/src/backup_info.rs
> +++ b/pbs-datastore/src/backup_info.rs
> @@ -9,7 +9,8 @@ use anyhow::{bail, format_err, Error};
>  use proxmox_sys::fs::{lock_dir_noblock, replace_file, CreateOptions};
> =20
>  use pbs_api_types::{
> -    Authid, BackupNamespace, BackupType, GroupFilter, BACKUP_DATE_REGEX,=
 BACKUP_FILE_REGEX,
> +    Authid, BackupNamespace, BackupType, GroupFilter, SnapshotVerifyStat=
e, VerifyState,
> +    BACKUP_DATE_REGEX, BACKUP_FILE_REGEX,
>  };
>  use pbs_config::{open_backup_lockfile, BackupLockGuard};
> =20
> @@ -544,6 +545,25 @@ impl BackupDir {
> =20
>          Ok(())
>      }
> +
> +    /// Returns true if the last verification of the snapshot failed and=
 false otherwise.
> +    ///
> +    /// Note that a snapshot that has not been verified will also return=
 false.
> +    pub fn is_corrupt(&self) -> bool {
> +        let mut to_return =3D false;
> +
> +        let _ =3D self.update_manifest(|m| {
> +            let verify =3D m.unprotected["verify_state"].clone();
> +
> +            if let Ok(verify) =3D serde_json::from_value::<SnapshotVerif=
yState>(verify) {
> +                if verify.state =3D=3D VerifyState::Failed {
> +                    to_return =3D true;
> +                }
> +            }
> +        });
> +
> +        to_return
> +    }

I a not sure whether this is the right place for this helper (mainly=20
because of the assumption that something named "is_corrupt" in such a=20
high level place is allowed to return "false" in the error path..)

maybe move it to the pull code as an internal helper there for now?

alternatively, something like this might be more appropriate if we think=20
we'll add more places where we just want to base actions on "definitely=20
corrupt/ok/.. last time we checked":

 pub fn verify_state(&self) -> Option<VerifyState> {
     self.load_manifest().ok().and_then(|(m, _)| {
         let verify =3D m.unprotected["verify_state"].clone();
         serde_json::from_value::<SnapshotVerifyState>(verify).ok().map(|sv=
s| svs.state)
     })
 }

or the same slightly adapted and wrapped in Result if we want to handle=20
no verify state differently from broken verify state/failure to load=20
manifest.

no locking just for reading, shorter code, more widely-usable return=20
value =3D> pull code can then still map that so everything except=20
Some(Failed) means its okay ;)

or, given that the current patch only checks this when a manifest is=20
already available, it could also move to the manifest to remove the=20
outer layer of error-source ;) but see below for other potential changes=20
that might make this nil again.

>  }
> =20
>  impl AsRef<pbs_api_types::BackupNamespace> for BackupDir {
> diff --git a/src/server/pull.rs b/src/server/pull.rs
> index 6778c66b..767b394c 100644
> --- a/src/server/pull.rs
> +++ b/src/server/pull.rs
> @@ -57,7 +57,7 @@ pub struct PullParameters {
>      /// How many levels of sub-namespaces to pull (0 =3D=3D no recursion=
, None =3D=3D maximum recursion)
>      max_depth: Option<usize>,
>      /// Whether to re-sync corrupted snapshots
> -    _deep_sync: bool,
> +    deep_sync: bool,
>      /// Filters for reducing the pull scope
>      group_filter: Option<Vec<GroupFilter>>,
>      /// Rate limits for all transfers from `remote`
> @@ -111,7 +111,7 @@ impl PullParameters {
>              owner,
>              remove_vanished,
>              max_depth,
> -            _deep_sync: deep_sync,
> +            deep_sync,
>              group_filter,
>              limit,
>          })
> @@ -371,6 +371,7 @@ async fn pull_snapshot(
>      worker: &WorkerTask,
>      reader: Arc<BackupReader>,
>      snapshot: &pbs_datastore::BackupDir,
> +    params: &PullParameters,
>      downloaded_chunks: Arc<Mutex<HashSet<[u8; 32]>>>,

this could just be a `force` or `force-corrupt` boolean? (either meaning=20
"force re-sync if corrupt" if the check stays here, or "force re-sync=20
because corrupt" if the check moves up a layer)

>  ) -> Result<(), Error> {
>      let mut manifest_name =3D snapshot.full_path();
> @@ -437,7 +438,10 @@ async fn pull_snapshot(
>          let mut path =3D snapshot.full_path();
>          path.push(&item.filename);
> =20
> -        if path.exists() {
> +        // if a snapshot could not be verified, the index file will stay=
 the same, but it'll point
> +        // to at least one corrupted chunk. hence, skip this check if th=
e last verification job
> +        // failed and we are running a deep sync.
> +        if !(params.deep_sync && snapshot.is_corrupt()) && path.exists()=
 {

this decision could be made in pull_snapshot_from (only for already=20
existing snapshots), setting force accordingly. or it could stay here,=20
and be called on the manifest directly. in any case it only needs to be=20
called once, not once for each referenced file.

>              match archive_type(&item.filename)? {
>                  ArchiveType::DynamicIndex =3D> {
>                      let index =3D DynamicIndexReader::open(&path)?;
> @@ -513,6 +517,7 @@ async fn pull_snapshot_from(
>      worker: &WorkerTask,
>      reader: Arc<BackupReader>,
>      snapshot: &pbs_datastore::BackupDir,
> +    params: &PullParameters,
>      downloaded_chunks: Arc<Mutex<HashSet<[u8; 32]>>>,
>  ) -> Result<(), Error> {
>      let (_path, is_new, _snap_lock) =3D snapshot
> @@ -522,7 +527,7 @@ async fn pull_snapshot_from(
>      if is_new {
>          task_log!(worker, "sync snapshot {}", snapshot.dir());
> =20
> -        if let Err(err) =3D pull_snapshot(worker, reader, snapshot, down=
loaded_chunks).await {
> +        if let Err(err) =3D pull_snapshot(worker, reader, snapshot, para=
ms, downloaded_chunks).await {

this would then pass false, since a new snapshot would never need to be=20
force-resynced ;)

>              if let Err(cleanup_err) =3D snapshot.datastore().remove_back=
up_dir(
>                  snapshot.backup_ns(),
>                  snapshot.as_ref(),
> @@ -535,7 +540,7 @@ async fn pull_snapshot_from(
>          task_log!(worker, "sync snapshot {} done", snapshot.dir());
>      } else {
>          task_log!(worker, "re-sync snapshot {}", snapshot.dir());
> -        pull_snapshot(worker, reader, snapshot, downloaded_chunks).await=
?;
> +        pull_snapshot(worker, reader, snapshot, params, downloaded_chunk=
s).await?;

here we could decide whether to force-resync based on verify state and=20
deep_sync flag, or just pass the deep_sync flag and leave the corruption=20
check in pull_snapshot.

>          task_log!(worker, "re-sync snapshot {} done", snapshot.dir());
>      }
> =20
> @@ -666,10 +671,12 @@ async fn pull_group(
> =20
>          remote_snapshots.insert(snapshot.time);
> =20
> -        if let Some(last_sync_time) =3D last_sync {
> -            if last_sync_time > snapshot.time {
> -                skip_info.update(snapshot.time);
> -                continue;
> +        if !params.deep_sync {

and this here would be where we could handle the second "re-sync missing=20
snapshots" part, if we want to ;) it would likely require pulling up the=20
snapshot lock somewhere here, and passing is_new to pull_snapshot_from?

> +            if let Some(last_sync_time) =3D last_sync {
> +                if last_sync_time > snapshot.time {
> +                    skip_info.update(snapshot.time);
> +                    continue;
> +                }
>              }
>          }
> =20
> @@ -699,7 +706,8 @@ async fn pull_group(
> =20
>          let snapshot =3D params.store.backup_dir(target_ns.clone(), snap=
shot)?;
> =20
> -        let result =3D pull_snapshot_from(worker, reader, &snapshot, dow=
nloaded_chunks.clone()).await;
> +        let result =3D
> +            pull_snapshot_from(worker, reader, &snapshot, params, downlo=
aded_chunks.clone()).await;
> =20
>          progress.done_snapshots =3D pos as u64 + 1;
>          task_log!(worker, "percentage done: {}", progress);
> --=20
> 2.30.2
>=20
>=20
>=20
> _______________________________________________
> pbs-devel mailing list
> pbs-devel@lists.proxmox.com
> https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel
>=20
>=20
>=20