From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <dietmar@proxmox.com>
Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature RSA-PSS (2048 bits))
 (No client certificate requested)
 by lists.proxmox.com (Postfix) with ESMTPS id 9795760174
 for <pbs-devel@lists.proxmox.com>; Fri, 16 Oct 2020 08:34:14 +0200 (CEST)
Received: from firstgate.proxmox.com (localhost [127.0.0.1])
 by firstgate.proxmox.com (Proxmox) with ESMTP id 83F8C17F05
 for <pbs-devel@lists.proxmox.com>; Fri, 16 Oct 2020 08:34:14 +0200 (CEST)
Received: from proxmox-new.maurer-it.com (proxmox-new.maurer-it.com
 [212.186.127.180])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature RSA-PSS (2048 bits))
 (No client certificate requested)
 by firstgate.proxmox.com (Proxmox) with ESMTPS id 65D0717EF7
 for <pbs-devel@lists.proxmox.com>; Fri, 16 Oct 2020 08:34:13 +0200 (CEST)
Received: from proxmox-new.maurer-it.com (localhost.localdomain [127.0.0.1])
 by proxmox-new.maurer-it.com (Proxmox) with ESMTP id 292C945D2D
 for <pbs-devel@lists.proxmox.com>; Fri, 16 Oct 2020 08:34:13 +0200 (CEST)
Date: Fri, 16 Oct 2020 08:33:54 +0200 (CEST)
From: Dietmar Maurer <dietmar@proxmox.com>
To: Proxmox Backup Server development discussion <pbs-devel@lists.proxmox.com>, 
 Stefan Reiter <s.reiter@proxmox.com>
Message-ID: <1528426253.279.1602830035538@webmail.proxmox.com>
In-Reply-To: <20201015104916.21170-3-s.reiter@proxmox.com>
References: <20201015104916.21170-1-s.reiter@proxmox.com>
 <20201015104916.21170-3-s.reiter@proxmox.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 7bit
X-Priority: 3
Importance: Normal
X-Mailer: Open-Xchange Mailer v7.10.4-Rev11
X-Originating-Client: open-xchange-appsuite
X-SPAM-LEVEL: Spam detection results:  0
 AWL 0.064 Adjusted score from AWL reputation of From: address
 KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment
 RCVD_IN_DNSWL_MED        -2.3 Sender listed at https://www.dnswl.org/,
 medium trust
 SPF_HELO_NONE           0.001 SPF: HELO does not publish an SPF Record
 SPF_PASS               -0.001 SPF: sender matches SPF record
 URIBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to URIBL was blocked. See
 http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block for more
 information. [manifest.rs, verify.rs, proxmox.com, datastore.rs,
 environment.rs]
Subject: Re: [pbs-devel] [PATCH v2 proxmox-backup 2/4] datastore: add
 manifest locking
X-BeenThere: pbs-devel@lists.proxmox.com
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Proxmox Backup Server development discussion
 <pbs-devel.lists.proxmox.com>
List-Unsubscribe: <https://lists.proxmox.com/cgi-bin/mailman/options/pbs-devel>, 
 <mailto:pbs-devel-request@lists.proxmox.com?subject=unsubscribe>
List-Archive: <http://lists.proxmox.com/pipermail/pbs-devel/>
List-Post: <mailto:pbs-devel@lists.proxmox.com>
List-Help: <mailto:pbs-devel-request@lists.proxmox.com?subject=help>
List-Subscribe: <https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel>, 
 <mailto:pbs-devel-request@lists.proxmox.com?subject=subscribe>
X-List-Received-Date: Fri, 16 Oct 2020 06:34:14 -0000

comments inline:

> On 10/15/2020 12:49 PM Stefan Reiter <s.reiter@proxmox.com> wrote:
> 
>  
> Avoid races when updating manifest data by flocking a lock file.
> update_manifest is used to ensure updates always happen with the lock
> held.
> 
> Snapshot deletion also acquires the lock, so it cannot interfere with an
> outstanding manifest write.
> 
> Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
> ---
> 
> v2:
> * Use seperate manifest lock file
> * Change store_manifest to update_manifest to force consumers to use correct
>   locking
> * Don't hold lock across verify, reload manifest at the end
> * Update comments
> 
>  src/api2/admin/datastore.rs    |  8 +++----
>  src/api2/backup/environment.rs | 13 ++++------
>  src/backup/datastore.rs        | 43 +++++++++++++++++++++++++++++-----
>  src/backup/manifest.rs         |  1 +
>  src/backup/verify.rs           |  9 +++----
>  5 files changed, 50 insertions(+), 24 deletions(-)
> 
> diff --git a/src/api2/admin/datastore.rs b/src/api2/admin/datastore.rs
> index de39cd1b..4f15c1cd 100644
> --- a/src/api2/admin/datastore.rs
> +++ b/src/api2/admin/datastore.rs
> @@ -1481,11 +1481,9 @@ fn set_notes(
>      let allowed = (user_privs & PRIV_DATASTORE_READ) != 0;
>      if !allowed { check_backup_owner(&datastore, backup_dir.group(), &userid)?; }
>  
> -    let (mut manifest, _) = datastore.load_manifest(&backup_dir)?;
> -
> -    manifest.unprotected["notes"] = notes.into();
> -
> -    datastore.store_manifest(&backup_dir, manifest)?;
> +    datastore.update_manifest(&backup_dir,|manifest| {
> +        manifest.unprotected["notes"] = notes.into();
> +    }).map_err(|err| format_err!("unable to update manifest blob - {}", err))?;
>  
>      Ok(())
>  }
> diff --git a/src/api2/backup/environment.rs b/src/api2/backup/environment.rs
> index f00c2cd3..c4f166df 100644
> --- a/src/api2/backup/environment.rs
> +++ b/src/api2/backup/environment.rs
> @@ -472,16 +472,11 @@ impl BackupEnvironment {
>              bail!("backup does not contain valid files (file count == 0)");
>          }
>  
> -        // check manifest
> -        let (mut manifest, _) = self.datastore.load_manifest(&self.backup_dir)
> -            .map_err(|err| format_err!("unable to load manifest blob - {}", err))?;
> -
> +        // check for valid manifest and store stats
>          let stats = serde_json::to_value(state.backup_stat)?;
> -
> -        manifest.unprotected["chunk_upload_stats"] = stats;
> -
> -        self.datastore.store_manifest(&self.backup_dir, manifest)
> -            .map_err(|err| format_err!("unable to store manifest blob - {}", err))?;
> +        self.datastore.update_manifest(&self.backup_dir, |manifest| {
> +            manifest.unprotected["chunk_upload_stats"] = stats;
> +        }).map_err(|err| format_err!("unable to update manifest blob - {}", err))?;
>  
>          if let Some(base) = &self.last_backup {
>              let path = self.datastore.snapshot_path(&base.backup_dir);
> diff --git a/src/backup/datastore.rs b/src/backup/datastore.rs
> index ca8ca438..7b2bee7e 100644
> --- a/src/backup/datastore.rs
> +++ b/src/backup/datastore.rs
> @@ -3,17 +3,19 @@ use std::io::{self, Write};
>  use std::path::{Path, PathBuf};
>  use std::sync::{Arc, Mutex};
>  use std::convert::TryFrom;
> +use std::time::Duration;
> +use std::fs::File;
>  
>  use anyhow::{bail, format_err, Error};
>  use lazy_static::lazy_static;
>  
> -use proxmox::tools::fs::{replace_file, CreateOptions};
> +use proxmox::tools::fs::{replace_file, CreateOptions, open_file_locked};
>  
>  use super::backup_info::{BackupGroup, BackupDir};
>  use super::chunk_store::ChunkStore;
>  use super::dynamic_index::{DynamicIndexReader, DynamicIndexWriter};
>  use super::fixed_index::{FixedIndexReader, FixedIndexWriter};
> -use super::manifest::{MANIFEST_BLOB_NAME, CLIENT_LOG_BLOB_NAME, BackupManifest};
> +use super::manifest::{MANIFEST_BLOB_NAME, MANIFEST_LOCK_NAME, CLIENT_LOG_BLOB_NAME, BackupManifest};
>  use super::index::*;
>  use super::{DataBlob, ArchiveType, archive_type};
>  use crate::config::datastore;
> @@ -235,9 +237,10 @@ impl DataStore {
>  
>          let full_path = self.snapshot_path(backup_dir);
>  
> -        let _guard;
> +        let (_guard, _manifest_guard);
>          if !force {
>              _guard = lock_dir_noblock(&full_path, "snapshot", "possibly running or in use")?;
> +            _manifest_guard = self.lock_manifest(backup_dir);

I think this is unnecessary. An update manifest should not block a remove_backup_dir.
What for exactly?

>          }
>  
>          // Acquire lock and keep it during remove operation, so there's no
> @@ -665,8 +668,27 @@ impl DataStore {
>              digest_str,
>              err,
>          ))
> -     }
> +    }
>  
> +    fn lock_manifest(
> +        &self,
> +        backup_dir: &BackupDir,
> +    ) -> Result<File, Error> {
> +        let mut path = self.base_path();
> +        path.push(backup_dir.relative_path());
> +        path.push(&MANIFEST_LOCK_NAME);
> +
> +        // update_manifest should never take a long time, so if someone else has
> +        // the lock we can simply block a bit and should get it soon
> +        open_file_locked(&path, Duration::from_secs(5), true)
> +            .map_err(|err| {
> +                format_err!(
> +                    "unable to acquire manifest lock {:?} - {}", &path, err
> +                )
> +            })
> +    }
> +
> +    /// Load the manifest without a lock. Must not be written back.
>      pub fn load_manifest(
>          &self,
>          backup_dir: &BackupDir,
> @@ -677,11 +699,19 @@ impl DataStore {
>          Ok((manifest, raw_size))
>      }
>  
> -    pub fn store_manifest(
> +    /// Update the manifest of the specified snapshot. Never write a manifest directly,
> +    /// only use this method - anything else may break locking guarantees.
> +    pub fn update_manifest(
>          &self,
>          backup_dir: &BackupDir,
> -        manifest: BackupManifest,
> +        update_fn: impl FnOnce(&mut BackupManifest),
>      ) -> Result<(), Error> {

It should not be possible to update anything outside the "unprotected" property.

> +
> +        let _guard = self.lock_manifest(backup_dir)?;
> +        let (mut manifest, _) = self.load_manifest(&backup_dir)?;
> +
> +        update_fn(&mut manifest);
> +
>          let manifest = serde_json::to_value(manifest)?;
>          let manifest = serde_json::to_string_pretty(&manifest)?;
>          let blob = DataBlob::encode(manifest.as_bytes(), None, true)?;
> @@ -691,6 +721,7 @@ impl DataStore {
>          path.push(backup_dir.relative_path());
>          path.push(MANIFEST_BLOB_NAME);
>  
> +        // atomic replace invalidates flock - no other writes past this point!
>          replace_file(&path, raw_data, CreateOptions::new())?;
>  
>          Ok(())
> diff --git a/src/backup/manifest.rs b/src/backup/manifest.rs
> index 609cc998..51980a07 100644
> --- a/src/backup/manifest.rs
> +++ b/src/backup/manifest.rs
> @@ -8,6 +8,7 @@ use ::serde::{Deserialize, Serialize};
>  use crate::backup::{BackupDir, CryptMode, CryptConfig};
>  
>  pub const MANIFEST_BLOB_NAME: &str = "index.json.blob";
> +pub const MANIFEST_LOCK_NAME: &str = ".index.json.lck";
>  pub const CLIENT_LOG_BLOB_NAME: &str = "client.log.blob";
>  
>  mod hex_csum {
> diff --git a/src/backup/verify.rs b/src/backup/verify.rs
> index 05b6ba86..ea3fa760 100644
> --- a/src/backup/verify.rs
> +++ b/src/backup/verify.rs
> @@ -300,7 +300,7 @@ pub fn verify_backup_dir(
>          return Ok(true);
>      }
>  
> -    let mut manifest = match datastore.load_manifest(&backup_dir) {
> +    let manifest = match datastore.load_manifest(&backup_dir) {
>          Ok((manifest, _)) => manifest,
>          Err(err) => {
>              task_log!(
> @@ -367,9 +367,10 @@ pub fn verify_backup_dir(
>          state: verify_result,
>          upid,
>      };
> -    manifest.unprotected["verify_state"] = serde_json::to_value(verify_state)?;
> -    datastore.store_manifest(&backup_dir, manifest)
> -        .map_err(|err| format_err!("unable to store manifest blob - {}", err))?;
> +    let verify_state = serde_json::to_value(verify_state)?;
> +    datastore.update_manifest(&backup_dir, |manifest| {
> +        manifest.unprotected["verify_state"] = verify_state;
> +    }).map_err(|err| format_err!("unable to update manifest blob - {}", err))?;
>  
>      Ok(error_count == 0)
>  }
> -- 
> 2.20.1
> 
> 
> 
> _______________________________________________
> pbs-devel mailing list
> pbs-devel@lists.proxmox.com
> https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel