From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <d.csapak@proxmox.com>
Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature RSA-PSS (2048 bits))
 (No client certificate requested)
 by lists.proxmox.com (Postfix) with ESMTPS id 1563471599
 for <pbs-devel@lists.proxmox.com>; Wed, 18 May 2022 13:25:29 +0200 (CEST)
Received: from firstgate.proxmox.com (localhost [127.0.0.1])
 by firstgate.proxmox.com (Proxmox) with ESMTP id 011502BC93
 for <pbs-devel@lists.proxmox.com>; Wed, 18 May 2022 13:24:59 +0200 (CEST)
Received: from proxmox-new.maurer-it.com (proxmox-new.maurer-it.com
 [94.136.29.106])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256)
 (No client certificate requested)
 by firstgate.proxmox.com (Proxmox) with ESMTPS id 22E412BC88
 for <pbs-devel@lists.proxmox.com>; Wed, 18 May 2022 13:24:57 +0200 (CEST)
Received: from proxmox-new.maurer-it.com (localhost.localdomain [127.0.0.1])
 by proxmox-new.maurer-it.com (Proxmox) with ESMTP id ECB9042A34
 for <pbs-devel@lists.proxmox.com>; Wed, 18 May 2022 13:24:56 +0200 (CEST)
From: Dominik Csapak <d.csapak@proxmox.com>
To: pbs-devel@lists.proxmox.com
Date: Wed, 18 May 2022 13:24:55 +0200
Message-Id: <20220518112455.2393668-1-d.csapak@proxmox.com>
X-Mailer: git-send-email 2.30.2
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-SPAM-LEVEL: Spam detection results:  0
 AWL 0.068 Adjusted score from AWL reputation of From: address
 BAYES_00                 -1.9 Bayes spam probability is 0 to 1%
 KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment
 PROLO_LEO1                0.1 Meta Catches all Leo drug variations so far
 SPF_HELO_NONE           0.001 SPF: HELO does not publish an SPF Record
 SPF_PASS               -0.001 SPF: sender matches SPF record
 T_SCC_BODY_TEXT_LINE    -0.01 -
 URIBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to URIBL was blocked. See
 http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block for more
 information. [environment.rs, datastore.rs]
Subject: [pbs-devel] [RFC PATCH proxmox-backup] datastore: implement
 consitency tuning for datastores
X-BeenThere: pbs-devel@lists.proxmox.com
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Proxmox Backup Server development discussion
 <pbs-devel.lists.proxmox.com>
List-Unsubscribe: <https://lists.proxmox.com/cgi-bin/mailman/options/pbs-devel>, 
 <mailto:pbs-devel-request@lists.proxmox.com?subject=unsubscribe>
List-Archive: <http://lists.proxmox.com/pipermail/pbs-devel/>
List-Post: <mailto:pbs-devel@lists.proxmox.com>
List-Help: <mailto:pbs-devel-request@lists.proxmox.com?subject=help>
List-Subscribe: <https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel>, 
 <mailto:pbs-devel-request@lists.proxmox.com?subject=subscribe>
X-List-Received-Date: Wed, 18 May 2022 11:25:29 -0000

currently, we don't (f)sync on chunk insertion (or at any point after
that), which can lead to broken chunks in case of e.g. an unexpected
powerloss. To fix that, offer a tuning option for datastores that
controls the level of syncs it does:

* None (old default): same as current state, no (f)syncs done at any point
* Filesystem (new default): at the end of a backup, the datastore issues
  a syncfs(2) to the filesystem of the datastore
* File: issues an fsync on each chunk as they get inserted
  (using our 'replace_file' helper)

a small benchmark showed the following (times in mm:ss):
setup: virtual pbs, 4 cores, 8GiB memory, ext4 on spinner

size                none    filesystem  file
2GiB (fits in ram)   00:13   0:41        01:00
33GiB                05:21   05:31       13:45

so if the backup fits in memory, there is a large difference between all
of the modes (expected), but as soon as it exceeds the memory size,
the difference between not syncing and syncing the fs at the end becomes
much smaller.

i also tested on an nvme, but there the syncs basically made no difference

Signed-off-by: Dominik Csapak <d.csapak@proxmox.com>
---
it would be nice if anybody else tries to recreate the benchmarks on
different setups, to verify (or disprove) my findings

 pbs-api-types/src/datastore.rs   | 14 +++++++++++
 pbs-datastore/src/chunk_store.rs | 34 ++++++++++----------------
 pbs-datastore/src/datastore.rs   | 42 +++++++++++++++++++++++++++++---
 src/api2/backup/environment.rs   |  2 ++
 src/api2/config/datastore.rs     |  9 +++++--
 5 files changed, 75 insertions(+), 26 deletions(-)

diff --git a/pbs-api-types/src/datastore.rs b/pbs-api-types/src/datastore.rs
index e2bf70aa..2536b8ff 100644
--- a/pbs-api-types/src/datastore.rs
+++ b/pbs-api-types/src/datastore.rs
@@ -214,6 +214,19 @@ pub enum ChunkOrder {
     Inode,
 }
 
+#[api]
+#[derive(Debug, Copy, Clone, PartialEq, Serialize, Deserialize)]
+#[serde(rename_all = "lowercase")]
+/// The level of consitency to ensure (Default = "filesystem")
+pub enum Consistency {
+    /// No special consistency operation to sync the fs or files to disk
+    None,
+    /// Each file (e.g. chunks) will be fsync'd to the disk after writing.
+    File,
+    /// At the end of some operations (e.g. Backup), the underlying filesystem will be synced.
+    Filesystem,
+}
+
 #[api(
     properties: {
         "chunk-order": {
@@ -228,6 +241,7 @@ pub enum ChunkOrder {
 pub struct DatastoreTuning {
     /// Iterate chunks in this order
     pub chunk_order: Option<ChunkOrder>,
+    pub consistency: Option<Consistency>,
 }
 
 pub const DATASTORE_TUNING_STRING_SCHEMA: Schema = StringSchema::new("Datastore tuning options")
diff --git a/pbs-datastore/src/chunk_store.rs b/pbs-datastore/src/chunk_store.rs
index 5bfe9fac..58541c0f 100644
--- a/pbs-datastore/src/chunk_store.rs
+++ b/pbs-datastore/src/chunk_store.rs
@@ -1,4 +1,3 @@
-use std::io::Write;
 use std::os::unix::io::AsRawFd;
 use std::path::{Path, PathBuf};
 use std::sync::{Arc, Mutex};
@@ -22,6 +21,7 @@ pub struct ChunkStore {
     chunk_dir: PathBuf,
     mutex: Mutex<()>,
     locker: Option<Arc<Mutex<ProcessLocker>>>,
+    sync_chunks: bool,
 }
 
 // TODO: what about sysctl setting vm.vfs_cache_pressure (0 - 100) ?
@@ -68,6 +68,7 @@ impl ChunkStore {
             chunk_dir: PathBuf::new(),
             mutex: Mutex::new(()),
             locker: None,
+            sync_chunks: false,
         }
     }
 
@@ -88,6 +89,7 @@ impl ChunkStore {
         uid: nix::unistd::Uid,
         gid: nix::unistd::Gid,
         worker: Option<&dyn WorkerTaskContext>,
+        sync_chunks: bool,
     ) -> Result<Self, Error>
     where
         P: Into<PathBuf>,
@@ -144,7 +146,7 @@ impl ChunkStore {
             }
         }
 
-        Self::open(name, base)
+        Self::open(name, base, sync_chunks)
     }
 
     fn lockfile_path<P: Into<PathBuf>>(base: P) -> PathBuf {
@@ -153,7 +155,7 @@ impl ChunkStore {
         lockfile_path
     }
 
-    pub fn open<P: Into<PathBuf>>(name: &str, base: P) -> Result<Self, Error> {
+    pub fn open<P: Into<PathBuf>>(name: &str, base: P, sync_chunks: bool) -> Result<Self, Error> {
         let base: PathBuf = base.into();
 
         if !base.is_absolute() {
@@ -176,6 +178,7 @@ impl ChunkStore {
             chunk_dir,
             locker: Some(locker),
             mutex: Mutex::new(()),
+            sync_chunks,
         })
     }
 
@@ -461,21 +464,10 @@ impl ChunkStore {
             }
         }
 
-        let mut tmp_path = chunk_path.clone();
-        tmp_path.set_extension("tmp");
-
-        let mut file = std::fs::File::create(&tmp_path).map_err(|err| {
-            format_err!("creating chunk on store '{name}' failed for {digest_str} - {err}")
-        })?;
-
-        file.write_all(raw_data).map_err(|err| {
-            format_err!("writing chunk on store '{name}' failed for {digest_str} - {err}")
-        })?;
-
-        if let Err(err) = std::fs::rename(&tmp_path, &chunk_path) {
-            if std::fs::remove_file(&tmp_path).is_err() { /* ignore */ }
-            bail!("atomic rename on store '{name}' failed for chunk {digest_str} - {err}");
-        }
+        proxmox_sys::fs::replace_file(chunk_path, raw_data, CreateOptions::new(), self.sync_chunks)
+            .map_err(|err| {
+                format_err!("inserting chunk on store '{name}' failed for {digest_str} - {err}")
+            })?;
 
         drop(lock);
 
@@ -532,13 +524,13 @@ fn test_chunk_store1() {
 
     if let Err(_e) = std::fs::remove_dir_all(".testdir") { /* ignore */ }
 
-    let chunk_store = ChunkStore::open("test", &path);
+    let chunk_store = ChunkStore::open("test", &path, false);
     assert!(chunk_store.is_err());
 
     let user = nix::unistd::User::from_uid(nix::unistd::Uid::current())
         .unwrap()
         .unwrap();
-    let chunk_store = ChunkStore::create("test", &path, user.uid, user.gid, None).unwrap();
+    let chunk_store = ChunkStore::create("test", &path, user.uid, user.gid, None, false).unwrap();
 
     let (chunk, digest) = crate::data_blob::DataChunkBuilder::new(&[0u8, 1u8])
         .build()
@@ -550,7 +542,7 @@ fn test_chunk_store1() {
     let (exists, _) = chunk_store.insert_chunk(&chunk, &digest).unwrap();
     assert!(exists);
 
-    let chunk_store = ChunkStore::create("test", &path, user.uid, user.gid, None);
+    let chunk_store = ChunkStore::create("test", &path, user.uid, user.gid, None, false);
     assert!(chunk_store.is_err());
 
     if let Err(_e) = std::fs::remove_dir_all(".testdir") { /* ignore */ }
diff --git a/pbs-datastore/src/datastore.rs b/pbs-datastore/src/datastore.rs
index 5af8a295..20b02c70 100644
--- a/pbs-datastore/src/datastore.rs
+++ b/pbs-datastore/src/datastore.rs
@@ -18,7 +18,7 @@ use proxmox_sys::WorkerTaskContext;
 use proxmox_sys::{task_log, task_warn};
 
 use pbs_api_types::{
-    Authid, BackupNamespace, BackupType, ChunkOrder, DataStoreConfig, DatastoreTuning,
+    Authid, BackupNamespace, BackupType, ChunkOrder, Consistency, DataStoreConfig, DatastoreTuning,
     GarbageCollectionStatus, HumanByte, Operation, UPID,
 };
 use pbs_config::ConfigVersionCache;
@@ -61,6 +61,7 @@ pub struct DataStoreImpl {
     chunk_order: ChunkOrder,
     last_generation: usize,
     last_update: i64,
+    sync_fs: bool,
 }
 
 impl DataStoreImpl {
@@ -75,6 +76,7 @@ impl DataStoreImpl {
             chunk_order: ChunkOrder::None,
             last_generation: 0,
             last_update: 0,
+            sync_fs: false,
         })
     }
 }
@@ -154,7 +156,16 @@ impl DataStore {
             }
         }
 
-        let chunk_store = ChunkStore::open(name, &config.path)?;
+        let tuning: DatastoreTuning = serde_json::from_value(
+            DatastoreTuning::API_SCHEMA
+                .parse_property_string(config.tuning.as_deref().unwrap_or(""))?,
+        )?;
+
+        let chunk_store = ChunkStore::open(
+            name,
+            &config.path,
+            tuning.consistency == Some(Consistency::File),
+        )?;
         let datastore = DataStore::with_store_and_config(chunk_store, config, generation, now)?;
 
         let datastore = Arc::new(datastore);
@@ -197,7 +208,15 @@ impl DataStore {
     ) -> Result<Arc<Self>, Error> {
         let name = config.name.clone();
 
-        let chunk_store = ChunkStore::open(&name, &config.path)?;
+        let tuning: DatastoreTuning = serde_json::from_value(
+            DatastoreTuning::API_SCHEMA
+                .parse_property_string(config.tuning.as_deref().unwrap_or(""))?,
+        )?;
+        let chunk_store = ChunkStore::open(
+            &name,
+            &config.path,
+            tuning.consistency == Some(Consistency::File),
+        )?;
         let inner = Arc::new(Self::with_store_and_config(chunk_store, config, 0, 0)?);
 
         if let Some(operation) = operation {
@@ -233,6 +252,8 @@ impl DataStore {
                 .parse_property_string(config.tuning.as_deref().unwrap_or(""))?,
         )?;
         let chunk_order = tuning.chunk_order.unwrap_or(ChunkOrder::Inode);
+        let sync_fs =
+            tuning.consistency.unwrap_or(Consistency::Filesystem) == Consistency::Filesystem;
 
         Ok(DataStoreImpl {
             chunk_store: Arc::new(chunk_store),
@@ -242,6 +263,7 @@ impl DataStore {
             chunk_order,
             last_generation,
             last_update,
+            sync_fs,
         })
     }
 
@@ -1257,4 +1279,18 @@ impl DataStore {
         todo!("split out the namespace");
     }
     */
+
+    /// Syncs the filesystem of the datastore if 'sync_fs' is enabled. Uses syncfs(2).
+    pub fn syncfs(&self) -> Result<(), Error> {
+        if !self.inner.sync_fs {
+            return Ok(());
+        }
+        let file = std::fs::File::open(self.base_path())?;
+        let fd = file.as_raw_fd();
+        log::info!("syncinc filesystem");
+        if unsafe { libc::syncfs(fd) } < 0 {
+            bail!("error during syncfs: {}", std::io::Error::last_os_error());
+        }
+        Ok(())
+    }
 }
diff --git a/src/api2/backup/environment.rs b/src/api2/backup/environment.rs
index 8c1c42db..ecde3394 100644
--- a/src/api2/backup/environment.rs
+++ b/src/api2/backup/environment.rs
@@ -623,6 +623,8 @@ impl BackupEnvironment {
             }
         }
 
+        self.datastore.syncfs()?;
+
         // marks the backup as successful
         state.finished = true;
 
diff --git a/src/api2/config/datastore.rs b/src/api2/config/datastore.rs
index 28342c2c..62f9c901 100644
--- a/src/api2/config/datastore.rs
+++ b/src/api2/config/datastore.rs
@@ -11,8 +11,8 @@ use proxmox_section_config::SectionConfigData;
 use proxmox_sys::WorkerTaskContext;
 
 use pbs_api_types::{
-    Authid, DataStoreConfig, DataStoreConfigUpdater, DatastoreNotify, DATASTORE_SCHEMA,
-    PRIV_DATASTORE_ALLOCATE, PRIV_DATASTORE_AUDIT, PRIV_DATASTORE_MODIFY,
+    Authid, Consistency, DataStoreConfig, DataStoreConfigUpdater, DatastoreNotify, DatastoreTuning,
+    DATASTORE_SCHEMA, PRIV_DATASTORE_ALLOCATE, PRIV_DATASTORE_AUDIT, PRIV_DATASTORE_MODIFY,
     PROXMOX_CONFIG_DIGEST_SCHEMA,
 };
 use pbs_config::BackupLockGuard;
@@ -70,6 +70,10 @@ pub(crate) fn do_create_datastore(
 ) -> Result<(), Error> {
     let path: PathBuf = datastore.path.clone().into();
 
+    let tuning: DatastoreTuning = serde_json::from_value(
+        DatastoreTuning::API_SCHEMA
+            .parse_property_string(datastore.tuning.as_deref().unwrap_or(""))?,
+    )?;
     let backup_user = pbs_config::backup_user()?;
     let _store = ChunkStore::create(
         &datastore.name,
@@ -77,6 +81,7 @@ pub(crate) fn do_create_datastore(
         backup_user.uid,
         backup_user.gid,
         worker,
+        tuning.consistency == Some(Consistency::File),
     )?;
 
     config.set_data(&datastore.name, "datastore", &datastore)?;
-- 
2.30.2