public inbox for pbs-devel@lists.proxmox.com
 help / color / mirror / Atom feed
* [pbs-devel] [RFC PATCH proxmox-backup] datastore: implement consitency tuning for datastores
@ 2022-05-18 11:24 Dominik Csapak
  2022-05-19 13:35 ` Fabian Grünbichler
  2022-05-20  7:07 ` Thomas Lamprecht
  0 siblings, 2 replies; 4+ messages in thread
From: Dominik Csapak @ 2022-05-18 11:24 UTC (permalink / raw)
  To: pbs-devel

currently, we don't (f)sync on chunk insertion (or at any point after
that), which can lead to broken chunks in case of e.g. an unexpected
powerloss. To fix that, offer a tuning option for datastores that
controls the level of syncs it does:

* None (old default): same as current state, no (f)syncs done at any point
* Filesystem (new default): at the end of a backup, the datastore issues
  a syncfs(2) to the filesystem of the datastore
* File: issues an fsync on each chunk as they get inserted
  (using our 'replace_file' helper)

a small benchmark showed the following (times in mm:ss):
setup: virtual pbs, 4 cores, 8GiB memory, ext4 on spinner

size                none    filesystem  file
2GiB (fits in ram)   00:13   0:41        01:00
33GiB                05:21   05:31       13:45

so if the backup fits in memory, there is a large difference between all
of the modes (expected), but as soon as it exceeds the memory size,
the difference between not syncing and syncing the fs at the end becomes
much smaller.

i also tested on an nvme, but there the syncs basically made no difference

Signed-off-by: Dominik Csapak <d.csapak@proxmox.com>
---
it would be nice if anybody else tries to recreate the benchmarks on
different setups, to verify (or disprove) my findings

 pbs-api-types/src/datastore.rs   | 14 +++++++++++
 pbs-datastore/src/chunk_store.rs | 34 ++++++++++----------------
 pbs-datastore/src/datastore.rs   | 42 +++++++++++++++++++++++++++++---
 src/api2/backup/environment.rs   |  2 ++
 src/api2/config/datastore.rs     |  9 +++++--
 5 files changed, 75 insertions(+), 26 deletions(-)

diff --git a/pbs-api-types/src/datastore.rs b/pbs-api-types/src/datastore.rs
index e2bf70aa..2536b8ff 100644
--- a/pbs-api-types/src/datastore.rs
+++ b/pbs-api-types/src/datastore.rs
@@ -214,6 +214,19 @@ pub enum ChunkOrder {
     Inode,
 }
 
+#[api]
+#[derive(Debug, Copy, Clone, PartialEq, Serialize, Deserialize)]
+#[serde(rename_all = "lowercase")]
+/// The level of consitency to ensure (Default = "filesystem")
+pub enum Consistency {
+    /// No special consistency operation to sync the fs or files to disk
+    None,
+    /// Each file (e.g. chunks) will be fsync'd to the disk after writing.
+    File,
+    /// At the end of some operations (e.g. Backup), the underlying filesystem will be synced.
+    Filesystem,
+}
+
 #[api(
     properties: {
         "chunk-order": {
@@ -228,6 +241,7 @@ pub enum ChunkOrder {
 pub struct DatastoreTuning {
     /// Iterate chunks in this order
     pub chunk_order: Option<ChunkOrder>,
+    pub consistency: Option<Consistency>,
 }
 
 pub const DATASTORE_TUNING_STRING_SCHEMA: Schema = StringSchema::new("Datastore tuning options")
diff --git a/pbs-datastore/src/chunk_store.rs b/pbs-datastore/src/chunk_store.rs
index 5bfe9fac..58541c0f 100644
--- a/pbs-datastore/src/chunk_store.rs
+++ b/pbs-datastore/src/chunk_store.rs
@@ -1,4 +1,3 @@
-use std::io::Write;
 use std::os::unix::io::AsRawFd;
 use std::path::{Path, PathBuf};
 use std::sync::{Arc, Mutex};
@@ -22,6 +21,7 @@ pub struct ChunkStore {
     chunk_dir: PathBuf,
     mutex: Mutex<()>,
     locker: Option<Arc<Mutex<ProcessLocker>>>,
+    sync_chunks: bool,
 }
 
 // TODO: what about sysctl setting vm.vfs_cache_pressure (0 - 100) ?
@@ -68,6 +68,7 @@ impl ChunkStore {
             chunk_dir: PathBuf::new(),
             mutex: Mutex::new(()),
             locker: None,
+            sync_chunks: false,
         }
     }
 
@@ -88,6 +89,7 @@ impl ChunkStore {
         uid: nix::unistd::Uid,
         gid: nix::unistd::Gid,
         worker: Option<&dyn WorkerTaskContext>,
+        sync_chunks: bool,
     ) -> Result<Self, Error>
     where
         P: Into<PathBuf>,
@@ -144,7 +146,7 @@ impl ChunkStore {
             }
         }
 
-        Self::open(name, base)
+        Self::open(name, base, sync_chunks)
     }
 
     fn lockfile_path<P: Into<PathBuf>>(base: P) -> PathBuf {
@@ -153,7 +155,7 @@ impl ChunkStore {
         lockfile_path
     }
 
-    pub fn open<P: Into<PathBuf>>(name: &str, base: P) -> Result<Self, Error> {
+    pub fn open<P: Into<PathBuf>>(name: &str, base: P, sync_chunks: bool) -> Result<Self, Error> {
         let base: PathBuf = base.into();
 
         if !base.is_absolute() {
@@ -176,6 +178,7 @@ impl ChunkStore {
             chunk_dir,
             locker: Some(locker),
             mutex: Mutex::new(()),
+            sync_chunks,
         })
     }
 
@@ -461,21 +464,10 @@ impl ChunkStore {
             }
         }
 
-        let mut tmp_path = chunk_path.clone();
-        tmp_path.set_extension("tmp");
-
-        let mut file = std::fs::File::create(&tmp_path).map_err(|err| {
-            format_err!("creating chunk on store '{name}' failed for {digest_str} - {err}")
-        })?;
-
-        file.write_all(raw_data).map_err(|err| {
-            format_err!("writing chunk on store '{name}' failed for {digest_str} - {err}")
-        })?;
-
-        if let Err(err) = std::fs::rename(&tmp_path, &chunk_path) {
-            if std::fs::remove_file(&tmp_path).is_err() { /* ignore */ }
-            bail!("atomic rename on store '{name}' failed for chunk {digest_str} - {err}");
-        }
+        proxmox_sys::fs::replace_file(chunk_path, raw_data, CreateOptions::new(), self.sync_chunks)
+            .map_err(|err| {
+                format_err!("inserting chunk on store '{name}' failed for {digest_str} - {err}")
+            })?;
 
         drop(lock);
 
@@ -532,13 +524,13 @@ fn test_chunk_store1() {
 
     if let Err(_e) = std::fs::remove_dir_all(".testdir") { /* ignore */ }
 
-    let chunk_store = ChunkStore::open("test", &path);
+    let chunk_store = ChunkStore::open("test", &path, false);
     assert!(chunk_store.is_err());
 
     let user = nix::unistd::User::from_uid(nix::unistd::Uid::current())
         .unwrap()
         .unwrap();
-    let chunk_store = ChunkStore::create("test", &path, user.uid, user.gid, None).unwrap();
+    let chunk_store = ChunkStore::create("test", &path, user.uid, user.gid, None, false).unwrap();
 
     let (chunk, digest) = crate::data_blob::DataChunkBuilder::new(&[0u8, 1u8])
         .build()
@@ -550,7 +542,7 @@ fn test_chunk_store1() {
     let (exists, _) = chunk_store.insert_chunk(&chunk, &digest).unwrap();
     assert!(exists);
 
-    let chunk_store = ChunkStore::create("test", &path, user.uid, user.gid, None);
+    let chunk_store = ChunkStore::create("test", &path, user.uid, user.gid, None, false);
     assert!(chunk_store.is_err());
 
     if let Err(_e) = std::fs::remove_dir_all(".testdir") { /* ignore */ }
diff --git a/pbs-datastore/src/datastore.rs b/pbs-datastore/src/datastore.rs
index 5af8a295..20b02c70 100644
--- a/pbs-datastore/src/datastore.rs
+++ b/pbs-datastore/src/datastore.rs
@@ -18,7 +18,7 @@ use proxmox_sys::WorkerTaskContext;
 use proxmox_sys::{task_log, task_warn};
 
 use pbs_api_types::{
-    Authid, BackupNamespace, BackupType, ChunkOrder, DataStoreConfig, DatastoreTuning,
+    Authid, BackupNamespace, BackupType, ChunkOrder, Consistency, DataStoreConfig, DatastoreTuning,
     GarbageCollectionStatus, HumanByte, Operation, UPID,
 };
 use pbs_config::ConfigVersionCache;
@@ -61,6 +61,7 @@ pub struct DataStoreImpl {
     chunk_order: ChunkOrder,
     last_generation: usize,
     last_update: i64,
+    sync_fs: bool,
 }
 
 impl DataStoreImpl {
@@ -75,6 +76,7 @@ impl DataStoreImpl {
             chunk_order: ChunkOrder::None,
             last_generation: 0,
             last_update: 0,
+            sync_fs: false,
         })
     }
 }
@@ -154,7 +156,16 @@ impl DataStore {
             }
         }
 
-        let chunk_store = ChunkStore::open(name, &config.path)?;
+        let tuning: DatastoreTuning = serde_json::from_value(
+            DatastoreTuning::API_SCHEMA
+                .parse_property_string(config.tuning.as_deref().unwrap_or(""))?,
+        )?;
+
+        let chunk_store = ChunkStore::open(
+            name,
+            &config.path,
+            tuning.consistency == Some(Consistency::File),
+        )?;
         let datastore = DataStore::with_store_and_config(chunk_store, config, generation, now)?;
 
         let datastore = Arc::new(datastore);
@@ -197,7 +208,15 @@ impl DataStore {
     ) -> Result<Arc<Self>, Error> {
         let name = config.name.clone();
 
-        let chunk_store = ChunkStore::open(&name, &config.path)?;
+        let tuning: DatastoreTuning = serde_json::from_value(
+            DatastoreTuning::API_SCHEMA
+                .parse_property_string(config.tuning.as_deref().unwrap_or(""))?,
+        )?;
+        let chunk_store = ChunkStore::open(
+            &name,
+            &config.path,
+            tuning.consistency == Some(Consistency::File),
+        )?;
         let inner = Arc::new(Self::with_store_and_config(chunk_store, config, 0, 0)?);
 
         if let Some(operation) = operation {
@@ -233,6 +252,8 @@ impl DataStore {
                 .parse_property_string(config.tuning.as_deref().unwrap_or(""))?,
         )?;
         let chunk_order = tuning.chunk_order.unwrap_or(ChunkOrder::Inode);
+        let sync_fs =
+            tuning.consistency.unwrap_or(Consistency::Filesystem) == Consistency::Filesystem;
 
         Ok(DataStoreImpl {
             chunk_store: Arc::new(chunk_store),
@@ -242,6 +263,7 @@ impl DataStore {
             chunk_order,
             last_generation,
             last_update,
+            sync_fs,
         })
     }
 
@@ -1257,4 +1279,18 @@ impl DataStore {
         todo!("split out the namespace");
     }
     */
+
+    /// Syncs the filesystem of the datastore if 'sync_fs' is enabled. Uses syncfs(2).
+    pub fn syncfs(&self) -> Result<(), Error> {
+        if !self.inner.sync_fs {
+            return Ok(());
+        }
+        let file = std::fs::File::open(self.base_path())?;
+        let fd = file.as_raw_fd();
+        log::info!("syncinc filesystem");
+        if unsafe { libc::syncfs(fd) } < 0 {
+            bail!("error during syncfs: {}", std::io::Error::last_os_error());
+        }
+        Ok(())
+    }
 }
diff --git a/src/api2/backup/environment.rs b/src/api2/backup/environment.rs
index 8c1c42db..ecde3394 100644
--- a/src/api2/backup/environment.rs
+++ b/src/api2/backup/environment.rs
@@ -623,6 +623,8 @@ impl BackupEnvironment {
             }
         }
 
+        self.datastore.syncfs()?;
+
         // marks the backup as successful
         state.finished = true;
 
diff --git a/src/api2/config/datastore.rs b/src/api2/config/datastore.rs
index 28342c2c..62f9c901 100644
--- a/src/api2/config/datastore.rs
+++ b/src/api2/config/datastore.rs
@@ -11,8 +11,8 @@ use proxmox_section_config::SectionConfigData;
 use proxmox_sys::WorkerTaskContext;
 
 use pbs_api_types::{
-    Authid, DataStoreConfig, DataStoreConfigUpdater, DatastoreNotify, DATASTORE_SCHEMA,
-    PRIV_DATASTORE_ALLOCATE, PRIV_DATASTORE_AUDIT, PRIV_DATASTORE_MODIFY,
+    Authid, Consistency, DataStoreConfig, DataStoreConfigUpdater, DatastoreNotify, DatastoreTuning,
+    DATASTORE_SCHEMA, PRIV_DATASTORE_ALLOCATE, PRIV_DATASTORE_AUDIT, PRIV_DATASTORE_MODIFY,
     PROXMOX_CONFIG_DIGEST_SCHEMA,
 };
 use pbs_config::BackupLockGuard;
@@ -70,6 +70,10 @@ pub(crate) fn do_create_datastore(
 ) -> Result<(), Error> {
     let path: PathBuf = datastore.path.clone().into();
 
+    let tuning: DatastoreTuning = serde_json::from_value(
+        DatastoreTuning::API_SCHEMA
+            .parse_property_string(datastore.tuning.as_deref().unwrap_or(""))?,
+    )?;
     let backup_user = pbs_config::backup_user()?;
     let _store = ChunkStore::create(
         &datastore.name,
@@ -77,6 +81,7 @@ pub(crate) fn do_create_datastore(
         backup_user.uid,
         backup_user.gid,
         worker,
+        tuning.consistency == Some(Consistency::File),
     )?;
 
     config.set_data(&datastore.name, "datastore", &datastore)?;
-- 
2.30.2





^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [pbs-devel] [RFC PATCH proxmox-backup] datastore: implement consitency tuning for datastores
  2022-05-18 11:24 [pbs-devel] [RFC PATCH proxmox-backup] datastore: implement consitency tuning for datastores Dominik Csapak
@ 2022-05-19 13:35 ` Fabian Grünbichler
  2022-05-19 13:49   ` Dominik Csapak
  2022-05-20  7:07 ` Thomas Lamprecht
  1 sibling, 1 reply; 4+ messages in thread
From: Fabian Grünbichler @ 2022-05-19 13:35 UTC (permalink / raw)
  To: Proxmox Backup Server development discussion

On May 18, 2022 1:24 pm, Dominik Csapak wrote:
> currently, we don't (f)sync on chunk insertion (or at any point after
> that), which can lead to broken chunks in case of e.g. an unexpected
> powerloss. To fix that, offer a tuning option for datastores that
> controls the level of syncs it does:
> 
> * None (old default): same as current state, no (f)syncs done at any point
> * Filesystem (new default): at the end of a backup, the datastore issues
>   a syncfs(2) to the filesystem of the datastore
> * File: issues an fsync on each chunk as they get inserted
>   (using our 'replace_file' helper)
> 
> a small benchmark showed the following (times in mm:ss):
> setup: virtual pbs, 4 cores, 8GiB memory, ext4 on spinner
> 
> size                none    filesystem  file
> 2GiB (fits in ram)   00:13   0:41        01:00
> 33GiB                05:21   05:31       13:45
> 
> so if the backup fits in memory, there is a large difference between all
> of the modes (expected), but as soon as it exceeds the memory size,
> the difference between not syncing and syncing the fs at the end becomes
> much smaller.
> 
> i also tested on an nvme, but there the syncs basically made no difference
> 
> Signed-off-by: Dominik Csapak <d.csapak@proxmox.com>
> ---
> it would be nice if anybody else tries to recreate the benchmarks on
> different setups, to verify (or disprove) my findings

FWIW:

randfile on tmpfs as source, backed up as fidx
randfile regenerated for every run, PBS restarted for every run

PBS in VM (8GB ram, disks on zvols on spinner + special + log), datastore on ext4:

SIZE: 4096 MODE: none Duration: 22.51s
SIZE: 4096 MODE: filesystem Duration: 28.11s
SIZE: 4096 MODE: file Duration: 54.47s

SIZE: 16384 MODE: none Duration: 202.42s
SIZE: 16384 MODE: filesystem Duration: 275.36s
SIZE: 16384 MODE: file Duration: 311.97s

same VM, datastore on single-disk ZFS pool:

SIZE: 1024 MODE: none Duration: 5.03s
SIZE: 1024 MODE: file Duration: 22.91s
SIZE: 1024 MODE: filesystem Duration: 15.57s

SIZE: 4096 MODE: none Duration: 41.02s
SIZE: 4096 MODE: file Duration: 135.94s
SIZE: 4096 MODE: filesystem Duration: 146.88s

SIZE: 16384 MODE: none Duration: 336.10s
rest ended in tears cause of restricted resources in the VM

PBS baremetal same as source, datastore on ZFS on spinner+special+log:

SIZE: 1024 MODE: none Duration: 4.90s
SIZE: 1024 MODE: file Duration: 4.92s
SIZE: 1024 MODE: filesystem Duration: 4.94s

SIZE: 4096 MODE: none Duration: 19.56s
SIZE: 4096 MODE: file Duration: 31.67s
SIZE: 4096 MODE: filesystem Duration: 38.54s

SIZE: 16384 MODE: none Duration: 189.77s
SIZE: 16384 MODE: file Duration: 178.81s
SIZE: 16384 MODE: filesystem Duration: 159.26s

^^ this is rather unexpected, I suspect something messed with the 'none' 
case here, so I re-ran it:

SIZE: 1024 MODE: none Duration: 4.90s
SIZE: 1024 MODE: file Duration: 4.92s
SIZE: 1024 MODE: filesystem Duration: 4.98s
SIZE: 4096 MODE: none Duration: 19.77s
SIZE: 4096 MODE: file Duration: 19.68s
SIZE: 4096 MODE: filesystem Duration: 19.61s
SIZE: 16384 MODE: none Duration: 133.93s
SIZE: 16384 MODE: file Duration: 146.88s
SIZE: 16384 MODE: filesystem Duration: 152.94s

and once more with ~30GB (ARC is just 16G):

SIZE: 30000 MODE: none Duration: 368.58s
SIZE: 30000 MODE: file Duration: 292.05s  (!!!)
SIZE: 30000 MODE: filesystem Duration: 431.73s

repeated once more:

SIZE: 30000 MODE: none Duration: 419.75s
SIZE: 30000 MODE: file Duration: 302.73s
SIZE: 30000 MODE: filesystem Duration: 409.07s

so.. rather weird? possible noisy measurements though, as this is on my 
workstation ;)

PBS baremetal same as source, datastore on ZFS on NVME (no surprises 
there):

SIZE: 1024 MODE: file Duration: 4.92s
SIZE: 1024 MODE: filesystem Duration: 4.95s
SIZE: 1024 MODE: none Duration: 4.96s

SIZE: 4096 MODE: file Duration: 19.69s
SIZE: 4096 MODE: filesystem Duration: 19.78s
SIZE: 4096 MODE: none Duration: 19.67s

SIZE: 16384 MODE: file Duration: 81.39s
SIZE: 16384 MODE: filesystem Duration: 78.86s
SIZE: 16384 MODE: none Duration: 78.38s

SIZE: 30000 MODE: none Duration: 142.65s
SIZE: 30000 MODE: file Duration: 143.43s
SIZE: 30000 MODE: filesystem Duration: 143.15s




^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [pbs-devel] [RFC PATCH proxmox-backup] datastore: implement consitency tuning for datastores
  2022-05-19 13:35 ` Fabian Grünbichler
@ 2022-05-19 13:49   ` Dominik Csapak
  0 siblings, 0 replies; 4+ messages in thread
From: Dominik Csapak @ 2022-05-19 13:49 UTC (permalink / raw)
  To: Proxmox Backup Server development discussion, Fabian Grünbichler
  Cc: Wolfgang Bumiller, Thomas Lamprecht, Dietmar Maurer

On 5/19/22 15:35, Fabian Grünbichler wrote:
> On May 18, 2022 1:24 pm, Dominik Csapak wrote:
>> currently, we don't (f)sync on chunk insertion (or at any point after
>> that), which can lead to broken chunks in case of e.g. an unexpected
>> powerloss. To fix that, offer a tuning option for datastores that
>> controls the level of syncs it does:
>>
>> * None (old default): same as current state, no (f)syncs done at any point
>> * Filesystem (new default): at the end of a backup, the datastore issues
>>    a syncfs(2) to the filesystem of the datastore
>> * File: issues an fsync on each chunk as they get inserted
>>    (using our 'replace_file' helper)
>>
>> a small benchmark showed the following (times in mm:ss):
>> setup: virtual pbs, 4 cores, 8GiB memory, ext4 on spinner
>>
>> size                none    filesystem  file
>> 2GiB (fits in ram)   00:13   0:41        01:00
>> 33GiB                05:21   05:31       13:45
>>
>> so if the backup fits in memory, there is a large difference between all
>> of the modes (expected), but as soon as it exceeds the memory size,
>> the difference between not syncing and syncing the fs at the end becomes
>> much smaller.
>>
>> i also tested on an nvme, but there the syncs basically made no difference
>>
>> Signed-off-by: Dominik Csapak <d.csapak@proxmox.com>
>> ---
>> it would be nice if anybody else tries to recreate the benchmarks on
>> different setups, to verify (or disprove) my findings
> 
> FWIW:
> 
> randfile on tmpfs as source, backed up as fidx
> randfile regenerated for every run, PBS restarted for every run
> 
> PBS in VM (8GB ram, disks on zvols on spinner + special + log), datastore on ext4:
> 
> SIZE: 4096 MODE: none Duration: 22.51s
> SIZE: 4096 MODE: filesystem Duration: 28.11s
> SIZE: 4096 MODE: file Duration: 54.47s
> 
> SIZE: 16384 MODE: none Duration: 202.42s
> SIZE: 16384 MODE: filesystem Duration: 275.36s
> SIZE: 16384 MODE: file Duration: 311.97s
> 
> same VM, datastore on single-disk ZFS pool:
> 
> SIZE: 1024 MODE: none Duration: 5.03s
> SIZE: 1024 MODE: file Duration: 22.91s
> SIZE: 1024 MODE: filesystem Duration: 15.57s
> 
> SIZE: 4096 MODE: none Duration: 41.02s
> SIZE: 4096 MODE: file Duration: 135.94s
> SIZE: 4096 MODE: filesystem Duration: 146.88s
> 
> SIZE: 16384 MODE: none Duration: 336.10s
> rest ended in tears cause of restricted resources in the VM
> 
> PBS baremetal same as source, datastore on ZFS on spinner+special+log:
> 
> SIZE: 1024 MODE: none Duration: 4.90s
> SIZE: 1024 MODE: file Duration: 4.92s
> SIZE: 1024 MODE: filesystem Duration: 4.94s
> 
> SIZE: 4096 MODE: none Duration: 19.56s
> SIZE: 4096 MODE: file Duration: 31.67s
> SIZE: 4096 MODE: filesystem Duration: 38.54s
> 
> SIZE: 16384 MODE: none Duration: 189.77s
> SIZE: 16384 MODE: file Duration: 178.81s
> SIZE: 16384 MODE: filesystem Duration: 159.26s
> 
> ^^ this is rather unexpected, I suspect something messed with the 'none'
> case here, so I re-ran it:
> 
> SIZE: 1024 MODE: none Duration: 4.90s
> SIZE: 1024 MODE: file Duration: 4.92s
> SIZE: 1024 MODE: filesystem Duration: 4.98s
> SIZE: 4096 MODE: none Duration: 19.77s
> SIZE: 4096 MODE: file Duration: 19.68s
> SIZE: 4096 MODE: filesystem Duration: 19.61s
> SIZE: 16384 MODE: none Duration: 133.93s
> SIZE: 16384 MODE: file Duration: 146.88s
> SIZE: 16384 MODE: filesystem Duration: 152.94s
> 
> and once more with ~30GB (ARC is just 16G):
> 
> SIZE: 30000 MODE: none Duration: 368.58s
> SIZE: 30000 MODE: file Duration: 292.05s  (!!!)
> SIZE: 30000 MODE: filesystem Duration: 431.73s
> 
> repeated once more:
> 
> SIZE: 30000 MODE: none Duration: 419.75s
> SIZE: 30000 MODE: file Duration: 302.73s
> SIZE: 30000 MODE: filesystem Duration: 409.07s
> 
> so.. rather weird? possible noisy measurements though, as this is on my
> workstation ;)
> 
> PBS baremetal same as source, datastore on ZFS on NVME (no surprises
> there):
> 
> SIZE: 1024 MODE: file Duration: 4.92s
> SIZE: 1024 MODE: filesystem Duration: 4.95s
> SIZE: 1024 MODE: none Duration: 4.96s
> 
> SIZE: 4096 MODE: file Duration: 19.69s
> SIZE: 4096 MODE: filesystem Duration: 19.78s
> SIZE: 4096 MODE: none Duration: 19.67s
> 
> SIZE: 16384 MODE: file Duration: 81.39s
> SIZE: 16384 MODE: filesystem Duration: 78.86s
> SIZE: 16384 MODE: none Duration: 78.38s
> 
> SIZE: 30000 MODE: none Duration: 142.65s
> SIZE: 30000 MODE: file Duration: 143.43s
> SIZE: 30000 MODE: filesystem Duration: 143.15s
> 
> 
> _______________________________________________
> pbs-devel mailing list
> pbs-devel@lists.proxmox.com
> https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel
> 
> 

so what i gather from this benchmark is that on ext4, many fsyncs are more expensive than
a single syncfs, but on zfs, it's very close together, leaning toward many fsyncs
to be faster ? (aside from the case where fsync is faster than not doing it ???)

in any case, doing some kind of syncing *will* slow down backups one way or another
(leaving the weird zfs case aside for the moment). so the question is if we
make one of the new modes the default or not...

i'd put them in there even if we leave the default, so the admin can decide how much
crash-consistency the pbs has.

any other input? @wolfgang, @thomas, @dietmar?




^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [pbs-devel] [RFC PATCH proxmox-backup] datastore: implement consitency tuning for datastores
  2022-05-18 11:24 [pbs-devel] [RFC PATCH proxmox-backup] datastore: implement consitency tuning for datastores Dominik Csapak
  2022-05-19 13:35 ` Fabian Grünbichler
@ 2022-05-20  7:07 ` Thomas Lamprecht
  1 sibling, 0 replies; 4+ messages in thread
From: Thomas Lamprecht @ 2022-05-20  7:07 UTC (permalink / raw)
  To: Proxmox Backup Server development discussion, Dominik Csapak

On 18/05/2022 13:24, Dominik Csapak wrote:
> currently, we don't (f)sync on chunk insertion (or at any point after
> that), which can lead to broken chunks in case of e.g. an unexpected
> powerloss.

"For most filesystem this is limited by the dirty write back time,
which defaults to 30s"

https://www.kernel.org/doc/html/latest/admin-guide/sysctl/vm.html#dirty-writeback-centisecs

To fix that, offer a tuning option for datastores that
> controls the level of syncs it does:
> 
> * None (old default): same as current state, no (f)syncs done at any point
> * Filesystem (new default): at the end of a backup, the datastore issues
>   a syncfs(2) to the filesystem of the datastore
> * File: issues an fsync on each chunk as they get inserted
>   (using our 'replace_file' helper)
> 
> a small benchmark showed the following (times in mm:ss):
> setup: virtual pbs, 4 cores, 8GiB memory, ext4 on spinner
> 
> size                none    filesystem  file
> 2GiB (fits in ram)   00:13   0:41        01:00
> 33GiB                05:21   05:31       13:45
> 
> so if the backup fits in memory, there is a large difference between all
> of the modes (expected), but as soon as it exceeds the memory size,
> the difference between not syncing and syncing the fs at the end becomes
> much smaller.
> 
> i also tested on an nvme, but there the syncs basically made no difference
> 
> Signed-off-by: Dominik Csapak <d.csapak@proxmox.com>
> ---
> it would be nice if anybody else tries to recreate the benchmarks on
> different setups, to verify (or disprove) my findings
> 
>  pbs-api-types/src/datastore.rs   | 14 +++++++++++
>  pbs-datastore/src/chunk_store.rs | 34 ++++++++++----------------
>  pbs-datastore/src/datastore.rs   | 42 +++++++++++++++++++++++++++++---
>  src/api2/backup/environment.rs   |  2 ++
>  src/api2/config/datastore.rs     |  9 +++++--
>  5 files changed, 75 insertions(+), 26 deletions(-)
> 
> diff --git a/pbs-api-types/src/datastore.rs b/pbs-api-types/src/datastore.rs
> index e2bf70aa..2536b8ff 100644
> --- a/pbs-api-types/src/datastore.rs
> +++ b/pbs-api-types/src/datastore.rs
> @@ -214,6 +214,19 @@ pub enum ChunkOrder {
>      Inode,
>  }
>  
> +#[api]
> +#[derive(Debug, Copy, Clone, PartialEq, Serialize, Deserialize)]

// are those really all required?!

> +#[serde(rename_all = "lowercase")]
> +/// The level of consitency to ensure (Default = "filesystem")

s/consitency/consistency/

And please separate adding this from changing the default into two patches.

> +pub enum Consistency {

This is such an overal generic name, it could be applied to anything...
Maybe call it what it is:

DatstoreFSyncLevel

That way you can also re-word the doc comments more specifically, as we'll
only touch chunks for fsync and only trigger syncfs after a backup, otherwise
user may question where else its used, assuming only a few examples are named
here.

> +    /// No special consistency operation to sync the fs or files to disk
> +    None,

I would extend the doc comment for above:

/// No special fsync or syncfs calls are triggered. The system default dirty write back
/// mechanism ensures that data gets is flushed eventually via the `dirty_writeback_centisecs`
/// and `dirty_writeback_centisecs` kernel sysctls, defaulting to ~ 30s.
///
/// This mode provides genereally the best performance, as all write back can happen async,
/// which reduces IO pressure.
/// But it may cause loosing data on powerloss or system crash, which  without any uninterruptible power supply

and potentially rename it to:
SystemDefault,

but not too hard feelings on that one.

> +    /// Each file (e.g. chunks) will be fsync'd to the disk after writing.

side note of something I just got aware of: this causes a longer runtime, at least in
general, which means, assuming that a powerloss/crash probability is equal distributed,
that it has an increased exposure to such events. Not that I think that it will matter
much in practice, as such things cannot be predicted anyway and can happen anytime.

But, 

/// Triggers a fsync after writing any chunk on the datastore. While this can slow down
/// backups significantly, depending on the underlying file system and storage used, it
/// will ensures fine-grained consistency. But in practice there are no benefits over the
/// file system level sync, so you should prefer that one, as on most systems the file level
/// one is slower and causes more IO pressure compared to the file system level one.

> +    File,
> +    /// At the end of some operations (e.g. Backup), the underlying filesystem will be synced.

/// Trigger a filesystem wide sync after all backup data got written but before finishing the
/// task. This allows guarantees that every finished backup is fully written back to storage
/// while reducing the impact doing so on most file systems.
///
/// Note that the underlying storage device sitll needs to protect itself against powerloss
/// to flush its internal ephemeral caches to the permanent storage layer.

(albeit the latter is probably something for the docs, which I'm missing completely here).

> +    Filesystem,
> +}
> +
>  #[api(
>      properties: {
>          "chunk-order": {
> @@ -228,6 +241,7 @@ pub enum ChunkOrder {
>  pub struct DatastoreTuning {
>      /// Iterate chunks in this order
>      pub chunk_order: Option<ChunkOrder>,
> +    pub consistency: Option<Consistency>,
>  }
>  
>  pub const DATASTORE_TUNING_STRING_SCHEMA: Schema = StringSchema::new("Datastore tuning options")
> diff --git a/pbs-datastore/src/chunk_store.rs b/pbs-datastore/src/chunk_store.rs
> index 5bfe9fac..58541c0f 100644
> --- a/pbs-datastore/src/chunk_store.rs
> +++ b/pbs-datastore/src/chunk_store.rs
> @@ -1,4 +1,3 @@
> -use std::io::Write;
>  use std::os::unix::io::AsRawFd;
>  use std::path::{Path, PathBuf};
>  use std::sync::{Arc, Mutex};
> @@ -22,6 +21,7 @@ pub struct ChunkStore {
>      chunk_dir: PathBuf,
>      mutex: Mutex<()>,
>      locker: Option<Arc<Mutex<ProcessLocker>>>,
> +    sync_chunks: bool,

can we avoid the boolean and re-use the actual type, which is much more telling,
especially if passed directly.

new(foo, bar, false) vs. new(foo, bar, FSyncLevel::)

>  }
>  
>  // TODO: what about sysctl setting vm.vfs_cache_pressure (0 - 100) ?
> @@ -68,6 +68,7 @@ impl ChunkStore {
>              chunk_dir: PathBuf::new(),
>              mutex: Mutex::new(()),
>              locker: None,
> +            sync_chunks: false,
>          }
>      }
>  
> @@ -88,6 +89,7 @@ impl ChunkStore {
>          uid: nix::unistd::Uid,
>          gid: nix::unistd::Gid,
>          worker: Option<&dyn WorkerTaskContext>,
> +        sync_chunks: bool,
>      ) -> Result<Self, Error>
>      where
>          P: Into<PathBuf>,
> @@ -144,7 +146,7 @@ impl ChunkStore {
>              }
>          }
>  
> -        Self::open(name, base)
> +        Self::open(name, base, sync_chunks)
>      }
>  
>      fn lockfile_path<P: Into<PathBuf>>(base: P) -> PathBuf {
> @@ -153,7 +155,7 @@ impl ChunkStore {
>          lockfile_path
>      }
>  
> -    pub fn open<P: Into<PathBuf>>(name: &str, base: P) -> Result<Self, Error> {
> +    pub fn open<P: Into<PathBuf>>(name: &str, base: P, sync_chunks: bool) -> Result<Self, Error> {
>          let base: PathBuf = base.into();
>  
>          if !base.is_absolute() {
> @@ -176,6 +178,7 @@ impl ChunkStore {
>              chunk_dir,
>              locker: Some(locker),
>              mutex: Mutex::new(()),
> +            sync_chunks,
>          })
>      }
>  
> @@ -461,21 +464,10 @@ impl ChunkStore {
>              }
>          }
>  
> -        let mut tmp_path = chunk_path.clone();
> -        tmp_path.set_extension("tmp");
> -
> -        let mut file = std::fs::File::create(&tmp_path).map_err(|err| {
> -            format_err!("creating chunk on store '{name}' failed for {digest_str} - {err}")
> -        })?;
> -
> -        file.write_all(raw_data).map_err(|err| {
> -            format_err!("writing chunk on store '{name}' failed for {digest_str} - {err}")
> -        })?;
> -
> -        if let Err(err) = std::fs::rename(&tmp_path, &chunk_path) {
> -            if std::fs::remove_file(&tmp_path).is_err() { /* ignore */ }
> -            bail!("atomic rename on store '{name}' failed for chunk {digest_str} - {err}");
> -        }
> +        proxmox_sys::fs::replace_file(chunk_path, raw_data, CreateOptions::new(), self.sync_chunks)
> +            .map_err(|err| {
> +                format_err!("inserting chunk on store '{name}' failed for {digest_str} - {err}")
> +            })?;

this is missing the fsync on the parent directory inode to ensure that
the new file insertion there is actually on disk...

An upfront patch chaning just to replace_file could reduce noise too, but
no hard feelings on that.

>  
>          drop(lock);
>  
> @@ -532,13 +524,13 @@ fn test_chunk_store1() {
>  
>      if let Err(_e) = std::fs::remove_dir_all(".testdir") { /* ignore */ }
>  
> -    let chunk_store = ChunkStore::open("test", &path);
> +    let chunk_store = ChunkStore::open("test", &path, false);
>      assert!(chunk_store.is_err());
>  
>      let user = nix::unistd::User::from_uid(nix::unistd::Uid::current())
>          .unwrap()
>          .unwrap();
> -    let chunk_store = ChunkStore::create("test", &path, user.uid, user.gid, None).unwrap();
> +    let chunk_store = ChunkStore::create("test", &path, user.uid, user.gid, None, false).unwrap();
>  
>      let (chunk, digest) = crate::data_blob::DataChunkBuilder::new(&[0u8, 1u8])
>          .build()
> @@ -550,7 +542,7 @@ fn test_chunk_store1() {
>      let (exists, _) = chunk_store.insert_chunk(&chunk, &digest).unwrap();
>      assert!(exists);
>  
> -    let chunk_store = ChunkStore::create("test", &path, user.uid, user.gid, None);
> +    let chunk_store = ChunkStore::create("test", &path, user.uid, user.gid, None, false);
>      assert!(chunk_store.is_err());
>  
>      if let Err(_e) = std::fs::remove_dir_all(".testdir") { /* ignore */ }
> diff --git a/pbs-datastore/src/datastore.rs b/pbs-datastore/src/datastore.rs
> index 5af8a295..20b02c70 100644
> --- a/pbs-datastore/src/datastore.rs
> +++ b/pbs-datastore/src/datastore.rs
> @@ -18,7 +18,7 @@ use proxmox_sys::WorkerTaskContext;
>  use proxmox_sys::{task_log, task_warn};
>  
>  use pbs_api_types::{
> -    Authid, BackupNamespace, BackupType, ChunkOrder, DataStoreConfig, DatastoreTuning,
> +    Authid, BackupNamespace, BackupType, ChunkOrder, Consistency, DataStoreConfig, DatastoreTuning,
>      GarbageCollectionStatus, HumanByte, Operation, UPID,
>  };
>  use pbs_config::ConfigVersionCache;
> @@ -61,6 +61,7 @@ pub struct DataStoreImpl {
>      chunk_order: ChunkOrder,
>      last_generation: usize,
>      last_update: i64,
> +    sync_fs: bool,
>  }
>  
>  impl DataStoreImpl {
> @@ -75,6 +76,7 @@ impl DataStoreImpl {
>              chunk_order: ChunkOrder::None,
>              last_generation: 0,
>              last_update: 0,
> +            sync_fs: false,
>          })
>      }
>  }
> @@ -154,7 +156,16 @@ impl DataStore {
>              }
>          }
>  
> -        let chunk_store = ChunkStore::open(name, &config.path)?;
> +        let tuning: DatastoreTuning = serde_json::from_value(
> +            DatastoreTuning::API_SCHEMA
> +                .parse_property_string(config.tuning.as_deref().unwrap_or(""))?,
> +        )?;
> +
> +        let chunk_store = ChunkStore::open(
> +            name,
> +            &config.path,
> +            tuning.consistency == Some(Consistency::File),
> +        )?;
>          let datastore = DataStore::with_store_and_config(chunk_store, config, generation, now)?;
>  
>          let datastore = Arc::new(datastore);
> @@ -197,7 +208,15 @@ impl DataStore {
>      ) -> Result<Arc<Self>, Error> {
>          let name = config.name.clone();
>  
> -        let chunk_store = ChunkStore::open(&name, &config.path)?;
> +        let tuning: DatastoreTuning = serde_json::from_value(
> +            DatastoreTuning::API_SCHEMA
> +                .parse_property_string(config.tuning.as_deref().unwrap_or(""))?,
> +        )?;
> +        let chunk_store = ChunkStore::open(
> +            &name,
> +            &config.path,
> +            tuning.consistency == Some(Consistency::File),
> +        )?;
>          let inner = Arc::new(Self::with_store_and_config(chunk_store, config, 0, 0)?);
>  
>          if let Some(operation) = operation {
> @@ -233,6 +252,8 @@ impl DataStore {
>                  .parse_property_string(config.tuning.as_deref().unwrap_or(""))?,
>          )?;
>          let chunk_order = tuning.chunk_order.unwrap_or(ChunkOrder::Inode);
> +        let sync_fs =
> +            tuning.consistency.unwrap_or(Consistency::Filesystem) == Consistency::Filesystem;
>  
>          Ok(DataStoreImpl {
>              chunk_store: Arc::new(chunk_store),
> @@ -242,6 +263,7 @@ impl DataStore {
>              chunk_order,
>              last_generation,
>              last_update,
> +            sync_fs,
>          })
>      }
>  
> @@ -1257,4 +1279,18 @@ impl DataStore {
>          todo!("split out the namespace");
>      }
>      */
> +
> +    /// Syncs the filesystem of the datastore if 'sync_fs' is enabled. Uses syncfs(2).
> +    pub fn syncfs(&self) -> Result<(), Error> {
> +        if !self.inner.sync_fs {
> +            return Ok(());
> +        }
> +        let file = std::fs::File::open(self.base_path())?;
> +        let fd = file.as_raw_fd();
> +        log::info!("syncinc filesystem");

/syncic/syncing/ and please add the base path so that one can a

> +        if unsafe { libc::syncfs(fd) } < 0 {
> +            bail!("error during syncfs: {}", std::io::Error::last_os_error());
> +        }
> +        Ok(())
> +    }
>  }
> diff --git a/src/api2/backup/environment.rs b/src/api2/backup/environment.rs
> index 8c1c42db..ecde3394 100644
> --- a/src/api2/backup/environment.rs
> +++ b/src/api2/backup/environment.rs
> @@ -623,6 +623,8 @@ impl BackupEnvironment {
>              }
>          }
>  
> +        self.datastore.syncfs()?;

I find it rather ugly to handle the if inside this fn, from reading the code it
would seem like we always trighger a sync FS..
At least rename it then to reflect that

> +
>          // marks the backup as successful
>          state.finished = true;
>  
> diff --git a/src/api2/config/datastore.rs b/src/api2/config/datastore.rs
> index 28342c2c..62f9c901 100644
> --- a/src/api2/config/datastore.rs
> +++ b/src/api2/config/datastore.rs
> @@ -11,8 +11,8 @@ use proxmox_section_config::SectionConfigData;
>  use proxmox_sys::WorkerTaskContext;
>  
>  use pbs_api_types::{
> -    Authid, DataStoreConfig, DataStoreConfigUpdater, DatastoreNotify, DATASTORE_SCHEMA,
> -    PRIV_DATASTORE_ALLOCATE, PRIV_DATASTORE_AUDIT, PRIV_DATASTORE_MODIFY,
> +    Authid, Consistency, DataStoreConfig, DataStoreConfigUpdater, DatastoreNotify, DatastoreTuning,
> +    DATASTORE_SCHEMA, PRIV_DATASTORE_ALLOCATE, PRIV_DATASTORE_AUDIT, PRIV_DATASTORE_MODIFY,
>      PROXMOX_CONFIG_DIGEST_SCHEMA,
>  };
>  use pbs_config::BackupLockGuard;
> @@ -70,6 +70,10 @@ pub(crate) fn do_create_datastore(
>  ) -> Result<(), Error> {
>      let path: PathBuf = datastore.path.clone().into();
>  
> +    let tuning: DatastoreTuning = serde_json::from_value(
> +        DatastoreTuning::API_SCHEMA
> +            .parse_property_string(datastore.tuning.as_deref().unwrap_or(""))?,
> +    )?;
>      let backup_user = pbs_config::backup_user()?;
>      let _store = ChunkStore::create(
>          &datastore.name,
> @@ -77,6 +81,7 @@ pub(crate) fn do_create_datastore(
>          backup_user.uid,
>          backup_user.gid,
>          worker,
> +        tuning.consistency == Some(Consistency::File),
>      )?;
>  
>      config.set_data(&datastore.name, "datastore", &datastore)?;





^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-05-20  7:07 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-18 11:24 [pbs-devel] [RFC PATCH proxmox-backup] datastore: implement consitency tuning for datastores Dominik Csapak
2022-05-19 13:35 ` Fabian Grünbichler
2022-05-19 13:49   ` Dominik Csapak
2022-05-20  7:07 ` Thomas Lamprecht

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal