From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by lists.proxmox.com (Postfix) with ESMTPS id 423FA916F2 for ; Fri, 7 Oct 2022 13:55:52 +0200 (CEST) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 6965325526 for ; Fri, 7 Oct 2022 13:54:54 +0200 (CEST) Received: from proxmox-new.maurer-it.com (proxmox-new.maurer-it.com [94.136.29.106]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by firstgate.proxmox.com (Proxmox) with ESMTPS for ; Fri, 7 Oct 2022 13:54:52 +0200 (CEST) Received: from proxmox-new.maurer-it.com (localhost.localdomain [127.0.0.1]) by proxmox-new.maurer-it.com (Proxmox) with ESMTP id 0D3E24480B for ; Fri, 7 Oct 2022 13:54:52 +0200 (CEST) From: Dominik Csapak To: pbs-devel@lists.proxmox.com Date: Fri, 7 Oct 2022 13:54:47 +0200 Message-Id: <20221007115449.3562604-4-d.csapak@proxmox.com> X-Mailer: git-send-email 2.30.2 In-Reply-To: <20221007115449.3562604-1-d.csapak@proxmox.com> References: <20221007115449.3562604-1-d.csapak@proxmox.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-SPAM-LEVEL: Spam detection results: 0 AWL 0.068 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record URIBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to URIBL was blocked. See http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block for more information. [datastore.rs, environment.rs] Subject: [pbs-devel] [PATCH proxmox-backup v2 3/5] datastore: implement sync-level tuning for datastores X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 07 Oct 2022 11:55:52 -0000 currently, we don't (f)sync on chunk insertion (or at any point after that), which can lead to broken chunks in case of e.g. an unexpected powerloss. To fix that, offer a tuning option for datastores that controls the level of syncs it does: * None (default): same as current state, no (f)syncs done at any point * Filesystem: at the end of a backup, the datastore issues a syncfs(2) to the filesystem of the datastore * File: issues an fsync on each chunk as they get inserted (using our 'replace_file' helper) and a fsync on the directory handle a small benchmark showed the following (times in mm:ss): setup: virtual pbs, 4 cores, 8GiB memory, ext4 on spinner size none filesystem file 2GiB (fits in ram) 00:13 0:41 01:00 33GiB 05:21 05:31 13:45 so if the backup fits in memory, there is a large difference between all of the modes (expected), but as soon as it exceeds the memory size, the difference between not syncing and syncing the fs at the end becomes much smaller. i also tested on an nvme, but there the syncs basically made no difference Signed-off-by: Dominik Csapak --- pbs-api-types/src/datastore.rs | 37 ++++++++++++++++++++ pbs-datastore/src/chunk_store.rs | 59 +++++++++++++++++++++++++++----- pbs-datastore/src/datastore.rs | 39 ++++++++++++++++++--- src/api2/backup/environment.rs | 2 ++ src/api2/config/datastore.rs | 9 +++-- 5 files changed, 131 insertions(+), 15 deletions(-) diff --git a/pbs-api-types/src/datastore.rs b/pbs-api-types/src/datastore.rs index 0af11b33..15ea80cd 100644 --- a/pbs-api-types/src/datastore.rs +++ b/pbs-api-types/src/datastore.rs @@ -168,6 +168,42 @@ pub enum ChunkOrder { Inode, } +#[api] +#[derive(PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +/// The level of syncing that is done when writing into a datastore. +pub enum DatastoreFSyncLevel { + /// No special fsync or syncfs calls are triggered. The system default dirty write back + /// mechanism ensures that data gets is flushed eventually via the `dirty_writeback_centisecs` + /// and `dirty_expire_centisecs` kernel sysctls, defaulting to ~ 30s. + /// + /// This mode provides generally the best performance, as all write back can happen async, + /// which reduces IO pressure. + /// But it may cause losing data on powerloss or system crash without any uninterruptible power + /// supply. + None, + /// Triggers a fsync after writing any chunk on the datastore. While this can slow down + /// backups significantly, depending on the underlying file system and storage used, it + /// will ensure fine-grained consistency. Depending on the exact setup, there might be no + /// benefits over the file system level sync, so if the setup allows it, you should prefer + /// that one. Despite the possible negative impact in performance, it's the most consistent + /// mode. + File, + /// Trigger a filesystem wide sync after all backup data got written but before finishing the + /// task. This allows that every finished backup is fully written back to storage + /// while reducing the impact on many file systems in contrast to the file level sync. + /// Depending on the setup, it might have a negative impact on unrelated write operations + /// of the underlying filesystem, but it is generally a good compromise between performance + /// and consitency. + Filesystem, +} + +impl Default for DatastoreFSyncLevel { + fn default() -> Self { + DatastoreFSyncLevel::None + } +} + #[api( properties: { "chunk-order": { @@ -182,6 +218,7 @@ pub enum ChunkOrder { pub struct DatastoreTuning { /// Iterate chunks in this order pub chunk_order: Option, + pub sync_level: Option, } pub const DATASTORE_TUNING_STRING_SCHEMA: Schema = StringSchema::new("Datastore tuning options") diff --git a/pbs-datastore/src/chunk_store.rs b/pbs-datastore/src/chunk_store.rs index 145d2c7e..a8582485 100644 --- a/pbs-datastore/src/chunk_store.rs +++ b/pbs-datastore/src/chunk_store.rs @@ -4,7 +4,7 @@ use std::sync::{Arc, Mutex}; use anyhow::{bail, format_err, Error}; -use pbs_api_types::GarbageCollectionStatus; +use pbs_api_types::{DatastoreFSyncLevel, GarbageCollectionStatus}; use proxmox_sys::fs::{create_dir, create_path, file_type_from_file_stat, CreateOptions}; use proxmox_sys::process_locker::{ ProcessLockExclusiveGuard, ProcessLockSharedGuard, ProcessLocker, @@ -21,6 +21,7 @@ pub struct ChunkStore { chunk_dir: PathBuf, mutex: Mutex<()>, locker: Option>>, + sync_level: DatastoreFSyncLevel, } // TODO: what about sysctl setting vm.vfs_cache_pressure (0 - 100) ? @@ -67,6 +68,7 @@ impl ChunkStore { chunk_dir: PathBuf::new(), mutex: Mutex::new(()), locker: None, + sync_level: Default::default(), } } @@ -87,6 +89,7 @@ impl ChunkStore { uid: nix::unistd::Uid, gid: nix::unistd::Gid, worker: Option<&dyn WorkerTaskContext>, + sync_level: DatastoreFSyncLevel, ) -> Result where P: Into, @@ -143,7 +146,7 @@ impl ChunkStore { } } - Self::open(name, base) + Self::open(name, base, sync_level) } fn lockfile_path>(base: P) -> PathBuf { @@ -157,7 +160,11 @@ impl ChunkStore { /// Note that this must be used with care, as it's dangerous to create two instances on the /// same base path, as closing the underlying ProcessLocker drops all locks from this process /// on the lockfile (even if separate FDs) - pub(crate) fn open>(name: &str, base: P) -> Result { + pub(crate) fn open>( + name: &str, + base: P, + sync_level: DatastoreFSyncLevel, + ) -> Result { let base: PathBuf = base.into(); if !base.is_absolute() { @@ -180,6 +187,7 @@ impl ChunkStore { chunk_dir, locker: Some(locker), mutex: Mutex::new(()), + sync_level, }) } @@ -460,9 +468,27 @@ impl ChunkStore { } } - proxmox_sys::fs::replace_file(chunk_path, raw_data, CreateOptions::new(), false).map_err( - |err| format_err!("inserting chunk on store '{name}' failed for {digest_str} - {err}"), - )?; + let chunk_dir_path = chunk_path + .parent() + .ok_or_else(|| format_err!("unable to get chunk dir"))? + .to_owned(); + + proxmox_sys::fs::replace_file( + chunk_path, + raw_data, + CreateOptions::new(), + self.sync_level == DatastoreFSyncLevel::File, + ) + .map_err(|err| { + format_err!("inserting chunk on store '{name}' failed for {digest_str} - {err}") + })?; + + if self.sync_level == DatastoreFSyncLevel::File { + // fsync dir handle to persist the tmp rename + let dir = std::fs::File::open(chunk_dir_path)?; + nix::unistd::fsync(dir.as_raw_fd()) + .map_err(|err| format_err!("fsync failed: {err}"))?; + } drop(lock); @@ -519,13 +545,21 @@ fn test_chunk_store1() { if let Err(_e) = std::fs::remove_dir_all(".testdir") { /* ignore */ } - let chunk_store = ChunkStore::open("test", &path); + let chunk_store = ChunkStore::open("test", &path, DatastoreFSyncLevel::None); assert!(chunk_store.is_err()); let user = nix::unistd::User::from_uid(nix::unistd::Uid::current()) .unwrap() .unwrap(); - let chunk_store = ChunkStore::create("test", &path, user.uid, user.gid, None).unwrap(); + let chunk_store = ChunkStore::create( + "test", + &path, + user.uid, + user.gid, + None, + DatastoreFSyncLevel::None, + ) + .unwrap(); let (chunk, digest) = crate::data_blob::DataChunkBuilder::new(&[0u8, 1u8]) .build() @@ -537,7 +571,14 @@ fn test_chunk_store1() { let (exists, _) = chunk_store.insert_chunk(&chunk, &digest).unwrap(); assert!(exists); - let chunk_store = ChunkStore::create("test", &path, user.uid, user.gid, None); + let chunk_store = ChunkStore::create( + "test", + &path, + user.uid, + user.gid, + None, + DatastoreFSyncLevel::None, + ); assert!(chunk_store.is_err()); if let Err(_e) = std::fs::remove_dir_all(".testdir") { /* ignore */ } diff --git a/pbs-datastore/src/datastore.rs b/pbs-datastore/src/datastore.rs index a539ddc5..da7bdf87 100644 --- a/pbs-datastore/src/datastore.rs +++ b/pbs-datastore/src/datastore.rs @@ -18,8 +18,8 @@ use proxmox_sys::WorkerTaskContext; use proxmox_sys::{task_log, task_warn}; use pbs_api_types::{ - Authid, BackupNamespace, BackupType, ChunkOrder, DataStoreConfig, DatastoreTuning, - GarbageCollectionStatus, HumanByte, Operation, UPID, + Authid, BackupNamespace, BackupType, ChunkOrder, DataStoreConfig, DatastoreFSyncLevel, + DatastoreTuning, GarbageCollectionStatus, HumanByte, Operation, UPID, }; use crate::backup_info::{BackupDir, BackupGroup}; @@ -59,6 +59,7 @@ pub struct DataStoreImpl { verify_new: bool, chunk_order: ChunkOrder, last_digest: Option<[u8; 32]>, + sync_level: DatastoreFSyncLevel, } impl DataStoreImpl { @@ -72,6 +73,7 @@ impl DataStoreImpl { verify_new: false, chunk_order: ChunkOrder::None, last_digest: None, + sync_level: Default::default(), }) } } @@ -151,7 +153,15 @@ impl DataStore { } Arc::clone(&datastore.chunk_store) } else { - Arc::new(ChunkStore::open(name, &config.path)?) + let tuning: DatastoreTuning = serde_json::from_value( + DatastoreTuning::API_SCHEMA + .parse_property_string(config.tuning.as_deref().unwrap_or(""))?, + )?; + Arc::new(ChunkStore::open( + name, + &config.path, + tuning.sync_level.unwrap_or_default(), + )?) }; let datastore = DataStore::with_store_and_config(chunk_store, config, Some(digest))?; @@ -207,7 +217,12 @@ impl DataStore { ) -> Result, Error> { let name = config.name.clone(); - let chunk_store = ChunkStore::open(&name, &config.path)?; + let tuning: DatastoreTuning = serde_json::from_value( + DatastoreTuning::API_SCHEMA + .parse_property_string(config.tuning.as_deref().unwrap_or(""))?, + )?; + let chunk_store = + ChunkStore::open(&name, &config.path, tuning.sync_level.unwrap_or_default())?; let inner = Arc::new(Self::with_store_and_config( Arc::new(chunk_store), config, @@ -254,6 +269,7 @@ impl DataStore { verify_new: config.verify_new.unwrap_or(false), chunk_order, last_digest, + sync_level: tuning.sync_level.unwrap_or_default(), }) } @@ -1294,4 +1310,19 @@ impl DataStore { todo!("split out the namespace"); } */ + + /// Syncs the filesystem of the datastore if 'sync_level' is set to + /// [`DatastoreFSyncLevel::Filesystem`]. Uses syncfs(2). + pub fn try_ensure_sync_level(&self) -> Result<(), Error> { + if self.inner.sync_level != DatastoreFSyncLevel::Filesystem { + return Ok(()); + } + let file = std::fs::File::open(self.base_path())?; + let fd = file.as_raw_fd(); + log::info!("syncinc filesystem"); + if unsafe { libc::syncfs(fd) } < 0 { + bail!("error during syncfs: {}", std::io::Error::last_os_error()); + } + Ok(()) + } } diff --git a/src/api2/backup/environment.rs b/src/api2/backup/environment.rs index e9a5cbc8..4f07f9b4 100644 --- a/src/api2/backup/environment.rs +++ b/src/api2/backup/environment.rs @@ -623,6 +623,8 @@ impl BackupEnvironment { } } + self.datastore.try_ensure_sync_level()?; + // marks the backup as successful state.finished = true; diff --git a/src/api2/config/datastore.rs b/src/api2/config/datastore.rs index 2d769722..4269bc30 100644 --- a/src/api2/config/datastore.rs +++ b/src/api2/config/datastore.rs @@ -11,8 +11,8 @@ use proxmox_section_config::SectionConfigData; use proxmox_sys::WorkerTaskContext; use pbs_api_types::{ - Authid, DataStoreConfig, DataStoreConfigUpdater, DatastoreNotify, DATASTORE_SCHEMA, - PRIV_DATASTORE_ALLOCATE, PRIV_DATASTORE_AUDIT, PRIV_DATASTORE_MODIFY, + Authid, DataStoreConfig, DataStoreConfigUpdater, DatastoreNotify, DatastoreTuning, + DATASTORE_SCHEMA, PRIV_DATASTORE_ALLOCATE, PRIV_DATASTORE_AUDIT, PRIV_DATASTORE_MODIFY, PROXMOX_CONFIG_DIGEST_SCHEMA, }; use pbs_config::BackupLockGuard; @@ -70,6 +70,10 @@ pub(crate) fn do_create_datastore( ) -> Result<(), Error> { let path: PathBuf = datastore.path.clone().into(); + let tuning: DatastoreTuning = serde_json::from_value( + DatastoreTuning::API_SCHEMA + .parse_property_string(datastore.tuning.as_deref().unwrap_or(""))?, + )?; let backup_user = pbs_config::backup_user()?; let _store = ChunkStore::create( &datastore.name, @@ -77,6 +81,7 @@ pub(crate) fn do_create_datastore( backup_user.uid, backup_user.gid, worker, + tuning.sync_level.unwrap_or_default(), )?; config.set_data(&datastore.name, "datastore", &datastore)?; -- 2.30.2