From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by lists.proxmox.com (Postfix) with ESMTPS id 3C7F0692F2 for ; Wed, 5 Aug 2020 09:23:53 +0200 (CEST) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 298D512249 for ; Wed, 5 Aug 2020 09:23:23 +0200 (CEST) Received: from proxmox-new.maurer-it.com (proxmox-new.maurer-it.com [212.186.127.180]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by firstgate.proxmox.com (Proxmox) with ESMTPS id 0BD851222E for ; Wed, 5 Aug 2020 09:23:22 +0200 (CEST) Received: from proxmox-new.maurer-it.com (localhost.localdomain [127.0.0.1]) by proxmox-new.maurer-it.com (Proxmox) with ESMTP id C934C4348D for ; Wed, 5 Aug 2020 09:23:21 +0200 (CEST) To: pbs-devel@lists.proxmox.com References: <20200804104205.29540-1-s.reiter@proxmox.com> <20200804104205.29540-5-s.reiter@proxmox.com> From: Fabian Ebner Message-ID: Date: Wed, 5 Aug 2020 09:23:15 +0200 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Thunderbird/68.11.0 MIME-Version: 1.0 In-Reply-To: <20200804104205.29540-5-s.reiter@proxmox.com> Content-Type: text/plain; charset=utf-8; format=flowed Content-Language: en-US Content-Transfer-Encoding: 7bit X-SPAM-LEVEL: Spam detection results: 0 AWL 0.748 Adjusted score from AWL reputation of From: address KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment NICE_REPLY_A -1.46 Looks like a legit reply (A) RCVD_IN_DNSWL_MED -2.3 Sender listed at https://www.dnswl.org/, medium trust SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record URIBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to URIBL was blocked. See http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block for more information. [prune.rs, datastore.rs, proxmox-backup-proxy.rs] Subject: Re: [pbs-devel] [PATCH proxmox-backup 4/7] prune: also check backup snapshot locks X-BeenThere: pbs-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox Backup Server development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 05 Aug 2020 07:23:53 -0000 Am 04.08.20 um 12:42 schrieb Stefan Reiter: > ...to avoid pruning running backups or backups used as base for others. > > Unfinished backups that have no lock are considered broken and will > always be pruned. > > The ignore_locks parameter is used for testing, since flocks are not > available in test environment. > > Signed-off-by: Stefan Reiter > --- > > This one I'm a bit scared of: The prune logic is astonishingly confusing IMHO, > but I hope I worked out how this makes the most sense. > > I left my long-form comments in, even though they might seem a bit excessive - > they helped me make light of the situation, and might help in the future as > well. But if it's too much, feel free to shorten/remove them. > > src/api2/admin/datastore.rs | 2 +- > src/backup/prune.rs | 48 +++++++++++++++++++++++---------- > src/bin/proxmox-backup-proxy.rs | 2 +- > tests/prune.rs | 6 +++-- > 4 files changed, 40 insertions(+), 18 deletions(-) > > diff --git a/src/api2/admin/datastore.rs b/src/api2/admin/datastore.rs > index 97fd9093..6194c9c7 100644 > --- a/src/api2/admin/datastore.rs > +++ b/src/api2/admin/datastore.rs > @@ -621,7 +621,7 @@ fn prune( > > let list = group.list_backups(&datastore.base_path())?; > > - let mut prune_info = compute_prune_info(list, &prune_options)?; > + let mut prune_info = compute_prune_info(list, &prune_options, false)?; > > prune_info.reverse(); // delete older snapshots first > > diff --git a/src/backup/prune.rs b/src/backup/prune.rs > index f7a87c5a..1bc5b9e8 100644 > --- a/src/backup/prune.rs > +++ b/src/backup/prune.rs > @@ -45,26 +45,45 @@ fn mark_selections, &BackupInfo) -> String> ( > } > } > > -fn remove_incomplete_snapshots( > +fn mark_incomplete_and_in_use_snapshots( > mark: &mut HashMap, > list: &Vec, > + ignore_locks: bool, > ) { > > - let mut keep_unfinished = true; > + let mut first_finished = true; > for info in list.iter() { > - // backup is considered unfinished if there is no manifest > - if info.is_finished() { > - // There is a new finished backup, so there is no need > - // to keep older unfinished backups. > - keep_unfinished = false; > - } else { > - let backup_id = info.backup_dir.relative_path(); > - if keep_unfinished { // keep first unfinished > - mark.insert(backup_id, PruneMark::KeepPartial); > - } else { > + let backup_id = info.backup_dir.relative_path(); > + let finished = info.is_finished(); > + if ignore_locks || info.lock().is_ok() { > + if !finished { > + // incomplete backup, but we can lock it, so it's not running - > + // always remove to clean up broken backups > mark.insert(backup_id, PruneMark::Remove); > } > - keep_unfinished = false; > + // if locking succeeds and the backup is complete, let the regular > + // marking logic figure it out > + // note that we don't need to keep any locks either - any newly > + // started backup can only ever use the latest finished backup as > + // base, which is kept anyway (since we always keep the latest, and > + // this function already filters out any unfinished ones) > + } else { > + if finished { > + // if the backup is finished but locking fails, we have to keep > + // it, however, if it's the latest finished one, it will be kept > + // anyway, so don't mark it at all > + if !first_finished { > + mark.insert(backup_id, PruneMark::Keep); > + } > + } else { > + // ...if it's incomplete, and locking fails, that probably means > + // it's currently running, so keep it, but do not include it in > + // mark computation > + mark.insert(backup_id, PruneMark::KeepPartial); > + }; > + } > + if first_finished && finished { > + first_finished = false; > } > } > } Is there any downside to treating all finished backups where the lock can't be acquired the same way? Not treating first_finished differently might save a few lines. Another thing is, by marking finished backups where we cannot acquire the lock as Keep, they will count towards the covered time periods in the Prune selection. This might cause the following, IMHO unwanted, behavior: Say there's two finished backups A and B for a given month (and they're not the first finished one). A is about to be removed with forget. While A is being removed, the prune operation starts, say with --keep-monthly, and it cannot acquire the lock on A. But the prune selection considers the month to be already covered, because A is marked as Keep, so it will mark B with Remove. In the end there is no backup for that month remaining. Marking these backups with KeepPartial would work around the issue, because those are not counted towards covering time periods. Of course the name doesn't really fit, maybe we should introduce an Ignore/Protected mark? I haven't thought too much about it yet, but what about selecting normally and only removing the one's where we have the locks and warn about the others? > @@ -173,13 +192,14 @@ impl PruneOptions { > pub fn compute_prune_info( > mut list: Vec, > options: &PruneOptions, > + ignore_locks: bool, > ) -> Result, Error> { > > let mut mark = HashMap::new(); > > BackupInfo::sort_list(&mut list, false); > > - remove_incomplete_snapshots(&mut mark, &list); > + mark_incomplete_and_in_use_snapshots(&mut mark, &list, ignore_locks); > > if let Some(keep_last) = options.keep_last { > mark_selections(&mut mark, &list, keep_last as usize, |_local_time, info| { > diff --git a/src/bin/proxmox-backup-proxy.rs b/src/bin/proxmox-backup-proxy.rs > index dc0d16d2..694e4b60 100644 > --- a/src/bin/proxmox-backup-proxy.rs > +++ b/src/bin/proxmox-backup-proxy.rs > @@ -441,7 +441,7 @@ async fn schedule_datastore_prune() { > let groups = BackupGroup::list_groups(&base_path)?; > for group in groups { > let list = group.list_backups(&base_path)?; > - let mut prune_info = compute_prune_info(list, &prune_options)?; > + let mut prune_info = compute_prune_info(list, &prune_options, false)?; > prune_info.reverse(); // delete older snapshots first > > worker.log(format!("Starting prune on store \"{}\" group \"{}/{}\"", > diff --git a/tests/prune.rs b/tests/prune.rs > index d9758ea7..923a94fd 100644 > --- a/tests/prune.rs > +++ b/tests/prune.rs > @@ -9,7 +9,8 @@ fn get_prune_list( > options: &PruneOptions, > ) -> Vec { > > - let mut prune_info = compute_prune_info(list, options).unwrap(); > + // ignore locks, would always fail in test environment > + let mut prune_info = compute_prune_info(list, options, true).unwrap(); > > prune_info.reverse(); > > @@ -38,7 +39,8 @@ fn create_info( > files.push(String::from(MANIFEST_BLOB_NAME)); > } > > - BackupInfo { backup_dir, files } > + // note: base_path is only used for locking, which we ignore here anyway > + BackupInfo { backup_dir, files, base_path: PathBuf::from("/tmp/pbs-test") } > } > > #[test] >