From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from gate001.proxmox.com (gate001.proxmox.com [45.144.208.40]) by lore.proxmox.com (Postfix) with ESMTPS id 8E29C1FF141 for ; Tue, 30 Jun 2026 13:50:01 +0200 (CEST) Received: from gate001.proxmox.com (localhost.localdomain [127.0.0.1]) by gate001.proxmox.com (Proxmox) with ESMTP id 5F2EF21417; Tue, 30 Jun 2026 13:50:01 +0200 (CEST) Message-ID: <9d099f5a-3121-4617-be7d-b3aa5e2c6c81@proxmox.com> Date: Tue, 30 Jun 2026 13:49:22 +0200 MIME-Version: 1.0 User-Agent: Mozilla Thunderbird From: Fiona Ebner Subject: Re: [PATCH storage v7 1/4] lvm: saferemove: zero out volumes range by range To: Lukas Sichert , pve-devel@lists.proxmox.com References: <20260616101323.24981-1-l.sichert@proxmox.com> <20260616101323.24981-2-l.sichert@proxmox.com> Content-Language: en-US In-Reply-To: <20260616101323.24981-2-l.sichert@proxmox.com> Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 7bit X-Bm-Milter-Handled: 55990f41-d878-4baa-be0a-ee34c49e34d2 X-Bm-Transport-Timestamp: 1782820150919 X-SPAM-LEVEL: Spam detection results: 0 DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment (newer systems) SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Message-ID-Hash: LMEFGABTE6GCYPTQEEO2GRR5YCJGCSAQ X-Message-ID-Hash: LMEFGABTE6GCYPTQEEO2GRR5YCJGCSAQ X-MailFrom: f.ebner@proxmox.com X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; loop; banned-address; emergency; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header X-Mailman-Version: 3.3.10 Precedence: list List-Id: Proxmox VE development discussion List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: Am 16.06.26 um 12:12 PM schrieb Lukas Sichert: > 'saferemove' currently uses different full-volume zero-out paths: > `blkdiscard --zeroout` for devices with write-zeroes support and > `cstream` otherwise. This makes progress and throttling inconsistent and > does not allow future discard cleanup to be interleaved with zeroing. On > thin-provisioned backing storage, zeroing the whole LV before discarding > it can also force unnecessary allocation. > > Move zeroing into an explicit range loop. Use BLKZEROOUT when supported, > capped to the device limit, and fall back to manual zero writes > otherwise. Keep progress and throttling in the shared loop, and keep the > renamed LV if zeroing fails so cleanup can be retried. > > Signed-off-by: Lukas Sichert > --- > src/PVE/Storage/LVMPlugin.pm | 168 ++++++++++++++++++++++++----------- > 1 file changed, 117 insertions(+), 51 deletions(-) > > diff --git a/src/PVE/Storage/LVMPlugin.pm b/src/PVE/Storage/LVMPlugin.pm > index 443d292..f0a7a80 100644 > --- a/src/PVE/Storage/LVMPlugin.pm > +++ b/src/PVE/Storage/LVMPlugin.pm > @@ -13,11 +13,16 @@ use PVE::Tools qw(run_command file_read_firstline trim); > > use PVE::Storage::Common; > use PVE::Storage::Plugin; > +use PVE::RESTEnvironment qw(log_warn); Style nit: please adhere to the style guide for module ordering: https://pve.proxmox.com/wiki/Perl_Style_Guide#Module_Dependencies > > use base qw(PVE::Storage::Plugin); > > # lvm helper functions > > +use constant { > + BLKZEROOUT => 0x127f, > +}; > + > my $ignore_no_medium_warnings = sub { > my $line = shift; > # ignore those, most of the time they're from (virtual) IPMI/iKVM devices > @@ -279,6 +284,13 @@ sub lvm_list_volumes { > return $lvs; > } > > +my sub blockdev_ioctl_range { > + my ($fh, $ioctl_name, $ioctl, $offset, $length) = @_; > + > + my $range = pack('QQ', $offset, $length); > + ioctl($fh, $ioctl, $range) or die "$ioctl_name failed - $!\n"; > +} > + > my sub free_lvm_volumes_locked { > my ($class, $scfg, $storeid, $volnames) = @_; > > @@ -304,49 +316,97 @@ my sub free_lvm_volumes_locked { > file_read_firstline("$sysdir/queue/write_zeroes_max_bytes") // 0; > ($write_zeroes_max_bytes) = $write_zeroes_max_bytes =~ m/^(\d+)$/; #untaint > > + my $size = file_read_firstline("$sysdir/size") // 0; > + ($size) = $size =~ m/^(\d+)$/; # untaint Shouldn't we rather die when the size cannot be read or parsed? > + $size *= 512; # sysfs size is in 512-byte sectors > + > + my $zeroout_variant = 'blkzeroout'; > + > + # If the storage does not support write_zeroes fall back to writing zeroes manually using > + # syswrite. Otherwise if the storage supports write_zeroes but stepsize is too big, reduce the stepsize to Style nit: line too long > + # the maximum supported by the storage. > if ($write_zeroes_max_bytes == 0) { > - # If the storage does not support 'write zeroes', we fallback to cstream. > - # wipe throughput up to 10MB/s by default; may be overwritten with saferemove_throughput > - my $throughput = '-10485760'; > - if ($scfg->{saferemove_throughput}) { > - $throughput = $scfg->{saferemove_throughput}; > - } > + print "falling back to syswrite to zero-out '$lvmpath'\n"; Nit: maybe also mention the why, e.g. "WRITE_ZEROES operation not supported, falling back ..." > + $stepsize = 1024 * 1024; # 1 MiB > + $zeroout_variant = 'syswrite'; > + } elsif ($stepsize > $write_zeroes_max_bytes) { > + print "reduce stepsize to the maximum supported by the storage:" > + . " $write_zeroes_max_bytes bytes\n"; > + $stepsize = $write_zeroes_max_bytes; > + } > + my $zeroes = "\0" x $stepsize; > + my $throughput = -1; The old default was 10 MiB/s, we should continue using it for the syswrite case. For the BLKZEROOUT case, we can use unlimited as a default. > + if ($scfg->{saferemove_throughput}) { > + # use abs as legacy cstream accepted negative values > + $throughput = abs($scfg->{saferemove_throughput}); > + } > > - my $cmd = [ > - '/usr/bin/cstream', > - '-i', > - '/dev/zero', > - '-o', > - $lvmpath, > - '-T', > - '10', > - '-v', > - '1', > - '-b', > - '1048576', > - '-t', > - "$throughput", > - ]; > - eval { > - run_command( > - $cmd, > - errmsg => "zero out finished (note: 'No space left on device' is ok here)", > - ); > - }; > - warn $@ if $@; > - } else { > - # If the storage supports write_zeroes but stepsize is too big, reduce the stepsize to > - # the maximum supported by the storage. > - if ($write_zeroes_max_bytes > 0 && $stepsize > $write_zeroes_max_bytes) { > - print "reduce stepsize to the maximum supported by the storage:" > - . " $write_zeroes_max_bytes bytes\n"; > + open(my $fh, '+<', $lvmpath) or die "can't open '$lvmpath' - $!\n"; > > - $stepsize = $write_zeroes_max_bytes; > - } > + #eval block, so that filehandle is closed even if something fails below > + eval { > + my $start = time(); > + my $written_total = 0; > + my $lastprint = -1; > + for (my $offset = 0; $offset < $size; $offset += $stepsize) { > + > + if ($offset + $stepsize > $size) { > + $stepsize = $size - $offset; > + } > + > + if ($zeroout_variant eq 'blkzeroout') { > + eval { > + blockdev_ioctl_range($fh, 'BLKZEROOUT', BLKZEROOUT, $offset, $stepsize); > + }; > + if ($@) { > + die "blkzeroout failed: $@"; I would also log offset and length of the request, so that a failure can be analyzed better in practice. > + } > + } elsif ($zeroout_variant eq 'syswrite') { > + # if the $offset is 0, sysseek can return 0, therefore use // to only > + # throw an error, if it returns undef > + sysseek($fh, $offset, 0) // die "sysseek failed: $!\n"; You can import the constant for the WHENCE from Fcntl like is done in PVE::File, then it's more readable than 0. > > - my $cmd = ['blkdiscard', $lvmpath, '-v', '--zeroout', '--step', "${stepsize}"]; > - eval { run_command($cmd); }; > - warn $@ if $@; > + my $written = syswrite($fh, $zeroes, $stepsize) > + // die "syswrite failed: $!\n"; > + > + if ($written != $stepsize) { > + die "short syswrite: wrote $written of $stepsize bytes\n"; We should re-attempt to write the rest after a short write. I'd only die if there was really no progress at all, i.e. $written == 0. > + } > + } > + $written_total += $stepsize; > + > + my $curr_time = time(); > + if (($curr_time - $lastprint) >= 3) { > + my $percent_finished = 100 * $written_total / $size; > + my $written_gb = $written_total / (1024**3); > + my $size_gb = $size / (1024**3); You can use render_bytes() and render_duration() from PVE::Format > + my $curr_seconds = $curr_time - $start; > + printf( > + "zeroed out %.2f GiB of %.2f GiB (%.2f%%) using %s in %d seconds\n", > + $written_gb, > + $size_gb, > + $percent_finished, > + $zeroout_variant, > + $curr_seconds, > + ); > + $lastprint = $curr_time; > + } > + > + if ($throughput > 0) { > + my $expected_elapsed = $written_total / $throughput; > + my $actual_elapsed = $curr_time - $start; > + my $delay = $expected_elapsed - $actual_elapsed; > + if ($delay > 0) { > + sleep($delay); > + } > + } > + } > + }; > + # close filehandle before throwing an error > + my $err = $@; > + close($fh); > + if ($err) { > + die "zeroing out failed: $err"; > } > }; > > @@ -368,18 +428,24 @@ my sub free_lvm_volumes_locked { > errmsg => "can't refresh LV '$lvmpath' to zero-out its data", > ); > > - $secure_delete_cmd->($lvmpath); > - > - $class->cluster_lock_storage( > - $storeid, > - $scfg->{shared}, > - undef, > - sub { > - my $cmd = ['/sbin/lvremove', '-f', "$vg/del-$name"]; > - run_command($cmd, errmsg => "lvremove '$vg/del-$name' error"); > - }, > - ); > - print "successfully removed volume $name ($vg/del-$name)\n"; > + my $err = undef; > + eval { $secure_delete_cmd->($lvmpath); }; > + $err = $@ if $@; > + > + if (!$err) { Style nit: you could switch the branches and just write eval { ... }; if (my $err = $@) { > + $class->cluster_lock_storage( > + $storeid, > + $scfg->{shared}, > + undef, > + sub { > + my $cmd = ['/sbin/lvremove', '-f', "$vg/del-$name"]; > + run_command($cmd, errmsg => "lvremove '$vg/del-$name' error"); > + }, > + ); > + print "successfully removed volume $name ($vg/del-$name)\n"; > + } else { > + log_warn("$vg/del-$name is not being removed: $err"); > + } > } > }; >