From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from gate001.proxmox.com (gate001.proxmox.com [45.144.208.40]) by lore.proxmox.com (Postfix) with ESMTPS id 5A06A1FF141 for ; Tue, 30 Jun 2026 16:17:34 +0200 (CEST) Received: from gate001.proxmox.com (localhost.localdomain [127.0.0.1]) by gate001.proxmox.com (Proxmox) with ESMTP id 64E952141C; Tue, 30 Jun 2026 16:17:33 +0200 (CEST) Mime-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset=UTF-8 Date: Tue, 30 Jun 2026 16:16:56 +0200 Message-Id: Subject: Re: [PATCH storage v7 1/4] lvm: saferemove: zero out volumes range by range From: "Lukas Sichert" To: "Fiona Ebner" , References: <20260616101323.24981-1-l.sichert@proxmox.com> <20260616101323.24981-2-l.sichert@proxmox.com> <9d099f5a-3121-4617-be7d-b3aa5e2c6c81@proxmox.com> In-Reply-To: <9d099f5a-3121-4617-be7d-b3aa5e2c6c81@proxmox.com> X-Bm-Milter-Handled: 55990f41-d878-4baa-be0a-ee34c49e34d2 X-Bm-Transport-Timestamp: 1782829003840 X-SPAM-LEVEL: Spam detection results: 0 DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment (newer systems) SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Message-ID-Hash: BL2YSP2BUSKX2FF4YPQ4HDAYX6J6HGO2 X-Message-ID-Hash: BL2YSP2BUSKX2FF4YPQ4HDAYX6J6HGO2 X-MailFrom: l.sichert@proxmox.com X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; loop; banned-address; emergency; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header X-Mailman-Version: 3.3.10 Precedence: list List-Id: Proxmox VE development discussion List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: On 2026-06-30 13:49, Fiona Ebner wrote: > Am 16.06.26 um 12:12 PM schrieb Lukas Sichert: >> 'saferemove' currently uses different full-volume zero-out paths: >> `blkdiscard --zeroout` for devices with write-zeroes support and >> `cstream` otherwise. This makes progress and throttling inconsistent and >> does not allow future discard cleanup to be interleaved with zeroing. On >> thin-provisioned backing storage, zeroing the whole LV before discarding >> it can also force unnecessary allocation. >>=20 >> Move zeroing into an explicit range loop. Use BLKZEROOUT when supported, >> capped to the device limit, and fall back to manual zero writes >> otherwise. Keep progress and throttling in the shared loop, and keep the >> renamed LV if zeroing fails so cleanup can be retried. >>=20 >> Signed-off-by: Lukas Sichert >> --- >> src/PVE/Storage/LVMPlugin.pm | 168 ++++++++++++++++++++++++----------- >> 1 file changed, 117 insertions(+), 51 deletions(-) >>=20 >> diff --git a/src/PVE/Storage/LVMPlugin.pm b/src/PVE/Storage/LVMPlugin.pm >> index 443d292..f0a7a80 100644 >> --- a/src/PVE/Storage/LVMPlugin.pm >> +++ b/src/PVE/Storage/LVMPlugin.pm >> @@ -13,11 +13,16 @@ use PVE::Tools qw(run_command file_read_firstline tr= im); >> =20 >> use PVE::Storage::Common; >> use PVE::Storage::Plugin; >> +use PVE::RESTEnvironment qw(log_warn); > > Style nit: please adhere to the style guide for module ordering: > https://pve.proxmox.com/wiki/Perl_Style_Guide#Module_Dependencies > >> =20 >> use base qw(PVE::Storage::Plugin); >> =20 >> # lvm helper functions >> =20 >> +use constant { >> + BLKZEROOUT =3D> 0x127f, >> +}; >> + >> my $ignore_no_medium_warnings =3D sub { >> my $line =3D shift; >> # ignore those, most of the time they're from (virtual) IPMI/iKVM d= evices >> @@ -279,6 +284,13 @@ sub lvm_list_volumes { >> return $lvs; >> } >> =20 >> +my sub blockdev_ioctl_range { >> + my ($fh, $ioctl_name, $ioctl, $offset, $length) =3D @_; >> + >> + my $range =3D pack('QQ', $offset, $length); >> + ioctl($fh, $ioctl, $range) or die "$ioctl_name failed - $!\n"; >> +} >> + >> my sub free_lvm_volumes_locked { >> my ($class, $scfg, $storeid, $volnames) =3D @_; >> =20 >> @@ -304,49 +316,97 @@ my sub free_lvm_volumes_locked { >> file_read_firstline("$sysdir/queue/write_zeroes_max_bytes")= // 0; >> ($write_zeroes_max_bytes) =3D $write_zeroes_max_bytes =3D~ m/^(= \d+)$/; #untaint >> =20 >> + my $size =3D file_read_firstline("$sysdir/size") // 0; >> + ($size) =3D $size =3D~ m/^(\d+)$/; # untaint > > Shouldn't we rather die when the size cannot be read or parsed? > >> + $size *=3D 512; # sysfs size is in 512-byte sectors >> + >> + my $zeroout_variant =3D 'blkzeroout'; >> + >> + # If the storage does not support write_zeroes fall back to wri= ting zeroes manually using >> + # syswrite. Otherwise if the storage supports write_zeroes but = stepsize is too big, reduce the stepsize to > > Style nit: line too long > >> + # the maximum supported by the storage. >> if ($write_zeroes_max_bytes =3D=3D 0) { >> - # If the storage does not support 'write zeroes', we fallba= ck to cstream. >> - # wipe throughput up to 10MB/s by default; may be overwritt= en with saferemove_throughput >> - my $throughput =3D '-10485760'; >> - if ($scfg->{saferemove_throughput}) { >> - $throughput =3D $scfg->{saferemove_throughput}; >> - } >> + print "falling back to syswrite to zero-out '$lvmpath'\n"; > > Nit: maybe also mention the why, e.g. "WRITE_ZEROES operation not > supported, falling back ..." > >> + $stepsize =3D 1024 * 1024; # 1 MiB >> + $zeroout_variant =3D 'syswrite'; >> + } elsif ($stepsize > $write_zeroes_max_bytes) { >> + print "reduce stepsize to the maximum supported by the stor= age:" >> + . " $write_zeroes_max_bytes bytes\n"; >> + $stepsize =3D $write_zeroes_max_bytes; >> + } >> + my $zeroes =3D "\0" x $stepsize; >> + my $throughput =3D -1; > > The old default was 10 MiB/s, we should continue using it for the > syswrite case. For the BLKZEROOUT case, we can use unlimited as a default= . > >> + if ($scfg->{saferemove_throughput}) { >> + # use abs as legacy cstream accepted negative values >> + $throughput =3D abs($scfg->{saferemove_throughput}); >> + } >> =20 >> - my $cmd =3D [ >> - '/usr/bin/cstream', >> - '-i', >> - '/dev/zero', >> - '-o', >> - $lvmpath, >> - '-T', >> - '10', >> - '-v', >> - '1', >> - '-b', >> - '1048576', >> - '-t', >> - "$throughput", >> - ]; >> - eval { >> - run_command( >> - $cmd, >> - errmsg =3D> "zero out finished (note: 'No space lef= t on device' is ok here)", >> - ); >> - }; >> - warn $@ if $@; >> - } else { >> - # If the storage supports write_zeroes but stepsize is too = big, reduce the stepsize to >> - # the maximum supported by the storage. >> - if ($write_zeroes_max_bytes > 0 && $stepsize > $write_zeroe= s_max_bytes) { >> - print "reduce stepsize to the maximum supported by the = storage:" >> - . " $write_zeroes_max_bytes bytes\n"; >> + open(my $fh, '+<', $lvmpath) or die "can't open '$lvmpath' - $!= \n"; >> =20 >> - $stepsize =3D $write_zeroes_max_bytes; >> - } >> + #eval block, so that filehandle is closed even if something fai= ls below >> + eval { >> + my $start =3D time(); >> + my $written_total =3D 0; >> + my $lastprint =3D -1; >> + for (my $offset =3D 0; $offset < $size; $offset +=3D $steps= ize) { >> + >> + if ($offset + $stepsize > $size) { >> + $stepsize =3D $size - $offset; >> + } >> + >> + if ($zeroout_variant eq 'blkzeroout') { >> + eval { >> + blockdev_ioctl_range($fh, 'BLKZEROOUT', BLKZERO= OUT, $offset, $stepsize); >> + }; >> + if ($@) { >> + die "blkzeroout failed: $@"; > > I would also log offset and length of the request, so that a failure can > be analyzed better in practice. > >> + } >> + } elsif ($zeroout_variant eq 'syswrite') { >> + # if the $offset is 0, sysseek can return 0, theref= ore use // to only >> + # throw an error, if it returns undef >> + sysseek($fh, $offset, 0) // die "sysseek failed: $!= \n"; > > You can import the constant for the WHENCE from Fcntl like is done in > PVE::File, then it's more readable than 0. > >> =20 >> - my $cmd =3D ['blkdiscard', $lvmpath, '-v', '--zeroout', '--= step', "${stepsize}"]; >> - eval { run_command($cmd); }; >> - warn $@ if $@; >> + my $written =3D syswrite($fh, $zeroes, $stepsize) >> + // die "syswrite failed: $!\n"; >> + >> + if ($written !=3D $stepsize) { >> + die "short syswrite: wrote $written of $stepsiz= e bytes\n"; > > We should re-attempt to write the rest after a short write. I'd only die > if there was really no progress at all, i.e. $written =3D=3D 0. > >> + } >> + } >> + $written_total +=3D $stepsize; >> + >> + my $curr_time =3D time(); >> + if (($curr_time - $lastprint) >=3D 3) { >> + my $percent_finished =3D 100 * $written_total / $si= ze; >> + my $written_gb =3D $written_total / (1024**3); >> + my $size_gb =3D $size / (1024**3); > > You can use render_bytes() and render_duration() from PVE::Format > >> + my $curr_seconds =3D $curr_time - $start; >> + printf( >> + "zeroed out %.2f GiB of %.2f GiB (%.2f%%) using= %s in %d seconds\n", >> + $written_gb, >> + $size_gb, >> + $percent_finished, >> + $zeroout_variant, >> + $curr_seconds, >> + ); >> + $lastprint =3D $curr_time; >> + } >> + >> + if ($throughput > 0) { >> + my $expected_elapsed =3D $written_total / $throughp= ut; >> + my $actual_elapsed =3D $curr_time - $start; >> + my $delay =3D $expected_elapsed - $actual_elapsed; >> + if ($delay > 0) { >> + sleep($delay); >> + } >> + } >> + } >> + }; >> + # close filehandle before throwing an error >> + my $err =3D $@; >> + close($fh); >> + if ($err) { >> + die "zeroing out failed: $err"; >> } >> }; >> =20 >> @@ -368,18 +428,24 @@ my sub free_lvm_volumes_locked { >> errmsg =3D> "can't refresh LV '$lvmpath' to zero-out it= s data", >> ); >> =20 >> - $secure_delete_cmd->($lvmpath); >> - >> - $class->cluster_lock_storage( >> - $storeid, >> - $scfg->{shared}, >> - undef, >> - sub { >> - my $cmd =3D ['/sbin/lvremove', '-f', "$vg/del-$name= "]; >> - run_command($cmd, errmsg =3D> "lvremove '$vg/del-$n= ame' error"); >> - }, >> - ); >> - print "successfully removed volume $name ($vg/del-$name)\n"= ; >> + my $err =3D undef; >> + eval { $secure_delete_cmd->($lvmpath); }; >> + $err =3D $@ if $@; >> + >> + if (!$err) { > > Style nit: you could switch the branches and just write > eval { ... }; > if (my $err =3D $@) { Yes, for this patch alone the inverted form would be nicer. I kept the separate '$err' variable intentionally because patch 2/4 adds a discard-onl= y path that reuses the same error handling. That avoids reshuffling this block again in the follow-up patch. > >> + $class->cluster_lock_storage( >> + $storeid, >> + $scfg->{shared}, >> + undef, >> + sub { >> + my $cmd =3D ['/sbin/lvremove', '-f', "$vg/del-$= name"]; >> + run_command($cmd, errmsg =3D> "lvremove '$vg/de= l-$name' error"); >> + }, >> + ); >> + print "successfully removed volume $name ($vg/del-$name= )\n"; >> + } else { >> + log_warn("$vg/del-$name is not being removed: $err"); >> + } >> } >> }; >> =20