From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <pve-devel-bounces@lists.proxmox.com>
Received: from gate001.proxmox.com (gate001.proxmox.com [45.144.208.40])
	by lore.proxmox.com (Postfix) with ESMTPS id 5A06A1FF141
	for <inbox@lore.proxmox.com>; Tue, 30 Jun 2026 16:17:34 +0200 (CEST)
Received: from gate001.proxmox.com (localhost.localdomain [127.0.0.1])
	by gate001.proxmox.com (Proxmox) with ESMTP id 64E952141C;
	Tue, 30 Jun 2026 16:17:33 +0200 (CEST)
Mime-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset=UTF-8
Date: Tue, 30 Jun 2026 16:16:56 +0200
Message-Id: <DJMG539MX7J8.25O7YMZEYEE3M@proxmox.com>
Subject: Re: [PATCH storage v7 1/4] lvm: saferemove: zero out volumes range
 by range
From: "Lukas Sichert" <l.sichert@proxmox.com>
To: "Fiona Ebner" <f.ebner@proxmox.com>, <pve-devel@lists.proxmox.com>
References: <20260616101323.24981-1-l.sichert@proxmox.com>
 <20260616101323.24981-2-l.sichert@proxmox.com>
 <9d099f5a-3121-4617-be7d-b3aa5e2c6c81@proxmox.com>
In-Reply-To: <9d099f5a-3121-4617-be7d-b3aa5e2c6c81@proxmox.com>
X-Bm-Milter-Handled: 55990f41-d878-4baa-be0a-ee34c49e34d2
X-Bm-Transport-Timestamp: 1782829003840
X-SPAM-LEVEL: Spam detection results:  0
	DMARC_MISSING             0.1 Missing DMARC policy
	KAM_DMARC_STATUS         0.01 Test Rule for DKIM or SPF Failure with Strict
 Alignment (newer systems)
	SPF_HELO_NONE           0.001 SPF: HELO does not publish an SPF Record
	SPF_PASS               -0.001 SPF: sender matches SPF record
Message-ID-Hash: BL2YSP2BUSKX2FF4YPQ4HDAYX6J6HGO2
X-Message-ID-Hash: BL2YSP2BUSKX2FF4YPQ4HDAYX6J6HGO2
X-MailFrom: l.sichert@proxmox.com
X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; loop;
 banned-address; emergency; member-moderation; nonmember-moderation;
 administrivia; implicit-dest; max-recipients; max-size; news-moderation;
 no-subject; digests; suspicious-header
X-Mailman-Version: 3.3.10
Precedence: list
List-Id: Proxmox VE development discussion <pve-devel.lists.proxmox.com>
List-Help: <mailto:pve-devel-request@lists.proxmox.com?subject=help>
List-Owner: <mailto:pve-devel-owner@lists.proxmox.com>
List-Post: <mailto:pve-devel@lists.proxmox.com>
List-Subscribe: <mailto:pve-devel-join@lists.proxmox.com>
List-Unsubscribe: <mailto:pve-devel-leave@lists.proxmox.com>

On 2026-06-30 13:49, Fiona Ebner <f.ebner@proxmox.com> wrote:

> Am 16.06.26 um 12:12 PM schrieb Lukas Sichert:
>> 'saferemove' currently uses different full-volume zero-out paths:
>> `blkdiscard --zeroout` for devices with write-zeroes support and
>> `cstream` otherwise. This makes progress and throttling inconsistent and
>> does not allow future discard cleanup to be interleaved with zeroing. On
>> thin-provisioned backing storage, zeroing the whole LV before discarding
>> it can also force unnecessary allocation.
>>=20
>> Move zeroing into an explicit range loop. Use BLKZEROOUT when supported,
>> capped to the device limit, and fall back to manual zero writes
>> otherwise. Keep progress and throttling in the shared loop, and keep the
>> renamed LV if zeroing fails so cleanup can be retried.
>>=20
>> Signed-off-by: Lukas Sichert <l.sichert@proxmox.com>
>> ---
>>  src/PVE/Storage/LVMPlugin.pm | 168 ++++++++++++++++++++++++-----------
>>  1 file changed, 117 insertions(+), 51 deletions(-)
>>=20
>> diff --git a/src/PVE/Storage/LVMPlugin.pm b/src/PVE/Storage/LVMPlugin.pm
>> index 443d292..f0a7a80 100644
>> --- a/src/PVE/Storage/LVMPlugin.pm
>> +++ b/src/PVE/Storage/LVMPlugin.pm
>> @@ -13,11 +13,16 @@ use PVE::Tools qw(run_command file_read_firstline tr=
im);
>> =20
>>  use PVE::Storage::Common;
>>  use PVE::Storage::Plugin;
>> +use PVE::RESTEnvironment qw(log_warn);
>
> Style nit: please adhere to the style guide for module ordering:
> https://pve.proxmox.com/wiki/Perl_Style_Guide#Module_Dependencies
>
>> =20
>>  use base qw(PVE::Storage::Plugin);
>> =20
>>  # lvm helper functions
>> =20
>> +use constant {
>> +    BLKZEROOUT =3D> 0x127f,
>> +};
>> +
>>  my $ignore_no_medium_warnings =3D sub {
>>      my $line =3D shift;
>>      # ignore those, most of the time they're from (virtual) IPMI/iKVM d=
evices
>> @@ -279,6 +284,13 @@ sub lvm_list_volumes {
>>      return $lvs;
>>  }
>> =20
>> +my sub blockdev_ioctl_range {
>> +    my ($fh, $ioctl_name, $ioctl, $offset, $length) =3D @_;
>> +
>> +    my $range =3D pack('QQ', $offset, $length);
>> +    ioctl($fh, $ioctl, $range) or die "$ioctl_name failed - $!\n";
>> +}
>> +
>>  my sub free_lvm_volumes_locked {
>>      my ($class, $scfg, $storeid, $volnames) =3D @_;
>> =20
>> @@ -304,49 +316,97 @@ my sub free_lvm_volumes_locked {
>>              file_read_firstline("$sysdir/queue/write_zeroes_max_bytes")=
 // 0;
>>          ($write_zeroes_max_bytes) =3D $write_zeroes_max_bytes =3D~ m/^(=
\d+)$/; #untaint
>> =20
>> +        my $size =3D file_read_firstline("$sysdir/size") // 0;
>> +        ($size) =3D $size =3D~ m/^(\d+)$/; # untaint
>
> Shouldn't we rather die when the size cannot be read or parsed?
>
>> +        $size *=3D 512; # sysfs size is in 512-byte sectors
>> +
>> +        my $zeroout_variant =3D 'blkzeroout';
>> +
>> +        # If the storage does not support write_zeroes fall back to wri=
ting zeroes manually using
>> +        # syswrite. Otherwise if the storage supports write_zeroes but =
stepsize is too big, reduce the stepsize to
>
> Style nit: line too long
>
>> +        # the maximum supported by the storage.
>>          if ($write_zeroes_max_bytes =3D=3D 0) {
>> -            # If the storage does not support 'write zeroes', we fallba=
ck to cstream.
>> -            # wipe throughput up to 10MB/s by default; may be overwritt=
en with saferemove_throughput
>> -            my $throughput =3D '-10485760';
>> -            if ($scfg->{saferemove_throughput}) {
>> -                $throughput =3D $scfg->{saferemove_throughput};
>> -            }
>> +            print "falling back to syswrite to zero-out '$lvmpath'\n";
>
> Nit: maybe also mention the why, e.g. "WRITE_ZEROES operation not
> supported, falling back ..."
>
>> +            $stepsize =3D 1024 * 1024; # 1 MiB
>> +            $zeroout_variant =3D 'syswrite';
>> +        } elsif ($stepsize > $write_zeroes_max_bytes) {
>> +            print "reduce stepsize to the maximum supported by the stor=
age:"
>> +                . " $write_zeroes_max_bytes bytes\n";
>> +            $stepsize =3D $write_zeroes_max_bytes;
>> +        }
>> +        my $zeroes =3D "\0" x $stepsize;
>> +        my $throughput =3D -1;
>
> The old default was 10 MiB/s, we should continue using it for the
> syswrite case. For the BLKZEROOUT case, we can use unlimited as a default=
.
>
>> +        if ($scfg->{saferemove_throughput}) {
>> +            # use abs as legacy cstream accepted negative values
>> +            $throughput =3D abs($scfg->{saferemove_throughput});
>> +        }
>> =20
>> -            my $cmd =3D [
>> -                '/usr/bin/cstream',
>> -                '-i',
>> -                '/dev/zero',
>> -                '-o',
>> -                $lvmpath,
>> -                '-T',
>> -                '10',
>> -                '-v',
>> -                '1',
>> -                '-b',
>> -                '1048576',
>> -                '-t',
>> -                "$throughput",
>> -            ];
>> -            eval {
>> -                run_command(
>> -                    $cmd,
>> -                    errmsg =3D> "zero out finished (note: 'No space lef=
t on device' is ok here)",
>> -                );
>> -            };
>> -            warn $@ if $@;
>> -        } else {
>> -            # If the storage supports write_zeroes but stepsize is too =
big, reduce the stepsize to
>> -            # the maximum supported by the storage.
>> -            if ($write_zeroes_max_bytes > 0 && $stepsize > $write_zeroe=
s_max_bytes) {
>> -                print "reduce stepsize to the maximum supported by the =
storage:"
>> -                    . " $write_zeroes_max_bytes bytes\n";
>> +        open(my $fh, '+<', $lvmpath) or die "can't open '$lvmpath' - $!=
\n";
>> =20
>> -                $stepsize =3D $write_zeroes_max_bytes;
>> -            }
>> +        #eval block, so that filehandle is closed even if something fai=
ls below
>> +        eval {
>> +            my $start =3D time();
>> +            my $written_total =3D 0;
>> +            my $lastprint =3D -1;
>> +            for (my $offset =3D 0; $offset < $size; $offset +=3D $steps=
ize) {
>> +
>> +                if ($offset + $stepsize > $size) {
>> +                    $stepsize =3D $size - $offset;
>> +                }
>> +
>> +                if ($zeroout_variant eq 'blkzeroout') {
>> +                    eval {
>> +                        blockdev_ioctl_range($fh, 'BLKZEROOUT', BLKZERO=
OUT, $offset, $stepsize);
>> +                    };
>> +                    if ($@) {
>> +                        die "blkzeroout failed: $@";
>
> I would also log offset and length of the request, so that a failure can
> be analyzed better in practice.
>
>> +                    }
>> +                } elsif ($zeroout_variant eq 'syswrite') {
>> +                    # if the $offset is 0, sysseek can return 0, theref=
ore use // to only
>> +                    # throw an error, if it returns undef
>> +                    sysseek($fh, $offset, 0) // die "sysseek failed: $!=
\n";
>
> You can import the constant for the WHENCE from Fcntl like is done in
> PVE::File, then it's more readable than 0.
>
>> =20
>> -            my $cmd =3D ['blkdiscard', $lvmpath, '-v', '--zeroout', '--=
step', "${stepsize}"];
>> -            eval { run_command($cmd); };
>> -            warn $@ if $@;
>> +                    my $written =3D syswrite($fh, $zeroes, $stepsize)
>> +                        // die "syswrite failed: $!\n";
>> +
>> +                    if ($written !=3D $stepsize) {
>> +                        die "short syswrite: wrote $written of $stepsiz=
e bytes\n";
>
> We should re-attempt to write the rest after a short write. I'd only die
> if there was really no progress at all, i.e. $written =3D=3D 0.
>
>> +                    }
>> +                }
>> +                $written_total +=3D $stepsize;
>> +
>> +                my $curr_time =3D time();
>> +                if (($curr_time - $lastprint) >=3D 3) {
>> +                    my $percent_finished =3D 100 * $written_total / $si=
ze;
>> +                    my $written_gb =3D $written_total / (1024**3);
>> +                    my $size_gb =3D $size / (1024**3);
>
> You can use render_bytes() and render_duration() from PVE::Format
>
>> +                    my $curr_seconds =3D $curr_time - $start;
>> +                    printf(
>> +                        "zeroed out %.2f GiB of %.2f GiB (%.2f%%) using=
 %s in %d seconds\n",
>> +                        $written_gb,
>> +                        $size_gb,
>> +                        $percent_finished,
>> +                        $zeroout_variant,
>> +                        $curr_seconds,
>> +                    );
>> +                    $lastprint =3D $curr_time;
>> +                }
>> +
>> +                if ($throughput > 0) {
>> +                    my $expected_elapsed =3D $written_total / $throughp=
ut;
>> +                    my $actual_elapsed =3D $curr_time - $start;
>> +                    my $delay =3D $expected_elapsed - $actual_elapsed;
>> +                    if ($delay > 0) {
>> +                        sleep($delay);
>> +                    }
>> +                }
>> +            }
>> +        };
>> +        # close filehandle before throwing an error
>> +        my $err =3D $@;
>> +        close($fh);
>> +        if ($err) {
>> +            die "zeroing out failed: $err";
>>          }
>>      };
>> =20
>> @@ -368,18 +428,24 @@ my sub free_lvm_volumes_locked {
>>                  errmsg =3D> "can't refresh LV '$lvmpath' to zero-out it=
s data",
>>              );
>> =20
>> -            $secure_delete_cmd->($lvmpath);
>> -
>> -            $class->cluster_lock_storage(
>> -                $storeid,
>> -                $scfg->{shared},
>> -                undef,
>> -                sub {
>> -                    my $cmd =3D ['/sbin/lvremove', '-f', "$vg/del-$name=
"];
>> -                    run_command($cmd, errmsg =3D> "lvremove '$vg/del-$n=
ame' error");
>> -                },
>> -            );
>> -            print "successfully removed volume $name ($vg/del-$name)\n"=
;
>> +            my $err =3D undef;
>> +            eval { $secure_delete_cmd->($lvmpath); };
>> +            $err =3D $@ if $@;
>> +
>> +            if (!$err) {
>
> Style nit: you could switch the branches and just write
> eval { ... };
> if (my $err =3D $@) {

Yes, for this patch alone the inverted form would be nicer. I kept the
separate '$err' variable intentionally because patch 2/4 adds a discard-onl=
y
path that reuses the same error handling. That avoids reshuffling this
block again in the follow-up patch.

>
>> +                $class->cluster_lock_storage(
>> +                    $storeid,
>> +                    $scfg->{shared},
>> +                    undef,
>> +                    sub {
>> +                        my $cmd =3D ['/sbin/lvremove', '-f', "$vg/del-$=
name"];
>> +                        run_command($cmd, errmsg =3D> "lvremove '$vg/de=
l-$name' error");
>> +                    },
>> +                );
>> +                print "successfully removed volume $name ($vg/del-$name=
)\n";
>> +            } else {
>> +                log_warn("$vg/del-$name is not being removed: $err");
>> +            }
>>          }
>>      };
>> =20