From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <pve-devel-bounces@lists.proxmox.com>
Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68])
	by lore.proxmox.com (Postfix) with ESMTPS id BE9361FF172
	for <inbox@lore.proxmox.com>; Tue,  1 Apr 2025 15:50:54 +0200 (CEST)
Received: from firstgate.proxmox.com (localhost [127.0.0.1])
	by firstgate.proxmox.com (Proxmox) with ESMTP id BAD5D33BA0;
	Tue,  1 Apr 2025 15:50:42 +0200 (CEST)
Date: Tue, 1 Apr 2025 15:50:37 +0200 (CEST)
From: =?UTF-8?Q?Fabian_Gr=C3=BCnbichler?= <f.gruenbichler@proxmox.com>
To: Proxmox VE development discussion <pve-devel@lists.proxmox.com>
Message-ID: <1614620193.3974.1743515437162@webmail.proxmox.com>
In-Reply-To: <mailman.943.1741688960.293.pve-devel@lists.proxmox.com>
References: <20250311102905.2680524-1-alexandre.derumier@groupe-cyllene.com>
 <mailman.943.1741688960.293.pve-devel@lists.proxmox.com>
MIME-Version: 1.0
X-Priority: 3
Importance: Normal
X-Mailer: Open-Xchange Mailer v7.10.6-Rev75
X-Originating-Client: open-xchange-appsuite
X-SPAM-LEVEL: Spam detection results:  0
 AWL 0.046 Adjusted score from AWL reputation of From: address
 BAYES_00                 -1.9 Bayes spam probability is 0 to 1%
 DMARC_MISSING             0.1 Missing DMARC policy
 KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment
 SPF_HELO_NONE           0.001 SPF: HELO does not publish an SPF Record
 SPF_PASS               -0.001 SPF: sender matches SPF record
Subject: Re: [pve-devel] [PATCH v4 pve-storage 1/5] qcow2: add external
 snapshot support
X-BeenThere: pve-devel@lists.proxmox.com
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Proxmox VE development discussion <pve-devel.lists.proxmox.com>
List-Unsubscribe: <https://lists.proxmox.com/cgi-bin/mailman/options/pve-devel>, 
 <mailto:pve-devel-request@lists.proxmox.com?subject=unsubscribe>
List-Archive: <http://lists.proxmox.com/pipermail/pve-devel/>
List-Post: <mailto:pve-devel@lists.proxmox.com>
List-Help: <mailto:pve-devel-request@lists.proxmox.com?subject=help>
List-Subscribe: <https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel>, 
 <mailto:pve-devel-request@lists.proxmox.com?subject=subscribe>
Reply-To: Proxmox VE development discussion <pve-devel@lists.proxmox.com>
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Errors-To: pve-devel-bounces@lists.proxmox.com
Sender: "pve-devel" <pve-devel-bounces@lists.proxmox.com>

> Alexandre Derumier via pve-devel <pve-devel@lists.proxmox.com> hat am 11.03.2025 11:28 CET geschrieben:

some sort of description here would be great ;)

> ---
>  src/PVE/Storage.pm           |   4 +-
>  src/PVE/Storage/DirPlugin.pm |   1 +
>  src/PVE/Storage/Plugin.pm    | 232 +++++++++++++++++++++++++++++------
>  3 files changed, 196 insertions(+), 41 deletions(-)
> 
> diff --git a/src/PVE/Storage.pm b/src/PVE/Storage.pm
> index 3b4f041..79e5c3a 100755
> --- a/src/PVE/Storage.pm
> +++ b/src/PVE/Storage.pm
> @@ -1002,7 +1002,7 @@ sub unmap_volume {
>  }
>  
>  sub vdisk_alloc {
> -    my ($cfg, $storeid, $vmid, $fmt, $name, $size) = @_;
> +    my ($cfg, $storeid, $vmid, $fmt, $name, $size, $backing) = @_;
>  
>      die "no storage ID specified\n" if !$storeid;
>  
> @@ -1025,7 +1025,7 @@ sub vdisk_alloc {
>      # lock shared storage
>      return $plugin->cluster_lock_storage($storeid, $scfg->{shared}, undef, sub {
>  	my $old_umask = umask(umask|0037);
> -	my $volname = eval { $plugin->alloc_image($storeid, $scfg, $vmid, $fmt, $name, $size) };
> +	my $volname = eval { $plugin->alloc_image($storeid, $scfg, $vmid, $fmt, $name, $size, $backing) };
>  	my $err = $@;
>  	umask $old_umask;
>  	die $err if $err;
> diff --git a/src/PVE/Storage/DirPlugin.pm b/src/PVE/Storage/DirPlugin.pm
> index fb23e0a..1cd7ac3 100644
> --- a/src/PVE/Storage/DirPlugin.pm
> +++ b/src/PVE/Storage/DirPlugin.pm
> @@ -81,6 +81,7 @@ sub options {
>  	is_mountpoint => { optional => 1 },
>  	bwlimit => { optional => 1 },
>  	preallocation => { optional => 1 },
> +	snapext => { optional => 1 },
>     };
>  }
>  
> diff --git a/src/PVE/Storage/Plugin.pm b/src/PVE/Storage/Plugin.pm
> index 65cf43f..d7f485f 100644
> --- a/src/PVE/Storage/Plugin.pm
> +++ b/src/PVE/Storage/Plugin.pm
> @@ -216,6 +216,11 @@ my $defaultData = {
>  	    maximum => 65535,
>  	    optional => 1,
>  	},
> +        'snapext' => {
> +	    type => 'boolean',
> +	    description => 'enable external snapshot.',
> +	    optional => 1,
> +        },
>      },
>  };
>  
> @@ -716,7 +721,11 @@ sub filesystem_path {
>  
>      my $dir = $class->get_subdir($scfg, $vtype);
>  
> -    $dir .= "/$vmid" if $vtype eq 'images';
> +    if ($scfg->{snapext} && $snapname) {
> +	$name = $class->get_snap_volname($volname, $snapname);
> +    } else {
> +	$dir .= "/$vmid" if $vtype eq 'images';
> +    }

this is a bit weird, as it mixes volnames (with the `$vmid/` prefix) and names (without), it's only called twice in this patch, and this here already has $volname parsed, so could we maybe let get_snap_volname take and return the $name part without the dir?

>  
>      my $path = "$dir/$name";
>  
> @@ -873,7 +882,7 @@ sub clone_image {
>  }
>  
>  sub alloc_image {
> -    my ($class, $storeid, $scfg, $vmid, $fmt, $name, $size) = @_;
> +    my ($class, $storeid, $scfg, $vmid, $fmt, $name, $size, $backing) = @_;

this extends the storage API, so it should actually do that.. and probably $backing should not be an arbitrary path, but something that is resolved locally?

>  
>      my $imagedir = $class->get_subdir($scfg, 'images');
>      $imagedir .= "/$vmid";
> @@ -901,17 +910,11 @@ sub alloc_image {
>  	umask $old_umask;
>  	die $err if $err;
>      } else {
> -	my $cmd = ['/usr/bin/qemu-img', 'create'];
> -
> -	my $prealloc_opt = preallocation_cmd_option($scfg, $fmt);
> -	push @$cmd, '-o', $prealloc_opt if defined($prealloc_opt);
>  
> -	push @$cmd, '-f', $fmt, $path, "${size}K";
> -
> -	eval { run_command($cmd, errmsg => "unable to create image"); };
> +	eval { qemu_img_create($scfg, $fmt, $size, $path, $backing) };
>  	if ($@) {
>  	    unlink $path;
> -	    rmdir $imagedir;
> +	    rmdir $imagedir if !$backing;

don't think this is needed, rmdir will fail if the dir isn't empty anyway..

>  	    die "$@";
>  	}
>      }
> @@ -955,6 +958,50 @@ sub free_image {
>  # TODO taken from PVE/QemuServer/Drive.pm, avoiding duplication would be nice
>  my @checked_qemu_img_formats = qw(raw cow qcow qcow2 qed vmdk cloop);
>  
> +sub qemu_img_create {
> +    my ($scfg, $fmt, $size, $path, $backing) = @_;
> +
> +    my $cmd = ['/usr/bin/qemu-img', 'create'];
> +
> +    my $options = [];
> +
> +    if($backing) {
> +	push @$cmd, '-b', $backing, '-F', 'qcow2';
> +	push @$options, 'extended_l2=on','cluster_size=128k';
> +    };
> +    push @$options, preallocation_cmd_option($scfg, $fmt);
> +    push @$cmd, '-o', join(',', @$options) if @$options > 0;
> +    push @$cmd, '-f', $fmt, $path;
> +    push @$cmd, "${size}K" if !$backing;

is this because it will automatically take the size from the backing image?

> +
> +    run_command($cmd, errmsg => "unable to create image");
> +}
> +
> +sub qemu_img_info {
> +    my ($filename, $file_format, $timeout, $follow_backing_files) = @_;
> +
> +    my $cmd = ['/usr/bin/qemu-img', 'info', '--output=json', $filename];
> +    push $cmd->@*, '-f', $file_format if $file_format;
> +    push $cmd->@*, '--backing-chain' if $follow_backing_files;
> +
> +    my $json = '';
> +    my $err_output = '';
> +    eval {
> +        run_command($cmd,
> +            timeout => $timeout,
> +            outfunc => sub { $json .= shift },
> +            errfunc => sub { $err_output .= shift . "\n"},
> +        );
> +    };
> +    warn $@ if $@;
> +    if ($err_output) {
> +        # if qemu did not output anything to stdout we die with stderr as an error
> +        die $err_output if !$json;
> +        # otherwise we warn about it and try to parse the json
> +        warn $err_output;
> +    }
> +    return $json;
> +}
>  # set $untrusted if the file in question might be malicious since it isn't
>  # created by our stack
>  # this makes certain checks fatal, and adds extra checks for known problems like
> @@ -1018,25 +1065,9 @@ sub file_size_info {
>  	warn "file_size_info: '$filename': falling back to 'raw' from unknown format '$file_format'\n";
>  	$file_format = 'raw';
>      }
> -    my $cmd = ['/usr/bin/qemu-img', 'info', '--output=json', $filename];
> -    push $cmd->@*, '-f', $file_format if $file_format;
>  
> -    my $json = '';
> -    my $err_output = '';
> -    eval {
> -	run_command($cmd,
> -	    timeout => $timeout,
> -	    outfunc => sub { $json .= shift },
> -	    errfunc => sub { $err_output .= shift . "\n"},
> -	);
> -    };
> -    warn $@ if $@;
> -    if ($err_output) {
> -	# if qemu did not output anything to stdout we die with stderr as an error
> -	die $err_output if !$json;
> -	# otherwise we warn about it and try to parse the json
> -	warn $err_output;
> -    }
> +    my $json = qemu_img_info($filename, $file_format, $timeout);
> +
>      if (!$json) {
>  	die "failed to query file information with qemu-img\n" if $untrusted;
>  	# skip decoding if there was no output, e.g. if there was a timeout.
> @@ -1162,11 +1193,29 @@ sub volume_snapshot {
>  
>      die "can't snapshot this image format\n" if $volname !~ m/\.(qcow2|qed)$/;
>  
> -    my $path = $class->filesystem_path($scfg, $volname);
> +    if($scfg->{snapext}) {
> +
> +	my $path = $class->path($scfg, $volname, $storeid);
> +	my $snappath = $class->path($scfg, $volname, $storeid, $snap);
> +	#rename current volume to snap volume
> +	die "snapshot volume $snappath already exist\n" if -e $snappath;
> +	rename($path, $snappath) if -e $path;

this is still looking weird.. I don't think it makes sense interface wise to allow snapshotting a volume that doesn't even exist..

> +
> +	my ($vtype, $name, $vmid, undef, undef, $isBase, $format) =
> +	    $class->parse_volname($volname);
> +
> +	$class->alloc_image($storeid, $scfg, $vmid, 'qcow2', $name, undef, $snappath);
> +	if ($@) {
> +	    eval { $class->free_image($storeid, $scfg, $volname, 0) };
> +	    warn $@ if $@;

missing cleanup - this should undo the rename from above

> +	}
>  
> -    my $cmd = ['/usr/bin/qemu-img', 'snapshot','-c', $snap, $path];
> +    } else {
>  
> -    run_command($cmd);
> +	my $path = $class->filesystem_path($scfg, $volname);
> +	my $cmd = ['/usr/bin/qemu-img', 'snapshot','-c', $snap, $path];
> +	run_command($cmd);
> +    }
>  
>      return undef;
>  }
> @@ -1177,6 +1226,21 @@ sub volume_snapshot {
>  sub volume_rollback_is_possible {
>      my ($class, $scfg, $storeid, $volname, $snap, $blockers) = @_;
>  
> +    if ($scfg->{snapext}) {
> +	#technically, we could manage multibranch, we it need lot more work for snapshot delete
> +	#we need to implemente block-stream from deleted snapshot to all others child branchs
> +	#when online, we need to do a transaction for multiple disk when delete the last snapshot
> +	#and need to merge in current running file
> +
> +	my $snappath = $class->path($scfg, $volname, $storeid, $snap);
> +	my $snapshots = $class->volume_snapshot_info($scfg, $storeid, $volname);
> +	my $parentsnap = $snapshots->{current}->{parent};

wouldn't it be enough to check that this equals $snap?

> +
> +	return 1 if $snapshots->{$parentsnap}->{file} eq $snappath;
> +
> +	die "can't rollback, '$snap' is not most recent snapshot on '$volname'\n";
> +    }
> +
>      return 1;
>  }
>  
> @@ -1187,9 +1251,15 @@ sub volume_snapshot_rollback {
>  
>      my $path = $class->filesystem_path($scfg, $volname);
>  
> -    my $cmd = ['/usr/bin/qemu-img', 'snapshot','-a', $snap, $path];
> -
> -    run_command($cmd);
> +    if ($scfg->{snapext}) {
> +	#simply delete the current snapshot and recreate it
> +	my $path = $class->filesystem_path($scfg, $volname);
> +	unlink($path);
> +	$class->volume_snapshot($scfg, $storeid, $volname, $snap);

instead of volume_snapshot, this could simply call alloc_image with the backing file? then volume_snapshot could always rename and always cleanup properly..

> +    } else {
> +	my $cmd = ['/usr/bin/qemu-img', 'snapshot','-a', $snap, $path];
> +	run_command($cmd);
> +    }
>  
>      return undef;
>  }
> @@ -1201,13 +1271,49 @@ sub volume_snapshot_delete {
>  
>      return 1 if $running;
>  
> +    my $cmd = "";
>      my $path = $class->filesystem_path($scfg, $volname);
>  
> -    $class->deactivate_volume($storeid, $scfg, $volname, $snap, {});
> +    if ($scfg->{snapext}) {
> +
> +	my $snapshots = $class->volume_snapshot_info($scfg, $storeid, $volname);
> +	my $snappath = $snapshots->{$snap}->{file};
> +	die "volume $snappath is missing" if !-e $snappath;
>  
> -    my $cmd = ['/usr/bin/qemu-img', 'snapshot','-d', $snap, $path];
> +	my $parentsnap = $snapshots->{$snap}->{parent};
> +	my $childsnap = $snapshots->{$snap}->{child};
>  
> -    run_command($cmd);
> +	my $parentpath = $snapshots->{$parentsnap}->{file} if $parentsnap;
> +	my $childpath = $snapshots->{$childsnap}->{file} if $childsnap;

my $foo = .. if ...; 

is forbidden in our code ;) but I think we always need to have a childsnap anyway, right?

so we could simply check for that, and then switch around the two branches below so that one of them can do

if (my $parentsnap = ...) {
...
} else {
...
}

> +
> +	#if first snapshot,as it should be bigger,  we merge child, and rename the snapshot to child
> +	if(!$parentsnap) {
> +	    print"commit $childpath\n";
> +	    $cmd = ['/usr/bin/qemu-img', 'commit', $childpath];

we could provide `-d` here to skip emptying $childpath since we renamed over it anyway below..

> +	    eval { run_command($cmd) };
> +	    if ($@) {
> +		die "error commiting $childpath to $parentpath; $@\n";

this is wrong, there is no $parentpath.. we are committing into $snappath

> +	    }
> +	    print"rename $snappath to $childpath\n";
> +	    rename($snappath, $childpath);

what if this fails?

> +	} else {
> +	    #we rebase the child image on the parent as new backing image

should we extend this to make it clear what this means? it means copying any parts of $snap that are not in $parent and not yet overwritten by $child into $child, right?

so how expensive this is depends on:
- how many changes are between $parent and $snap (increases cost)
- how many of those are overwritten by changes between $snap and $child (decreases cost)

> +	    die "missing parentsnap snapshot to rebase child $childpath\n" if !$parentpath;

how can this happen? if there is a parentsnap there must be a parentpath as well?

> +	    $cmd = ['/usr/bin/qemu-img', 'rebase', '-b', $parentpath, '-F', 'qcow2', '-f', 'qcow2', $childpath];
> +	    eval { run_command($cmd) };
> +	    if ($@) {
> +		die "error rebase $childpath from $parentpath; $@\n";
> +	    }
> +	    #delete the snapshot
> +	    unlink($snappath);
> +	}
> +
> +    } else {
> +	$class->deactivate_volume($storeid, $scfg, $volname, $snap, {});
> +
> +	$cmd = ['/usr/bin/qemu-img', 'snapshot','-d', $snap, $path];
> +	run_command($cmd);
> +    }
>  
>      return undef;
>  }
> @@ -1246,7 +1352,7 @@ sub volume_has_feature {
>  	    current => { qcow2 => 1, raw => 1, vmdk => 1 },
>  	},
>  	rename => {
> -	    current => {qcow2 => 1, raw => 1, vmdk => 1},
> +	    current => { qcow2 => 1, raw => 1, vmdk => 1},
>  	},
>      };
>  
> @@ -1481,7 +1587,37 @@ sub status {
>  sub volume_snapshot_info {
>      my ($class, $scfg, $storeid, $volname) = @_;
>  
> -    die "volume_snapshot_info is not implemented for $class";
> +    my $path = $class->filesystem_path($scfg, $volname);
> +
> +    my $backing_chain = 1;
> +    my $json = qemu_img_info($path, undef, 10, $backing_chain);
> +    die "failed to query file information with qemu-img\n" if !$json;
> +    my $snapshots = eval { decode_json($json) };

missing error handlign for json decoding..

> +
> +    my $info = {};
> +    my $order = 0;
> +    for my $snap (@$snapshots) {
> +
> +	my $snapfile = $snap->{filename};
> +	my $snapname = parse_snapname($snapfile);
> +	$snapname = 'current' if !$snapname;
> +	my $snapvolname = $class->get_snap_volname($volname, $snapname);
> +
> +	$info->{$snapname}->{order} = $order;
> +	$info->{$snapname}->{file}= $snapfile;
> +	$info->{$snapname}->{volname} = $snapvolname;
> +	$info->{$snapname}->{volid} = "$storeid:$snapvolname";
> +	$info->{$snapname}->{ext} = 1;
> +
> +	my $parentfile = $snap->{'backing-filename'};
> +	if ($parentfile) {
> +	    my $parentname = parse_snapname($parentfile);
> +	    $info->{$snapname}->{parent} = $parentname;
> +	    $info->{$parentname}->{child} = $snapname;
> +	}
> +	$order++;
> +    }
> +    return $info;
>  }
>  
>  sub activate_storage {
> @@ -1867,4 +2003,22 @@ sub config_aware_base_mkdir {
>      }
>  }
>  
> +sub get_snap_volname {
> +    my ($class, $volname, $snapname) = @_;
> +
> +    my ($vtype, $name, $vmid, $basename, $basevmid, $isBase, $format) = $class->parse_volname($volname);
> +    $name = !$snapname || $snapname eq 'current' ? $volname : "$vmid/snap-$snapname-$name";

other way round would be better to group by volume first IMHO ($vmid/snap-$name-$snapname), as this is similar to how we encode snapshots often on the storage level (volume@snap). we also need to have some delimiter between snapshot and volume name that is not allowed in either (hard for volname since basically everything but '/' goes, but snapshots have a restricted character set (configid, which means alphanumeric, hyphen and underscore), so we could use something like '.' as delimiter? or we switch to directories and do $vmid/snap/$snap/$name?)

> +    return $name;
> +}
> +
> +sub parse_snapname {
> +    my ($name) = @_;
> +
> +    my $basename = basename($name);
> +    if ($basename =~ m/^snap-(.*)-vm(.*)$/) {

this is not strict enough, see above

> +	return $1;
> +    }
> +    return undef;
> +}
> +
>  1;
> -- 
> 2.39.5


_______________________________________________
pve-devel mailing list
pve-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel