public inbox for pve-devel@lists.proxmox.com
 help / color / mirror / Atom feed
From: "Fabian Grünbichler" <f.gruenbichler@proxmox.com>
To: Proxmox VE development discussion <pve-devel@lists.proxmox.com>
Subject: Re: [pve-devel] [PATCH v4 pve-storage 1/5] qcow2: add external snapshot support
Date: Tue, 1 Apr 2025 15:50:37 +0200 (CEST)	[thread overview]
Message-ID: <1614620193.3974.1743515437162@webmail.proxmox.com> (raw)
In-Reply-To: <mailman.943.1741688960.293.pve-devel@lists.proxmox.com>

> Alexandre Derumier via pve-devel <pve-devel@lists.proxmox.com> hat am 11.03.2025 11:28 CET geschrieben:

some sort of description here would be great ;)

> ---
>  src/PVE/Storage.pm           |   4 +-
>  src/PVE/Storage/DirPlugin.pm |   1 +
>  src/PVE/Storage/Plugin.pm    | 232 +++++++++++++++++++++++++++++------
>  3 files changed, 196 insertions(+), 41 deletions(-)
> 
> diff --git a/src/PVE/Storage.pm b/src/PVE/Storage.pm
> index 3b4f041..79e5c3a 100755
> --- a/src/PVE/Storage.pm
> +++ b/src/PVE/Storage.pm
> @@ -1002,7 +1002,7 @@ sub unmap_volume {
>  }
>  
>  sub vdisk_alloc {
> -    my ($cfg, $storeid, $vmid, $fmt, $name, $size) = @_;
> +    my ($cfg, $storeid, $vmid, $fmt, $name, $size, $backing) = @_;
>  
>      die "no storage ID specified\n" if !$storeid;
>  
> @@ -1025,7 +1025,7 @@ sub vdisk_alloc {
>      # lock shared storage
>      return $plugin->cluster_lock_storage($storeid, $scfg->{shared}, undef, sub {
>  	my $old_umask = umask(umask|0037);
> -	my $volname = eval { $plugin->alloc_image($storeid, $scfg, $vmid, $fmt, $name, $size) };
> +	my $volname = eval { $plugin->alloc_image($storeid, $scfg, $vmid, $fmt, $name, $size, $backing) };
>  	my $err = $@;
>  	umask $old_umask;
>  	die $err if $err;
> diff --git a/src/PVE/Storage/DirPlugin.pm b/src/PVE/Storage/DirPlugin.pm
> index fb23e0a..1cd7ac3 100644
> --- a/src/PVE/Storage/DirPlugin.pm
> +++ b/src/PVE/Storage/DirPlugin.pm
> @@ -81,6 +81,7 @@ sub options {
>  	is_mountpoint => { optional => 1 },
>  	bwlimit => { optional => 1 },
>  	preallocation => { optional => 1 },
> +	snapext => { optional => 1 },
>     };
>  }
>  
> diff --git a/src/PVE/Storage/Plugin.pm b/src/PVE/Storage/Plugin.pm
> index 65cf43f..d7f485f 100644
> --- a/src/PVE/Storage/Plugin.pm
> +++ b/src/PVE/Storage/Plugin.pm
> @@ -216,6 +216,11 @@ my $defaultData = {
>  	    maximum => 65535,
>  	    optional => 1,
>  	},
> +        'snapext' => {
> +	    type => 'boolean',
> +	    description => 'enable external snapshot.',
> +	    optional => 1,
> +        },
>      },
>  };
>  
> @@ -716,7 +721,11 @@ sub filesystem_path {
>  
>      my $dir = $class->get_subdir($scfg, $vtype);
>  
> -    $dir .= "/$vmid" if $vtype eq 'images';
> +    if ($scfg->{snapext} && $snapname) {
> +	$name = $class->get_snap_volname($volname, $snapname);
> +    } else {
> +	$dir .= "/$vmid" if $vtype eq 'images';
> +    }

this is a bit weird, as it mixes volnames (with the `$vmid/` prefix) and names (without), it's only called twice in this patch, and this here already has $volname parsed, so could we maybe let get_snap_volname take and return the $name part without the dir?

>  
>      my $path = "$dir/$name";
>  
> @@ -873,7 +882,7 @@ sub clone_image {
>  }
>  
>  sub alloc_image {
> -    my ($class, $storeid, $scfg, $vmid, $fmt, $name, $size) = @_;
> +    my ($class, $storeid, $scfg, $vmid, $fmt, $name, $size, $backing) = @_;

this extends the storage API, so it should actually do that.. and probably $backing should not be an arbitrary path, but something that is resolved locally?

>  
>      my $imagedir = $class->get_subdir($scfg, 'images');
>      $imagedir .= "/$vmid";
> @@ -901,17 +910,11 @@ sub alloc_image {
>  	umask $old_umask;
>  	die $err if $err;
>      } else {
> -	my $cmd = ['/usr/bin/qemu-img', 'create'];
> -
> -	my $prealloc_opt = preallocation_cmd_option($scfg, $fmt);
> -	push @$cmd, '-o', $prealloc_opt if defined($prealloc_opt);
>  
> -	push @$cmd, '-f', $fmt, $path, "${size}K";
> -
> -	eval { run_command($cmd, errmsg => "unable to create image"); };
> +	eval { qemu_img_create($scfg, $fmt, $size, $path, $backing) };
>  	if ($@) {
>  	    unlink $path;
> -	    rmdir $imagedir;
> +	    rmdir $imagedir if !$backing;

don't think this is needed, rmdir will fail if the dir isn't empty anyway..

>  	    die "$@";
>  	}
>      }
> @@ -955,6 +958,50 @@ sub free_image {
>  # TODO taken from PVE/QemuServer/Drive.pm, avoiding duplication would be nice
>  my @checked_qemu_img_formats = qw(raw cow qcow qcow2 qed vmdk cloop);
>  
> +sub qemu_img_create {
> +    my ($scfg, $fmt, $size, $path, $backing) = @_;
> +
> +    my $cmd = ['/usr/bin/qemu-img', 'create'];
> +
> +    my $options = [];
> +
> +    if($backing) {
> +	push @$cmd, '-b', $backing, '-F', 'qcow2';
> +	push @$options, 'extended_l2=on','cluster_size=128k';
> +    };
> +    push @$options, preallocation_cmd_option($scfg, $fmt);
> +    push @$cmd, '-o', join(',', @$options) if @$options > 0;
> +    push @$cmd, '-f', $fmt, $path;
> +    push @$cmd, "${size}K" if !$backing;

is this because it will automatically take the size from the backing image?

> +
> +    run_command($cmd, errmsg => "unable to create image");
> +}
> +
> +sub qemu_img_info {
> +    my ($filename, $file_format, $timeout, $follow_backing_files) = @_;
> +
> +    my $cmd = ['/usr/bin/qemu-img', 'info', '--output=json', $filename];
> +    push $cmd->@*, '-f', $file_format if $file_format;
> +    push $cmd->@*, '--backing-chain' if $follow_backing_files;
> +
> +    my $json = '';
> +    my $err_output = '';
> +    eval {
> +        run_command($cmd,
> +            timeout => $timeout,
> +            outfunc => sub { $json .= shift },
> +            errfunc => sub { $err_output .= shift . "\n"},
> +        );
> +    };
> +    warn $@ if $@;
> +    if ($err_output) {
> +        # if qemu did not output anything to stdout we die with stderr as an error
> +        die $err_output if !$json;
> +        # otherwise we warn about it and try to parse the json
> +        warn $err_output;
> +    }
> +    return $json;
> +}
>  # set $untrusted if the file in question might be malicious since it isn't
>  # created by our stack
>  # this makes certain checks fatal, and adds extra checks for known problems like
> @@ -1018,25 +1065,9 @@ sub file_size_info {
>  	warn "file_size_info: '$filename': falling back to 'raw' from unknown format '$file_format'\n";
>  	$file_format = 'raw';
>      }
> -    my $cmd = ['/usr/bin/qemu-img', 'info', '--output=json', $filename];
> -    push $cmd->@*, '-f', $file_format if $file_format;
>  
> -    my $json = '';
> -    my $err_output = '';
> -    eval {
> -	run_command($cmd,
> -	    timeout => $timeout,
> -	    outfunc => sub { $json .= shift },
> -	    errfunc => sub { $err_output .= shift . "\n"},
> -	);
> -    };
> -    warn $@ if $@;
> -    if ($err_output) {
> -	# if qemu did not output anything to stdout we die with stderr as an error
> -	die $err_output if !$json;
> -	# otherwise we warn about it and try to parse the json
> -	warn $err_output;
> -    }
> +    my $json = qemu_img_info($filename, $file_format, $timeout);
> +
>      if (!$json) {
>  	die "failed to query file information with qemu-img\n" if $untrusted;
>  	# skip decoding if there was no output, e.g. if there was a timeout.
> @@ -1162,11 +1193,29 @@ sub volume_snapshot {
>  
>      die "can't snapshot this image format\n" if $volname !~ m/\.(qcow2|qed)$/;
>  
> -    my $path = $class->filesystem_path($scfg, $volname);
> +    if($scfg->{snapext}) {
> +
> +	my $path = $class->path($scfg, $volname, $storeid);
> +	my $snappath = $class->path($scfg, $volname, $storeid, $snap);
> +	#rename current volume to snap volume
> +	die "snapshot volume $snappath already exist\n" if -e $snappath;
> +	rename($path, $snappath) if -e $path;

this is still looking weird.. I don't think it makes sense interface wise to allow snapshotting a volume that doesn't even exist..

> +
> +	my ($vtype, $name, $vmid, undef, undef, $isBase, $format) =
> +	    $class->parse_volname($volname);
> +
> +	$class->alloc_image($storeid, $scfg, $vmid, 'qcow2', $name, undef, $snappath);
> +	if ($@) {
> +	    eval { $class->free_image($storeid, $scfg, $volname, 0) };
> +	    warn $@ if $@;

missing cleanup - this should undo the rename from above

> +	}
>  
> -    my $cmd = ['/usr/bin/qemu-img', 'snapshot','-c', $snap, $path];
> +    } else {
>  
> -    run_command($cmd);
> +	my $path = $class->filesystem_path($scfg, $volname);
> +	my $cmd = ['/usr/bin/qemu-img', 'snapshot','-c', $snap, $path];
> +	run_command($cmd);
> +    }
>  
>      return undef;
>  }
> @@ -1177,6 +1226,21 @@ sub volume_snapshot {
>  sub volume_rollback_is_possible {
>      my ($class, $scfg, $storeid, $volname, $snap, $blockers) = @_;
>  
> +    if ($scfg->{snapext}) {
> +	#technically, we could manage multibranch, we it need lot more work for snapshot delete
> +	#we need to implemente block-stream from deleted snapshot to all others child branchs
> +	#when online, we need to do a transaction for multiple disk when delete the last snapshot
> +	#and need to merge in current running file
> +
> +	my $snappath = $class->path($scfg, $volname, $storeid, $snap);
> +	my $snapshots = $class->volume_snapshot_info($scfg, $storeid, $volname);
> +	my $parentsnap = $snapshots->{current}->{parent};

wouldn't it be enough to check that this equals $snap?

> +
> +	return 1 if $snapshots->{$parentsnap}->{file} eq $snappath;
> +
> +	die "can't rollback, '$snap' is not most recent snapshot on '$volname'\n";
> +    }
> +
>      return 1;
>  }
>  
> @@ -1187,9 +1251,15 @@ sub volume_snapshot_rollback {
>  
>      my $path = $class->filesystem_path($scfg, $volname);
>  
> -    my $cmd = ['/usr/bin/qemu-img', 'snapshot','-a', $snap, $path];
> -
> -    run_command($cmd);
> +    if ($scfg->{snapext}) {
> +	#simply delete the current snapshot and recreate it
> +	my $path = $class->filesystem_path($scfg, $volname);
> +	unlink($path);
> +	$class->volume_snapshot($scfg, $storeid, $volname, $snap);

instead of volume_snapshot, this could simply call alloc_image with the backing file? then volume_snapshot could always rename and always cleanup properly..

> +    } else {
> +	my $cmd = ['/usr/bin/qemu-img', 'snapshot','-a', $snap, $path];
> +	run_command($cmd);
> +    }
>  
>      return undef;
>  }
> @@ -1201,13 +1271,49 @@ sub volume_snapshot_delete {
>  
>      return 1 if $running;
>  
> +    my $cmd = "";
>      my $path = $class->filesystem_path($scfg, $volname);
>  
> -    $class->deactivate_volume($storeid, $scfg, $volname, $snap, {});
> +    if ($scfg->{snapext}) {
> +
> +	my $snapshots = $class->volume_snapshot_info($scfg, $storeid, $volname);
> +	my $snappath = $snapshots->{$snap}->{file};
> +	die "volume $snappath is missing" if !-e $snappath;
>  
> -    my $cmd = ['/usr/bin/qemu-img', 'snapshot','-d', $snap, $path];
> +	my $parentsnap = $snapshots->{$snap}->{parent};
> +	my $childsnap = $snapshots->{$snap}->{child};
>  
> -    run_command($cmd);
> +	my $parentpath = $snapshots->{$parentsnap}->{file} if $parentsnap;
> +	my $childpath = $snapshots->{$childsnap}->{file} if $childsnap;

my $foo = .. if ...; 

is forbidden in our code ;) but I think we always need to have a childsnap anyway, right?

so we could simply check for that, and then switch around the two branches below so that one of them can do

if (my $parentsnap = ...) {
...
} else {
...
}

> +
> +	#if first snapshot,as it should be bigger,  we merge child, and rename the snapshot to child
> +	if(!$parentsnap) {
> +	    print"commit $childpath\n";
> +	    $cmd = ['/usr/bin/qemu-img', 'commit', $childpath];

we could provide `-d` here to skip emptying $childpath since we renamed over it anyway below..

> +	    eval { run_command($cmd) };
> +	    if ($@) {
> +		die "error commiting $childpath to $parentpath; $@\n";

this is wrong, there is no $parentpath.. we are committing into $snappath

> +	    }
> +	    print"rename $snappath to $childpath\n";
> +	    rename($snappath, $childpath);

what if this fails?

> +	} else {
> +	    #we rebase the child image on the parent as new backing image

should we extend this to make it clear what this means? it means copying any parts of $snap that are not in $parent and not yet overwritten by $child into $child, right?

so how expensive this is depends on:
- how many changes are between $parent and $snap (increases cost)
- how many of those are overwritten by changes between $snap and $child (decreases cost)

> +	    die "missing parentsnap snapshot to rebase child $childpath\n" if !$parentpath;

how can this happen? if there is a parentsnap there must be a parentpath as well?

> +	    $cmd = ['/usr/bin/qemu-img', 'rebase', '-b', $parentpath, '-F', 'qcow2', '-f', 'qcow2', $childpath];
> +	    eval { run_command($cmd) };
> +	    if ($@) {
> +		die "error rebase $childpath from $parentpath; $@\n";
> +	    }
> +	    #delete the snapshot
> +	    unlink($snappath);
> +	}
> +
> +    } else {
> +	$class->deactivate_volume($storeid, $scfg, $volname, $snap, {});
> +
> +	$cmd = ['/usr/bin/qemu-img', 'snapshot','-d', $snap, $path];
> +	run_command($cmd);
> +    }
>  
>      return undef;
>  }
> @@ -1246,7 +1352,7 @@ sub volume_has_feature {
>  	    current => { qcow2 => 1, raw => 1, vmdk => 1 },
>  	},
>  	rename => {
> -	    current => {qcow2 => 1, raw => 1, vmdk => 1},
> +	    current => { qcow2 => 1, raw => 1, vmdk => 1},
>  	},
>      };
>  
> @@ -1481,7 +1587,37 @@ sub status {
>  sub volume_snapshot_info {
>      my ($class, $scfg, $storeid, $volname) = @_;
>  
> -    die "volume_snapshot_info is not implemented for $class";
> +    my $path = $class->filesystem_path($scfg, $volname);
> +
> +    my $backing_chain = 1;
> +    my $json = qemu_img_info($path, undef, 10, $backing_chain);
> +    die "failed to query file information with qemu-img\n" if !$json;
> +    my $snapshots = eval { decode_json($json) };

missing error handlign for json decoding..

> +
> +    my $info = {};
> +    my $order = 0;
> +    for my $snap (@$snapshots) {
> +
> +	my $snapfile = $snap->{filename};
> +	my $snapname = parse_snapname($snapfile);
> +	$snapname = 'current' if !$snapname;
> +	my $snapvolname = $class->get_snap_volname($volname, $snapname);
> +
> +	$info->{$snapname}->{order} = $order;
> +	$info->{$snapname}->{file}= $snapfile;
> +	$info->{$snapname}->{volname} = $snapvolname;
> +	$info->{$snapname}->{volid} = "$storeid:$snapvolname";
> +	$info->{$snapname}->{ext} = 1;
> +
> +	my $parentfile = $snap->{'backing-filename'};
> +	if ($parentfile) {
> +	    my $parentname = parse_snapname($parentfile);
> +	    $info->{$snapname}->{parent} = $parentname;
> +	    $info->{$parentname}->{child} = $snapname;
> +	}
> +	$order++;
> +    }
> +    return $info;
>  }
>  
>  sub activate_storage {
> @@ -1867,4 +2003,22 @@ sub config_aware_base_mkdir {
>      }
>  }
>  
> +sub get_snap_volname {
> +    my ($class, $volname, $snapname) = @_;
> +
> +    my ($vtype, $name, $vmid, $basename, $basevmid, $isBase, $format) = $class->parse_volname($volname);
> +    $name = !$snapname || $snapname eq 'current' ? $volname : "$vmid/snap-$snapname-$name";

other way round would be better to group by volume first IMHO ($vmid/snap-$name-$snapname), as this is similar to how we encode snapshots often on the storage level (volume@snap). we also need to have some delimiter between snapshot and volume name that is not allowed in either (hard for volname since basically everything but '/' goes, but snapshots have a restricted character set (configid, which means alphanumeric, hyphen and underscore), so we could use something like '.' as delimiter? or we switch to directories and do $vmid/snap/$snap/$name?)

> +    return $name;
> +}
> +
> +sub parse_snapname {
> +    my ($name) = @_;
> +
> +    my $basename = basename($name);
> +    if ($basename =~ m/^snap-(.*)-vm(.*)$/) {

this is not strict enough, see above

> +	return $1;
> +    }
> +    return undef;
> +}
> +
>  1;
> -- 
> 2.39.5


_______________________________________________
pve-devel mailing list
pve-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel


  reply	other threads:[~2025-04-01 13:50 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20250311102905.2680524-1-alexandre.derumier@groupe-cyllene.com>
2025-03-11 10:28 ` [pve-devel] [PATCH v4 pve-qemu 1/1] add block-commit-replaces option patch Alexandre Derumier via pve-devel
2025-03-11 10:28 ` [pve-devel] [PATCH v4 qemu-server 01/11] blockdev: cmdline: convert drive to blockdev syntax Alexandre Derumier via pve-devel
2025-03-11 10:28 ` [pve-devel] [PATCH v4 pve-storage 1/5] qcow2: add external snapshot support Alexandre Derumier via pve-devel
2025-04-01 13:50   ` Fabian Grünbichler [this message]
2025-04-02  8:01     ` DERUMIER, Alexandre via pve-devel
     [not found]     ` <0e2cd118f35aa8d4c410d362fea1a1b366df1570.camel@groupe-cyllene.com>
2025-04-02  8:28       ` Fabian Grünbichler
2025-04-03  4:27         ` DERUMIER, Alexandre via pve-devel
2025-03-11 10:28 ` [pve-devel] [PATCH v4 qemu-server 02/11] blockdev : convert qemu_driveadd && qemu_drivedel Alexandre Derumier via pve-devel
2025-03-11 10:28 ` [pve-devel] [PATCH v4 pve-storage 2/5] lvmplugin: add qcow2 snapshot Alexandre Derumier via pve-devel
2025-04-01 13:50   ` Fabian Grünbichler
2025-03-11 10:28 ` [pve-devel] [PATCH v4 qemu-server 03/11] replace qemu_block_set_io_throttle with qom-set throttlegroup limits Alexandre Derumier via pve-devel
2025-03-11 10:28 ` [pve-devel] [PATCH v4 pve-storage 3/5] storage: vdisk_free: remove external snapshots Alexandre Derumier via pve-devel
2025-04-01 13:50   ` Fabian Grünbichler
2025-04-07 11:02     ` DERUMIER, Alexandre via pve-devel
2025-04-07 11:29     ` DERUMIER, Alexandre via pve-devel
2025-03-11 10:28 ` [pve-devel] [PATCH v4 qemu-server 04/11] blockdev: vm_devices_list : fix block-query Alexandre Derumier via pve-devel
2025-04-02  8:10   ` Fabian Grünbichler
2025-04-11 17:32     ` DERUMIER, Alexandre via pve-devel
2025-03-11 10:28 ` [pve-devel] [PATCH v4 pve-storage 4/5] lvm: lvrename helper: allow path Alexandre Derumier via pve-devel
2025-04-01 13:50   ` Fabian Grünbichler
2025-03-11 10:28 ` [pve-devel] [PATCH v4 qemu-server 05/11] blockdev: convert cdrom media eject/insert Alexandre Derumier via pve-devel
2025-03-11 10:28 ` [pve-devel] [PATCH v4 pve-storage 5/5] lvm: add lvremove helper Alexandre Derumier via pve-devel
2025-04-01 13:50   ` Fabian Grünbichler
2025-03-11 10:29 ` [pve-devel] [PATCH v4 qemu-server 06/11] blockdev: block_resize: convert to blockdev Alexandre Derumier via pve-devel
2025-03-11 10:29 ` [pve-devel] [PATCH v4 qemu-server 07/11] blockdev: nbd_export: block-export-add : use drive-$id for nodename Alexandre Derumier via pve-devel
2025-03-11 10:29 ` [pve-devel] [PATCH v4 qemu-server 08/11] blockdev: convert drive_mirror to blockdev_mirror Alexandre Derumier via pve-devel
2025-03-11 10:29 ` [pve-devel] [PATCH v4 qemu-server 09/11] blockdev: change aio on target if io_uring is not default Alexandre Derumier via pve-devel
2025-03-11 10:29 ` [pve-devel] [PATCH v4 qemu-server 10/11] blockdev: add backing_chain support Alexandre Derumier via pve-devel
2025-04-02  8:10   ` Fabian Grünbichler
2025-03-11 10:29 ` [pve-devel] [PATCH v4 qemu-server 11/11] qcow2: add external snapshot support Alexandre Derumier via pve-devel
2025-04-02  8:10   ` Fabian Grünbichler
2025-04-03  4:51     ` DERUMIER, Alexandre via pve-devel
2025-04-04 11:31     ` DERUMIER, Alexandre via pve-devel
     [not found]     ` <3e516016a970e52e5a1014dbcd6cf9507581da74.camel@groupe-cyllene.com>
2025-04-04 11:37       ` Fabian Grünbichler
2025-04-04 13:02         ` DERUMIER, Alexandre via pve-devel

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1614620193.3974.1743515437162@webmail.proxmox.com \
    --to=f.gruenbichler@proxmox.com \
    --cc=pve-devel@lists.proxmox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal