From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <pve-devel-bounces@lists.proxmox.com>
Received: from firstgate.proxmox.com (firstgate.proxmox.com [IPv6:2a01:7e0:0:424::9])
	by lore.proxmox.com (Postfix) with ESMTPS id 8FAA01FF16E
	for <inbox@lore.proxmox.com>; Mon, 31 Mar 2025 15:22:58 +0200 (CEST)
Received: from firstgate.proxmox.com (localhost [127.0.0.1])
	by firstgate.proxmox.com (Proxmox) with ESMTP id 51C54544C;
	Mon, 31 Mar 2025 15:20:48 +0200 (CEST)
From: Fiona Ebner <f.ebner@proxmox.com>
To: pve-devel@lists.proxmox.com
Date: Mon, 31 Mar 2025 15:20:04 +0200
Message-Id: <20250331132020.105324-22-f.ebner@proxmox.com>
X-Mailer: git-send-email 2.39.5
In-Reply-To: <20250331132020.105324-1-f.ebner@proxmox.com>
References: <20250331132020.105324-1-f.ebner@proxmox.com>
MIME-Version: 1.0
X-SPAM-LEVEL: Spam detection results:  0
 AWL -0.039 Adjusted score from AWL reputation of From: address
 BAYES_00                 -1.9 Bayes spam probability is 0 to 1%
 DMARC_MISSING             0.1 Missing DMARC policy
 KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment
 SPF_HELO_NONE           0.001 SPF: HELO does not publish an SPF Record
 SPF_PASS               -0.001 SPF: sender matches SPF record
Subject: [pve-devel] [PATCH qemu-server v6 21/37] backup: implement backup
 for external providers
X-BeenThere: pve-devel@lists.proxmox.com
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Proxmox VE development discussion <pve-devel.lists.proxmox.com>
List-Unsubscribe: <https://lists.proxmox.com/cgi-bin/mailman/options/pve-devel>, 
 <mailto:pve-devel-request@lists.proxmox.com?subject=unsubscribe>
List-Archive: <http://lists.proxmox.com/pipermail/pve-devel/>
List-Post: <mailto:pve-devel@lists.proxmox.com>
List-Help: <mailto:pve-devel-request@lists.proxmox.com?subject=help>
List-Subscribe: <https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel>, 
 <mailto:pve-devel-request@lists.proxmox.com?subject=subscribe>
Reply-To: Proxmox VE development discussion <pve-devel@lists.proxmox.com>
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Errors-To: pve-devel-bounces@lists.proxmox.com
Sender: "pve-devel" <pve-devel-bounces@lists.proxmox.com>

The state of the VM's disk images at the time the backup is started is
preserved via a snapshot-access block node. Old data is moved to the
fleecing image when new guest writes come in. The snapshot-access
block node, as well as the associated bitmap in case of incremental
backup, will be made available to the external provider. They are
exported via NBD and for 'nbd' mechanism, the NBD socket path is
passed to the provider, while for 'file-handle' mechanism, the NBD
export is made accessible via a file handle and the bitmap information
is made available via a $next_dirty_region->() function. For
'file-handle', the 'nbdinfo' and 'nbdfuse' binaries are required.

The provider can indicate that it wants to do an incremental backup by
returning the bitmap ID that was used for a previous backup and it
will then be told if the bitmap was newly created (either first backup
or old bitmap was invalid) or if the bitmap can be reused.

The provider then reads the parts of the NBD or virtual file it needs,
either the full disk for full backup, or the dirty parts according to
the bitmap for incremental backup. The bitmap has to be respected,
reads to other parts of the image will return an error. After backing
up each part of the disk, it should be discarded in the export to
avoid unnecessary space usage in the fleecing image (requires the
storage underlying the fleecing image to support discard too).

Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
---

Changes in v6:
* Drop outdated call to backup_get_task_size().

 PVE/VZDump/QemuServer.pm | 399 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 397 insertions(+), 2 deletions(-)

diff --git a/PVE/VZDump/QemuServer.pm b/PVE/VZDump/QemuServer.pm
index 65f01791..5e959de4 100644
--- a/PVE/VZDump/QemuServer.pm
+++ b/PVE/VZDump/QemuServer.pm
@@ -3,12 +3,15 @@ package PVE::VZDump::QemuServer;
 use strict;
 use warnings;
 
+use Fcntl qw(:mode);
 use File::Basename;
-use File::Path;
+use File::Path qw(make_path remove_tree);
+use File::stat qw();
 use IO::File;
 use IPC::Open3;
 use JSON;
 use POSIX qw(EINTR EAGAIN);
+use Time::HiRes qw(usleep);
 
 use PVE::Cluster qw(cfs_read_file);
 use PVE::INotify;
@@ -20,7 +23,7 @@ use PVE::QMPClient;
 use PVE::Storage::Plugin;
 use PVE::Storage::PBSPlugin;
 use PVE::Storage;
-use PVE::Tools;
+use PVE::Tools qw(run_command);
 use PVE::VZDump;
 use PVE::Format qw(render_duration render_bytes);
 
@@ -30,6 +33,7 @@ use PVE::QemuServer::Drive qw(checked_volume_format);
 use PVE::QemuServer::Helpers;
 use PVE::QemuServer::Machine;
 use PVE::QemuServer::Monitor qw(mon_cmd);
+use PVE::QemuServer::QMPHelpers;
 
 use base qw (PVE::VZDump::Plugin);
 
@@ -284,6 +288,8 @@ sub archive {
 
     if ($self->{vzdump}->{opts}->{pbs}) {
 	$self->archive_pbs($task, $vmid);
+    } elsif ($self->{vzdump}->{'backup-provider'}) {
+	$self->archive_external($task, $vmid);
     } else {
 	$self->archive_vma($task, $vmid, $filename, $comp);
     }
@@ -1148,11 +1154,90 @@ sub snapshot {
     # nothing to do
 }
 
+my sub cleanup_file_handles {
+    my ($self, $file_handles) = @_;
+
+    for my $file_handle ($file_handles->@*) {
+	close($file_handle) or $self->log('warn', "unable to close file handle - $!");
+    }
+}
+
+my sub cleanup_nbd_mounts {
+    my ($self, $info) = @_;
+
+    for my $mount_point (keys $info->%*) {
+	my $pid_file = delete($info->{$mount_point}->{'pid-file'});
+	unlink($pid_file) or $self->log('warn', "unable to unlink '$pid_file' - $!");
+	# Do a lazy unmount, because the target might still be busy even if the file handle was
+	# already closed.
+	eval { run_command(['fusermount', '-z', '-u', $mount_point ]); };
+	if (my $err = $@) {
+	    delete $info->{$mount_point};
+	    $self->log('warn', "unable to unmount NBD backup source '$mount_point' - $err");
+	}
+    }
+
+    # Wait for the unmount before cleaning up child PIDs to avoid 'nbdfuse' processes being
+    # interrupted by the signals issued there.
+    my $waited;
+    my $wait_limit = 50; # 5 seconds
+    for ($waited = 0; $waited < $wait_limit && scalar(keys $info->%*); $waited++) {
+	for my $mount_point (keys $info->%*) {
+	    delete($info->{$mount_point}) if !-e $info->{$mount_point}->{'virtual-file'};
+	    eval { remove_tree($mount_point); };
+	}
+	usleep(100_000);
+    }
+    # just informational, remaining child processes will be killed afterwards
+    $self->loginfo("unable to gracefully cleanup NBD fuse mounts") if scalar(keys $info->%*) != 0;
+}
+
+my sub cleanup_child_processes {
+    my ($self, $cpids) = @_;
+
+    my $waited;
+    my $wait_limit = 5;
+    for ($waited = 0; $waited < $wait_limit && scalar(keys $cpids->%*); $waited++) {
+	for my $cpid (keys $cpids->%*) {
+	    delete($cpids->{$cpid}) if waitpid($cpid, POSIX::WNOHANG) > 0;
+	}
+	if ($waited == 0) {
+	    kill 15, $_ for keys $cpids->%*;
+	}
+	sleep 1;
+    }
+    if ($waited == $wait_limit && scalar(keys $cpids->%*)) {
+	kill 9, $_ for keys $cpids->%*;
+	sleep 1;
+	for my $cpid (keys $cpids->%*) {
+	    delete($cpids->{$cpid}) if waitpid($cpid, POSIX::WNOHANG) > 0;
+	}
+	$self->log('warn', "unable to collect child process '$_'") for keys $cpids->%*;
+    }
+}
+
 sub cleanup {
     my ($self, $task, $vmid) = @_;
 
     # If VM was started only for backup, it is already stopped now.
     if (PVE::QemuServer::Helpers::vm_running_locally($vmid)) {
+	if ($task->{cleanup}->{'nbd-stop'}) {
+	    eval { PVE::QemuServer::QMPHelpers::nbd_stop($vmid); };
+	    $self->logerr($@) if $@;
+	}
+
+	if (my $info = $task->{cleanup}->{'backup-access-teardown'}) {
+	    my $params = {
+		'target-id' => $info->{'target-id'},
+		timeout => 60,
+		success => $info->{success} ? JSON::true : JSON::false,
+	    };
+
+	    $self->loginfo("tearing down backup-access");
+	    eval { mon_cmd($vmid, "backup-access-teardown", $params->%*) };
+	    $self->logerr($@) if $@;
+	}
+
 	$detach_tpmstate_drive->($task, $vmid);
 	detach_fleecing_images($task->{disks}, $vmid) if $task->{'use-fleecing'};
     }
@@ -1162,6 +1247,316 @@ sub cleanup {
     if ($self->{qmeventd_fh}) {
 	close($self->{qmeventd_fh});
     }
+
+    cleanup_file_handles($self, $task->{cleanup}->{'file-handles'})
+	if $task->{cleanup}->{'file-handles'};
+
+    cleanup_nbd_mounts($self, $task->{cleanup}->{'nbd-mounts'})
+	if $task->{cleanup}->{'nbd-mounts'};
+
+    cleanup_child_processes($self, $task->{cleanup}->{'child-pids'})
+	if $task->{cleanup}->{'child-pids'};
+
+    if (my $dir = $task->{'backup-access-root-dir'}) {
+	eval { remove_tree($dir) };
+	$self->log('warn', "unable to cleanup directory $dir - $@") if $@;
+    }
+}
+
+my sub virtual_file_backup_prepare {
+    my ($self, $vmid, $task, $device_name, $size, $nbd_path, $bitmap_name) = @_;
+
+    my $cleanup = $task->{cleanup};
+
+    my $nbd_uri = "nbd+unix:///${device_name}?socket=${nbd_path}";
+
+    my $error_fh;
+    my $next_dirty_region;
+
+    # If there is no dirty bitmap, it can be treated as if there's a full dirty one. The output of
+    # nbdinfo is a list of tuples with offset, length, type, description. The first bit of 'type' is
+    # set when the bitmap is dirty, see QEMU's docs/interop/nbd.txt
+    my $dirty_bitmap = [];
+    if ($bitmap_name) {
+	my $input = IO::File->new();
+	my $info = IO::File->new();
+	$error_fh = IO::File->new();
+	my $nbdinfo_cmd = ["nbdinfo", $nbd_uri, "--map=qemu:dirty-bitmap:${bitmap_name}"];
+	my $cpid = open3($input, $info, $error_fh, $nbdinfo_cmd->@*)
+	    or die "failed to spawn nbdinfo child - $!\n";
+	$cleanup->{'child-pids'}->{$cpid} = 1;
+
+	$next_dirty_region = sub {
+	    my ($offset, $length, $type);
+	    do {
+		my $line = <$info>;
+		return if !$line;
+		die "unexpected output from nbdinfo - $line\n"
+		    if $line !~ m/^\s*(\d+)\s*(\d+)\s*(\d+)/; # also untaints
+		($offset, $length, $type) = ($1, $2, $3);
+	    } while (($type & 0x1) == 0); # not dirty
+	    return ($offset, $length);
+	};
+    } else {
+	my $done = 0;
+	$next_dirty_region = sub {
+	    return if $done;
+	    $done = 1;
+	    return (0, $size);
+	};
+    }
+
+    my $mount_point = $task->{'backup-access-root-dir'}
+	."/${vmid}-nbd.backup-access.${device_name}.$$";
+    make_path($mount_point) or die "unable to create directory $mount_point\n";
+    $cleanup->{'nbd-mounts'}->{$mount_point} = {};
+
+    # Note that nbdfuse requires "$dir/$file". A single name would be treated as a dir and the file
+    # would be named "$dir/nbd" then
+    my $virtual_file = "${mount_point}/${device_name}";
+    $cleanup->{'nbd-mounts'}->{$mount_point}->{'virtual-file'} = $virtual_file;
+
+    my $pid_file = "${mount_point}.pid";
+    PVE::Tools::file_set_contents($pid_file, '', 0600);
+    $cleanup->{'nbd-mounts'}->{$mount_point}->{'pid-file'} = $pid_file;
+
+    my $cpid = fork() // die "fork failed: $!\n";
+    if (!$cpid) {
+	# By default, access will be restricted to the current user, because the allow_other fuse
+	# mount option is not used.
+	eval {
+	    run_command(
+		["nbdfuse", '--pidfile', $pid_file, $virtual_file, $nbd_uri],
+		logfunc => sub { $self->loginfo("nbdfuse '$virtual_file': $_[0]") },
+	    );
+	};
+	if (my $err = $@) {
+	    eval { $self->loginfo($err); };
+	    POSIX::_exit(1);
+	}
+	POSIX::_exit(0);
+    }
+    $cleanup->{'child-pids'}->{$cpid} = 1;
+
+    my ($virtual_file_ready, $waited) = (0, 0);
+    while (!$virtual_file_ready && $waited < 30) { # 3 seconds
+	my $pid = PVE::Tools::file_read_firstline($pid_file);
+	if ($pid) {
+	    $virtual_file_ready = 1;
+	} else {
+	    usleep(100_000);
+	    $waited++;
+	}
+    }
+    die "timeout setting up virtual file '$virtual_file'" if !$virtual_file_ready;
+
+    $self->loginfo("provided NBD export as a virtual file '$virtual_file'");
+
+    # NOTE O_DIRECT, because each block should be read exactly once and also because fuse will try
+    # to read ahead otherwise, which would produce warning messages if the next block is not
+    # mapped/allocated for the NBD export in case of incremental backup. Open as writable to support
+    # discard.
+    my $fh = IO::File->new($virtual_file, O_RDWR | O_DIRECT)
+	or die "unable to open backup source '$virtual_file' - $!\n";
+    push $cleanup->{'file-handles'}->@*, $fh;
+
+    return ($fh, $next_dirty_region);
+}
+
+my sub backup_access_to_volume_info {
+    my ($self, $vmid, $task, $backup_access_info, $mechanism, $nbd_path) = @_;
+
+    my $bitmap_action_to_status = {
+	'not-used' => 'none',
+	'not-used-removed' => 'none',
+	'new' => 'new',
+	'used' => 'reuse',
+	'invalid' => 'new',
+    };
+
+    my $volumes = {};
+
+    for my $info ($backup_access_info->@*) {
+	my $bitmap_status = 'none';
+	my $bitmap_name;
+	if (my $bitmap_action = $info->{'bitmap-action'}) {
+	    $bitmap_status = $bitmap_action_to_status->{$bitmap_action}
+		or die "got unexpected bitmap action '$bitmap_action'\n";
+
+	    $bitmap_name = $info->{'bitmap-name'} or die "bitmap-name is not present\n";
+	}
+
+	my ($device, $size) = $info->@{qw(device size)};
+
+	$volumes->{$device}->{'bitmap-mode'} = $bitmap_status;
+	$volumes->{$device}->{size} = $size;
+
+	if ($mechanism eq 'file-handle') {
+	    my ($fh, $next_dirty_region) = virtual_file_backup_prepare(
+		$self, $vmid, $task, $device, $size, $nbd_path, $bitmap_name);
+	    $volumes->{$device}->{'file-handle'} = $fh;
+	    $volumes->{$device}->{'next-dirty-region'} = $next_dirty_region;
+	} elsif ($mechanism eq 'nbd') {
+	    $volumes->{$device}->{'nbd-path'} = $nbd_path;
+	    $volumes->{$device}->{'bitmap-name'} = $bitmap_name;
+	} else {
+	    die "internal error - unkown mechanism '$mechanism'";
+	}
+    }
+
+    return $volumes;
+}
+
+sub archive_external {
+    my ($self, $task, $vmid) = @_;
+
+    $task->{'backup-access-root-dir'} = "/run/qemu-server/${vmid}.backup-access.$$/";
+    make_path($task->{'backup-access-root-dir'})
+	or die "unable to create directory $task->{'backup-access-root-dir'}\n";
+    chmod(0700, $task->{'backup-access-root-dir'})
+	or die "unable to chmod directory $task->{'backup-access-root-dir'}\n";
+
+    my $guest_config = PVE::Tools::file_get_contents("$task->{tmpdir}/qemu-server.conf");
+    my $firewall_file = "$task->{tmpdir}/qemu-server.fw";
+
+    my $opts = $self->{vzdump}->{opts};
+
+    my $backup_provider = $self->{vzdump}->{'backup-provider'};
+
+    $self->loginfo("starting external backup via " . $backup_provider->provider_name());
+
+    my $starttime = time();
+
+    # get list early so we die on unkown drive types before doing anything
+    my $devlist = _get_task_devlist($task);
+
+    $self->enforce_vm_running_for_backup($vmid);
+    $self->{qmeventd_fh} = PVE::QemuServer::register_qmeventd_handle($vmid);
+
+    eval {
+	$SIG{INT} = $SIG{TERM} = $SIG{QUIT} = $SIG{HUP} = $SIG{PIPE} = sub {
+	    die "interrupted by signal\n";
+	};
+
+	my $qemu_support = mon_cmd($vmid, "query-proxmox-support");
+
+	if (!$qemu_support->{'backup-access-api'}) {
+		die "backups access API required for external provider backup is not supported by"
+		    ." the running QEMU version. Please make sure you've installed the latest "
+		    ." version and the VM has been restarted.\n";
+	}
+
+	$attach_tpmstate_drive->($self, $task, $vmid);
+
+	my $is_template = PVE::QemuConfig->is_template($self->{vmlist}->{$vmid});
+
+	my $fleecing = check_and_prepare_fleecing(
+	    $self, $vmid, $opts->{fleecing}, $task->{disks}, $is_template, $qemu_support, 1);
+	die "cannot setup backup access without fleecing\n" if !$fleecing;
+
+	$task->{'use-fleecing'} = 1;
+
+	my $target_id = "snapshot-access:$opts->{storage}";
+
+	my ($mechanism, $bitmap_name) = $backup_provider->backup_get_mechanism($vmid, 'qemu');
+	die "mechanism '$mechanism' requested by backup provider is not supported for VMs\n"
+	    if $mechanism ne 'file-handle' && $mechanism ne 'nbd';
+
+	$self->loginfo("using backup mechanism '$mechanism'");
+
+	if ($mechanism eq 'file-handle') {
+	    # For mechanism 'file-handle', the nbdfuse binary is required. Also, the bitmap needs
+	    # to be passed to the provider. The bitmap cannot be dumped via QMP and doing it via
+	    # qemu-img is experimental, so use nbdinfo. Both are in libnbd-bin.
+	    die "need 'nbdfuse' binary from package libnbd-bin\n" if !-e "/usr/bin/nbdfuse";
+	}
+
+	my $params = {
+	    'target-id' => $target_id,
+	    devlist => $devlist,
+	    timeout => 60,
+	};
+
+	if ($bitmap_name) {
+	    # prepend storage ID so different providers can never cause clashes
+	    $bitmap_name = "$opts->{storage}-" . $bitmap_name;
+	    $params->{'bitmap-name'} = $bitmap_name;
+	}
+
+	my $fs_frozen = $self->qga_fs_freeze($task, $vmid);
+
+	$self->loginfo("setting up snapshot-access for backup");
+
+	$task->{cleanup}->{'backup-access-teardown'} = { 'target-id' => $target_id, success => 0 };
+
+	my $backup_access_info = eval { mon_cmd($vmid, "backup-access-setup", $params->%*) };
+	my $qmperr = $@;
+
+	if ($fs_frozen) {
+	    $self->qga_fs_thaw($vmid);
+	}
+
+	die $qmperr if $qmperr;
+
+	$self->resume_vm_after_job_start($task, $vmid);
+
+	my $bitmap_info = mon_cmd($vmid, 'query-pbs-bitmap-info');
+	for my $info (sort { $a->{drive} cmp $b->{drive} } $bitmap_info->@*) {
+	    my $text = $bitmap_action_to_human->($self, $info);
+	    my $drive = $info->{drive};
+	    $drive =~ s/^drive-//; # for consistency
+	    $self->loginfo("$drive: dirty-bitmap status: $text");
+	}
+
+	$self->loginfo("starting NBD server");
+
+	my $nbd_path = "$task->{'backup-access-root-dir'}/${vmid}-nbd.backup-access";
+	mon_cmd(
+	    $vmid, "nbd-server-start", addr => { type => 'unix', data => { path => $nbd_path } } );
+	$task->{cleanup}->{'nbd-stop'} = 1;
+
+	for my $info ($backup_access_info->@*) {
+	    $self->loginfo("adding NBD export for $info->{device}");
+
+	    my $export_params = {
+		id => $info->{device},
+		'node-name' => $info->{'node-name'},
+		writable => JSON::true, # for discard
+		type => "nbd",
+		name => $info->{device}, # NBD export name
+	    };
+
+	    if ($info->{'bitmap-name'}) {
+		$export_params->{bitmaps} = [{
+		    node => $info->{'bitmap-node-name'},
+		    name => $info->{'bitmap-name'},
+		}];
+	    }
+
+	    mon_cmd($vmid, "block-export-add", $export_params->%*);
+	}
+
+	my $volumes = backup_access_to_volume_info(
+	    $self, $vmid, $task, $backup_access_info, $mechanism, $nbd_path);
+
+	my $param = {};
+	$param->{'bandwidth-limit'} = $opts->{bwlimit} * 1024 if $opts->{bwlimit};
+	$param->{'firewall-config'} = PVE::Tools::file_get_contents($firewall_file)
+	    if -e $firewall_file;
+
+	$backup_provider->backup_vm($vmid, $guest_config, $volumes, $param);
+    };
+    my $err = $@;
+
+    if ($err) {
+	$self->logerr($err);
+	$self->resume_vm_after_job_start($task, $vmid);
+    } else {
+	$task->{cleanup}->{'backup-access-teardown'}->{success} = 1;
+    }
+    $self->restore_vm_power_state($vmid);
+
+    die $err if $err;
 }
 
 1;
-- 
2.39.5



_______________________________________________
pve-devel mailing list
pve-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel