From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <pve-devel-bounces@lists.proxmox.com>
Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68])
	by lore.proxmox.com (Postfix) with ESMTPS id B645B1FF162
	for <inbox@lore.proxmox.com>; Mon,  5 May 2025 14:58:19 +0200 (CEST)
Received: from firstgate.proxmox.com (localhost [127.0.0.1])
	by firstgate.proxmox.com (Proxmox) with ESMTP id AEA5C1C449;
	Mon,  5 May 2025 14:58:02 +0200 (CEST)
From: Fiona Ebner <f.ebner@proxmox.com>
To: pve-devel@lists.proxmox.com
Date: Mon,  5 May 2025 14:57:20 +0200
Message-Id: <20250505125724.75620-8-f.ebner@proxmox.com>
X-Mailer: git-send-email 2.39.5
In-Reply-To: <20250505125724.75620-1-f.ebner@proxmox.com>
References: <20250505125724.75620-1-f.ebner@proxmox.com>
MIME-Version: 1.0
X-SPAM-LEVEL: Spam detection results:  0
 AWL -0.036 Adjusted score from AWL reputation of From: address
 BAYES_00                 -1.9 Bayes spam probability is 0 to 1%
 DMARC_MISSING             0.1 Missing DMARC policy
 KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment
 SPF_HELO_NONE           0.001 SPF: HELO does not publish an SPF Record
 SPF_PASS               -0.001 SPF: sender matches SPF record
Subject: [pve-devel] [PATCH qemu-server 07/11] agent: implement fsfreeze
 helper to better handle lost commands
X-BeenThere: pve-devel@lists.proxmox.com
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Proxmox VE development discussion <pve-devel.lists.proxmox.com>
List-Unsubscribe: <https://lists.proxmox.com/cgi-bin/mailman/options/pve-devel>, 
 <mailto:pve-devel-request@lists.proxmox.com?subject=unsubscribe>
List-Archive: <http://lists.proxmox.com/pipermail/pve-devel/>
List-Post: <mailto:pve-devel@lists.proxmox.com>
List-Help: <mailto:pve-devel-request@lists.proxmox.com?subject=help>
List-Subscribe: <https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel>, 
 <mailto:pve-devel-request@lists.proxmox.com?subject=subscribe>
Reply-To: Proxmox VE development discussion <pve-devel@lists.proxmox.com>
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Errors-To: pve-devel-bounces@lists.proxmox.com
Sender: "pve-devel" <pve-devel-bounces@lists.proxmox.com>

As reported in the enterprise support, it can happen that a guest
agent command is read, but then the guest agent never sends an answer,
because the service in the guest is stopped/killed. For example, if a
guest reboot happens before the command can be successfully executed.
This is usually not problematic, but the fsfreeze-freeze command has a
timeout of 1 hour, so the guest agent socket would be blocked for that
amount of time, waiting on a command that is not being executed
anymore.

Use a lower timeout for the fsfreeze-freeze command, and issue an
fsfreeze-status command afterwards, which will return immediately if
the fsfreeze-freeze command already finished, and which will be queued
if not. This is used as a proxy to determine whether the
fsfreeze-freeze command is still running and to check whether it was
successful. Like this, the time the socket is blocked after a "lost
command" is at most 10 minutes.

Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
---
 PVE/QMPClient.pm         |  4 ++++
 PVE/QemuConfig.pm        |  5 ++--
 PVE/QemuServer.pm        |  3 ++-
 PVE/QemuServer/Agent.pm  | 51 ++++++++++++++++++++++++++++++++++++++++
 PVE/VZDump/QemuServer.pm |  5 ++--
 5 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/PVE/QMPClient.pm b/PVE/QMPClient.pm
index 0f08e678..b3340885 100644
--- a/PVE/QMPClient.pm
+++ b/PVE/QMPClient.pm
@@ -110,6 +110,8 @@ sub cmd {
 	} elsif ($cmd->{execute} =~ m/^(eject|change)/) {
 	    $timeout = 60; # note: cdrom mount command is slow
 	} elsif ($cmd->{execute} eq 'guest-fsfreeze-freeze') {
+	    # consider using the guest_fsfreeze() helper in Agent.pm
+	    #
 	    # freeze syncs all guest FS, if we kill it it stays in an unfreezable
 	    # locked state with high probability, so use an generous timeout
 	    $timeout = 60*60; # 1 hour
@@ -146,6 +148,7 @@ sub cmd {
     if (defined($queue_info->{error})) {
 	die "VM $vmid qmp command '$cmd->{execute}' failed - $queue_info->{error}" if !$noerr;
 	$result = { error => $queue_info->{error} };
+	$result->{'error-is-timeout'} = 1 if $queue_info->{'error-is-timeout'};
     }
 
     return $result;
@@ -467,6 +470,7 @@ sub mux_timeout {
 
     if (my $queue_info = &$lookup_queue_info($self, $fh)) {
 	$queue_info->{error} = "got timeout\n";
+	$queue_info->{'error-is-timeout'} = 1;
 	$self->{mux}->inbuffer($fh, ''); # clear to avoid warnings
     }
 
diff --git a/PVE/QemuConfig.pm b/PVE/QemuConfig.pm
index 2609542c..e941f093 100644
--- a/PVE/QemuConfig.pm
+++ b/PVE/QemuConfig.pm
@@ -8,6 +8,7 @@ use Scalar::Util qw(blessed);
 use PVE::AbstractConfig;
 use PVE::INotify;
 use PVE::JSONSchema;
+use PVE::QemuServer::Agent;
 use PVE::QemuServer::CPUConfig;
 use PVE::QemuServer::Drive;
 use PVE::QemuServer::Helpers;
@@ -291,8 +292,8 @@ sub __snapshot_freeze {
 	eval { mon_cmd($vmid, "guest-fsfreeze-thaw"); };
 	warn "guest-fsfreeze-thaw problems - $@" if $@;
     } else {
-	eval { mon_cmd($vmid, "guest-fsfreeze-freeze"); };
-	warn "guest-fsfreeze-freeze problems - $@" if $@;
+	eval { PVE::QemuServer::Agent::guest_fsfreeze($vmid); };
+	warn $@ if $@;
     }
 }
 
diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm
index 577959a4..317c09f2 100644
--- a/PVE/QemuServer.pm
+++ b/PVE/QemuServer.pm
@@ -51,6 +51,7 @@ use PVE::Tools qw(run_command file_read_firstline file_get_contents dir_glob_for
 use PVE::QMPClient;
 use PVE::QemuConfig;
 use PVE::QemuConfig::NoWrite;
+use PVE::QemuServer::Agent;
 use PVE::QemuServer::Helpers qw(config_aware_timeout min_version kvm_user_version windows_version);
 use PVE::QemuServer::Cloudinit;
 use PVE::QemuServer::CGroup;
@@ -8242,7 +8243,7 @@ sub qemu_drive_mirror_monitor {
 		    my $agent_running = $qga && qga_check_running($vmid);
 		    if ($agent_running) {
 			print "freeze filesystem\n";
-			eval { mon_cmd($vmid, "guest-fsfreeze-freeze"); };
+			eval { PVE::QemuServer::Agent::guest_fsfreeze($vmid); };
 			warn $@ if $@;
 		    } else {
 			print "suspend vm\n";
diff --git a/PVE/QemuServer/Agent.pm b/PVE/QemuServer/Agent.pm
index 41e615aa..ef36a6a8 100644
--- a/PVE/QemuServer/Agent.pm
+++ b/PVE/QemuServer/Agent.pm
@@ -119,4 +119,55 @@ sub qemu_exec_status {
     return $res;
 }
 
+# It can happen that a guest agent command is read, but then the guest agent never sends an answer,
+# because the service in the guest is stopped/killed. For example, if a guest reboot happens before
+# the command can be successfully executed. This is usually not problematic, but the fsfreeze-freeze
+# command has a timeout of 1 hour, so the guest agent socket would be blocked for that amount of
+# time, waiting on a command that is not being executed anymore.
+#
+# Use a lower timeout for the fsfreeze-freeze command, and issue an fsfreeze-status command
+# afterwards, which will return immediately if the fsfreeze-freeze command already finished, and
+# which will be queued if not. This is used as a proxy to determine whether the fsfreeze-freeze
+# command is still running and to check whether it was successful. Like this, the time the socket is
+# blocked after a "lost command" is at most 10 minutes.
+sub guest_fsfreeze {
+    my ($vmid) = @_;
+
+    my $timeout = 10 * 60;
+
+    my $result = eval {
+	PVE::QemuServer::Monitor::mon_cmd($vmid, 'guest-fsfreeze-freeze', timeout => $timeout);
+    };
+    if ($result && ref($result) eq 'HASH' && $result->{error}) {
+	my $error = $result->{error}->{desc} // 'unknown';
+	die "unable to freeze guest fs - $error\n";
+    } elsif (defined($result)) {
+	return; # command successful
+    }
+
+    my $status;
+    eval {
+	my ($i, $last_iteration) = (0, 5);
+	while ($i < $last_iteration && !defined($status)) {
+	    print "still waiting on guest fs freeze\n";
+	    $i++;
+
+	    $status = PVE::QemuServer::Monitor::mon_cmd(
+		$vmid, 'guest-fsfreeze-status', timeout => $timeout, noerr => 1);
+
+	    if ($status && ref($status) eq 'HASH' && $status->{'error-is-timeout'}) {
+		$status = undef;
+	    } else {
+		check_agent_error($status, 'unknown error');
+	    }
+	}
+	if (!defined($status)) {
+	    die "timeout after " . ($timeout * ($last_iteration + 1) / 60) . " minutes\n";
+	}
+    };
+    die "querying status after freezing guest fs failed - $@" if $@;
+
+    die "unable to freeze guest fs - unexpected status '$status'\n" if $status ne 'frozen';
+}
+
 1;
diff --git a/PVE/VZDump/QemuServer.pm b/PVE/VZDump/QemuServer.pm
index 10514f75..b686da84 100644
--- a/PVE/VZDump/QemuServer.pm
+++ b/PVE/VZDump/QemuServer.pm
@@ -29,6 +29,7 @@ use PVE::Format qw(render_duration render_bytes);
 
 use PVE::QemuConfig;
 use PVE::QemuServer;
+use PVE::QemuServer::Agent;
 use PVE::QemuServer::Drive qw(checked_volume_format);
 use PVE::QemuServer::Helpers;
 use PVE::QemuServer::Machine;
@@ -1069,10 +1070,10 @@ sub qga_fs_freeze {
     }
 
     $self->loginfo("issuing guest-agent 'fs-freeze' command");
-    eval { mon_cmd($vmid, "guest-fsfreeze-freeze") };
+    eval { PVE::QemuServer::Agent::guest_fsfreeze($vmid); };
     $self->logerr($@) if $@;
 
-    return 1; # even on mon command error, ensure we always thaw again
+    return 1; # even on error, ensure we always thaw again
 }
 
 # only call if fs_freeze return 1
-- 
2.39.5



_______________________________________________
pve-devel mailing list
pve-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel