From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) by lore.proxmox.com (Postfix) with ESMTPS id 345B11FF141 for ; Tue, 19 May 2026 16:39:24 +0200 (CEST) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id EDB74AF96; Tue, 19 May 2026 16:39:21 +0200 (CEST) From: Daniel Kral To: pve-devel@lists.proxmox.com Subject: [PATCH ha-manager 2/2] make idle LRMs resolve leftover moving HA resources while disarmed Date: Tue, 19 May 2026 16:38:36 +0200 Message-ID: <20260519143842.382324-3-d.kral@proxmox.com> X-Mailer: git-send-email 2.47.3 In-Reply-To: <20260519143842.382324-1-d.kral@proxmox.com> References: <20260519143842.382324-1-d.kral@proxmox.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Bm-Milter-Handled: 55990f41-d878-4baa-be0a-ee34c49e34d2 X-Bm-Transport-Timestamp: 1779201511862 X-SPAM-LEVEL: Spam detection results: 0 AWL 0.075 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Message-ID-Hash: NICZROMIIK7VJT3DL7YAFS7JICHGC3Q7 X-Message-ID-Hash: NICZROMIIK7VJT3DL7YAFS7JICHGC3Q7 X-MailFrom: d.kral@proxmox.com X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; loop; banned-address; emergency; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header X-Mailman-Version: 3.3.10 Precedence: list List-Id: Proxmox VE development discussion List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: If there are HA resources, which are in transient states that defer the disarming process, but their LRMs are already in idle state and disarmed mode, these LRMs will not properly resolve the transient states of these HA resources as assumed by the HA Manager. For HA resources, which are still moving, this makes the HA Manager stuck in a loop, which tries to defer the disarming process to wait for a LRM response for these moving HA resources, which will never come as the LRM is idle. Therefore allow the LRM to become active in disarm mode if there are any HA resources on the LRM's node, which are in any of these transient states, and make sure that the LRM only processes the disarm-deferring HA resources while the LRM is active. Signed-off-by: Daniel Kral --- src/PVE/HA/LRM.pm | 19 ++++++++++- src/PVE/HA/Manager.pm | 8 ++--- src/PVE/HA/Tools.pm | 17 ++++++++++ src/test/test-disarm-idle-lrm1/log.expect | 37 ++++++--------------- src/test/test-disarm-idle-lrm2/log.expect | 39 +++++++---------------- 5 files changed, 58 insertions(+), 62 deletions(-) diff --git a/src/PVE/HA/LRM.pm b/src/PVE/HA/LRM.pm index 426982cc..9100d611 100644 --- a/src/PVE/HA/LRM.pm +++ b/src/PVE/HA/LRM.pm @@ -312,6 +312,18 @@ sub active_service_count { return PVE::HA::Tools::count_active_services($ss, $nodename); } +# returns a truthy value if there are HA resources in transient states, which +# need to be resolved, e.g. to complete the disarm procedure. +sub has_disarm_deferred_services { + my ($self) = @_; + + my $ss = $self->{service_status}; + my $nodename = $self->{haenv}->nodename(); + my $deferred_sids = PVE::HA::Tools::get_disarm_deferred_services($ss, $nodename); + + return %$deferred_sids; +} + my $wrote_lrm_status_at_startup = 0; sub do_one_iteration { @@ -371,7 +383,7 @@ sub work { my $service_count = $self->active_service_count(); - if ($self->{mode} eq 'disarm') { + if ($self->{mode} eq 'disarm' && !$self->has_disarm_deferred_services()) { # stay idle while disarmed, don't acquire lock } elsif (!$fence_request && $service_count && $haenv->quorate()) { if ($self->get_protected_ha_agent_lock()) { @@ -709,12 +721,17 @@ sub manage_resources { my $nodename = $haenv->nodename(); my $ss = $self->{service_status}; + my $deferred_sids; + $deferred_sids = PVE::HA::Tools::get_disarm_deferred_services($ss, $nodename) + if $self->{mode} eq 'disarm'; foreach my $sid (keys %{ $self->{restart_tries} }) { delete $self->{restart_tries}->{$sid} if !$ss->{$sid}; } foreach my $sid (keys %$ss) { + next if $deferred_sids && !$deferred_sids->{$sid}; + my $sd = $ss->{$sid}; next if !$sd->{node} || !$sd->{uid}; next if $sd->{node} ne $nodename; diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm index 9b901c4f..a2baf349 100644 --- a/src/PVE/HA/Manager.pm +++ b/src/PVE/HA/Manager.pm @@ -929,15 +929,13 @@ sub handle_disarm { } # defer disarm if any services are in a transient state that needs the state machine to resolve - my $deferred_sids = {}; - for my $sid (sort keys %$ss) { + my $deferred_sids = PVE::HA::Tools::get_disarm_deferred_services($ss); + for my $sid (sort keys %$deferred_sids) { my $state = $ss->{$sid}->{state}; if ($state eq 'fence' || $state eq 'recovery') { $haenv->log('warning', "deferring disarm - service '$sid' is in '$state' state"); - $deferred_sids->{$sid} = 1; - } elsif ($state eq 'migrate' || $state eq 'relocate') { + } else { $haenv->log('info', "deferring disarm - service '$sid' is in '$state' state"); - $deferred_sids->{$sid} = 1; } } diff --git a/src/PVE/HA/Tools.pm b/src/PVE/HA/Tools.pm index 26629fb5..37b27e11 100644 --- a/src/PVE/HA/Tools.pm +++ b/src/PVE/HA/Tools.pm @@ -213,6 +213,23 @@ sub count_active_services { return $active_count; } +sub get_disarm_deferred_services { + my ($ss, $node) = @_; + + my $deferred_sids = {}; + my @deferrable_states = qw(fence recovery migrate relocate); + + for my $sid (keys %$ss) { + my ($state, $current_node, $target_node) = $ss->{$sid}->@{qw(state node target)}; + + next if $node && (!$current_node || $current_node ne $node); + + $deferred_sids->{$sid} = 1 if grep { $state eq $_ } @deferrable_states; + } + + return $deferred_sids; +} + sub get_verbose_service_state { my ($service_state, $service_conf) = @_; diff --git a/src/test/test-disarm-idle-lrm1/log.expect b/src/test/test-disarm-idle-lrm1/log.expect index 1b7f4ece..d46fbebd 100644 --- a/src/test/test-disarm-idle-lrm1/log.expect +++ b/src/test/test-disarm-idle-lrm1/log.expect @@ -26,34 +26,15 @@ info 20 node1/crm: recover service 'vm:102' from fenced node 'node2' to n info 20 node1/crm: service 'vm:102': state changed from 'recovery' to 'started' (node = node1) info 22 node2/crm: status change wait_for_quorum => slave info 24 node3/crm: status change wait_for_quorum => slave +info 25 node3/lrm: got lock 'ha_agent_node3_lock' +info 25 node3/lrm: status change wait_for_agent_lock => active +info 25 node3/lrm: service vm:103 - start migrate to node 'node2' +info 25 node3/lrm: service vm:103 - end migrate to node 'node2' info 40 node1/crm: node 'node2': state changed from 'unknown' => 'online' info 40 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 60 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 80 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 100 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 120 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 140 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 160 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 180 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 200 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 220 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 240 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 260 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 280 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 300 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 320 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 340 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 360 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 380 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 400 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 420 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 440 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 460 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 480 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 500 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 520 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 540 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 560 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 580 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 600 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state +info 40 node1/crm: service 'vm:103': state changed from 'migrate' to 'started' (node = node2) +info 45 node3/lrm: HA disarm requested, releasing agent lock and watchdog +info 45 node3/lrm: status change active => wait_for_agent_lock +info 60 node1/crm: all LRMs disarmed, HA stack is now fully disarmed +info 60 node1/crm: HA stack fully disarmed, releasing CRM watchdog info 620 hardware: exit simulation - done diff --git a/src/test/test-disarm-idle-lrm2/log.expect b/src/test/test-disarm-idle-lrm2/log.expect index d0ba96ff..13e3e2a7 100644 --- a/src/test/test-disarm-idle-lrm2/log.expect +++ b/src/test/test-disarm-idle-lrm2/log.expect @@ -23,34 +23,17 @@ info 20 node1/crm: recover service 'vm:102' from fenced node 'node2' to n info 20 node1/crm: service 'vm:102': state changed from 'recovery' to 'started' (node = node1) info 22 node2/crm: status change wait_for_quorum => slave info 24 node3/crm: status change wait_for_quorum => slave +info 25 node3/lrm: got lock 'ha_agent_node3_lock' +info 25 node3/lrm: status change wait_for_agent_lock => active +info 25 node3/lrm: service vm:103 - start migrate to node 'node2' +info 25 node3/lrm: service vm:103 - end migrate to node 'node2' info 40 node1/crm: node 'node2': state changed from 'unknown' => 'online' info 40 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 60 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 80 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 100 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 120 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 140 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 160 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 180 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 200 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 220 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 240 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 260 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 280 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 300 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 320 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 340 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 360 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 380 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 400 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 420 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 440 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 460 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 480 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 500 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 520 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 540 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 560 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 580 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state -info 600 node1/crm: deferring disarm - service 'vm:103' is in 'migrate' state +info 40 node1/crm: service 'vm:103': state changed from 'migrate' to 'started' (node = node2) +info 45 node3/lrm: HA disarm requested, releasing agent lock and watchdog +info 45 node3/lrm: status change active => wait_for_agent_lock +info 60 node1/crm: disarm: freezing service 'vm:102' (was 'started') +info 60 node1/crm: disarm: freezing service 'vm:103' (was 'started') +info 60 node1/crm: all LRMs disarmed, HA stack is now fully disarmed +info 60 node1/crm: HA stack fully disarmed, releasing CRM watchdog info 620 hardware: exit simulation - done -- 2.47.3