From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [IPv6:2a01:7e0:0:424::9]) by lore.proxmox.com (Postfix) with ESMTPS id 531231FF144 for ; Tue, 24 Mar 2026 19:32:58 +0100 (CET) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 7445F1AD4D; Tue, 24 Mar 2026 19:31:34 +0100 (CET) From: Daniel Kral To: pve-devel@lists.proxmox.com Subject: [PATCH ha-manager v2 35/40] implement automatic rebalancing Date: Tue, 24 Mar 2026 19:30:19 +0100 Message-ID: <20260324183029.1274972-36-d.kral@proxmox.com> X-Mailer: git-send-email 2.47.3 In-Reply-To: <20260324183029.1274972-1-d.kral@proxmox.com> References: <20260324183029.1274972-1-d.kral@proxmox.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Bm-Milter-Handled: 55990f41-d878-4baa-be0a-ee34c49e34d2 X-Bm-Transport-Timestamp: 1774376989870 X-SPAM-LEVEL: Spam detection results: 0 AWL 0.007 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment PROLO_LEO1 0.1 Meta Catches all Leo drug variations so far SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Message-ID-Hash: HFVYSLVUFNNHOJHW7IZO5VL7OGIUD2IN X-Message-ID-Hash: HFVYSLVUFNNHOJHW7IZO5VL7OGIUD2IN X-MailFrom: d.kral@proxmox.com X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; loop; banned-address; emergency; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header X-Mailman-Version: 3.3.10 Precedence: list List-Id: Proxmox VE development discussion List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: If the automatic load balancing system is enabled, it checks whether the cluster node imbalance exceeds some user-defined threshold for some HA Manager rounds ("hold duration"). If it does exceed on consecutive HA Manager rounds, it will choose the best resource motion to improve the cluster node imbalance and queue it if it significantly improves it by some user-defined imbalance improvement ("margin"). This patch introduces resource bundles, which ensure that HA resources in strict positive resource affinity rules are considered as a whole "bundle" instead of individual HA resources. Specifically, active and stationary resource bundles are resource bundles, that have at least one resource running and all resources located on the same node. This distinction is needed as newly created strict positive resource affinity rules may still require some resource motions to enforce the rule. Additionally, the migration candidate generation prunes any target nodes, which do not adhere to the HA rules of these resource bundles before scoring these migration candidates. Signed-off-by: Daniel Kral --- changes v1 -> v2: - add more context in patch message - add comment to sustained_imbalance_round (as suggested by @Thomas) - fix issue where resource bundle was created even though some dependent resources were still migrating or relocating - remove debug logging of node imbalance - remove unused calculate_node_loads() - remove select_best_balancing_migration{,_topsis}() from Static and Dynamic and make it a proxy in PVE::HA::Usage src/PVE/HA/Manager.pm | 177 +++++++++++++++++++++++++++++++++++- src/PVE/HA/Usage.pm | 34 +++++++ src/PVE/HA/Usage/Dynamic.pm | 33 +++++++ src/PVE/HA/Usage/Static.pm | 33 +++++++ 4 files changed, 276 insertions(+), 1 deletion(-) diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm index 872d43c4..73146b56 100644 --- a/src/PVE/HA/Manager.pm +++ b/src/PVE/HA/Manager.pm @@ -59,10 +59,17 @@ sub new { my $self = bless { haenv => $haenv, - crs => {}, + crs => { + auto_rebalance => {}, + }, last_rules_digest => '', last_groups_digest => '', last_services_digest => '', + # used to track how many HA rounds the imbalance threshold has been exceeded + # + # this is not persisted for a CRM failover as in the mean time + # the usage statistics might have change quite a bit already + sustained_imbalance_round => 0, group_migration_round => 3, # wait a little bit }, $class; @@ -94,6 +101,13 @@ sub update_crs_scheduler_mode { my $crs_cfg = $dc_cfg->{crs}; $self->{crs}->{rebalance_on_request_start} = !!$crs_cfg->{'ha-rebalance-on-start'}; + $self->{crs}->{auto_rebalance}->{enable} = !!$crs_cfg->{'ha-auto-rebalance'}; + $self->{crs}->{auto_rebalance}->{threshold} = $crs_cfg->{'ha-auto-rebalance-threshold'} // 0.7; + $self->{crs}->{auto_rebalance}->{method} = $crs_cfg->{'ha-auto-rebalance-method'} + // 'bruteforce'; + $self->{crs}->{auto_rebalance}->{hold_duration} = $crs_cfg->{'ha-auto-rebalance-hold-duration'} + // 3; + $self->{crs}->{auto_rebalance}->{margin} = $crs_cfg->{'ha-auto-rebalance-margin'} // 0.1; my $old_mode = $self->{crs}->{scheduler}; my $new_mode = $crs_cfg->{ha} || 'basic'; @@ -111,6 +125,150 @@ sub update_crs_scheduler_mode { return; } +# Returns a hash of lists, which contain the running, non-moving HA resource +# bundles, which are on the same node, implied by the strict positive resource +# affinity rules. +# +# Each resource bundle has a leader, which is the alphabetically first running +# HA resource in the resource bundle and also the key of each resource bundle +# in the returned hash. +sub get_active_stationary_resource_bundles { + my ($ss, $resource_affinity) = @_; + + my $resource_bundles = {}; +OUTER: for my $sid (sort keys %$ss) { + # do not consider non-started resource as 'active' leading resource + next if $ss->{$sid}->{state} ne 'started'; + + my @resources = ($sid); + my $nodes = { $ss->{$sid}->{node} => 1 }; + + my ($dependent_resources) = get_affinitive_resources($resource_affinity, $sid); + if (%$dependent_resources) { + for my $csid (keys %$dependent_resources) { + next if !defined($ss->{$csid}); + my ($state, $node) = $ss->{$csid}->@{qw(state node)}; + + # do not consider stationary bundle if a dependent resource moves + next OUTER if $state eq 'migrate' || $state eq 'relocate'; + # do not add non-started resource to active bundle + next if $state ne 'started'; + + $nodes->{$node} = 1; + + push @resources, $csid; + } + + @resources = sort @resources; + } + + # skip resource bundles, which are not on the same node yet + next if keys %$nodes > 1; + + my $leader_sid = $resources[0]; + + $resource_bundles->{$leader_sid} = \@resources; + } + + return $resource_bundles; +} + +# Returns a hash of hashes, where each item contains the resource bundle's +# leader, the list of HA resources in the resource bundle, and the list of +# possible nodes to migrate to. +sub get_resource_migration_candidates { + my ($self) = @_; + + my ($ss, $compiled_rules, $online_node_usage) = + $self->@{qw(ss compiled_rules online_node_usage)}; + my ($node_affinity, $resource_affinity) = + $compiled_rules->@{qw(node-affinity resource-affinity)}; + + my $resource_bundles = get_active_stationary_resource_bundles($ss, $resource_affinity); + + my @compact_migration_candidates = (); + for my $leader_sid (sort keys %$resource_bundles) { + my $current_leader_node = $ss->{$leader_sid}->{node}; + my $online_nodes = { map { $_ => 1 } $online_node_usage->list_nodes() }; + + my (undef, $target_nodes) = get_node_affinity($node_affinity, $leader_sid, $online_nodes); + my ($together, $separate) = + get_resource_affinity($resource_affinity, $leader_sid, $ss, $online_nodes); + apply_negative_resource_affinity($separate, $target_nodes); + + delete $target_nodes->{$current_leader_node}; + + next if !%$target_nodes; + + push @compact_migration_candidates, + { + leader => $leader_sid, + nodes => [sort keys %$target_nodes], + resources => $resource_bundles->{$leader_sid}, + }; + } + + return \@compact_migration_candidates; +} + +sub load_balance { + my ($self) = @_; + + my ($crs, $haenv, $online_node_usage) = $self->@{qw(crs haenv online_node_usage)}; + my ($auto_rebalance_opts) = $crs->{auto_rebalance}; + + return if !$auto_rebalance_opts->{enable}; + return if $crs->{scheduler} ne 'static' && $crs->{scheduler} ne 'dynamic'; + return if $self->any_resource_motion_queued_or_running(); + + my ($threshold, $method, $hold_duration, $margin) = + $auto_rebalance_opts->@{qw(threshold method hold_duration margin)}; + + my $imbalance = $online_node_usage->calculate_node_imbalance(); + + # do not load balance unless imbalance threshold has been exceeded + # consecutively for $hold_duration calls to load_balance() + if ($imbalance < $threshold) { + $self->{sustained_imbalance_round} = 0; + return; + } else { + $self->{sustained_imbalance_round}++; + return if $self->{sustained_imbalance_round} < $hold_duration; + $self->{sustained_imbalance_round} = 0; + } + + my $candidates = $self->get_resource_migration_candidates(); + + my $result; + if ($method eq 'bruteforce') { + $result = $online_node_usage->select_best_balancing_migration($candidates); + } elsif ($method eq 'topsis') { + $result = $online_node_usage->select_best_balancing_migration_topsis($candidates); + } + + # happens if $candidates is empty or $method isn't handled above + return if !$result; + + my ($migration, $target_imbalance) = $result->@{qw(migration imbalance)}; + + my $relative_change = ($imbalance - $target_imbalance) / $imbalance; + return if $relative_change < $margin; + + my ($sid, $source, $target) = $migration->@{qw(sid source-node target-node)}; + + my (undef, $type, $id) = $haenv->parse_sid($sid); + my $task = $type eq 'vm' ? "migrate" : "relocate"; + my $cmd = "$task $sid $target"; + + my $target_imbalance_str = int(100 * $target_imbalance + 0.5) / 100; + $haenv->log( + 'info', + "auto rebalance - $task $sid to $target (expected target imbalance: $target_imbalance_str)", + ); + + $self->queue_resource_motion($cmd, $task, $sid, $target); +} + sub cleanup { my ($self) = @_; @@ -463,6 +621,21 @@ sub queue_resource_motion { } } +sub any_resource_motion_queued_or_running { + my ($self) = @_; + + my ($ss) = $self->@{qw(ss)}; + + for my $sid (keys %$ss) { + my ($cmd, $state) = $ss->{$sid}->@{qw(cmd state)}; + + return 1 if $state eq 'migrate' || $state eq 'relocate'; + return 1 if defined($cmd) && ($cmd->[0] eq 'migrate' || $cmd->[0] eq 'relocate'); + } + + return 0; +} + # read new crm commands and save them into crm master status sub update_crm_commands { my ($self) = @_; @@ -746,6 +919,8 @@ sub manage { $self->update_crm_commands(); + $self->load_balance(); + for (;;) { my $repeat = 0; diff --git a/src/PVE/HA/Usage.pm b/src/PVE/HA/Usage.pm index 822b884c..dc029e86 100644 --- a/src/PVE/HA/Usage.pm +++ b/src/PVE/HA/Usage.pm @@ -59,6 +59,40 @@ sub remove_service_usage { die "implement in subclass"; } +sub calculate_node_imbalance { + my ($self) = @_; + + die "implement in subclass"; +} + +sub score_best_balancing_migrations { + my ($self, $migration_candidates, $limit) = @_; + + die "implement in subclass"; +} + +sub select_best_balancing_migration { + my ($self, $migration_candidates) = @_; + + my $migrations = $self->score_best_balancing_migrations($migration_candidates, 1); + + return $migrations->[0]; +} + +sub score_best_balancing_migrations_topsis { + my ($self, $migration_candidates, $limit) = @_; + + die "implement in subclass"; +} + +sub select_best_balancing_migration_topsis { + my ($self, $migration_candidates) = @_; + + my $migrations = $self->score_best_balancing_migrations_topsis($migration_candidates, 1); + + return $migrations->[0]; +} + # Returns a hash with $nodename => $score pairs. A lower $score is better. sub score_nodes_to_start_service { my ($self, $sid) = @_; diff --git a/src/PVE/HA/Usage/Dynamic.pm b/src/PVE/HA/Usage/Dynamic.pm index 7e11715d..a8adfe83 100644 --- a/src/PVE/HA/Usage/Dynamic.pm +++ b/src/PVE/HA/Usage/Dynamic.pm @@ -92,6 +92,39 @@ sub remove_service_usage { $self->{haenv}->log('warning', "unable to remove service '$sid' usage - $@") if $@; } +sub calculate_node_imbalance { + my ($self) = @_; + + my $node_imbalance = eval { $self->{scheduler}->calculate_node_imbalance() }; + $self->{haenv}->log('warning', "unable to calculate dynamic node imbalance - $@") if $@; + + return $node_imbalance // 0.0; +} + +sub score_best_balancing_migrations { + my ($self, $migration_candidates, $limit) = @_; + + my $migrations = eval { + $self->{scheduler} + ->score_best_balancing_migration_candidates($migration_candidates, $limit); + }; + $self->{haenv}->log('warning', "unable to score best balancing migration - $@") if $@; + + return $migrations; +} + +sub score_best_balancing_migrations_topsis { + my ($self, $migration_candidates, $limit) = @_; + + my $migrations = eval { + $self->{scheduler} + ->score_best_balancing_migration_candidates_topsis($migration_candidates, $limit); + }; + $self->{haenv}->log('warning', "unable to score best balancing migration - $@") if $@; + + return $migrations; +} + sub score_nodes_to_start_service { my ($self, $sid) = @_; diff --git a/src/PVE/HA/Usage/Static.pm b/src/PVE/HA/Usage/Static.pm index 835f4300..92bfaaa7 100644 --- a/src/PVE/HA/Usage/Static.pm +++ b/src/PVE/HA/Usage/Static.pm @@ -99,6 +99,39 @@ sub remove_service_usage { $self->{haenv}->log('warning', "unable to remove service '$sid' usage - $@") if $@; } +sub calculate_node_imbalance { + my ($self) = @_; + + my $node_imbalance = eval { $self->{scheduler}->calculate_node_imbalance() }; + $self->{haenv}->log('warning', "unable to calculate static node imbalance - $@") if $@; + + return $node_imbalance // 0.0; +} + +sub score_best_balancing_migrations { + my ($self, $migration_candidates, $limit) = @_; + + my $migrations = eval { + $self->{scheduler} + ->score_best_balancing_migration_candidates($migration_candidates, $limit); + }; + $self->{haenv}->log('warning', "unable to score best balancing migration - $@") if $@; + + return $migrations; +} + +sub score_best_balancing_migrations_topsis { + my ($self, $migration_candidates, $limit) = @_; + + my $migrations = eval { + $self->{scheduler} + ->score_best_balancing_migration_candidates_topsis($migration_candidates, $limit); + }; + $self->{haenv}->log('warning', "unable to score best balancing migration - $@") if $@; + + return $migrations; +} + sub score_nodes_to_start_service { my ($self, $sid) = @_; -- 2.47.3