From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [IPv6:2a01:7e0:0:424::9]) by lore.proxmox.com (Postfix) with ESMTPS id D64161FF137 for ; Tue, 17 Feb 2026 15:17:34 +0100 (CET) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 1C73E4D6A; Tue, 17 Feb 2026 15:16:18 +0100 (CET) From: Daniel Kral To: pve-devel@lists.proxmox.com Subject: [RFC ha-manager 20/21] implement automatic rebalancing Date: Tue, 17 Feb 2026 15:14:27 +0100 Message-ID: <20260217141437.584852-34-d.kral@proxmox.com> X-Mailer: git-send-email 2.47.3 In-Reply-To: <20260217141437.584852-1-d.kral@proxmox.com> References: <20260217141437.584852-1-d.kral@proxmox.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Bm-Milter-Handled: 55990f41-d878-4baa-be0a-ee34c49e34d2 X-Bm-Transport-Timestamp: 1771337678532 X-SPAM-LEVEL: Spam detection results: 0 AWL 0.019 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Message-ID-Hash: YLK2GHVIY3SHAUWV3RLL4UMBEVYSPRQB X-Message-ID-Hash: YLK2GHVIY3SHAUWV3RLL4UMBEVYSPRQB X-MailFrom: d.kral@proxmox.com X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; loop; banned-address; emergency; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header X-Mailman-Version: 3.3.10 Precedence: list List-Id: Proxmox VE development discussion List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: The automatic load rebalancing system checks whether the cluster node imbalance exceeds some user-defined threshold for some HA Manager rounds ("hold duration"). If it does exceed on consecutive HA Manager rounds, it will choose the best service migration/relocation to improve the cluster node imbalance and queue it if it significantly improves it by some user-defined margin. This introduces resource bundles, which make sure that HA resources in strict positive resource affinity rules are considered as a whole "bundle" instead of individually. Additionally, the migration candidate generation prunes any target nodes, which do not adhere the HA rules before scoring these migration candidates. Signed-off-by: Daniel Kral --- As noted by the TODO, the migration candidate generation will likely be moved to the perlmod bindings or proxmox-resource-scheduling. This version includes a debug log statement which will report the current node imbalance through the pve-ha-crm's log, which will obviously not be part of a final revision due to syslog spam. src/PVE/HA/Manager.pm | 171 +++++++++++++++++++++++++++++++++++- src/PVE/HA/Usage.pm | 36 ++++++++ src/PVE/HA/Usage/Dynamic.pm | 65 +++++++++++++- src/PVE/HA/Usage/Static.pm | 60 +++++++++++++ 4 files changed, 329 insertions(+), 3 deletions(-) diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm index e3ab1ee7..5915b55a 100644 --- a/src/PVE/HA/Manager.pm +++ b/src/PVE/HA/Manager.pm @@ -54,10 +54,13 @@ sub new { my $self = bless { haenv => $haenv, - crs => {}, + crs => { + auto_rebalance => {}, + }, last_rules_digest => '', last_groups_digest => '', last_services_digest => '', + sustained_imbalance_round => 0, group_migration_round => 3, # wait a little bit }, $class; @@ -89,6 +92,13 @@ sub update_crs_scheduler_mode { my $crs_cfg = $dc_cfg->{crs}; $self->{crs}->{rebalance_on_request_start} = !!$crs_cfg->{'ha-rebalance-on-start'}; + $self->{crs}->{auto_rebalance}->{enable} = !!$crs_cfg->{'ha-auto-rebalance'}; + $self->{crs}->{auto_rebalance}->{threshold} = $crs_cfg->{'ha-auto-rebalance-threshold'} // 0.7; + $self->{crs}->{auto_rebalance}->{method} = $crs_cfg->{'ha-auto-rebalance-method'} + // 'bruteforce'; + $self->{crs}->{auto_rebalance}->{hold_duration} = $crs_cfg->{'ha-auto-rebalance-hold-duration'} + // 3; + $self->{crs}->{auto_rebalance}->{margin} = $crs_cfg->{'ha-auto-rebalance-margin'} // 0.1; my $old_mode = $self->{crs}->{scheduler}; my $new_mode = $crs_cfg->{ha} || 'basic'; @@ -106,6 +116,148 @@ sub update_crs_scheduler_mode { return; } +# Returns a hash of lists, which contain the running, non-moving HA resource +# bundles, which are on the same node, implied by the strict positive resource +# affinity rules. +# +# Each resource bundle has a leader, which is the alphabetically first running +# HA resource in the resource bundle and also the key of each resource bundle +# in the returned hash. +my sub get_active_stationary_resource_bundles { + my ($ss, $resource_affinity) = @_; + + my $resource_bundles = {}; + for my $sid (sort keys %$ss) { + next if $ss->{$sid}->{state} ne 'started'; + + my @resources = ($sid); + my $nodes = { $ss->{$sid}->{node} => 1 }; + + my ($dependent_resources) = get_affinitive_resources($resource_affinity, $sid); + if (%$dependent_resources) { + for my $csid (keys %$dependent_resources) { + my ($state, $node) = $ss->{$csid}->@{qw(state node)}; + + next if $state ne 'started'; + + $nodes->{$node} = 1; + + push @resources, $csid; + } + + @resources = sort @resources; + } + + # skip resource bundles, which are not on the same node yet + next if keys %$nodes > 1; + + my $leader_sid = $resources[0]; + + $resource_bundles->{$leader_sid} = \@resources; + } + + return $resource_bundles; +} + +# Returns a hash of hashes, where each item contains the resource bundle's +# leader, the list of HA resources in the resource bundle, and the list of +# possible nodes to migrate to. +sub get_resource_migration_candidates { + my ($self) = @_; + + my ($ss, $compiled_rules, $online_node_usage) = + $self->@{qw(ss compiled_rules online_node_usage)}; + my ($node_affinity, $resource_affinity) = + $compiled_rules->@{qw(node-affinity resource-affinity)}; + + my $resource_bundles = get_active_stationary_resource_bundles($ss, $resource_affinity); + + my @compact_migration_candidates = (); + for my $leader_sid (sort keys %$resource_bundles) { + my $current_leader_node = $ss->{$leader_sid}->{node}; + my $online_nodes = { map { $_ => 1 } $online_node_usage->list_nodes() }; + + my (undef, $target_nodes) = get_node_affinity($node_affinity, $leader_sid, $online_nodes); + my ($together, $separate) = + get_resource_affinity($resource_affinity, $leader_sid, $ss, $online_nodes); + apply_negative_resource_affinity($separate, $target_nodes); + + delete $target_nodes->{$current_leader_node}; + + next if !%$target_nodes; + + push @compact_migration_candidates, + { + leader => $leader_sid, + nodes => [sort keys %$target_nodes], + services => $resource_bundles->{$leader_sid}, + }; + } + + return \@compact_migration_candidates; +} + +sub load_balance { + my ($self) = @_; + + my ($crs, $haenv, $online_node_usage) = $self->@{qw(crs haenv online_node_usage)}; + my ($auto_rebalance_opts) = $crs->{auto_rebalance}; + + return if !$auto_rebalance_opts->{enable}; + return if $crs->{scheduler} ne 'static' && $crs->{scheduler} ne 'dynamic'; + return if $self->any_resource_motion_queued_or_running(); + + my ($threshold, $method, $hold_duration, $margin) = + $auto_rebalance_opts->@{qw(threshold method hold_duration margin)}; + + my $node_loads = $online_node_usage->calculate_node_loads(); + my $imbalance = $online_node_usage->calculate_node_imbalance(); + + $haenv->log('debug', "auto rebalance - node imbalance: $imbalance"); + + # do not load balance unless imbalance threshold has been exceeded + # consecutively for $hold_duration calls to load_balance() + if ($imbalance < $threshold) { + $self->{sustained_imbalance_round} = 0; + return; + } else { + $self->{sustained_imbalance_round}++; + return if $self->{sustained_imbalance_round} < $hold_duration; + $self->{sustained_imbalance_round} = 0; + } + + # TODO Move migration candidate generation into PVE::RS::ResourceScheduling + my $candidates = $self->get_resource_migration_candidates(); + + my $result; + if ($method eq 'bruteforce') { + $result = $online_node_usage->select_best_balancing_migration($candidates); + } elsif ($method eq 'topsis') { + $result = $online_node_usage->select_best_balancing_migration_topsis($candidates); + } + + return if !$result; + + my ($migration, $target_imbalance) = $result->@{qw(migration imbalance)}; + + my $relative_change = ($imbalance - $target_imbalance) / $imbalance; + return if $relative_change < $margin; + + my ($sid, $source, $target) = $migration->@{qw(sid source-node target-node)}; + + my (undef, $type, $id) = $haenv->parse_sid($sid); + my $task = $type eq 'vm' ? "migrate" : "relocate"; + my $cmd = "$task $sid $target"; + + my $target_imbalance_str = int(100 * $target_imbalance + 0.5) / 100; + $haenv->log( + 'info', + "auto rebalance - $task $sid to $target (expected target imbalance: $target_imbalance_str)", + ); + + $self->queue_resource_motion($cmd, $task, $sid, $target); +} + sub cleanup { my ($self) = @_; @@ -455,6 +607,21 @@ sub queue_resource_motion { } } +sub any_resource_motion_queued_or_running { + my ($self) = @_; + + my ($ss) = $self->@{qw(ss)}; + + for my $sid (keys %$ss) { + my ($cmd, $state) = $ss->{$sid}->@{qw(cmd state)}; + + return 1 if $state eq 'migrate' || $state eq 'relocate'; + return 1 if defined($cmd) && ($cmd->[0] eq 'migrate' || $cmd->[0] eq 'relocate'); + } + + return 0; +} + # read new crm commands and save them into crm master status sub update_crm_commands { my ($self) = @_; @@ -738,6 +905,8 @@ sub manage { $self->update_crm_commands(); + $self->load_balance(); + for (;;) { my $repeat = 0; diff --git a/src/PVE/HA/Usage.pm b/src/PVE/HA/Usage.pm index 9f19a82b..3515b48f 100644 --- a/src/PVE/HA/Usage.pm +++ b/src/PVE/HA/Usage.pm @@ -59,6 +59,42 @@ sub remove_service_usage { die "implement in subclass"; } +sub calculate_node_loads { + my ($self) = @_; + + die "implement in subclass"; +} + +sub calculate_node_imbalance { + my ($self) = @_; + + die "implement in subclass"; +} + +sub score_best_balancing_migrations { + my ($self, $migration_candidates, $limit) = @_; + + die "implement in subclass"; +} + +sub select_best_balancing_migration { + my ($self, $migration_candidates) = @_; + + die "implement in subclass"; +} + +sub score_best_balancing_migrations_topsis { + my ($self, $migration_candidates, $limit) = @_; + + die "implement in subclass"; +} + +sub select_best_balancing_migration_topsis { + my ($self, $migration_candidates) = @_; + + die "implement in subclass"; +} + # Returns a hash with $nodename => $score pairs. A lower $score is better. sub score_nodes_to_start_service { my ($self, $sid) = @_; diff --git a/src/PVE/HA/Usage/Dynamic.pm b/src/PVE/HA/Usage/Dynamic.pm index f4049f62..12bdc383 100644 --- a/src/PVE/HA/Usage/Dynamic.pm +++ b/src/PVE/HA/Usage/Dynamic.pm @@ -59,7 +59,7 @@ my sub get_service_usage { my ($self, $sid) = @_; my $service_stats = $self->{'service-stats'}->{$sid}->{usage} - or die "did not get static service usage information for '$sid'\n"; + or die "did not get dynamic service usage information for '$sid'\n"; return $service_stats; } @@ -82,6 +82,66 @@ sub remove_service_usage { $self->{haenv}->log('warning', "unable to remove service '$sid' usage - $@") if $@; } +sub calculate_node_loads { + my ($self) = @_; + + my $node_loads = eval { $self->{scheduler}->calculate_node_loads() }; + $self->{haenv}->log('warning', "unable to calculate dynamic node loads - $@") if $@; + + return { map { $_->[0] => $_->[1] } $node_loads->@* }; +} + +sub calculate_node_imbalance { + my ($self) = @_; + + my $node_imbalance = eval { $self->{scheduler}->calculate_node_imbalance() }; + $self->{haenv}->log('warning', "unable to calculate dynamic node imbalance - $@") if $@; + + return $node_imbalance // 0.0; +} + +sub score_best_balancing_migrations { + my ($self, $migration_candidates, $limit) = @_; + + my $migrations = + eval { $self->{scheduler}->score_best_balancing_migrations($migration_candidates, $limit); }; + $self->{haenv}->log('warning', "unable to score best balancing migration - $@") if $@; + + return $migrations; +} + +sub select_best_balancing_migration { + my ($self, $migration_candidates) = @_; + + my $result = + eval { $self->{scheduler}->select_best_balancing_migration($migration_candidates) }; + $self->{haenv}->log('warning', "unable to select best balancing migration - $@") if $@; + + return $result; +} + +sub score_best_balancing_migrations_topsis { + my ($self, $migration_candidates, $limit) = @_; + + my $migrations = eval { + $self->{scheduler} + ->score_best_balancing_migrations_topsis($migration_candidates, $limit); + }; + $self->{haenv}->log('warning', "unable to score best balancing migration - $@") if $@; + + return $migrations; +} + +sub select_best_balancing_migration_topsis { + my ($self, $migration_candidates) = @_; + + my $result = + eval { $self->{scheduler}->select_best_balancing_migration_topsis($migration_candidates) }; + $self->{haenv}->log('warning', "unable to select best balancing migration - $@") if $@; + + return $result; +} + sub score_nodes_to_start_service { my ($self, $sid) = @_; @@ -90,7 +150,8 @@ sub score_nodes_to_start_service { $self->{scheduler}->score_nodes_to_start_service($service_usage); }; $self->{haenv} - ->log('err', "unable to score nodes according to dynamic usage for service '$sid' - $@"); + ->log('err', "unable to score nodes according to dynamic usage for service '$sid' - $@") + if $@; # Take minus the value, so that a lower score is better, which our caller(s) expect(s). return { map { $_->[0] => -$_->[1] } $score_list->@* }; diff --git a/src/PVE/HA/Usage/Static.pm b/src/PVE/HA/Usage/Static.pm index c8460fd7..ecc2a14f 100644 --- a/src/PVE/HA/Usage/Static.pm +++ b/src/PVE/HA/Usage/Static.pm @@ -91,6 +91,66 @@ sub remove_service_usage { $self->{haenv}->log('warning', "unable to remove service '$sid' usage - $@") if $@; } +sub calculate_node_loads { + my ($self) = @_; + + my $node_loads = eval { $self->{scheduler}->calculate_node_loads() }; + $self->{haenv}->log('warning', "unable to calculate static node loads - $@") if $@; + + return { map { $_->[0] => $_->[1] } $node_loads->@* }; +} + +sub calculate_node_imbalance { + my ($self) = @_; + + my $node_imbalance = eval { $self->{scheduler}->calculate_node_imbalance() }; + $self->{haenv}->log('warning', "unable to calculate static node imbalance - $@") if $@; + + return $node_imbalance // 0.0; +} + +sub score_best_balancing_migrations { + my ($self, $migration_candidates, $limit) = @_; + + my $migrations = + eval { $self->{scheduler}->score_best_balancing_migrations($migration_candidates, $limit); }; + $self->{haenv}->log('warning', "unable to score best balancing migration - $@") if $@; + + return $migrations; +} + +sub select_best_balancing_migration { + my ($self, $migration_candidates) = @_; + + my $result = + eval { $self->{scheduler}->select_best_balancing_migration($migration_candidates) }; + $self->{haenv}->log('warning', "unable to select best balancing migration - $@") if $@; + + return $result; +} + +sub score_best_balancing_migrations_topsis { + my ($self, $migration_candidates, $limit) = @_; + + my $migrations = eval { + $self->{scheduler} + ->score_best_balancing_migrations_topsis($migration_candidates, $limit); + }; + $self->{haenv}->log('warning', "unable to score best balancing migration - $@") if $@; + + return $migrations; +} + +sub select_best_balancing_migration_topsis { + my ($self, $migration_candidates) = @_; + + my $result = + eval { $self->{scheduler}->select_best_balancing_migration_topsis($migration_candidates) }; + $self->{haenv}->log('warning', "unable to select best balancing migration - $@") if $@; + + return $result; +} + sub score_nodes_to_start_service { my ($self, $sid) = @_; -- 2.47.3