From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [IPv6:2a01:7e0:0:424::9]) by lore.proxmox.com (Postfix) with ESMTPS id ABD491FF144 for ; Tue, 24 Mar 2026 19:33:10 +0100 (CET) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 641C01AFEC; Tue, 24 Mar 2026 19:31:36 +0100 (CET) From: Daniel Kral To: pve-devel@lists.proxmox.com Subject: [PATCH proxmox v2 09/40] resource-scheduling: implement rebalancing migration selection Date: Tue, 24 Mar 2026 19:29:53 +0100 Message-ID: <20260324183029.1274972-10-d.kral@proxmox.com> X-Mailer: git-send-email 2.47.3 In-Reply-To: <20260324183029.1274972-1-d.kral@proxmox.com> References: <20260324183029.1274972-1-d.kral@proxmox.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Bm-Milter-Handled: 55990f41-d878-4baa-be0a-ee34c49e34d2 X-Bm-Transport-Timestamp: 1774376988153 X-SPAM-LEVEL: Spam detection results: 0 AWL -1.593 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment POISEN_SPAM_PILL 0.1 Meta: its spam POISEN_SPAM_PILL_1 0.1 random spam to be learned in bayes POISEN_SPAM_PILL_3 0.1 random spam to be learned in bayes SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record URIBL_BLACK 3 Contains an URL listed in the URIBL blacklist [node.name] Message-ID-Hash: PPROJ6B35WBFKD3GAUXYAHKFQMGLGEH5 X-Message-ID-Hash: PPROJ6B35WBFKD3GAUXYAHKFQMGLGEH5 X-MailFrom: d.kral@proxmox.com X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; loop; banned-address; emergency; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header X-Mailman-Version: 3.3.10 Precedence: list List-Id: Proxmox VE development discussion List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: Assuming that a resource will hold the same dynamic resource usage on a new node as on the previous node, score possible migrations, where: - the cluster node imbalance is minimal (bruteforce), or - the shifted root mean square and maximum resource usages of the cpu and memory is minimal across the cluster nodes (TOPSIS). Signed-off-by: Daniel Kral --- changes v1 -> v2: - add saturating_sub() in remove_running_resource(...) (as suggested by @Thomas) - slightly move declarations and impls around so that reading from top-to-bottom is a little easier - pass NodeUsage vec instead of NodeStats vec to calculate_node_imbalance(...) - pass a closure to calculate_node_imbalance(...) (as suggested by @Dominik) - also use `migration` for `Ord` impl of `ScoredMigration`, s.t. the struct is now ordered first by the imbalance and then the strings in the `Migration` struct - fix floating-point issue for the imbalance ordering for ScoredMigration - correctly implement `Ord` (essentially removing the reverse() and moving these Reverse() wrappers to the usages for the BinaryHeap) - use the `Migration` struct in `MigrationCandidate` as well - drop Scheduler::node_stats() as it's unused now - use Vec::with_capacity(...) where possible - eagerly implement common traits (especially Clone and Debug) - add test cases for the ScoredMigration ordering, node imbalance calculation and the two rebalancing migration scoring methods - s/score_best_balancing_migrations /score_best_balancing_migration_candidates to possibly allow the Scheduler/Usage impls handling the migration candidate generation in the future instead of the callers proxmox-resource-scheduling/src/node.rs | 17 ++ proxmox-resource-scheduling/src/scheduler.rs | 282 ++++++++++++++++++ .../tests/scheduler.rs | 169 ++++++++++- 3 files changed, 467 insertions(+), 1 deletion(-) diff --git a/proxmox-resource-scheduling/src/node.rs b/proxmox-resource-scheduling/src/node.rs index be462782..2dcef75e 100644 --- a/proxmox-resource-scheduling/src/node.rs +++ b/proxmox-resource-scheduling/src/node.rs @@ -29,6 +29,18 @@ impl NodeStats { self.mem += resource_stats.maxmem; } + /// Adds the resource stats to the node stats as if the resource is running on the node. + pub fn add_running_resource(&mut self, resource_stats: &ResourceStats) { + self.cpu += resource_stats.cpu; + self.mem += resource_stats.mem; + } + + /// Removes the resource stats from the node stats as if the resource is not running on the node. + pub fn remove_running_resource(&mut self, resource_stats: &ResourceStats) { + self.cpu -= resource_stats.cpu; + self.mem = self.mem.saturating_sub(resource_stats.mem); + } + /// Returns the current cpu usage as a percentage. pub fn cpu_load(&self) -> f64 { self.cpu / self.maxcpu as f64 @@ -38,6 +50,11 @@ impl NodeStats { pub fn mem_load(&self) -> f64 { self.mem as f64 / self.maxmem as f64 } + + /// Returns a combined node usage as a percentage. + pub fn load(&self) -> f64 { + (self.cpu_load() + self.mem_load()) / 2.0 + } } /// A node in the cluster context. diff --git a/proxmox-resource-scheduling/src/scheduler.rs b/proxmox-resource-scheduling/src/scheduler.rs index 69dc6f4e..a25babad 100644 --- a/proxmox-resource-scheduling/src/scheduler.rs +++ b/proxmox-resource-scheduling/src/scheduler.rs @@ -2,6 +2,12 @@ use anyhow::Error; use crate::{node::NodeStats, resource::ResourceStats, topsis}; +use serde::{Deserialize, Serialize}; +use std::{ + cmp::{Ordering, Reverse}, + collections::BinaryHeap, +}; + /// The scheduler view of a node. #[derive(Clone, Debug)] pub struct NodeUsage { @@ -11,6 +17,36 @@ pub struct NodeUsage { pub stats: NodeStats, } +/// Returns the load imbalance among the nodes. +/// +/// The load balance is measured as the statistical dispersion of the individual node loads. +/// +/// The current implementation uses the dimensionless coefficient of variation, which expresses the +/// standard deviation in relation to the average mean of the node loads. +/// +/// The coefficient of variation is not robust, which is a desired property here, because outliers +/// should be detected as much as possible. +fn calculate_node_imbalance(nodes: &[NodeUsage], to_load: impl Fn(&NodeUsage) -> f64) -> f64 { + let node_count = nodes.len(); + let node_loads = nodes.iter().map(to_load).collect::>(); + + let load_sum = node_loads.iter().sum::(); + + // load_sum is guaranteed to be -0.0 for empty `nodes` + if load_sum == 0.0 { + 0.0 + } else { + let load_mean = load_sum / node_count as f64; + + let squared_diff_sum = node_loads + .iter() + .fold(0.0, |sum, node_load| sum + (node_load - load_mean).powi(2)); + let load_sd = (squared_diff_sum / node_count as f64).sqrt(); + + load_sd / load_mean + } +} + criteria_struct! { /// A given alternative. struct PveTopsisAlternative { @@ -32,6 +68,83 @@ pub struct Scheduler { nodes: Vec, } +/// A possible migration. +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +pub struct Migration { + /// The identifier of a leading resource. + pub sid: String, + /// The current node of the leading resource. + pub source_node: String, + /// The possible migration target node for the resource. + pub target_node: String, +} + +/// A possible migration with a score. +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +pub struct ScoredMigration { + /// The possible migration. + pub migration: Migration, + /// The expected node imbalance after the migration. + pub imbalance: f64, +} + +impl Ord for ScoredMigration { + fn cmp(&self, other: &Self) -> Ordering { + self.imbalance + .total_cmp(&other.imbalance) + .then(self.migration.cmp(&other.migration)) + } +} + +impl PartialOrd for ScoredMigration { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for ScoredMigration { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == Ordering::Equal + } +} + +impl Eq for ScoredMigration {} + +impl ScoredMigration { + pub fn new>(migration: T, imbalance: f64) -> Self { + // Depending how the imbalance is calculated, it can contain minor approximation errors. As + // this struct implements the Ord trait, users of the struct's cmp() can run into cases, + // where the imbalance is the same up to the significant digits in base 10, but treated as + // different values. + // + // Therefore, truncate any non-significant digits to prevent these cases. + let factor = 10_f64.powf(f64::DIGITS as f64); + let truncated_imbalance = f64::trunc(factor * imbalance) / factor; + + Self { + migration: migration.into(), + imbalance: truncated_imbalance, + } + } +} + +/// A possible migration candidate with the migrated usage stats. +#[derive(Clone, Debug)] +pub struct MigrationCandidate { + /// The possible migration. + pub migration: Migration, + /// The to-be-migrated resource usage stats. + pub stats: ResourceStats, +} + +impl From for Migration { + fn from(candidate: MigrationCandidate) -> Self { + candidate.migration + } +} + impl Scheduler { /// Instantiate scheduler instance from node usages. pub fn from_nodes(nodes: I) -> Self @@ -81,6 +194,123 @@ impl Scheduler { } } + /// Returns the load imbalance among the nodes. + /// + /// See [`calculate_node_imbalance`] for more information. + pub fn node_imbalance(&self) -> f64 { + calculate_node_imbalance(&self.nodes, |node| node.stats.load()) + } + + /// Returns the load imbalance among the nodes as if a specific resource was moved. + /// + /// See [`calculate_node_imbalance`] for more information. + fn node_imbalance_with_migration_candidate(&self, candidate: &MigrationCandidate) -> f64 { + calculate_node_imbalance(&self.nodes, |node| { + let mut new_stats = node.stats; + + if node.name == candidate.migration.source_node { + new_stats.remove_running_resource(&candidate.stats); + } else if node.name == candidate.migration.target_node { + new_stats.add_running_resource(&candidate.stats); + } + + new_stats.load() + }) + } + + /// Scores the given migration `candidates` by the best node imbalance improvement with + /// exhaustive search. + /// + /// The `candidates` are assumed to be consistent with the scheduler. No further validation is + /// done whether the given nodenames actually exist in the scheduler. + /// + /// The scoring is done as if each resource migration has already been done. This assumes that + /// the already migrated resource consumes the same amount of each stat as on the previous node + /// according to its `stats`. + /// + /// Returns up to `limit` of the best scored migrations. + pub fn score_best_balancing_migration_candidates( + &self, + candidates: I, + limit: usize, + ) -> Vec + where + I: IntoIterator, + { + let mut scored_migrations = candidates + .into_iter() + .map(|candidate| { + let imbalance = self.node_imbalance_with_migration_candidate(&candidate); + + Reverse(ScoredMigration::new(candidate, imbalance)) + }) + .collect::>(); + + let mut best_migrations = Vec::with_capacity(limit); + + // BinaryHeap::into_iter_sorted() is still in nightly unfortunately + while best_migrations.len() < limit { + match scored_migrations.pop() { + Some(Reverse(alternative)) => best_migrations.push(alternative), + None => break, + } + } + + best_migrations + } + + /// Scores the given migration `candidates` by the best node imbalance improvement with the + /// TOPSIS method. + /// + /// The `candidates` are assumed to be consistent with the scheduler. No further validation is + /// done whether the given nodenames actually exist in the scheduler. + /// + /// The scoring is done as if each resource migration has already been done. This assumes that + /// the already migrated resource consumes the same amount of each stat as on the previous node + /// according to its `stats`. + /// + /// Returns up to `limit` of the best scored migrations. + pub fn score_best_balancing_migration_candidates_topsis( + &self, + candidates: &[MigrationCandidate], + limit: usize, + ) -> Result, Error> { + let matrix = candidates + .iter() + .map(|candidate| { + let resource_stats = &candidate.stats; + let source_node = &candidate.migration.source_node; + let target_node = &candidate.migration.target_node; + + self.topsis_alternative_with(|node| { + let mut new_stats = node.stats; + + if &node.name == source_node { + new_stats.remove_running_resource(resource_stats); + } else if &node.name == target_node { + new_stats.add_running_resource(resource_stats); + } + + new_stats + }) + .into() + }) + .collect::>(); + + let best_alternatives = + topsis::rank_alternatives(&topsis::Matrix::new(matrix)?, &PVE_HA_TOPSIS_CRITERIA)?; + + Ok(best_alternatives + .into_iter() + .take(limit) + .map(|i| { + let imbalance = self.node_imbalance_with_migration_candidate(&candidates[i]); + + ScoredMigration::new(candidates[i].clone(), imbalance) + }) + .collect()) + } + /// Scores nodes to start a resource with the usage statistics `resource_stats` on. /// /// The scoring is done as if the resource is already started on each node. This assumes that @@ -122,3 +352,55 @@ impl Scheduler { .collect()) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_scored_migration_order() { + let migration1 = ScoredMigration::new( + Migration { + sid: String::from("vm:102"), + source_node: String::from("node1"), + target_node: String::from("node2"), + }, + 0.7231749488916931, + ); + let migration2 = ScoredMigration::new( + Migration { + sid: String::from("vm:102"), + source_node: String::from("node1"), + target_node: String::from("node3"), + }, + 0.723174948891693, + ); + let migration3 = ScoredMigration::new( + Migration { + sid: String::from("vm:101"), + source_node: String::from("node1"), + target_node: String::from("node2"), + }, + 0.723174948891693 + 1e-15, + ); + + let mut migrations = vec![migration2.clone(), migration3.clone(), migration1.clone()]; + + migrations.sort(); + + assert_eq!( + vec![migration1.clone(), migration2.clone(), migration3.clone()], + migrations + ); + + let mut heap = BinaryHeap::from(vec![ + Reverse(migration2.clone()), + Reverse(migration3.clone()), + Reverse(migration1.clone()), + ]); + + assert_eq!(heap.pop(), Some(Reverse(migration1))); + assert_eq!(heap.pop(), Some(Reverse(migration2))); + assert_eq!(heap.pop(), Some(Reverse(migration3))); + } +} diff --git a/proxmox-resource-scheduling/tests/scheduler.rs b/proxmox-resource-scheduling/tests/scheduler.rs index c7a9dab9..8672f40d 100644 --- a/proxmox-resource-scheduling/tests/scheduler.rs +++ b/proxmox-resource-scheduling/tests/scheduler.rs @@ -2,9 +2,13 @@ use anyhow::Error; use proxmox_resource_scheduling::{ node::NodeStats, resource::ResourceStats, - scheduler::{NodeUsage, Scheduler}, + scheduler::{Migration, MigrationCandidate, NodeUsage, Scheduler, ScoredMigration}, }; +fn new_empty_cluster_scheduler() -> Scheduler { + Scheduler::from_nodes(Vec::::new()) +} + fn new_homogeneous_cluster_scheduler() -> Scheduler { let (maxcpu, maxmem) = (16, 64 * (1 << 30)); @@ -75,6 +79,169 @@ fn new_heterogeneous_cluster_scheduler() -> Scheduler { Scheduler::from_nodes(vec![node1, node2, node3]) } +#[test] +fn test_node_imbalance_with_empty_cluster() { + let scheduler = new_empty_cluster_scheduler(); + + assert_eq!(scheduler.node_imbalance(), 0.0); +} + +#[test] +fn test_node_imbalance_with_perfectly_balanced_cluster() { + let node = NodeUsage { + name: String::from("node1"), + stats: NodeStats { + cpu: 1.7, + maxcpu: 16, + mem: 224395264, + maxmem: 68719476736, + }, + }; + + let scheduler = Scheduler::from_nodes(vec![node.clone()]); + + assert_eq!(scheduler.node_imbalance(), 0.0); + + let scheduler = Scheduler::from_nodes(vec![node.clone(), node.clone(), node]); + + assert_eq!(scheduler.node_imbalance(), 0.0); +} + +fn new_simple_migration_candidates() -> (Vec, Migration, Migration) { + let migration1 = Migration { + sid: String::from("vm:101"), + source_node: String::from("node1"), + target_node: String::from("node2"), + }; + let migration2 = Migration { + sid: String::from("vm:101"), + source_node: String::from("node1"), + target_node: String::from("node3"), + }; + let stats = ResourceStats { + cpu: 0.7, + maxcpu: 4.0, + mem: 8 << 30, + maxmem: 16 << 30, + }; + + let candidates = vec![ + MigrationCandidate { + migration: migration1.clone(), + stats, + }, + MigrationCandidate { + migration: migration2.clone(), + stats, + }, + ]; + + (candidates, migration1, migration2) +} + +fn assert_imbalance(imbalance: f64, expected_imbalance: f64) { + assert!( + (expected_imbalance - imbalance).abs() <= f64::EPSILON, + "imbalance is {imbalance}, but was expected to be {expected_imbalance}" + ); +} + +#[test] +fn test_score_best_balancing_migration_candidates_with_no_candidates() { + let scheduler = new_homogeneous_cluster_scheduler(); + + assert_eq!( + scheduler.score_best_balancing_migration_candidates(vec![], 2), + vec![] + ); +} + +#[test] +fn test_score_best_balancing_migration_candidates_in_homogeneous_cluster() { + let scheduler = new_homogeneous_cluster_scheduler(); + + assert_imbalance(scheduler.node_imbalance(), 0.4893954724628247); + + let (candidates, migration1, migration2) = new_simple_migration_candidates(); + + assert_eq!( + scheduler.score_best_balancing_migration_candidates(candidates, 2), + vec![ + ScoredMigration::new(migration2.clone(), 0.5972874658664057), + ScoredMigration::new(migration1.clone(), 0.7239828690397611) + ] + ); +} + +#[test] +fn test_score_best_balancing_migration_candidates_in_heterogeneous_cluster() { + let scheduler = new_heterogeneous_cluster_scheduler(); + + assert_imbalance(scheduler.node_imbalance(), 0.33026013056867354); + + let (candidates, migration1, migration2) = new_simple_migration_candidates(); + + assert_eq!( + scheduler.score_best_balancing_migration_candidates(candidates, 2), + vec![ + ScoredMigration::new(migration2, 0.525031850557711), + ScoredMigration::new(migration1, 0.5794177040605537) + ] + ); +} + +#[test] +fn test_score_best_balancing_migration_candidates_topsis_with_no_candidates() -> Result<(), Error> { + let scheduler = new_homogeneous_cluster_scheduler(); + + assert_eq!( + scheduler.score_best_balancing_migration_candidates_topsis(&vec![], 2)?, + vec![] + ); + + Ok(()) +} + +#[test] +fn test_score_best_balancing_migration_candidates_topsis_in_homogeneous_cluster( +) -> Result<(), Error> { + let scheduler = new_homogeneous_cluster_scheduler(); + + assert_imbalance(scheduler.node_imbalance(), 0.4893954724628247); + + let (candidates, migration1, migration2) = new_simple_migration_candidates(); + + assert_eq!( + scheduler.score_best_balancing_migration_candidates_topsis(&candidates, 2)?, + vec![ + ScoredMigration::new(migration1.clone(), 0.7239828690397611), + ScoredMigration::new(migration2.clone(), 0.5972874658664057), + ] + ); + + Ok(()) +} + +#[test] +fn test_score_best_balancing_migration_candidates_topsis_in_heterogeneous_cluster( +) -> Result<(), Error> { + let scheduler = new_heterogeneous_cluster_scheduler(); + + assert_imbalance(scheduler.node_imbalance(), 0.33026013056867354); + + let (candidates, migration1, migration2) = new_simple_migration_candidates(); + + assert_eq!( + scheduler.score_best_balancing_migration_candidates_topsis(&candidates, 2)?, + vec![ + ScoredMigration::new(migration1, 0.5794177040605537), + ScoredMigration::new(migration2, 0.525031850557711), + ] + ); + + Ok(()) +} + fn rank_nodes_to_start_resource( scheduler: &Scheduler, resource_stats: ResourceStats, -- 2.47.3