From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) by lore.proxmox.com (Postfix) with ESMTPS id 297991FF13A for ; Wed, 29 Apr 2026 14:21:49 +0200 (CEST) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 654D54B31; Wed, 29 Apr 2026 14:21:32 +0200 (CEST) From: Dominik Rusovac To: pve-devel@lists.proxmox.com Subject: [PATCH proxmox v2 1/6] resource-scheduling: clamp imbalance value to unit interval Date: Wed, 29 Apr 2026 14:20:46 +0200 Message-ID: <20260429122051.179485-2-d.rusovac@proxmox.com> X-Mailer: git-send-email 2.47.3 In-Reply-To: <20260429122051.179485-1-d.rusovac@proxmox.com> References: <20260429122051.179485-1-d.rusovac@proxmox.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Bm-Milter-Handled: 55990f41-d878-4baa-be0a-ee34c49e34d2 X-Bm-Transport-Timestamp: 1777465159921 X-SPAM-LEVEL: Spam detection results: 0 AWL 0.406 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Message-ID-Hash: TYVYX5N72WZAECDJJFCW7IBQSXCXG2YR X-Message-ID-Hash: TYVYX5N72WZAECDJJFCW7IBQSXCXG2YR X-MailFrom: d.rusovac@proxmox.com X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; loop; banned-address; emergency; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header X-Mailman-Version: 3.3.10 Precedence: list List-Id: Proxmox VE development discussion List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: The currently used load imbalance value is given as the so-called coefficient of variation (CV), a value that may exceed 1. As such, the CV value alone lacks meaning. A CV value of 0.0 means no imbalance, but what does a value of, say, 1.7 mean? Relative to the number of nodes in a cluster, it is possible to determine the upper bound of the CV value [0][1]. By dividing the CV value by its upper bound, the load imbalance can be represented as a value that varies between 0 and 1. Expressing the CV as a percentage makes the concept of load imbalance easier to interpret. Re-adjust hardcoded imbalance values within tests accordingly. [0] https://repositorio.ipbeja.pt/server/api/core/bitstreams/8ed9a444-dbe0-402f-9d2f-90c5bf6e418c/content [1] https://stats.stackexchange.com/questions/18621/maximum-value-of-coefficient-of-variation-for-bounded-data-set Signed-off-by: Dominik Rusovac --- Notes: changes since v1: * squash commit that re-adjusts tests into this one * back to multiple `as f64` casts of node_count variable * go from if-else to early-return * make comment above early return clause more explanatory * re-order cv and max_cv bindings * add comment with ref relating to computation of max_cv proxmox-resource-scheduling/src/scheduler.rs | 47 +++++++++++-------- .../tests/scheduler.rs | 8 ++-- 2 files changed, 32 insertions(+), 23 deletions(-) diff --git a/proxmox-resource-scheduling/src/scheduler.rs b/proxmox-resource-scheduling/src/scheduler.rs index 49d16f9f..87eccfee 100644 --- a/proxmox-resource-scheduling/src/scheduler.rs +++ b/proxmox-resource-scheduling/src/scheduler.rs @@ -17,34 +17,43 @@ pub struct NodeUsage { pub stats: NodeStats, } -/// Returns the load imbalance among the nodes. +/// Returns the load imbalance among the nodes, which is a value between 0 and 1 that describes the +/// statistical dispersion of the individual node loads around the mean node load. The lower the +/// value, the better. /// -/// The load balance is measured as the statistical dispersion of the individual node loads. -/// -/// The current implementation uses the dimensionless coefficient of variation, which expresses the -/// standard deviation in relation to the average mean of the node loads. -/// -/// The coefficient of variation is not robust, which is a desired property here, because outliers -/// should be detected as much as possible. +/// In more detail, the current implementation computes the so-called coefficient of variation (CV), +/// which is the ratio of the standard deviation to the mean of the given node loads. The lower +/// bound of the CV is reached if all node loads are equal. The upper bound is reached if all nodes +/// except one are idle. To present the CV as a value between 0 and 1, it's being divided by the +/// upper bound of the CV for the given number of nodes. fn calculate_node_imbalance(nodes: &[NodeUsage], to_load: impl Fn(&NodeUsage) -> f64) -> f64 { let node_count = nodes.len(); - let node_loads = nodes.iter().map(to_load).collect::>(); + // early return with perfect imbalance to avoid division by zero + if node_count < 2 { + return 0.0; + } + + let node_loads = nodes.iter().map(to_load).collect::>(); let load_sum = node_loads.iter().sum::(); - // load_sum is guaranteed to be -0.0 for empty `nodes` + // early return with perfect imbalance to avoid division by zero if load_sum == 0.0 { - 0.0 - } else { - let load_mean = load_sum / node_count as f64; + return 0.0; + } - let squared_diff_sum = node_loads - .iter() - .fold(0.0, |sum, node_load| sum + (node_load - load_mean).powi(2)); - let load_sd = (squared_diff_sum / node_count as f64).sqrt(); + let load_mean = load_sum / node_count as f64; + let squared_diff_sum = node_loads + .iter() + .fold(0.0, |sum, node_load| sum + (node_load - load_mean).powi(2)); + let load_sd = (squared_diff_sum / node_count as f64).sqrt(); - load_sd / load_mean - } + let cv = load_sd / load_mean; + + // https://stats.stackexchange.com/questions/18621 + let max_cv = ((node_count - 1) as f64).sqrt(); + + cv / max_cv } criteria_struct! { diff --git a/proxmox-resource-scheduling/tests/scheduler.rs b/proxmox-resource-scheduling/tests/scheduler.rs index be90e4f9..21dbe451 100644 --- a/proxmox-resource-scheduling/tests/scheduler.rs +++ b/proxmox-resource-scheduling/tests/scheduler.rs @@ -172,7 +172,7 @@ fn test_score_best_balancing_migration_candidates_with_no_candidates() { fn test_score_best_balancing_migration_candidates_in_homogeneous_cluster() { let scheduler = new_homogeneous_cluster_scheduler(); - assert_imbalance(scheduler.node_imbalance(), 0.4893954724628247); + assert_imbalance(scheduler.node_imbalance(), 0.3460548572604576); let (candidates, migration1, migration2) = new_simple_migration_candidates(); @@ -186,7 +186,7 @@ fn test_score_best_balancing_migration_candidates_in_homogeneous_cluster() { fn test_score_best_balancing_migration_candidates_in_heterogeneous_cluster() { let scheduler = new_heterogeneous_cluster_scheduler(); - assert_imbalance(scheduler.node_imbalance(), 0.33026013056867354); + assert_imbalance(scheduler.node_imbalance(), 0.23352917788066363); let (candidates, migration1, migration2) = new_simple_migration_candidates(); @@ -225,7 +225,7 @@ fn test_score_best_balancing_migration_candidates_topsis_in_homogeneous_cluster( ) -> Result<(), Error> { let scheduler = new_homogeneous_cluster_scheduler(); - assert_imbalance(scheduler.node_imbalance(), 0.4893954724628247); + assert_imbalance(scheduler.node_imbalance(), 0.3460548572604576); let (candidates, migration1, migration2) = new_simple_migration_candidates(); @@ -242,7 +242,7 @@ fn test_score_best_balancing_migration_candidates_topsis_in_heterogeneous_cluste ) -> Result<(), Error> { let scheduler = new_heterogeneous_cluster_scheduler(); - assert_imbalance(scheduler.node_imbalance(), 0.33026013056867354); + assert_imbalance(scheduler.node_imbalance(), 0.23352917788066363); let (candidates, migration1, migration2) = new_simple_migration_candidates(); -- 2.47.3