From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [IPv6:2a01:7e0:0:424::9]) by lore.proxmox.com (Postfix) with ESMTPS id 1A6371FF144 for ; Tue, 24 Mar 2026 19:34:47 +0100 (CET) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 811291C102; Tue, 24 Mar 2026 19:32:01 +0100 (CET) From: Daniel Kral To: pve-devel@lists.proxmox.com Subject: [PATCH proxmox v2 04/40] resource-scheduling: introduce generic scheduler implementation Date: Tue, 24 Mar 2026 19:29:48 +0100 Message-ID: <20260324183029.1274972-5-d.kral@proxmox.com> X-Mailer: git-send-email 2.47.3 In-Reply-To: <20260324183029.1274972-1-d.kral@proxmox.com> References: <20260324183029.1274972-1-d.kral@proxmox.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Bm-Milter-Handled: 55990f41-d878-4baa-be0a-ee34c49e34d2 X-Bm-Transport-Timestamp: 1774376987804 X-SPAM-LEVEL: Spam detection results: 0 AWL 0.057 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Message-ID-Hash: JCHMZAFFOLW2XGUEXBN4SIPWUOK77HSC X-Message-ID-Hash: JCHMZAFFOLW2XGUEXBN4SIPWUOK77HSC X-MailFrom: d.kral@proxmox.com X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; loop; banned-address; emergency; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header X-Mailman-Version: 3.3.10 Precedence: list List-Id: Proxmox VE development discussion List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: The existing score_nodes_to_start_resource(...) function is dependent on the StaticNodeUsage and StaticServiceUsage structs. To use this function for other usage stats structs as well, declare generic NodeStats and ResourceStats structs, that the users can convert into. These are used to make score_nodes_to_start_resource(...) and its documentation generic. The pve_static::score_nodes_to_start_service(...) is marked as deprecated accordingly. The usage-related structs are marked as deprecated as well as the specific usage implementations - including their serialization and deserialization - should be handled by the caller now. This is best viewed with the git option --ignore-all-space. No functional changes intended. Signed-off-by: Daniel Kral --- This patch was "[RFC proxmox 2/5] resource-scheduling: introduce generic cluster usage implementation" in the RFC v1. Sorry for the ambigious naming with the next patch! changes v1 -> v2: - add more information to the patch message - split out `NodeStats` and `ResourceStats` to their own modules, which will also be used in an upcoming patch for the `Usage` implementation - add deprecation note to pve_static::score_nodes_to_start_service() - add deprecation attribute to other major pve_static items as well - impl `Add` and `Sum` for ResourceStats, which will be used for the resource bundling in pve-rs later - eagerly implement common traits (especially Clone and Debug) - add test cases for the scheduler::Scheduler::score_nodes_to_start_resource() proxmox-resource-scheduling/src/lib.rs | 6 + proxmox-resource-scheduling/src/node.rs | 39 ++++ proxmox-resource-scheduling/src/pve_static.rs | 46 +++- proxmox-resource-scheduling/src/resource.rs | 33 +++ proxmox-resource-scheduling/src/scheduler.rs | 157 ++++++++------ .../tests/scheduler.rs | 200 ++++++++++++++++++ 6 files changed, 408 insertions(+), 73 deletions(-) create mode 100644 proxmox-resource-scheduling/src/node.rs create mode 100644 proxmox-resource-scheduling/src/resource.rs create mode 100644 proxmox-resource-scheduling/tests/scheduler.rs diff --git a/proxmox-resource-scheduling/src/lib.rs b/proxmox-resource-scheduling/src/lib.rs index c73e7b1e..12b743fe 100644 --- a/proxmox-resource-scheduling/src/lib.rs +++ b/proxmox-resource-scheduling/src/lib.rs @@ -1,6 +1,12 @@ #[macro_use] pub mod topsis; +pub mod node; +pub mod resource; + pub mod scheduler; +// pve_static exists only for backwards compatibility to not break builds +// The allow(deprecated) is to not report its own use of deprecated items +#[allow(deprecated)] pub mod pve_static; diff --git a/proxmox-resource-scheduling/src/node.rs b/proxmox-resource-scheduling/src/node.rs new file mode 100644 index 00000000..e6227eda --- /dev/null +++ b/proxmox-resource-scheduling/src/node.rs @@ -0,0 +1,39 @@ +use crate::resource::ResourceStats; + +/// Usage statistics of a node. +#[derive(Copy, Clone, PartialEq, PartialOrd, Debug, Default)] +pub struct NodeStats { + /// CPU utilization in CPU cores. + pub cpu: f64, + /// Total number of CPU cores. + pub maxcpu: usize, + /// Used memory in bytes. + pub mem: usize, + /// Total memory in bytes. + pub maxmem: usize, +} + +impl NodeStats { + /// Adds the resource stats to the node stats as if the resource has started on the node. + pub fn add_started_resource(&mut self, resource_stats: &ResourceStats) { + // a maxcpu value of `0.0` means no cpu usage limit on the node + let resource_cpu = if resource_stats.maxcpu == 0.0 { + self.maxcpu as f64 + } else { + resource_stats.maxcpu + }; + + self.cpu += resource_cpu; + self.mem += resource_stats.maxmem; + } + + /// Returns the current cpu usage as a percentage. + pub fn cpu_load(&self) -> f64 { + self.cpu / self.maxcpu as f64 + } + + /// Returns the current memory usage as a percentage. + pub fn mem_load(&self) -> f64 { + self.mem as f64 / self.maxmem as f64 + } +} diff --git a/proxmox-resource-scheduling/src/pve_static.rs b/proxmox-resource-scheduling/src/pve_static.rs index c7e1d1b1..229ee3c6 100644 --- a/proxmox-resource-scheduling/src/pve_static.rs +++ b/proxmox-resource-scheduling/src/pve_static.rs @@ -1,10 +1,12 @@ use anyhow::Error; use serde::{Deserialize, Serialize}; -use crate::scheduler; +use crate::scheduler::{NodeUsage, Scheduler}; +use crate::{node::NodeStats, resource::ResourceStats}; -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] #[serde(rename_all = "kebab-case")] +#[deprecated = "specific node usage structs should be declared where they are used"] /// Static usage information of a node. pub struct StaticNodeUsage { /// Hostname of the node. @@ -33,6 +35,22 @@ impl AsRef for StaticNodeUsage { } } +impl From for NodeUsage { + fn from(usage: StaticNodeUsage) -> Self { + let stats = NodeStats { + cpu: usage.cpu, + maxcpu: usage.maxcpu, + mem: usage.mem, + maxmem: usage.maxmem, + }; + + Self { + name: usage.name, + stats, + } + } +} + /// Calculate new CPU usage in percent. /// `add` being `0.0` means "unlimited" and results in `max` being added. fn add_cpu_usage(old: f64, max: f64, add: f64) -> f64 { @@ -43,8 +61,9 @@ fn add_cpu_usage(old: f64, max: f64, add: f64) -> f64 { } } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Copy, Serialize, Deserialize)] #[serde(rename_all = "kebab-case")] +#[deprecated = "specific service usage structs should be declared where they are used"] /// Static usage information of an HA resource. pub struct StaticServiceUsage { /// Number of assigned CPUs or CPU limit. @@ -53,14 +72,33 @@ pub struct StaticServiceUsage { pub maxmem: usize, } +impl From for ResourceStats { + fn from(usage: StaticServiceUsage) -> Self { + Self { + cpu: usage.maxcpu, + maxcpu: usage.maxcpu, + mem: usage.maxmem, + maxmem: usage.maxmem, + } + } +} + /// Scores candidate `nodes` to start a `service` on. Scoring is done according to the static memory /// and CPU usages of the nodes as if the service would already be running on each. /// /// Returns a vector of (nodename, score) pairs. Scores are between 0.0 and 1.0 and a higher score /// is better. +#[deprecated = "use Scheduler::score_nodes_to_start_resource(...) directly instead"] pub fn score_nodes_to_start_service>( nodes: &[T], service: &StaticServiceUsage, ) -> Result, Error> { - scheduler::score_nodes_to_start_resource(nodes, service) + let nodes = nodes + .iter() + .map(|node| node.as_ref().clone().into()) + .collect::>(); + + let scheduler = Scheduler::from_nodes(nodes); + + scheduler.score_nodes_to_start_resource(*service) } diff --git a/proxmox-resource-scheduling/src/resource.rs b/proxmox-resource-scheduling/src/resource.rs new file mode 100644 index 00000000..1eb9d15e --- /dev/null +++ b/proxmox-resource-scheduling/src/resource.rs @@ -0,0 +1,33 @@ +use std::{iter::Sum, ops::Add}; + +/// Usage statistics for a resource. +#[derive(Copy, Clone, PartialEq, PartialOrd, Debug, Default)] +pub struct ResourceStats { + /// CPU utilization in CPU cores. + pub cpu: f64, + /// Number of assigned CPUs or CPU limit. + pub maxcpu: f64, + /// Used memory in bytes. + pub mem: usize, + /// Maximum assigned memory in bytes. + pub maxmem: usize, +} + +impl Add for ResourceStats { + type Output = Self; + + fn add(self, other: Self) -> Self { + Self { + cpu: self.cpu + other.cpu, + maxcpu: self.maxcpu + other.maxcpu, + mem: self.mem + other.mem, + maxmem: self.maxmem + other.maxmem, + } + } +} + +impl Sum for ResourceStats { + fn sum>(iter: I) -> Self { + iter.fold(Self::default(), |a, b| a + b) + } +} diff --git a/proxmox-resource-scheduling/src/scheduler.rs b/proxmox-resource-scheduling/src/scheduler.rs index 39ee44ce..bb38f238 100644 --- a/proxmox-resource-scheduling/src/scheduler.rs +++ b/proxmox-resource-scheduling/src/scheduler.rs @@ -1,9 +1,15 @@ use anyhow::Error; -use crate::{ - pve_static::{StaticNodeUsage, StaticServiceUsage}, - topsis, -}; +use crate::{node::NodeStats, resource::ResourceStats, topsis}; + +/// The scheduler view of a node. +#[derive(Clone, Debug)] +pub struct NodeUsage { + /// The identifier of the node. + pub name: String, + /// The usage statistics of the node. + pub stats: NodeStats, +} criteria_struct! { /// A given alternative. @@ -22,69 +28,82 @@ criteria_struct! { static PVE_HA_TOPSIS_CRITERIA; } -/// Scores candidate `nodes` to start a `resource` on. Scoring is done according to the static memory -/// and CPU usages of the nodes as if the resource would already be running on each. -/// -/// Returns a vector of (nodename, score) pairs. Scores are between 0.0 and 1.0 and a higher score -/// is better. -pub fn score_nodes_to_start_resource>( - nodes: &[T], - resource: &StaticServiceUsage, -) -> Result, Error> { - let len = nodes.len(); - - let matrix = nodes - .iter() - .enumerate() - .map(|(target_index, _)| { - // Base values on percentages to allow comparing nodes with different stats. - let mut highest_cpu = 0.0; - let mut squares_cpu = 0.0; - let mut highest_mem = 0.0; - let mut squares_mem = 0.0; - - for (index, node) in nodes.iter().enumerate() { - let node = node.as_ref(); - let new_cpu = if index == target_index { - if resource.maxcpu == 0.0 { - node.cpu + node.maxcpu as f64 - } else { - node.cpu + resource.maxcpu - } - } else { - node.cpu - } / (node.maxcpu as f64); - highest_cpu = f64::max(highest_cpu, new_cpu); - squares_cpu += new_cpu.powi(2); - - let new_mem = if index == target_index { - node.mem + resource.maxmem - } else { - node.mem - } as f64 - / node.maxmem as f64; - highest_mem = f64::max(highest_mem, new_mem); - squares_mem += new_mem.powi(2); - } - - // Add 1.0 to avoid boosting tiny differences: e.g. 0.004 is twice as much as 0.002, but - // 1.004 is only slightly more than 1.002. - PveTopsisAlternative { - average_cpu: 1.0 + (squares_cpu / len as f64).sqrt(), - highest_cpu: 1.0 + highest_cpu, - average_memory: 1.0 + (squares_mem / len as f64).sqrt(), - highest_memory: 1.0 + highest_mem, - } - .into() - }) - .collect::>(); - - let scores = - topsis::score_alternatives(&topsis::Matrix::new(matrix)?, &PVE_HA_TOPSIS_CRITERIA)?; - - Ok(scores - .into_iter() - .enumerate() - .map(|(n, score)| (nodes[n].as_ref().name.clone(), score)) - .collect()) +pub struct Scheduler { + nodes: Vec, +} + +impl Scheduler { + /// Instantiate scheduler instance from node usages. + pub fn from_nodes(nodes: I) -> Self + where + I: IntoIterator>, + { + Self { + nodes: nodes.into_iter().map(|node| node.into()).collect(), + } + } + + /// Scores nodes to start a resource with the usage statistics `resource_stats` on. + /// + /// The scoring is done as if the resource is already started on each node. This assumes that + /// the already started resource consumes the maximum amount of each stat according to its + /// `resource_stats`. + /// + /// Returns a vector of (nodename, score) pairs. Scores are between 0.0 and 1.0 and a higher + /// score is better. + pub fn score_nodes_to_start_resource>( + &self, + resource_stats: T, + ) -> Result, Error> { + let len = self.nodes.len(); + let resource_stats = resource_stats.into(); + + let matrix = self + .nodes + .iter() + .enumerate() + .map(|(target_index, _)| { + // Base values on percentages to allow comparing nodes with different stats. + let mut highest_cpu = 0.0; + let mut squares_cpu = 0.0; + let mut highest_mem = 0.0; + let mut squares_mem = 0.0; + + for (index, node) in self.nodes.iter().enumerate() { + let mut new_stats = node.stats; + + if index == target_index { + new_stats.add_started_resource(&resource_stats) + }; + + let new_cpu = new_stats.cpu_load(); + highest_cpu = f64::max(highest_cpu, new_cpu); + squares_cpu += new_cpu.powi(2); + + let new_mem = new_stats.mem_load(); + highest_mem = f64::max(highest_mem, new_mem); + squares_mem += new_mem.powi(2); + } + + // Add 1.0 to avoid boosting tiny differences: e.g. 0.004 is twice as much as 0.002, but + // 1.004 is only slightly more than 1.002. + PveTopsisAlternative { + average_cpu: 1.0 + (squares_cpu / len as f64).sqrt(), + highest_cpu: 1.0 + highest_cpu, + average_memory: 1.0 + (squares_mem / len as f64).sqrt(), + highest_memory: 1.0 + highest_mem, + } + .into() + }) + .collect::>(); + + let scores = + topsis::score_alternatives(&topsis::Matrix::new(matrix)?, &PVE_HA_TOPSIS_CRITERIA)?; + + Ok(scores + .into_iter() + .enumerate() + .map(|(n, score)| (self.nodes[n].name.to_string(), score)) + .collect()) + } } diff --git a/proxmox-resource-scheduling/tests/scheduler.rs b/proxmox-resource-scheduling/tests/scheduler.rs new file mode 100644 index 00000000..c7a9dab9 --- /dev/null +++ b/proxmox-resource-scheduling/tests/scheduler.rs @@ -0,0 +1,200 @@ +use anyhow::Error; +use proxmox_resource_scheduling::{ + node::NodeStats, + resource::ResourceStats, + scheduler::{NodeUsage, Scheduler}, +}; + +fn new_homogeneous_cluster_scheduler() -> Scheduler { + let (maxcpu, maxmem) = (16, 64 * (1 << 30)); + + let node1 = NodeUsage { + name: String::from("node1"), + stats: NodeStats { + cpu: 1.7, + maxcpu, + mem: 12334 << 20, + maxmem, + }, + }; + + let node2 = NodeUsage { + name: String::from("node2"), + stats: NodeStats { + cpu: 15.184, + maxcpu, + mem: 529 << 20, + maxmem, + }, + }; + + let node3 = NodeUsage { + name: String::from("node3"), + stats: NodeStats { + cpu: 5.2, + maxcpu, + mem: 9381 << 20, + maxmem, + }, + }; + + Scheduler::from_nodes(vec![node1, node2, node3]) +} + +fn new_heterogeneous_cluster_scheduler() -> Scheduler { + let node1 = NodeUsage { + name: String::from("node1"), + stats: NodeStats { + cpu: 1.7, + maxcpu: 16, + mem: 12334 << 20, + maxmem: 128 << 30, + }, + }; + + let node2 = NodeUsage { + name: String::from("node2"), + stats: NodeStats { + cpu: 15.184, + maxcpu: 32, + mem: 529 << 20, + maxmem: 96 << 30, + }, + }; + + let node3 = NodeUsage { + name: String::from("node3"), + stats: NodeStats { + cpu: 5.2, + maxcpu: 24, + mem: 9381 << 20, + maxmem: 64 << 30, + }, + }; + + Scheduler::from_nodes(vec![node1, node2, node3]) +} + +fn rank_nodes_to_start_resource( + scheduler: &Scheduler, + resource_stats: ResourceStats, +) -> Result, Error> { + let mut alternatives = scheduler.score_nodes_to_start_resource(resource_stats)?; + + alternatives.sort_by(|a, b| b.1.total_cmp(&a.1)); + + Ok(alternatives + .iter() + .map(|alternative| alternative.0.to_string()) + .collect()) +} + +#[test] +fn test_score_homogeneous_nodes_to_start_resource() -> Result<(), Error> { + let scheduler = new_homogeneous_cluster_scheduler(); + + let heavy_memory_resource_stats = ResourceStats { + cpu: 0.0, + maxcpu: 1.0, + mem: 0, + maxmem: 12 << 30, + }; + + assert_eq!( + rank_nodes_to_start_resource(&scheduler, heavy_memory_resource_stats)?, + vec!["node2", "node3", "node1"] + ); + + let heavy_cpu_resource_stats = ResourceStats { + cpu: 0.0, + maxcpu: 12.0, + mem: 0, + maxmem: 0, + }; + + assert_eq!( + rank_nodes_to_start_resource(&scheduler, heavy_cpu_resource_stats)?, + vec!["node1", "node3", "node2"] + ); + + let unlimited_cpu_resource_stats = ResourceStats { + cpu: 0.0, + maxcpu: 0.0, + mem: 0, + maxmem: 0, + }; + + assert_eq!( + rank_nodes_to_start_resource(&scheduler, unlimited_cpu_resource_stats)?, + vec!["node1", "node3", "node2"] + ); + + let unlimited_cpu_resource_stats = ResourceStats { + cpu: 0.0, + maxcpu: 12.0, + mem: 0, + maxmem: 12 << 30, + }; + + assert_eq!( + rank_nodes_to_start_resource(&scheduler, unlimited_cpu_resource_stats)?, + vec!["node2", "node3", "node1"] + ); + + Ok(()) +} + +#[test] +fn test_score_heterogeneous_nodes_to_start_resource() -> Result<(), Error> { + let scheduler = new_heterogeneous_cluster_scheduler(); + + let heavy_memory_resource_stats = ResourceStats { + cpu: 0.0, + maxcpu: 1.0, + mem: 0, + maxmem: 12 << 30, + }; + + assert_eq!( + rank_nodes_to_start_resource(&scheduler, heavy_memory_resource_stats)?, + vec!["node2", "node1", "node3"] + ); + + let heavy_cpu_resource_stats = ResourceStats { + cpu: 0.0, + maxcpu: 12.0, + mem: 0, + maxmem: 0, + }; + + assert_eq!( + rank_nodes_to_start_resource(&scheduler, heavy_cpu_resource_stats)?, + vec!["node3", "node2", "node1"] + ); + + let unlimited_cpu_resource_stats = ResourceStats { + cpu: 0.0, + maxcpu: 0.0, + mem: 0, + maxmem: 0, + }; + + assert_eq!( + rank_nodes_to_start_resource(&scheduler, unlimited_cpu_resource_stats)?, + vec!["node1", "node3", "node2"] + ); + + let unlimited_cpu_resource_stats = ResourceStats { + cpu: 0.0, + maxcpu: 12.0, + mem: 0, + maxmem: 12 << 30, + }; + + assert_eq!( + rank_nodes_to_start_resource(&scheduler, unlimited_cpu_resource_stats)?, + vec!["node2", "node1", "node3"] + ); + + Ok(()) +} -- 2.47.3