From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [IPv6:2a01:7e0:0:424::9]) by lore.proxmox.com (Postfix) with ESMTPS id 6DAC51FF13F for ; Thu, 26 Mar 2026 11:19:24 +0100 (CET) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id A303AC17E; Thu, 26 Mar 2026 11:19:46 +0100 (CET) Content-Type: text/plain; charset=UTF-8 Date: Thu, 26 Mar 2026 11:19:08 +0100 Message-Id: Subject: Re: [PATCH proxmox v2 04/40] resource-scheduling: introduce generic scheduler implementation From: "Dominik Rusovac" To: "Daniel Kral" , Content-Transfer-Encoding: quoted-printable Mime-Version: 1.0 X-Mailer: aerc 0.20.0 References: <20260324183029.1274972-1-d.kral@proxmox.com> <20260324183029.1274972-5-d.kral@proxmox.com> In-Reply-To: <20260324183029.1274972-5-d.kral@proxmox.com> X-Bm-Milter-Handled: 55990f41-d878-4baa-be0a-ee34c49e34d2 X-Bm-Transport-Timestamp: 1774520299862 X-SPAM-LEVEL: Spam detection results: 0 AWL -1.184 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment RCVD_IN_VALIDITY_CERTIFIED_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to Validity was blocked. See https://knowledge.validity.com/hc/en-us/articles/20961730681243 for more information. RCVD_IN_VALIDITY_RPBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to Validity was blocked. See https://knowledge.validity.com/hc/en-us/articles/20961730681243 for more information. RCVD_IN_VALIDITY_SAFE_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to Validity was blocked. See https://knowledge.validity.com/hc/en-us/articles/20961730681243 for more information. SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record URIBL_BLACK 3 Contains an URL listed in the URIBL blacklist [node.name] Message-ID-Hash: 7F5IEWDFZHSTUARKABKDDG53FRXDXAF4 X-Message-ID-Hash: 7F5IEWDFZHSTUARKABKDDG53FRXDXAF4 X-MailFrom: d.rusovac@proxmox.com X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; loop; banned-address; emergency; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header X-Mailman-Version: 3.3.10 Precedence: list List-Id: Proxmox VE development discussion List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: lgtm=20 pls find my comments inline, mostly nits. On Tue Mar 24, 2026 at 7:29 PM CET, Daniel Kral wrote: > The existing score_nodes_to_start_resource(...) function is dependent on > the StaticNodeUsage and StaticServiceUsage structs. > > To use this function for other usage stats structs as well, declare > generic NodeStats and ResourceStats structs, that the users can convert > into. These are used to make score_nodes_to_start_resource(...) and its > documentation generic. > > The pve_static::score_nodes_to_start_service(...) is marked as > deprecated accordingly. The usage-related structs are marked as > deprecated as well as the specific usage implementations - including > their serialization and deserialization - should be handled by the > caller now. > > This is best viewed with the git option --ignore-all-space. > > No functional changes intended. > > Signed-off-by: Daniel Kral > --- > This patch was "[RFC proxmox 2/5] resource-scheduling: introduce generic > cluster usage implementation" in the RFC v1. Sorry for the ambigious > naming with the next patch! > > changes v1 -> v2: > - add more information to the patch message > - split out `NodeStats` and `ResourceStats` to their own modules, which > will also be used in an upcoming patch for the `Usage` implementation > - add deprecation note to pve_static::score_nodes_to_start_service() > - add deprecation attribute to other major pve_static items as well > - impl `Add` and `Sum` for ResourceStats, which will be used for the > resource bundling in pve-rs later > - eagerly implement common traits (especially Clone and Debug) > - add test cases for the > scheduler::Scheduler::score_nodes_to_start_resource() [snip] good to have:=20 #[derive(Clone, Debug)] > +pub struct Scheduler { > + nodes: Vec, > +} > + nit: The implementation of `Scheduler` is totally fine as-is. This is just my two cents, as this was mentioned off-list.=20 I believe that for the implementation of the scheduler working with enum variants and a trait and then exploiting static dispatch is more convenient and easier to maintain, e.g.: pub enum Schedulerr { Topsis(Nodes), BruteForce(Nodes), } =20 pub trait Decide { fn node_imbalance(&self) -> f64; =20 fn node_imbalance_with_migration_candidate(&self, candidate: &Migra= tionCandidate) -> f64; =20 fn score_best_balancing_migration_candidates( &self, candidates: &[MigrationCandidate], limit: usize, ) -> Result, Error>; } =20 impl Decide for Schedulerr> { fn node_imbalance(&self) -> f64 { match self { Self::Topsis(nodes) | Self::BruteForce(nodes) =3D> { calculate_node_imbalance(nodes, |node| node.stats.load(= )) } } } =20 fn node_imbalance_with_migration_candidate(&self, candidate: &Migra= tionCandidate) -> f64 { match self { Self::Topsis(nodes) | Self::BruteForce(nodes) =3D> { calculate_node_imbalance(nodes, |node| { let mut new_stats =3D node.stats; =20 if node.name =3D=3D candidate.migration.source_node= { new_stats.remove_running_resource(&candidate.st= ats); } else if node.name =3D=3D candidate.migration.targ= et_node { new_stats.add_running_resource(&candidate.stats= ); } =20 new_stats.load() }) } } } =20 fn score_best_balancing_migration_candidates( &self, candidates: &[MigrationCandidate], limit: usize, ) -> Result, Error> { match self { Self::Topsis(nodes) =3D> { let len =3D nodes.len(); =20 let matrix =3D candidates .iter() .map(|candidate| { let resource_stats =3D &candidate.stats; let source_node =3D &candidate.migration.source= _node; let target_node =3D &candidate.migration.target= _node; =20 let mut highest_cpu =3D 0.0; let mut squares_cpu =3D 0.0; let mut highest_mem =3D 0.0; let mut squares_mem =3D 0.0; =20 for node in nodes.iter() { let new_stats =3D { let mut new_stats =3D node.stats; =20 if &node.name =3D=3D source_node { new_stats.remove_running_resource(r= esource_stats); } else if &node.name =3D=3D target_node= { new_stats.add_running_resource(reso= urce_stats); } =20 new_stats }; =20 let new_cpu =3D new_stats.cpu_load(); highest_cpu =3D f64::max(highest_cpu, new_c= pu); squares_cpu +=3D new_cpu.powi(2); =20 let new_mem =3D new_stats.mem_load(); highest_mem =3D f64::max(highest_mem, new_m= em); squares_mem +=3D new_mem.powi(2); } =20 // Add 1.0 to avoid boosting tiny differences: = e.g. 0.004 is twice as much as 0.002, but // 1.004 is only slightly more than 1.002. PveTopsisAlternative { average_cpu: 1.0 + (squares_cpu / len as f6= 4).sqrt(), highest_cpu: 1.0 + highest_cpu, average_memory: 1.0 + (squares_mem / len as= f64).sqrt(), highest_memory: 1.0 + highest_mem, } .into() }) .collect::>(); =20 let best_alternatives =3D topsis::rank_alternatives( &topsis::Matrix::new(matrix)?, &PVE_HA_TOPSIS_CRITERIA, )?; =20 Ok(best_alternatives .into_iter() .take(limit) .map(|i| { let imbalance =3D self.node_imbalance_with_migration_candidat= e(&candidates[i]); =20 ScoredMigration::new(candidates[i].clone(), imb= alance) }) .collect()) } Self::BruteForce(_) =3D> { let mut scored_migrations =3D candidates .iter() .map(|candidate| { let imbalance =3D self.node_imbalance_with_migr= ation_candidate(candidate); =20 // NOTE: could avoid clone if Migration had add= itional score field Reverse(ScoredMigration::new(candidate.clone(),= imbalance)) }) .collect::>(); =20 let mut best_migrations =3D Vec::with_capacity(limit); =20 // BinaryHeap::into_iter_sorted() is still in nightly u= nfortunately while best_migrations.len() < limit { match scored_migrations.pop() { Some(Reverse(alternative)) =3D> best_migrations= .push(alternative), None =3D> break, } } =20 Ok(best_migrations) } } } } =20 pub fn score_best_balancing_migration_candidates( scheduler: &impl Decide, candidates: &[MigrationCandidate], limit: usize, ) -> Result, Error> { scheduler.score_best_balancing_migration_candidates(candidates, lim= it) } In a nutshell, this declares what a scheduler ought to be able to do to be = used for scoring (that is, implementing the `Decide` trait); and implements all the functionality for all the variants in one place. =20 Nice side-effects of this design:=20 * one scoring function implements all the variants in one place, which is n= ice, I think * adding/removing a scheduler variant would become more systematic * modifying scheduler variants in terms of how they score or, for example, = how they measure imbalance, would also be more straightforward Again, just my two cents. > +impl Scheduler { > + /// Instantiate scheduler instance from node usages. > + pub fn from_nodes(nodes: I) -> Self > + where > + I: IntoIterator>, > + { > + Self { > + nodes: nodes.into_iter().map(|node| node.into()).collect(), > + } > + } > + > + /// Scores nodes to start a resource with the usage statistics `reso= urce_stats` on. > + /// > + /// The scoring is done as if the resource is already started on eac= h node. This assumes that > + /// the already started resource consumes the maximum amount of each= stat according to its > + /// `resource_stats`. > + /// > + /// Returns a vector of (nodename, score) pairs. Scores are between = 0.0 and 1.0 and a higher > + /// score is better. > + pub fn score_nodes_to_start_resource>( > + &self, > + resource_stats: T, > + ) -> Result, Error> { > + let len =3D self.nodes.len(); > + let resource_stats =3D resource_stats.into(); > + > + let matrix =3D self > + .nodes > + .iter() > + .enumerate() > + .map(|(target_index, _)| { > + // Base values on percentages to allow comparing nodes w= ith different stats. > + let mut highest_cpu =3D 0.0; > + let mut squares_cpu =3D 0.0; > + let mut highest_mem =3D 0.0; > + let mut squares_mem =3D 0.0; > + > + for (index, node) in self.nodes.iter().enumerate() { > + let mut new_stats =3D node.stats; > + > + if index =3D=3D target_index { > + new_stats.add_started_resource(&resource_stats) > + }; > + > + let new_cpu =3D new_stats.cpu_load(); > + highest_cpu =3D f64::max(highest_cpu, new_cpu); > + squares_cpu +=3D new_cpu.powi(2); > + > + let new_mem =3D new_stats.mem_load(); > + highest_mem =3D f64::max(highest_mem, new_mem); > + squares_mem +=3D new_mem.powi(2); > + } > + > + // Add 1.0 to avoid boosting tiny differences: e.g. 0.00= 4 is twice as much as 0.002, but > + // 1.004 is only slightly more than 1.002. > + PveTopsisAlternative { > + average_cpu: 1.0 + (squares_cpu / len as f64).sqrt()= , > + highest_cpu: 1.0 + highest_cpu, > + average_memory: 1.0 + (squares_mem / len as f64).sqr= t(), > + highest_memory: 1.0 + highest_mem, > + } > + .into() > + }) > + .collect::>(); > + > + let scores =3D > + topsis::score_alternatives(&topsis::Matrix::new(matrix)?, &P= VE_HA_TOPSIS_CRITERIA)?; > + > + Ok(scores > + .into_iter() > + .enumerate() > + .map(|(n, score)| (self.nodes[n].name.to_string(), score)) > + .collect()) > + } > } [snip] in the future, the proptest crate could come in handy for such helper functions.=20 in general, I would propose to add proptests in the future. for some examples and inspiration, see [0]. [0] https://lore.proxmox.com/all/20260306082046.34311-1-d.rusovac@proxmox.c= om/T/ > +fn new_homogeneous_cluster_scheduler() -> Scheduler { > + let (maxcpu, maxmem) =3D (16, 64 * (1 << 30)); > + > + let node1 =3D NodeUsage { > + name: String::from("node1"), > + stats: NodeStats { > + cpu: 1.7, > + maxcpu, > + mem: 12334 << 20, > + maxmem, > + }, > + }; > + > + let node2 =3D NodeUsage { > + name: String::from("node2"), > + stats: NodeStats { > + cpu: 15.184, > + maxcpu, > + mem: 529 << 20, > + maxmem, > + }, > + }; > + > + let node3 =3D NodeUsage { > + name: String::from("node3"), > + stats: NodeStats { > + cpu: 5.2, > + maxcpu, > + mem: 9381 << 20, > + maxmem, > + }, > + }; > + > + Scheduler::from_nodes(vec![node1, node2, node3]) > +} > + > +fn new_heterogeneous_cluster_scheduler() -> Scheduler { > + let node1 =3D NodeUsage { > + name: String::from("node1"), > + stats: NodeStats { > + cpu: 1.7, > + maxcpu: 16, > + mem: 12334 << 20, > + maxmem: 128 << 30, > + }, > + }; > + > + let node2 =3D NodeUsage { > + name: String::from("node2"), > + stats: NodeStats { > + cpu: 15.184, > + maxcpu: 32, > + mem: 529 << 20, > + maxmem: 96 << 30, > + }, > + }; > + > + let node3 =3D NodeUsage { > + name: String::from("node3"), > + stats: NodeStats { > + cpu: 5.2, > + maxcpu: 24, > + mem: 9381 << 20, > + maxmem: 64 << 30, > + }, > + }; > + > + Scheduler::from_nodes(vec![node1, node2, node3]) > +} [snip] > +#[test] > +fn test_score_homogeneous_nodes_to_start_resource() -> Result<(), Error>= { > + let scheduler =3D new_homogeneous_cluster_scheduler(); > + > + let heavy_memory_resource_stats =3D ResourceStats { > + cpu: 0.0, > + maxcpu: 1.0, > + mem: 0, > + maxmem: 12 << 30, > + }; > + > + assert_eq!( > + rank_nodes_to_start_resource(&scheduler, heavy_memory_resource_s= tats)?, > + vec!["node2", "node3", "node1"] > + ); > + > + let heavy_cpu_resource_stats =3D ResourceStats { > + cpu: 0.0, > + maxcpu: 12.0, > + mem: 0, > + maxmem: 0, > + }; > + > + assert_eq!( > + rank_nodes_to_start_resource(&scheduler, heavy_cpu_resource_stat= s)?, > + vec!["node1", "node3", "node2"] > + ); > + > + let unlimited_cpu_resource_stats =3D ResourceStats { > + cpu: 0.0, > + maxcpu: 0.0, > + mem: 0, > + maxmem: 0, > + }; > + > + assert_eq!( > + rank_nodes_to_start_resource(&scheduler, unlimited_cpu_resource_= stats)?, > + vec!["node1", "node3", "node2"] > + ); > + nit: confusing variable name > + let unlimited_cpu_resource_stats =3D ResourceStats { > + cpu: 0.0, > + maxcpu: 12.0, > + mem: 0, > + maxmem: 12 << 30, > + }; > + > + assert_eq!( nit: confusing variable name > + rank_nodes_to_start_resource(&scheduler, unlimited_cpu_resource_= stats)?, > + vec!["node2", "node3", "node1"] > + ); > + > + Ok(()) > +} > + > +#[test] > +fn test_score_heterogeneous_nodes_to_start_resource() -> Result<(), Erro= r> { > + let scheduler =3D new_heterogeneous_cluster_scheduler(); > + > + let heavy_memory_resource_stats =3D ResourceStats { > + cpu: 0.0, > + maxcpu: 1.0, > + mem: 0, > + maxmem: 12 << 30, > + }; > + > + assert_eq!( > + rank_nodes_to_start_resource(&scheduler, heavy_memory_resource_s= tats)?, > + vec!["node2", "node1", "node3"] > + ); > + > + let heavy_cpu_resource_stats =3D ResourceStats { > + cpu: 0.0, > + maxcpu: 12.0, > + mem: 0, > + maxmem: 0, > + }; > + > + assert_eq!( > + rank_nodes_to_start_resource(&scheduler, heavy_cpu_resource_stat= s)?, > + vec!["node3", "node2", "node1"] > + ); > + > + let unlimited_cpu_resource_stats =3D ResourceStats { > + cpu: 0.0, > + maxcpu: 0.0, > + mem: 0, > + maxmem: 0, > + }; > + > + assert_eq!( > + rank_nodes_to_start_resource(&scheduler, unlimited_cpu_resource_= stats)?, > + vec!["node1", "node3", "node2"] > + ); > + nit: confusing variable name > + let unlimited_cpu_resource_stats =3D ResourceStats { > + cpu: 0.0, > + maxcpu: 12.0, > + mem: 0, > + maxmem: 12 << 30, > + }; > + nit: confusing variable name > + assert_eq!( > + rank_nodes_to_start_resource(&scheduler, unlimited_cpu_resource_= stats)?, > + vec!["node2", "node1", "node3"] > + ); > + > + Ok(()) > +}