From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by lists.proxmox.com (Postfix) with ESMTPS id 063DBA841 for ; Wed, 27 Apr 2022 17:34:30 +0200 (CEST) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id F204828820 for ; Wed, 27 Apr 2022 17:33:59 +0200 (CEST) Received: from bastionodiso.odiso.net (bastionodiso.odiso.net [185.151.191.93]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by firstgate.proxmox.com (Proxmox) with ESMTPS id 2C8A8287D3 for ; Wed, 27 Apr 2022 17:33:53 +0200 (CEST) Received: from kvmformation3.odiso.net (formationkvm3.odiso.net [10.3.94.12]) by bastionodiso.odiso.net (Postfix) with ESMTP id BCB6E159B9; Wed, 27 Apr 2022 17:33:52 +0200 (CEST) Received: by kvmformation3.odiso.net (Postfix, from userid 0) id B5C09F958C; Wed, 27 Apr 2022 17:33:52 +0200 (CEST) From: Alexandre Derumier To: pve-devel@lists.proxmox.com Date: Wed, 27 Apr 2022 17:33:46 +0200 Message-Id: <20220427153351.1773666-4-aderumier@odiso.com> X-Mailer: git-send-email 2.30.2 In-Reply-To: <20220427153351.1773666-1-aderumier@odiso.com> References: <20220427153351.1773666-1-aderumier@odiso.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-SPAM-LEVEL: Spam detection results: 0 AWL 0.062 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% HEADER_FROM_DIFFERENT_DOMAINS 0.248 From and EnvelopeFrom 2nd level mail domains are different KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment KAM_LAZY_DOMAIN_SECURITY 1 Sending domain does not have any anti-forgery methods NO_DNS_FOR_FROM 0.001 Envelope sender has no MX or A DNS records PROLO_LEO1 0.1 Meta Catches all Leo drug variations so far SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_NONE 0.001 SPF: sender does not publish an SPF Record Subject: [pve-devel] [PATCH pve-ha-manager 3/8] implement ressource aware service recovery X-BeenThere: pve-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox VE development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 27 Apr 2022 15:34:30 -0000 Use a new method to find destination node for the service recovery First, we ordering services by topsis score Then we try to find the best target node. FILTERING --------- 1) We check is node is able to start vm - host have enough cores - host have enough memory - storage availability - not yet implemented: more checks could be added (cpumodel compat, bridge availability,antiaffinity, local devices,....) 2) if balancing option is enabled, we compute check load of the target node and we skip this node if: - too much global pressure (some cores are already more than 100%, so we can't trust cpu average) - a vm of this node already have an high cpu pressure (0.75) - target host cpu usage > 70% - target host mem usage > 83% or > 75% if ksm is already big (>20% ksm memory) (cpu pressure && ksm are not currently available in rrd, this need to be implemented) ORDERING -------- 1a) if balancing is enabled, we compute topsis score of the nodes with - biggest hagroup priority - lowest cpu pressure - lowest target cpu - lowest target mem Then we return the node with the highest scoe 1b) if balancing is disable, we use the classic ordering - hagroup priority - online_node_usage - node name Then we return the first node of the list --- debian/pve-ha-manager.install | 1 + src/PVE/HA/Balancer/Makefile | 2 +- src/PVE/HA/Balancer/Nodes.pm | 217 ++++++++++++++++++++++++++++++++++ src/PVE/HA/Env.pm | 11 ++ src/PVE/HA/Env/PVE2.pm | 52 ++++++++ src/PVE/HA/Manager.pm | 32 +++-- src/PVE/HA/Sim/TestEnv.pm | 20 ++++ 7 files changed, 327 insertions(+), 8 deletions(-) create mode 100644 src/PVE/HA/Balancer/Nodes.pm diff --git a/debian/pve-ha-manager.install b/debian/pve-ha-manager.install index 6297997..e083214 100644 --- a/debian/pve-ha-manager.install +++ b/debian/pve-ha-manager.install @@ -22,6 +22,7 @@ /usr/share/perl5/PVE/HA/Balancer/AHP.pm /usr/share/perl5/PVE/HA/Balancer/Topsis.pm /usr/share/perl5/PVE/HA/Balancer/Stats.pm +/usr/share/perl5/PVE/HA/Balancer/Nodes.pm /usr/share/perl5/PVE/HA/Config.pm /usr/share/perl5/PVE/HA/Config.pm /usr/share/perl5/PVE/HA/Env.pm diff --git a/src/PVE/HA/Balancer/Makefile b/src/PVE/HA/Balancer/Makefile index 95ff86c..92ab8d3 100644 --- a/src/PVE/HA/Balancer/Makefile +++ b/src/PVE/HA/Balancer/Makefile @@ -1,4 +1,4 @@ -SOURCES=Topsis.pm AHP.pm Stats.pm +SOURCES=Topsis.pm AHP.pm Stats.pm Nodes.pm .PHONY: install install: diff --git a/src/PVE/HA/Balancer/Nodes.pm b/src/PVE/HA/Balancer/Nodes.pm new file mode 100644 index 0000000..a06ed62 --- /dev/null +++ b/src/PVE/HA/Balancer/Nodes.pm @@ -0,0 +1,217 @@ +package PVE::HA::Balancer::Nodes; + +use strict; +use warnings; +use PVE::HA::Balancer::Topsis; +use PVE::HA::Balancer::AHP; + + +my $compute_node_target_cpu_pct = sub{ + my ($node_stats, $vm_stats) = @_; + + return 0 if $node_stats->{maxmem} == 0; + return ($node_stats->{totalcpu} + $vm_stats->{totalcpu}) / $node_stats->{maxcpu}; +}; + +my $compute_node_target_mem_pct = sub { + my ($node_stats, $vm_stats) = @_; + + return 0 if $node_stats->{maxmem} == 0; + return ($node_stats->{mem} + $vm_stats->{mem}) * 100 / $node_stats->{maxmem}; +}; + +my $add_prio = sub { + my ($self, $sd, $nodename, $group_members_prio) = @_; + + my $vm_stats = $sd->{stats}; + my $node_stats = $self->{online_node_stats}->{$nodename}->{stats}; + + my $node = {}; + $node->{prio} = $group_members_prio->{$nodename}; + $node->{affinity} = 0; #fixme, need to implement vm group + $node->{online_node_usage} = $self->{online_node_usage}->{$nodename}; + $node->{name} = $nodename; + $node->{cpu_pressure} = 0; #fixme, need to stream rrd graph first + $node->{target_cpu_pct} = &$compute_node_target_cpu_pct($node_stats, $vm_stats); + $node->{target_mem_pct} = &$compute_node_target_mem_pct($node_stats, $vm_stats); + + return $node; +}; + +my $find_target_by_score = sub { + my($self, $nodes) = @_; + + return if !keys %$nodes; + + my $weights = $self->{balancer}->{topsis}->{nodes}->{weights}; + my $order = $self->{balancer}->{topsis}->{nodes}->{order}; + my $scores = PVE::HA::Balancer::Topsis::score($nodes, $weights, $order); + + my @targets = sort { + $scores->{$b}->{score} <=> $scores->{$a}->{score} + } keys %$scores; + + return $targets[0]; +}; + +my $find_target_by_prio = sub { + my($self, $nodes) = @_; + + return if !keys %$nodes; + + my @targets = sort { + $nodes->{$b}->{prio} <=> $nodes->{$a}->{prio} || + $nodes->{$a}->{online_node_usage} <=> $nodes->{$b}->{online_node_usage} || + $nodes->{$a}->{name} cmp $nodes->{$b}->{name} + } keys %$nodes; + return $targets[0]; +}; + +my $check_bridge_availability = sub { + my ($vmconf, $node) = @_; + #fixme + return 1; +}; + +my $check_cpumodel_compatibility = sub { + my ($vmconf, $node) = @_; + #fixme + return 1; +}; + +my $check_target_load = sub { + my ($self, $sd, $node) = @_; + + return 1 if !$self->{balancer}->{enabled}; + + my $vm_stats = $sd->{stats}; + my $node_stats = $self->{online_node_stats}->{$node}->{stats}; + + my $max_threshold = { cpu => 70, mem => 83, cpu_pressure => 3, vm_pressure => 0.75 }; + # if ksm sharing is already huge (20% of total memory), reduce mem threshold to 75% + $max_threshold->{mem} = 75 if $node_stats->{ksm} > $node_stats->{maxmem} * 0.2; + + my $target_mem_percent = &$compute_node_target_mem_pct($node_stats, $vm_stats); + return if $target_mem_percent > $max_threshold->{mem}; + + #don't use node if already too much global pressure (some cores are already more than 100%, so we can't trust cpu average) + return if $node_stats->{cpu_pressure} > $max_threshold->{cpu_pressure}; + + #don't use node if a vm is already overloaded on this node + return if $node_stats->{max_vm_pressure} > $max_threshold->{vm_pressure}; + + my $target_cpu_percent = &$compute_node_target_cpu_pct($node_stats, $vm_stats); + return if $target_cpu_percent > $max_threshold->{cpu}; + + return 1; +}; + +my $check_hard_constraints = sub { + my ($self, $sd, $node, $group_members_prio) = @_; + + my $haenv = $self->{haenv}; + my $vm_stats = $sd->{stats}; + my $node_stats = $self->{online_node_stats}->{$node}->{stats}; + my $vmconf = $sd->{vmconf}; + #node need to have a prio(restricted group) + return if !defined($group_members_prio->{$node}); + + #vm can't start if host have less core + return if $node_stats->{maxcpu} < $vm_stats->{maxcpu}; + #vm can't start if node don't have enough mem to handle vm max mem + return if ($node_stats->{maxmem} - $node_stats->{mem}) < $vm_stats->{maxmem}; + + return if !$haenv->check_storage_availability($vmconf, $sd->{type}, $node, $self->{storecfg}); + + return if !&$check_bridge_availability($vmconf, $node); + + return if !&$check_cpumodel_compatibility($vmconf, $node); + + return 1; +}; + +sub find_target { + my($self, $cd, $sd, $group_members_prio) = @_; + + my $online_nodes = $self->{online_node_stats}; + + my $target_nodes = {}; + + foreach my $node (keys %$online_nodes) { + + #### FILTERING NODES WITH HARD CONSTRAINTS (vm can't be started) + next if !&$check_hard_constraints($self, $sd, $node, $group_members_prio); + + ### FILTERING too much loaded nodes + next if !&$check_target_load($self,$sd, $node); + + #### compute differents prio + $target_nodes->{$node} = &$add_prio($self, $sd, $node, $group_members_prio); + } + + # if ressource aware is enabled, order by score + if ($self->{balancer}->{enabled}) { + return &$find_target_by_score($self, $target_nodes); + } else { + return &$find_target_by_prio($self, $target_nodes); + } +} + + +sub compute_ahp_weights { + my ($self) = @_; + + #"+" -> benefit -> bigger better + #"-" -> cost ---> lower better + my $bestorder = { + prio => "+", + affinity => "-", + cpu_pressure => "-", + target_cpu_pct => "-", + target_mem_pct => "-", + }; + + #1 : equal importance (two activities contribute equally to the objective) + #3 : Moderate importance of one factor over another (experience and judgment strongly favor one activity over another) + #5 : Strong or essential importance (experience and judgment strongly favor one activity over another) + #7 : Very strong importance (and activity is strongly favored and it's dominance demonstrated in practice) + #9 : Extreme importance (the evidence favoring one activity over another is the highest possible order of affirmation) + + + #same weight of cpu && mem, well balanced node prio + my $preferences = { + prio => { + prio => 1, + affinity => 3, + cpu_pressure => 5, + target_cpu_pct => 9, + target_mem_pct => 9, + }, + affinity => { + affinity => 1, + cpu_pressure => 3, + target_cpu_pct => 7, + target_mem_pct => 7, + }, + cpu_pressure => { + cpu_pressure => 1, + target_cpu_pct => 5, + target_mem_pct => 7, + }, + target_cpu_pct => { + target_cpu_pct => 1, + target_mem_pct => 1, + }, + target_mem_pct => { + target_mem_pct => 1, + }, + }; + + my $weights = PVE::HA::Balancer::AHP::compute_weights($preferences); + + $self->{balancer}->{topsis}->{nodes}->{weights} = $weights; + $self->{balancer}->{topsis}->{nodes}->{order} = $bestorder; +} + + +1; diff --git a/src/PVE/HA/Env.pm b/src/PVE/HA/Env.pm index 2ecc186..757c5e0 100644 --- a/src/PVE/HA/Env.pm +++ b/src/PVE/HA/Env.pm @@ -297,4 +297,15 @@ sub read_vm_ct_config { } } +sub read_storecfg { + my ($self) = @_; + + return $self->{plug}->read_storecfg(); +} + +sub check_storage_availability { + my ($self, $vmconf, $type, $node, $storecfg) = @_; + + return $self->{plug}->check_storage_availability($vmconf, $type, $node, $storecfg); +} 1; diff --git a/src/PVE/HA/Env/PVE2.pm b/src/PVE/HA/Env/PVE2.pm index 917aa62..ee97559 100644 --- a/src/PVE/HA/Env/PVE2.pm +++ b/src/PVE/HA/Env/PVE2.pm @@ -16,6 +16,7 @@ use PVE::API2Tools; use PVE::QemuConfig; use PVE::QemuServer; use PVE::LXC::Config; +use PVE::Storage; use RRDs; use PVE::HA::Tools ':exit_codes'; @@ -608,4 +609,55 @@ sub read_ct_config { return $finalconf; } +sub read_storecfg { + my ($self) = @_; + + return PVE::Storage::config(); +} + +sub check_storage_availability { + my ($self, $vmconf, $type, $node, $storecfg) = @_; + + if ($type eq 'vm') { + eval { PVE::QemuServer::check_storage_availability($storecfg, $vmconf, $node) }; + return if $@; + } elsif ($type eq 'ct') { + eval { check_lxc_storage_availability($storecfg, $vmconf, $node) }; + return if $@; + } + return 1; +} + + + +##copy/paste from PVE::LXC::Migrate. add ad PVE::LXC::check_storage_availability like qemuserver +sub check_lxc_storage_availability { + my ($storecfg, $conf, $node) = @_; + + PVE::LXC::Config->foreach_volume_full($conf, { include_unused => 1 }, sub { + my ($ms, $mountpoint) = @_; + + my $volid = $mountpoint->{volume}; + my $type = $mountpoint->{type}; + + # skip dev/bind mps when shared + if ($type ne 'volume') { + if ($mountpoint->{shared}) { + return; + } else { + die "cannot migrate local $type mount point '$ms'\n"; + } + } + + my ($storage, $volname) = PVE::Storage::parse_volume_id($volid, 1) if $volid; + die "can't determine assigned storage for mount point '$ms'\n" if !$storage; + + # check if storage is available on both nodes + my $scfg = PVE::Storage::storage_check_enabled($storecfg, $storage); + PVE::Storage::storage_check_enabled($storecfg, $storage, $node); + + die "content type 'rootdir' is not available on storage '$storage'\n" + if !$scfg->{content}->{rootdir}; + }); +} 1; diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm index 68b2872..e021d60 100644 --- a/src/PVE/HA/Manager.pm +++ b/src/PVE/HA/Manager.pm @@ -8,6 +8,7 @@ use PVE::Tools; use PVE::HA::Tools ':exit_codes'; use PVE::HA::NodeStatus; use PVE::HA::Balancer::Stats; +use PVE::HA::Balancer::Nodes; sub new { my ($this, $haenv) = @_; @@ -28,6 +29,7 @@ sub new { $self->{ms} = { master_node => $haenv->nodename() }; PVE::HA::Balancer::Stats::compute_ahp_recovery_weights($self); + PVE::HA::Balancer::Nodes::compute_ahp_weights($self); return $self; } @@ -399,8 +401,14 @@ sub manage { $self->recompute_online_node_usage(); PVE::HA::Balancer::Stats::recompute_node_service_stats($self, $ss, $sc); + $self->{storecfg} = $haenv->read_storecfg(); + + foreach my $sid ( + sort { + $ss->{$b}->{stats}->{recovery_score} <=> $ss->{$a}->{stats}->{recovery_score} || + $ss->{$a}->{type} cmp $ss->{$b}->{type}} + keys %$ss) { - foreach my $sid (sort keys %$ss) { my $sd = $ss->{$sid}; my $cd = $sc->{$sid} || { state => 'disabled' }; @@ -816,12 +824,7 @@ sub next_state_recovery { $self->recompute_online_node_usage(); # we want the most current node state - my $recovery_node = select_service_node( - $self->{groups}, - $self->{online_node_usage}, - $cd, - $sd->{node}, - ); + my $recovery_node = $self->find_node_target($cd , $sd); if ($recovery_node) { my $msg = "recover service '$sid' from fenced node '$fenced_node' to node '$recovery_node'"; @@ -836,6 +839,11 @@ sub next_state_recovery { $haenv->steal_service($sid, $sd->{node}, $recovery_node); $self->{online_node_usage}->{$recovery_node}++; + #add vm cpu/mem to current node stats (this is an estimation based on last 20min vm stats) + my $node_stats = $self->{online_node_stats}->{$recovery_node}->{stats}; + $node_stats->{totalcpu} += $sd->{stats}->{totalcpu}; + $node_stats->{mem} += $sd->{stats}->{mem}; + # NOTE: $sd *is normally read-only*, fencing is the exception $cd->{node} = $sd->{node} = $recovery_node; my $new_state = ($cd->{state} eq 'started') ? 'started' : 'request_stop'; @@ -853,4 +861,14 @@ sub next_state_recovery { } } +sub find_node_target { + my($self, $cd, $sd) = @_; + + my $online_nodes = $self->{online_node_stats}; + my $groups = $self->{groups}; + my $hagroup = get_service_group($groups, $online_nodes, $cd); + my ($pri_groups, $group_members_prio) = get_node_priority_groups($hagroup, $online_nodes); + return PVE::HA::Balancer::Nodes::find_target($self, $cd, $sd, $group_members_prio); +} + 1; diff --git a/src/PVE/HA/Sim/TestEnv.pm b/src/PVE/HA/Sim/TestEnv.pm index ee261ef..8c86c84 100644 --- a/src/PVE/HA/Sim/TestEnv.pm +++ b/src/PVE/HA/Sim/TestEnv.pm @@ -164,4 +164,24 @@ sub read_ct_config { return $self->{hardware}->{vm_config}->{$vmid}; } +sub read_storecfg { + my ($self) = @_; + + return $self->{hardware}->{storecfg}; +} + +sub check_storage_availability { + my ($self, $vmconf, $type, $node, $storecfg) = @_; + + return 1 if !$vmconf; + + my $vm_storage = $vmconf->{storage}; + return 1 if !defined($vm_storage); + + foreach my $storeid (keys %$storecfg) { + next if !defined($storecfg->{$storeid}->{nodes}->{$node}); + return 1 if $vm_storage eq $storeid; + } +} + 1; \ No newline at end of file -- 2.30.2