From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by lists.proxmox.com (Postfix) with ESMTPS id DD6C8844AB for ; Mon, 13 Dec 2021 08:43:29 +0100 (CET) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id D96C013E56 for ; Mon, 13 Dec 2021 08:43:29 +0100 (CET) Received: from bastionodiso.odiso.net (bastionodiso.odiso.net [IPv6:2a0a:1580:2000::2d]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by firstgate.proxmox.com (Proxmox) with ESMTPS id C011613E28 for ; Mon, 13 Dec 2021 08:43:24 +0100 (CET) Received: from kvmformation3.odiso.net (formationkvm3.odiso.net [10.3.94.12]) by bastionodiso.odiso.net (Postfix) with ESMTP id 5FE682DAEF; Mon, 13 Dec 2021 08:43:17 +0100 (CET) Received: by kvmformation3.odiso.net (Postfix, from userid 0) id 49962153919; Mon, 13 Dec 2021 08:43:17 +0100 (CET) From: Alexandre Derumier To: pve-devel@lists.proxmox.com Date: Mon, 13 Dec 2021 08:43:14 +0100 Message-Id: <20211213074316.2565139-2-aderumier@odiso.com> X-Mailer: git-send-email 2.30.2 In-Reply-To: <20211213074316.2565139-1-aderumier@odiso.com> References: <20211213074316.2565139-1-aderumier@odiso.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-SPAM-LEVEL: Spam detection results: 0 AWL 0.025 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% HEADER_FROM_DIFFERENT_DOMAINS 0.249 From and EnvelopeFrom 2nd level mail domains are different KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment KAM_LAZY_DOMAIN_SECURITY 1 Sending domain does not have any anti-forgery methods NO_DNS_FOR_FROM 0.001 Envelope sender has no MX or A DNS records SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_NONE 0.001 SPF: sender does not publish an SPF Record Subject: [pve-devel] [PATCH pve-ha-manager 1/3] add ressource awareness manager X-BeenThere: pve-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox VE development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 13 Dec 2021 07:43:29 -0000 --- src/PVE/HA/Env.pm | 24 ++++ src/PVE/HA/Env/PVE2.pm | 90 ++++++++++++++ src/PVE/HA/Manager.pm | 246 ++++++++++++++++++++++++++++++++++++-- src/PVE/HA/Sim/TestEnv.pm | 27 +++++ 4 files changed, 380 insertions(+), 7 deletions(-) diff --git a/src/PVE/HA/Env.pm b/src/PVE/HA/Env.pm index ac569a9..73b6407 100644 --- a/src/PVE/HA/Env.pm +++ b/src/PVE/HA/Env.pm @@ -269,4 +269,28 @@ sub get_ha_settings { return $self->{plug}->get_ha_settings(); } +sub get_node_rrd_stats { + my ($self, $node) = @_; + + return $self->{plug}->get_node_rrd_stats($node); +} + +sub get_vm_rrd_stats { + my ($self, $vmid, $percentile) = @_; + + return $self->{plug}->get_vm_rrd_stats($vmid, $percentile); +} + +sub read_vm_config { + my ($self, $vmid) = @_; + + return $self->{plug}->read_vm_config($vmid); +} + +sub read_ct_config { + my ($self, $vmid) = @_; + + return $self->{plug}->read_ct_config($vmid); +} + 1; diff --git a/src/PVE/HA/Env/PVE2.pm b/src/PVE/HA/Env/PVE2.pm index 5e0a683..2e1585c 100644 --- a/src/PVE/HA/Env/PVE2.pm +++ b/src/PVE/HA/Env/PVE2.pm @@ -9,9 +9,14 @@ use IO::Socket::UNIX; use PVE::SafeSyslog; use PVE::Tools; use PVE::Cluster qw(cfs_register_file cfs_read_file cfs_write_file cfs_lock_file); +use PVE::Cluster; use PVE::DataCenterConfig; use PVE::INotify; use PVE::RPCEnvironment; +use PVE::API2Tools; +use PVE::QemuConfig; +use PVE::LXC::Config; +use RRDs; use PVE::HA::Tools ':exit_codes'; use PVE::HA::Env; @@ -459,4 +464,89 @@ sub get_ha_settings { return $datacenterconfig->{ha}; } +sub get_node_rrd_stats { + my ($self, $node) = @_; + + my $rrd = PVE::Cluster::rrd_dump(); + my $members = PVE::Cluster::get_members(); + + my $stats = PVE::API2Tools::extract_node_stats($node, $members, $rrd); + + return $stats; +} + +sub get_vm_rrd_stats { + my ($self, $vmid, $percentile) = @_; + + my $rrdname = "pve2-vm/$vmid"; + my $rrddir = "/var/lib/rrdcached/db"; + + my $rrd = "$rrddir/$rrdname"; + + my $cf = "AVERAGE"; + + my $reso = 60; + my $ctime = $reso*int(time()/$reso); + + #last 20minutes + my $req_start = $ctime - $reso*20; + my $req_end = $ctime - $reso*1; + + my @args = ( + "-s" => $req_start, + "-e" => $req_end, + "-r" => $reso, + ); + + my $socket = "/var/run/rrdcached.sock"; + push @args, "--daemon" => "unix:$socket" if -S $socket; + + my ($start, $step, $names, $data) = RRDs::fetch($rrd, $cf, @args); + + my @cpu = (); + my @mem = (); + my @maxmem = (); + my @maxcpu = (); + + foreach my $rec (@$data) { + my $maxcpu = @$rec[0] || 0; + my $cpu = @$rec[1] || 0; + my $maxmem = @$rec[2] || 0; + my $mem = @$rec[3] || 0; + #skip zeros values if vm is down + push @cpu, $cpu*$maxcpu if $cpu > 0; + push @mem, $mem if $mem > 0; + push @maxcpu, $maxcpu if $maxcpu > 0; + push @maxmem, $maxmem if $maxmem > 0; + } + + my $stats = {}; + + $stats->{cpu} = percentile($percentile, \@cpu) || 0; + $stats->{mem} = percentile($percentile, \@mem) || 0; + $stats->{maxmem} = percentile($percentile, \@maxmem) || 0; + $stats->{maxcpu} = percentile($percentile, \@maxcpu) || 0; + $stats->{totalcpu} = $stats->{cpu} * $stats->{maxcpu} * 100; + + return $stats; +} + +sub percentile { + my ($p, $aref) = @_; + my $percentile = int($p * $#{$aref}/100); + return (sort @$aref)[$percentile]; +} + +sub read_vm_config { + my ($self, $vmid) = @_; + + return PVE::QemuConfig->load_config($vmid); +} + +sub read_ct_config { + my ($self, $vmid) = @_; + + return PVE::LXC::Config->load_config($vmid); +} + 1; diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm index 1c66b43..ae5fbcb 100644 --- a/src/PVE/HA/Manager.pm +++ b/src/PVE/HA/Manager.pm @@ -1,8 +1,13 @@ + package PVE::HA::Manager; use strict; use warnings; use Digest::MD5 qw(md5_base64); +use RRDs; +use POSIX qw/ceil/; +use PVE::API2Tools; +use PVE::Storage; use PVE::Tools; use PVE::HA::Tools ':exit_codes'; @@ -394,8 +399,16 @@ sub manage { my $repeat = 0; $self->recompute_online_node_usage(); + $self->recompute_online_node_stats(); - foreach my $sid (sort keys %$ss) { + $self->get_service_stats($ss); + + foreach my $sid ( + #ordering vm by size, bigger mem first then bigger cpu + #could be improved with bubblesearch heuristic + #https://www.cs.tufts.edu/~nr/cs257/archive/michael-mitzenmacher/bubblesearch.pdf + sort { $ss->{$a}->{stats}->{memg} <=> $ss->{$b}->{stats}->{memg} || $ss->{$a}->{stats}->{totalcpuround} <=> $ss->{$b}->{stats}->{totalcpuround} || $ss->{$a}->{type} cmp $ss->{$b}->{type}} + keys %$ss) { my $sd = $ss->{$sid}; my $cd = $sc->{$sid} || { state => 'disabled' }; @@ -802,12 +815,8 @@ sub next_state_recovery { $self->recompute_online_node_usage(); # we want the most current node state - my $recovery_node = select_service_node( - $self->{groups}, - $self->{online_node_usage}, - $cd, - $sd->{node}, - ); + my $storecfg = PVE::Storage::config(); + my $recovery_node = find_bestfit_node_target($haenv, $sid, $cd , $sd->{node}, $sd->{stats}, $self->{online_node_usage}, $self->{online_node_stats}, $self->{groups}, $storecfg); if ($recovery_node) { my $msg = "recover service '$sid' from fenced node '$fenced_node' to node '$recovery_node'"; @@ -822,6 +831,14 @@ sub next_state_recovery { $haenv->steal_service($sid, $sd->{node}, $recovery_node); $self->{online_node_usage}->{$recovery_node}++; + #add vm cpu/mem to current node stats (this is an estimation based on last 20min vm stats) + my $node_stats = $self->{online_node_stats}->{$recovery_node}->{stats}; + $node_stats->{totalcpu} += $sd->{stats}->{totalcpu}; + $node_stats->{mem} += $sd->{stats}->{mem}; + $node_stats->{totalfreecpu} = (100 * $node_stats->{maxcpu}) - $node_stats->{totalcpu}; + $node_stats->{freemem} = $node_stats->{maxmem} - $node_stats->{mem}; + + # NOTE: $sd *is normally read-only*, fencing is the exception $cd->{node} = $sd->{node} = $recovery_node; my $new_state = ($cd->{state} eq 'started') ? 'started' : 'request_stop'; @@ -839,4 +856,219 @@ sub next_state_recovery { } } + +sub dotprod { + my($vec_a, $vec_b, $mode) = @_; + die "they must have the same size\n" unless @$vec_a == @$vec_b; + $mode = "" if !$mode; + my $sum = 0; + my $norm_a = 0; + my $norm_b = 0; + + for(my $i=0; $i < scalar @{$vec_a}; $i++) { + my $a = @{$vec_a}[$i]; + my $b = @{$vec_b}[$i]; + + $sum += $a * $b; + $norm_a += $a * $a; + $norm_b += $b * $b; + } + + if($mode eq 'normR') { + return $sum / (sqrt($norm_a) * sqrt($norm_b)) + } elsif ($mode eq 'normC') { + return $sum / $norm_b; + } + return $sum; +} + +sub euclidean_distance { + my($vec_a, $vec_b) = @_; + + my $sum = 0; + + for(my $i=0; $i < scalar @{$vec_a}; $i++) { + my $a = @{$vec_a}[$i]; + my $b = @{$vec_b}[$i]; + $sum += ($b - $a)**2; + } + + return sqrt($sum); +} + +sub find_bestfit_node_target { + my($haenv, $sid, $cd, $nodename, $vm_stats, $online_node_usage, $online_nodes, $groups, $storecfg) = @_; + + my (undef, $vmid) = split(/:/, $sid); + + my $hagroup = get_service_group($groups, $online_nodes, $cd); + my ($pri_groups, $group_members_prio) = get_node_priority_groups($hagroup, $online_nodes); + + my $target_nodes = {}; + foreach my $nodename (keys %$online_nodes) { + my $node_stats = $online_nodes->{$nodename}->{stats}; + + #### FILTERING NODES WITH HARD CONSTRAINTS (vm can't be started) + next if !check_hard_constraints($haenv, $vmid, $cd, $nodename, $node_stats, $vm_stats, $storecfg, $group_members_prio); + + #### ADD prio and euclidean_distance weight + $target_nodes->{$nodename} = add_node_prio($nodename, 'distance', $node_stats, $vm_stats, $group_members_prio, $online_node_usage); + } + + #order by soft_constraint_prio, hagroup prio, weight (Best fit algorithm, lower distance first), number of services, and nodename + my @target_array = sort { + $target_nodes->{$b}->{prio} <=> $target_nodes->{$a}->{prio} || + $target_nodes->{$a}->{soft_constraint_prio} <=> $target_nodes->{$b}->{soft_constraint_prio} || + $target_nodes->{$a}->{weight} <=> $target_nodes->{$b}->{weight} || + $target_nodes->{$a}->{online_node_usage} <=> $target_nodes->{$b}->{online_node_usage} || + $target_nodes->{$a}->{name} cmp $target_nodes->{$b}->{name} + } keys %$target_nodes; + + my $target = $target_array[0]; + + return $target; +} + + +sub check_hard_constraints { + my ($haenv, $vmid, $cd, $node, $node_stats, $vm_stats, $storecfg, $group_members_prio) = @_; + + #node need to have a prio(restricted group) + return if !defined($group_members_prio->{$node}); + + #vm can't start if host have less core + return if $node_stats->{maxcpu} < $vm_stats->{maxcpu}; + #vm can't start if node don't have enough mem to handle vm max mem + return if $node_stats->{freemem} < $vm_stats->{maxmem}; + + #max 95% cpu/ram + my $mem_threshold = 0.95; + my $cpu_threshold = 0.95; + + #check if target node have enough mem ressources under threshold + return if $node_stats->{freemem} * $mem_threshold < $vm_stats->{mem}; + + #check if target node have enough cpu ressources under threshold + return if $node_stats->{totalfreecpu} * $cpu_threshold < $vm_stats->{totalcpu}; + + #check storage availability + if ($cd->{type} eq 'vm') { + my $conf = undef; + eval { $conf = $haenv->read_vm_config($vmid); }; + if (!$@) { + eval { PVE::QemuServer::check_storage_availability($storecfg, $conf, $node) }; + return if $@; + } + + } elsif ($cd->{type} eq 'ct') { + my $conf = undef; + eval { $conf = $haenv->read_ct_config($vmid); }; + #fixme : check storage for lxc too + } + + # fixme: check bridge availability + # fixme: vm: add a check for cpumodel compatibility ? + return 1; +} + +sub compute_soft_constraints { + my ($node_stats, $vm_stats) = @_; + + #try to reach 80% max cpu/ram + my $mem_threshold = 0.8; + my $cpu_threshold = 0.8; + + my $count = 0; + #check if target node have enough mem ressources under threshold + $count++ if $node_stats->{freemem} * $mem_threshold < $vm_stats->{mem}; + + #check if target node have enough cpu ressources under threshold + $count++ if $node_stats->{totalfreecpu} * $cpu_threshold < $vm_stats->{totalcpu}; + + #fixme : add antiaffinity + + return $count; +} + +sub add_node_prio { + my ($nodename, $method, $node_stats, $vm_stats, $group_members_prio, $online_node_usage) = @_; + + #rounded values to compute vectors (cpu 0-100 , mem 0G-->XG) + my $vm_totalcpu = ceil($vm_stats->{totalcpu}); + my $vm_mem = ceil($vm_stats->{mem}/1024/1024/1024); + my $node_freecpu = ceil($node_stats->{totalfreecpu}); + my $node_freemem = ceil($node_stats->{freemem}/1024/1024/1024); + + my @vec_vm = ($vm_totalcpu, $vm_mem); #? add network usage dimension ? + my @vec_node = ($node_freecpu, $node_freemem); #? add network usage dimension ? + my $weight = 0; + if ($method eq 'distance') { + $weight = euclidean_distance(\@vec_vm,\@vec_node); + } elsif ($method eq 'dotprod') { + $weight = dotprod(\@vec_vm,\@vec_node); + } + + my $node = {}; + $node->{weight} = $weight; + $node->{soft_constraint_prio} = compute_soft_constraints($node_stats, $vm_stats); + $node->{prio} = $group_members_prio->{$nodename}; + $node->{online_node_usage} = $online_node_usage->{$nodename}; + $node->{name} = $nodename; + + return $node; +} + +sub get_service_stats { + my ($self, $ss) = @_; + + foreach my $sid (sort keys %$ss) { + + if ($sid =~ m/^(vm|ct|fa):(\d+)$/) { + $ss->{$sid}->{type} = $1; + $ss->{$sid}->{name} = $2; + } + + my $stats = {}; + $stats->{cpu} = 0; + $stats->{maxcpu} = 0; + $stats->{mem} = 0; + $stats->{maxmem} = 0; + + #avoid to compute all stats, as currently we only support recovery + if ($ss->{$sid}->{state} eq 'recovery') { + + #get vm/ct stats 5min before on last 20min + $stats = $self->{haenv}->get_vm_rrd_stats($ss->{$sid}->{name}, 95); + } + #fixme: windows vm fill memory with zero at boot, so mem = maxmem + + #rounded values for ordering + $stats->{totalcpuround} = ceil($stats->{cpu} * 100 * $stats->{maxcpu}); + $stats->{memg} = ceil( $stats->{mem} /1024 /1024 /1024); + + $ss->{$sid}->{stats} = $stats; + } +} + +sub recompute_online_node_stats { + my ($self) = @_; + + my $online_node_stats = {}; + my $online_nodes = $self->{ns}->list_online_nodes(); + + foreach my $node (@$online_nodes) { + my $stats = $self->{haenv}->get_node_rrd_stats($node); + $stats->{cpu} = 0 if !defined($stats->{cpu}); + $stats->{maxcpu} = 0 if !defined($stats->{maxcpu}); + $stats->{mem} = 0 if !defined($stats->{mem}); + $stats->{maxmem} = 0 if !defined($stats->{maxmem}); + $stats->{totalcpu} = $stats->{cpu} * 100 * $stats->{maxcpu}; #how to handle different cpu model power ? bogomips ? + $stats->{totalfreecpu} = (100 * $stats->{maxcpu}) - $stats->{totalcpu}; + $stats->{freemem} = $stats->{maxmem} - $stats->{mem}; + $online_node_stats->{$node}->{stats} = $stats; + } + + $self->{online_node_stats} = $online_node_stats; +} + 1; diff --git a/src/PVE/HA/Sim/TestEnv.pm b/src/PVE/HA/Sim/TestEnv.pm index 6718d8c..08f27c7 100644 --- a/src/PVE/HA/Sim/TestEnv.pm +++ b/src/PVE/HA/Sim/TestEnv.pm @@ -118,4 +118,31 @@ sub get_max_workers { return 0; } +sub get_node_rrd_stats { + my ($self, $node) = @_; + + my $stats = {}; + $stats->{cpu} = 0; + $stats->{maxcpu} = 0; + $stats->{mem} = 0; + $stats->{maxmem} = 0; + + return $stats; +} + +sub get_vm_rrd_stats { + my ($self, $vmid, $percentile) = @_; + + my $stats = {}; + + $stats->{cpu} = 0; + $stats->{mem} = 0; + $stats->{maxmem} = 0; + $stats->{maxcpu} = 0; + $stats->{cpu} = $stats->{cpu} * 100; + $stats->{totalcpu} = $stats->{cpu} * $stats->{maxcpu}; + + return $stats; +} + 1; -- 2.30.2