From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <root@kvmformation3.odiso.net>
Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature RSA-PSS (2048 bits))
 (No client certificate requested)
 by lists.proxmox.com (Postfix) with ESMTPS id E2FE9A83F
 for <pve-devel@lists.proxmox.com>; Wed, 27 Apr 2022 17:34:29 +0200 (CEST)
Received: from firstgate.proxmox.com (localhost [127.0.0.1])
 by firstgate.proxmox.com (Proxmox) with ESMTP id D08DE2881F
 for <pve-devel@lists.proxmox.com>; Wed, 27 Apr 2022 17:33:59 +0200 (CEST)
Received: from bastionodiso.odiso.net (bastionodiso.odiso.net
 [IPv6:2a0a:1580:2000::2d])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature RSA-PSS (2048 bits))
 (No client certificate requested)
 by firstgate.proxmox.com (Proxmox) with ESMTPS id 1BF7B287D2
 for <pve-devel@lists.proxmox.com>; Wed, 27 Apr 2022 17:33:53 +0200 (CEST)
Received: from kvmformation3.odiso.net (formationkvm3.odiso.net [10.3.94.12])
 by bastionodiso.odiso.net (Postfix) with ESMTP id BCAF8159B8;
 Wed, 27 Apr 2022 17:33:52 +0200 (CEST)
Received: by kvmformation3.odiso.net (Postfix, from userid 0)
 id B0462F94BC; Wed, 27 Apr 2022 17:33:52 +0200 (CEST)
From: Alexandre Derumier <aderumier@odiso.com>
To: pve-devel@lists.proxmox.com
Date: Wed, 27 Apr 2022 17:33:45 +0200
Message-Id: <20220427153351.1773666-3-aderumier@odiso.com>
X-Mailer: git-send-email 2.30.2
In-Reply-To: <20220427153351.1773666-1-aderumier@odiso.com>
References: <20220427153351.1773666-1-aderumier@odiso.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-SPAM-LEVEL: Spam detection results:  0
 AWL -0.003 Adjusted score from AWL reputation of From: address
 BAYES_00                 -1.9 Bayes spam probability is 0 to 1%
 HEADER_FROM_DIFFERENT_DOMAINS 0.248 From and EnvelopeFrom 2nd level mail
 domains are different
 KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment
 KAM_LAZY_DOMAIN_SECURITY 1 Sending domain does not have any anti-forgery
 methods
 NO_DNS_FOR_FROM         0.001 Envelope sender has no MX or A DNS records
 SPF_HELO_NONE           0.001 SPF: HELO does not publish an SPF Record
 SPF_NONE                0.001 SPF: sender does not publish an SPF Record
Subject: [pve-devel] [PATCH pve-ha-manager 2/8] get services && nodes stats
X-BeenThere: pve-devel@lists.proxmox.com
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Proxmox VE development discussion <pve-devel.lists.proxmox.com>
List-Unsubscribe: <https://lists.proxmox.com/cgi-bin/mailman/options/pve-devel>, 
 <mailto:pve-devel-request@lists.proxmox.com?subject=unsubscribe>
List-Archive: <http://lists.proxmox.com/pipermail/pve-devel/>
List-Post: <mailto:pve-devel@lists.proxmox.com>
List-Help: <mailto:pve-devel-request@lists.proxmox.com?subject=help>
List-Subscribe: <https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel>, 
 <mailto:pve-devel-request@lists.proxmox.com?subject=subscribe>
X-List-Received-Date: Wed, 27 Apr 2022 15:34:29 -0000

For offline vms in recovery state, we look at rrd for last 20minutes average
(excluding spike with 90th percentile)

For online vms, we get last rrd streamed value.
Need to implement a method to compute last minute average for cpu usage without need to re-read rrd file.
For other metrics, we can use last value.

For Nodes we get last rrd streamed value.
(Also need to compute last minute average for cpu usage)

A topsis score is compute for recovery state vm with order:
   - biggest boot memory usage (windows = 100% memory, linux: last mem used)
   - biggest cpu usage

We want to restart biggest services first, to have more chance to find a node with enough ressources
---
 debian/pve-ha-manager.install |   1 +
 src/PVE/HA/Balancer/Makefile  |   2 +-
 src/PVE/HA/Balancer/Stats.pm  | 134 ++++++++++++++++++++++++++++++
 src/PVE/HA/Env.pm             |  28 +++++++
 src/PVE/HA/Env/PVE2.pm        | 149 ++++++++++++++++++++++++++++++++++
 src/PVE/HA/Manager.pm         |   5 ++
 src/PVE/HA/Sim/TestEnv.pm     |  48 ++++++++++-
 7 files changed, 365 insertions(+), 2 deletions(-)
 create mode 100644 src/PVE/HA/Balancer/Stats.pm

diff --git a/debian/pve-ha-manager.install b/debian/pve-ha-manager.install
index d6979c4..6297997 100644
--- a/debian/pve-ha-manager.install
+++ b/debian/pve-ha-manager.install
@@ -21,6 +21,7 @@
 /usr/share/perl5/PVE/HA/CRM.pm
 /usr/share/perl5/PVE/HA/Balancer/AHP.pm
 /usr/share/perl5/PVE/HA/Balancer/Topsis.pm
+/usr/share/perl5/PVE/HA/Balancer/Stats.pm
 /usr/share/perl5/PVE/HA/Config.pm
 /usr/share/perl5/PVE/HA/Config.pm
 /usr/share/perl5/PVE/HA/Env.pm
diff --git a/src/PVE/HA/Balancer/Makefile b/src/PVE/HA/Balancer/Makefile
index de4b1b2..95ff86c 100644
--- a/src/PVE/HA/Balancer/Makefile
+++ b/src/PVE/HA/Balancer/Makefile
@@ -1,4 +1,4 @@
-SOURCES=Topsis.pm AHP.pm
+SOURCES=Topsis.pm AHP.pm Stats.pm
 
 .PHONY: install
 install:
diff --git a/src/PVE/HA/Balancer/Stats.pm b/src/PVE/HA/Balancer/Stats.pm
new file mode 100644
index 0000000..15162ce
--- /dev/null
+++ b/src/PVE/HA/Balancer/Stats.pm
@@ -0,0 +1,134 @@
+package PVE::HA::Balancer::Stats;
+
+use strict;
+use warnings;
+use PVE::HA::Balancer::Topsis;
+use PVE::HA::Balancer::AHP;
+
+my $compute_node_vms_pressure = sub {
+    my ($self, $node, $vm_stats) = @_;
+
+    return if !defined($self->{online_node_stats}->{$node});
+
+    my $node_stats = $self->{online_node_stats}->{$node}->{stats};
+
+    #count total number of vms  vcpus on this host
+    $node_stats->{total_vm_vcpus} = 0 if !defined($node_stats->{total_vm_vcpus});
+    $node_stats->{total_vm_vcpus} += $vm_stats->{maxcpu};
+
+    #add biggest vm pressure
+    $node_stats->{max_vm_pressure} = 0 if !defined($node_stats->{max_vm_pressure});
+    $node_stats->{max_vm_pressure} = $vm_stats->{cpu_pressure} if $vm_stats->{cpu_pressure} > $node_stats->{max_vm_pressure};
+};
+
+my $get_service_stats = sub {
+   my ($self, $ss, $sc) = @_;
+
+    my $haenv = $self->{haenv};
+
+    my $recovery_stats = {};
+
+    foreach my $sid (sort keys %$ss) {
+
+	my $cd = $sc->{$sid};
+	my $node = $cd->{node};
+
+	my (undef, $type, $vmid) = $haenv->parse_sid($sid);
+	$ss->{$sid}->{type} = $type;
+	$ss->{$sid}->{vmid} = $vmid;
+
+	my $stats = { cpu => 0, maxcpu => 0, startmem => 0, mem => 0, maxmem => 0, totalcpu => 0, cpu_pressure => 0, recovery_score => 0 };
+	$ss->{$sid}->{stats} = $stats;
+
+	if ($ss->{$sid}->{state} eq 'recovery') {
+	    #get vm/ct stats history on last 20min (95percentile)
+	    $stats = $haenv->get_vm_offline_rrd_stats($vmid, 95);
+	} elsif ($ss->{$sid}->{state} eq 'started') {
+	    #get last stats from cache.
+	    $stats = $haenv->get_vm_rrd_stats($vmid);
+	} else {
+	    #avoid to compute all stats, as currently we only support recovery and started for balancing;
+	    next;
+	}
+
+	my $vmconf = $haenv->read_vm_ct_config($vmid, $type);
+	$ss->{$sid}->{vmconf} = $vmconf;
+
+	$stats->{startmem} = $stats->{mem};
+	#windows vm fill memory with zero at boot, so mem = maxmem
+	$stats->{startmem} = $stats->{maxmem} if $vmconf && defined($vmconf->{ostype}) && $vmconf->{ostype} eq 'windows';
+
+	#totalcpu = relative cpu for 1core. 50% of 4 cores = 200% of 1 core
+	$stats->{totalcpu} = $stats->{cpu} * 100 * $stats->{maxcpu};
+
+	$stats->{recovery_score} = 0;
+
+	&$compute_node_vms_pressure($self, $cd, $stats);
+
+	$ss->{$sid}->{stats} = $stats;
+	$recovery_stats->{$sid} = $stats if $ss->{$sid}->{state} eq 'recovery';
+    }
+
+    #compute scores for recovery services
+    return if !keys %$recovery_stats;
+
+    my $weights = $self->{balancer}->{topsis}->{services_recovery}->{weights};
+    my $order = $self->{balancer}->{topsis}->{services_recovery}->{order};
+    my $scores = PVE::HA::Balancer::Topsis::score($recovery_stats, $weights, $order);
+
+    foreach my $sid (sort keys %$scores) {
+	$ss->{$sid}->{stats}->{recovery_score} = $scores->{$sid}->{score};
+    }
+};
+
+sub recompute_node_service_stats {
+    my ($self, $ss, $sc) = @_;
+
+    my $online_node_stats = {};
+    my $online_nodes = $self->{ns}->list_online_nodes();
+
+    foreach my $node (@$online_nodes) {
+        my $stats = $self->{haenv}->get_node_rrd_stats($node);
+        $stats->{cpu} = 0 if !defined($stats->{cpu});
+        $stats->{cpu_pressure} = 0 if !defined($stats->{cpu_pressure}); #fixme: implement rrd
+        $stats->{maxcpu} = 0 if !defined($stats->{maxcpu});
+        $stats->{mem} = 0 if !defined($stats->{mem});
+        $stats->{ksm} = 0 if !defined($stats->{ksm}); #fixme: implement rrd
+        $stats->{maxmem} = 0 if !defined($stats->{maxmem});
+        $stats->{totalcpu} = $stats->{cpu} * 100 * $stats->{maxcpu}; #how to handle different cpu model power ? bogomips ?
+	$stats->{total_vm_vcpus} = 0;
+	$stats->{max_vm_pressure} = 0;
+        $online_node_stats->{$node}->{stats} = $stats;
+    }
+
+    &$get_service_stats($self, $ss, $sc);
+
+    $self->{online_node_stats} = $online_node_stats;
+}
+
+sub compute_ahp_recovery_weights {
+    my ($self) = @_;
+
+    #bigger memory/cpu for offline service, better chance to find free space first
+
+    my $bestorder = {
+	startmem => "+",
+	totalcpu => "+",
+    };
+
+    my $preferences = {
+	startmem => {
+	    startmem => 1,
+	    totalcpu => 2,
+	},
+	totalcpu => {
+	    startmem => 1,
+	},
+    };
+
+    my $weights = PVE::HA::Balancer::AHP::compute_weights($preferences);
+    $self->{balancer}->{topsis}->{services_recovery}->{weights} = $weights;
+    $self->{balancer}->{topsis}->{services_recovery}->{order} = $bestorder;
+}
+
+1;
diff --git a/src/PVE/HA/Env.pm b/src/PVE/HA/Env.pm
index ac569a9..2ecc186 100644
--- a/src/PVE/HA/Env.pm
+++ b/src/PVE/HA/Env.pm
@@ -269,4 +269,32 @@ sub get_ha_settings {
     return $self->{plug}->get_ha_settings();
 }
 
+sub get_node_rrd_stats {
+    my ($self, $node) = @_;
+
+    return $self->{plug}->get_node_rrd_stats($node);
+}
+
+sub get_vm_rrd_stats {
+    my ($self, $vmid, $percentile) = @_;
+
+    return $self->{plug}->get_vm_rrd_stats($vmid, $percentile);
+}
+
+sub get_vm_offline_rrd_stats {
+    my ($self, $vmid, $percentile) = @_;
+
+    return $self->{plug}->get_vm_offline_rrd_stats($vmid, $percentile);
+}
+
+sub read_vm_ct_config {
+    my ($self, $vmid, $type) = @_;
+
+    if ($type eq 'vm') {
+	return $self->{plug}->read_vm_config($vmid);
+    } elsif ($type eq 'ct') {
+	return $self->{plug}->read_ct_config($vmid);
+    }
+}
+
 1;
diff --git a/src/PVE/HA/Env/PVE2.pm b/src/PVE/HA/Env/PVE2.pm
index 5e0a683..917aa62 100644
--- a/src/PVE/HA/Env/PVE2.pm
+++ b/src/PVE/HA/Env/PVE2.pm
@@ -12,6 +12,11 @@ use PVE::Cluster qw(cfs_register_file cfs_read_file cfs_write_file cfs_lock_file
 use PVE::DataCenterConfig;
 use PVE::INotify;
 use PVE::RPCEnvironment;
+use PVE::API2Tools;
+use PVE::QemuConfig;
+use PVE::QemuServer;
+use PVE::LXC::Config;
+use RRDs;
 
 use PVE::HA::Tools ':exit_codes';
 use PVE::HA::Env;
@@ -459,4 +464,148 @@ sub get_ha_settings {
     return $datacenterconfig->{ha};
 }
 
+sub get_node_rrd_stats {
+    my ($self, $node) = @_;
+
+    my $rrd = PVE::Cluster::rrd_dump();
+    my $members = PVE::Cluster::get_members();
+
+    my $stats = PVE::API2Tools::extract_node_stats($node, $members, $rrd);
+
+    # improve me
+    # we could compute last average minute for cpu usage,
+    # for others values, use can use last value
+
+    return $stats;
+}
+
+sub get_vm_rrd_stats {
+    my ($self, $vmid) = @_;
+
+    my $rrd = PVE::Cluster::rrd_dump();
+    my $vmlist = PVE::Cluster::get_vmlist() || {};
+    my $idlist = $vmlist->{ids} || {};
+
+    my $data = $idlist->{$vmid};
+    my $stats = PVE::API2Tools::extract_vm_stats($vmid, $data, $rrd);
+
+    # improve me
+    # we could compute last average minute for cpu usage,
+    # for others values, use can use last value
+
+    return $stats;
+}
+
+sub get_vm_offline_rrd_stats {
+    my ($self, $vmid, $percentile) = @_;
+
+    my $rrdname = "pve2-vm/$vmid";
+    my $rrddir = "/var/lib/rrdcached/db";
+
+    my $rrd = "$rrddir/$rrdname";
+
+    my $cf = "AVERAGE";
+
+    my $reso = 60;
+    my $ctime  = $reso*int(time()/$reso);
+
+    #last 20minutes average?
+ 
+    my $req_start = $ctime - $reso*20;
+    my $req_end = $ctime - $reso*1;
+
+    my @args = (
+        "-s" => $req_start,
+        "-e" => $req_end,
+        "-r" => $reso,
+        );
+
+    my $socket = "/var/run/rrdcached.sock";
+    push @args, "--daemon" => "unix:$socket" if -S $socket;
+
+    my ($start, $step, $names, $data) = RRDs::fetch($rrd, $cf, @args);
+
+    my @cpu = ();
+    my @mem = ();
+    my @maxmem = ();
+    my @maxcpu = ();
+
+    #fixme: implement true cgroup host cpu/mem && pressure in rrd
+
+    foreach my $rec (@$data) {
+        my $maxcpu = @$rec[0] || 0;
+        my $cpu = @$rec[1] || 0;
+        my $maxmem = @$rec[2] || 0;
+        my $mem = @$rec[3] || 0;
+        #skip zeros values if vm is down
+        push @cpu, $cpu*$maxcpu if $cpu > 0;
+        push @mem, $mem if $mem > 0;
+        push @maxcpu, $maxcpu if $maxcpu > 0;
+        push @maxmem, $maxmem if $maxmem > 0;
+    }
+
+    my $stats = {};
+
+    $stats->{cpu} = percentile($percentile, \@cpu) || 0;
+    $stats->{mem} = percentile($percentile, \@mem) || 0;
+    $stats->{maxmem} = percentile($percentile, \@maxmem) || 0;
+    $stats->{maxcpu} = percentile($percentile, \@maxcpu) || 0;
+    $stats->{totalcpu} = $stats->{cpu} * $stats->{maxcpu} * 100;
+    $stats->{cpu_pressure} = 0;
+    return $stats;
+}
+
+sub percentile {
+    my ($p, $aref) = @_;
+    my $percentile = int($p * $#{$aref}/100);
+    return (sort @$aref)[$percentile];
+}
+
+sub read_vm_config {
+    my ($self, $vmid) = @_;
+
+    my $conf = undef;
+    my $finalconf = {};
+
+    my $vmlist = PVE::Cluster::get_vmlist();
+    my $node = $vmlist->{ids}->{$vmid}->{node};
+
+    eval { $conf = PVE::QemuConfig->load_config($vmid, $node)};
+    return if !$conf;
+
+    if ( PVE::QemuServer::windows_version($conf->{ostype}) ) {
+	$finalconf->{ostype} = 'windows';
+    } else {
+	$finalconf->{ostype} = $conf->{ostype};
+    }
+
+    PVE::QemuConfig->foreach_volume($conf, sub {
+	my ($ds, $drive) = @_;
+
+	$finalconf->{$ds} = $conf->{$ds};
+    });
+
+    return $finalconf;
+}
+
+sub read_ct_config {
+    my ($self, $vmid) = @_;
+
+    my $conf = undef;
+    my $finalconf = {};
+
+    my $vmlist = PVE::Cluster::get_vmlist();
+    my $node = $vmlist->{ids}->{$vmid}->{node};
+
+    eval { $conf = PVE::LXC::Config->load_config($vmid, $node)};
+    return if !$conf;
+
+    PVE::LXC::Config->foreach_volume($conf, sub {
+        my ($ms, $mountpoint) = @_;
+        $finalconf->{$ms} = $conf->{$ms};
+    });
+
+    return $finalconf;
+}
+
 1;
diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm
index 2deea57..68b2872 100644
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -7,6 +7,7 @@ use Digest::MD5 qw(md5_base64);
 use PVE::Tools;
 use PVE::HA::Tools ':exit_codes';
 use PVE::HA::NodeStatus;
+use PVE::HA::Balancer::Stats;
 
 sub new {
     my ($this, $haenv) = @_;
@@ -26,6 +27,8 @@ sub new {
 
     $self->{ms} = { master_node => $haenv->nodename() };
 
+    PVE::HA::Balancer::Stats::compute_ahp_recovery_weights($self);
+
     return $self;
 }
 
@@ -395,6 +398,8 @@ sub manage {
 
 	$self->recompute_online_node_usage();
 
+	PVE::HA::Balancer::Stats::recompute_node_service_stats($self, $ss, $sc);
+
 	foreach my $sid (sort keys %$ss) {
 	    my $sd = $ss->{$sid};
 	    my $cd = $sc->{$sid} || { state => 'disabled' };
diff --git a/src/PVE/HA/Sim/TestEnv.pm b/src/PVE/HA/Sim/TestEnv.pm
index b448d72..ee261ef 100644
--- a/src/PVE/HA/Sim/TestEnv.pm
+++ b/src/PVE/HA/Sim/TestEnv.pm
@@ -118,4 +118,50 @@ sub get_max_workers {
     return 0;
 }
 
-1;
+sub get_node_rrd_stats {
+    my ($self, $node) = @_;
+
+    my $nodestats = $self->{hardware}->{node_stats};
+    my $stats = $nodestats->{$node};
+
+    return $stats;
+}
+
+sub get_vm_rrd_stats {
+    my ($self, $vmid) = @_;
+
+    my $vmstats = $self->{hardware}->{service_stats};
+    my $stats = $vmstats->{$vmid};
+
+    $stats->{uptime} = $stats->{uptime} || 400;
+    $stats->{cpu} = $stats->{cpu} || 0;
+    $stats->{mem} = $stats->{mem} || 0;
+    $stats->{maxmem} = $stats->{maxmem} || 0;
+    $stats->{maxcpu} = $stats->{maxcpu} || 0;
+    $stats->{totalcpu} = $stats->{cpu} * $stats->{maxcpu} * 100;
+    $stats->{cpu_pressure} = $stats->{cpu_pressure} || 0;
+
+    return $stats;
+}
+
+sub get_vm_offline_rrd_stats {
+    my ($self, $vmid, $percentile) = @_;
+
+    my $stats = $self->get_vm_rrd_stats($vmid);
+
+    return $stats;
+}
+
+sub read_vm_config {
+    my ($self, $vmid) = @_;
+
+    return $self->{hardware}->{vm_config}->{$vmid};
+}
+
+sub read_ct_config {
+    my ($self, $vmid) = @_;
+
+    return $self->{hardware}->{vm_config}->{$vmid};
+}
+
+1;
\ No newline at end of file
-- 
2.30.2