From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <root@kvmformation3.odiso.net>
Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature RSA-PSS (2048 bits))
 (No client certificate requested)
 by lists.proxmox.com (Postfix) with ESMTPS id 3F69DA7B0
 for <pve-devel@lists.proxmox.com>; Wed, 27 Apr 2022 17:34:28 +0200 (CEST)
Received: from firstgate.proxmox.com (localhost [127.0.0.1])
 by firstgate.proxmox.com (Proxmox) with ESMTP id 2C6CD2880A
 for <pve-devel@lists.proxmox.com>; Wed, 27 Apr 2022 17:33:58 +0200 (CEST)
Received: from bastionodiso.odiso.net (bastionodiso.odiso.net [185.151.191.93])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature RSA-PSS (2048 bits))
 (No client certificate requested)
 by firstgate.proxmox.com (Proxmox) with ESMTPS id 30D0E287DA
 for <pve-devel@lists.proxmox.com>; Wed, 27 Apr 2022 17:33:53 +0200 (CEST)
Received: from kvmformation3.odiso.net (formationkvm3.odiso.net [10.3.94.12])
 by bastionodiso.odiso.net (Postfix) with ESMTP id C064C159BA;
 Wed, 27 Apr 2022 17:33:52 +0200 (CEST)
Received: by kvmformation3.odiso.net (Postfix, from userid 0)
 id BAE8BF94C7; Wed, 27 Apr 2022 17:33:52 +0200 (CEST)
From: Alexandre Derumier <aderumier@odiso.com>
To: pve-devel@lists.proxmox.com
Date: Wed, 27 Apr 2022 17:33:47 +0200
Message-Id: <20220427153351.1773666-5-aderumier@odiso.com>
X-Mailer: git-send-email 2.30.2
In-Reply-To: <20220427153351.1773666-1-aderumier@odiso.com>
References: <20220427153351.1773666-1-aderumier@odiso.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-SPAM-LEVEL: Spam detection results:  0
 AWL 0.115 Adjusted score from AWL reputation of From: address
 BAYES_00                 -1.9 Bayes spam probability is 0 to 1%
 HEADER_FROM_DIFFERENT_DOMAINS 0.248 From and EnvelopeFrom 2nd level mail
 domains are different
 KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment
 KAM_LAZY_DOMAIN_SECURITY 1 Sending domain does not have any anti-forgery
 methods
 NO_DNS_FOR_FROM         0.001 Envelope sender has no MX or A DNS records
 SPF_HELO_NONE           0.001 SPF: HELO does not publish an SPF Record
 SPF_NONE                0.001 SPF: sender does not publish an SPF Record
Subject: [pve-devel] [PATCH pve-ha-manager 4/8] add vm loadbalancing
X-BeenThere: pve-devel@lists.proxmox.com
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Proxmox VE development discussion <pve-devel.lists.proxmox.com>
List-Unsubscribe: <https://lists.proxmox.com/cgi-bin/mailman/options/pve-devel>, 
 <mailto:pve-devel-request@lists.proxmox.com?subject=unsubscribe>
List-Archive: <http://lists.proxmox.com/pipermail/pve-devel/>
List-Post: <mailto:pve-devel@lists.proxmox.com>
List-Help: <mailto:pve-devel-request@lists.proxmox.com?subject=help>
List-Subscribe: <https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel>, 
 <mailto:pve-devel-request@lists.proxmox.com?subject=subscribe>
X-List-Received-Date: Wed, 27 Apr 2022 15:34:28 -0000

This is a vm centric loadbalancer with some inspiration of the
vmware drs 2.0 scheduler.
https://blogs.vmware.com/vsphere/2020/05/vsphere-7-a-closer-look-at-the-vm-drs-score.html

This look at bad performance vms, give a cluster topsis score for each
bad vm.

for each vm (CT are skipped as we can't live migrate them),
   we keep vms with:
  - high cpu pressure (> 0.75)
  - high pseudo memory pressure.
    We could use the true vm memory pressure, and it's already too late,
    host is already swapping.
    Pseudo memory pressure is the percentage of host mem over threshold vs vm mem.
    the mem threshold is 85% host memory or 80% if ksm is already big)
  - not yet implemented: vm affinity could be check here

Then we compute a topsis score with
   - biggest affinity
   - biggest cpu pressure
   - biggest memory pseudo pressure
   - lowest cpu usage
   - lowest mem

and we try to migrate the biggest scored vm to the best target node. (Same than for service recovery)
---
 debian/pve-ha-manager.install   |   1 +
 src/PVE/HA/Balancer/Makefile    |   2 +-
 src/PVE/HA/Balancer/Services.pm | 187 ++++++++++++++++++++++++++++++++
 src/PVE/HA/Manager.pm           |  47 +++++++-
 4 files changed, 235 insertions(+), 2 deletions(-)
 create mode 100644 src/PVE/HA/Balancer/Services.pm

diff --git a/debian/pve-ha-manager.install b/debian/pve-ha-manager.install
index e083214..3bc7cc8 100644
--- a/debian/pve-ha-manager.install
+++ b/debian/pve-ha-manager.install
@@ -23,6 +23,7 @@
 /usr/share/perl5/PVE/HA/Balancer/Topsis.pm
 /usr/share/perl5/PVE/HA/Balancer/Stats.pm
 /usr/share/perl5/PVE/HA/Balancer/Nodes.pm
+/usr/share/perl5/PVE/HA/Balancer/Services.pm
 /usr/share/perl5/PVE/HA/Config.pm
 /usr/share/perl5/PVE/HA/Config.pm
 /usr/share/perl5/PVE/HA/Env.pm
diff --git a/src/PVE/HA/Balancer/Makefile b/src/PVE/HA/Balancer/Makefile
index 92ab8d3..ec1823d 100644
--- a/src/PVE/HA/Balancer/Makefile
+++ b/src/PVE/HA/Balancer/Makefile
@@ -1,4 +1,4 @@
-SOURCES=Topsis.pm AHP.pm Stats.pm Nodes.pm
+SOURCES=Topsis.pm AHP.pm Stats.pm Nodes.pm Services.pm
 
 .PHONY: install
 install:
diff --git a/src/PVE/HA/Balancer/Services.pm b/src/PVE/HA/Balancer/Services.pm
new file mode 100644
index 0000000..6cce6a7
--- /dev/null
+++ b/src/PVE/HA/Balancer/Services.pm
@@ -0,0 +1,187 @@
+package PVE::HA::Balancer::Services;
+
+use strict;
+use warnings;
+use PVE::HA::Balancer::Topsis;
+use PVE::HA::Balancer::AHP;
+
+my $check_anti_affinity = sub {
+   my ($vmid, $node, $vm_stats) = @_;
+
+   #implement me
+
+   return undef;
+};
+
+my $check_cpu_pressure = sub {
+    my ($vm_stats) = @_;
+
+    my $eviction_threshold = 0.75;
+    return 1 if $vm_stats->{cpu_pressure} > $eviction_threshold;
+};
+
+my $check_pseudo_mem_pressure = sub {
+    my ($node_stats, $vm_stats) = @_;
+
+    my $eviction_threshold = 85;
+
+    #if ksm is already big, lowering threshold
+    $eviction_threshold = 80 if $node_stats->{ksm} > ($node_stats->{maxmem} * 0.2);
+    my $node_mem_threshold = $node_stats->{maxmem} * $eviction_threshold/100;
+
+    if ($node_stats->{mem} > $node_mem_threshold) {
+	#if removing the vm fix the threshold
+	#we compute a pseudo presssure (percentage of host mem over threshold vs vm mem)
+
+	if (($node_stats->{mem} - $vm_stats->{mem}) < $node_mem_threshold) {
+	    $vm_stats->{mem_pseudo_pressure} = ($node_stats->{mem} - $node_mem_threshold) / $vm_stats->{mem};
+	}
+	#others vm are still added (case if not a single vm can reduce memory under threshold)
+	#in the case, we use simply the vm used memory order
+	return 1;
+    }
+};
+
+my $get_bad_vms  = sub {
+    my($ss, $sc, $online_nodes) = @_;
+
+    my $bad_vms = {};
+
+    foreach my $sid (keys %$ss) {
+
+        my $cd = $sc->{$sid};
+	my $sd = $ss->{$sid};
+
+	next if !$cd;
+        # can't live migrate ct
+        next if $cd->{type} ne 'vm';
+        # only migrate started state
+        next if $cd->{state} ne 'started';
+
+	# don't migrate same sid multiple time
+        next if $self->{balancer}->{last_migrate_sid} && $self->{balancer}->{last_migrate_sid} eq $sid;
+
+        my $node = $cd->{node};
+	#skip if node is not online or not responding;
+	next if !defined($online_nodes->{$node});
+
+	my $node_stats = $online_nodes->{$node}->{stats};
+	my $vm_stats = $sd->{stats};
+
+        # skip vm is recently started or migrated
+        next if !defined($vm_stats->{uptime}) || $vm_stats->{uptime} < 300;
+
+	# fixme : skip if local disk
+	# &$check_vm_disks_local($storecfg, $vmconf, $vmid);
+	# fixme : skip if local ressources
+        #PVE::QemuServer::check_local_resources($vmconf, 1);
+
+
+	$vm_stats->{affinity} = 0;
+	$vm_stats->{mem_pseudo_pressure} = 0;
+
+	my $add_vm = undef;
+
+	$add_vm = 1 if &$check_anti_affinity($sid, $node, $vm_stats);
+	$add_vm = 1 if &$check_cpu_pressure($vm_stats);
+	$add_vm = 1 if &$check_pseudo_mem_pressure($node_stats, $vm_stats);
+	next if !$add_vm;
+
+        my $prio = {
+		affinity => $vm_stats->{affinity},
+		mem_pseudo_pressure => $vm_stats->{mem_pseudo_pressure},
+		mem => $vm_stats->{mem},
+		totalcpu => $vm_stats->{totalcpu},
+		cpu_pressure => $vm_stats->{cpu_pressure},
+	};
+
+	$bad_vms->{$sid} = $prio;
+    }
+    return $bad_vms if keys %$bad_vms;
+};
+
+my $get_score = sub {
+    my ($self, $vms) = @_;
+
+    my $weights = $self->{balancer}->{topsis}->{bad_vms}->{weights};
+    my $order = $self->{balancer}->{topsis}->{bad_vms}->{order};
+    my $scores = PVE::HA::Balancer::Topsis::score($vms, $weights, $order);
+
+    return $scores;
+};
+
+sub get_vm_targetnode {
+    my($self, $ss, $sc) = @_;
+
+    my $online_nodes = $self->{online_node_stats};
+
+    my $bad_vms = &$get_bad_vms($ss, $sc, $online_nodes);
+    return if !$bad_vms;
+
+    my $vm_scores = &$get_score($self, $bad_vms);
+
+    foreach my $sid (
+        sort {
+            $vm_scores->{$b}->{score} <=> $vm_scores->{$a}->{score}
+        } keys %$vm_scores) {
+
+        my $cd = $sc->{$sid};
+        my $sd = $ss->{$sid};
+
+        my $node = $self->find_node_target($cd , $sd);
+        next if !$node;
+
+	# register last sid we tried to migrate, to not try to balance it in loop
+	$self->{balancer}->{last_migrate_sid} = $sid;
+
+        return ($sid, $node);
+    }
+}
+
+sub compute_ahp_weights {
+    my ($self) = @_;
+
+    my $bestorder = {
+	affinity => "+",
+	cpu_pressure => "+",
+	totalcpu => "-",
+	mem => "-",
+	mem_pseudo_pressure => "+",
+    };
+
+    my $preferences = {
+	affinity => {
+	    affinity => 1,
+	    cpu_pressure => 9,
+	    mem_pseudo_pressure => 9,
+	    mem => 9,
+	    total_cpu => 9,
+	},
+	cpu_pressure => {
+	    cpu_pressure => 1,
+	    mem_pseudo_pressure => 3,
+	    mem => 3,
+	    total_cpu => 5,
+	},
+	mem_pseudo_pressure => {
+	    mem_pseudo_pressure => 1,
+	    mem => 3,
+	    total_cpu => 3,
+	},
+	mem => {
+	    mem => 1,
+	    total_cpu => 2,
+	},
+	total_cpu => {
+	    total_cpu => 1,
+	},
+    };
+
+    my $weights = PVE::HA::Balancer::AHP::compute_weights($preferences);
+
+    $self->{balancer}->{topsis}->{bad_vms}->{weights} = $weights;
+    $self->{balancer}->{topsis}->{bad_vms}->{order} = $bestorder;
+}
+
+
+1;
diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm
index e021d60..6fa866a 100644
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -9,6 +9,7 @@ use PVE::HA::Tools ':exit_codes';
 use PVE::HA::NodeStatus;
 use PVE::HA::Balancer::Stats;
 use PVE::HA::Balancer::Nodes;
+use PVE::HA::Balancer::Services;
 
 sub new {
     my ($this, $haenv) = @_;
@@ -30,6 +31,7 @@ sub new {
 
     PVE::HA::Balancer::Stats::compute_ahp_recovery_weights($self);
     PVE::HA::Balancer::Nodes::compute_ahp_weights($self);
+    PVE::HA::Balancer::Services::compute_ahp_weights($self);
 
     return $self;
 }
@@ -251,6 +253,9 @@ my $change_service_state = sub {
     $text_state = "  ($text_state)" if $text_state;
     $haenv->log('info', "service '$sid': state changed from '${old_state}'" .
 		" to '${new_state}'$text_state");
+
+    $self->{balancer}->{last_state_change} = $haenv->get_time();
+
 };
 
 # clean up a possible bad state from a recovered service to allow its start
@@ -402,6 +407,7 @@ sub manage {
 
 	PVE::HA::Balancer::Stats::recompute_node_service_stats($self, $ss, $sc);
         $self->{storecfg} = $haenv->read_storecfg();
+	$self->balancer_status();
 
         foreach my $sid (
 			sort {
@@ -493,10 +499,12 @@ sub manage {
 	    $haenv->log('notice', "node '$node' in fence state but no services to-fence! admin interference?!");
 	    $repeat = 1 if $ns->fence_node($node);
 	}
-
 	last if !$repeat;
     }
 
+    #if all services states are stable, do loadbalancing
+    $self->loadbalance($ss, $sc);
+
     $self->flush_master_status();
 }
 
@@ -871,4 +879,41 @@ sub find_node_target {
     return PVE::HA::Balancer::Nodes::find_target($self, $cd, $sd, $group_members_prio);
 }
 
+sub loadbalance {
+    my($self, $ss, $sc) = @_;
+
+    return if !$self->{balancer}->{enabled};
+
+    my $haenv = $self->{haenv};
+
+    my $now = $haenv->get_time();
+
+    #check only once per minute
+    $self->{balancer}->{lastcheck} = $now if !defined($self->{balancer}->{lastcheck});
+    my $delta_check = $now - $self->{balancer}->{lastcheck};
+    return if $delta_check < 60;
+    $self->{balancer}->{lastcheck} = $now;
+
+    #check only 1 minute after last state change
+    $self->{balancer}->{last_state_change} = $now if !defined($self->{balancer}->{last_state_change});
+    my $delta_state_change = $now - $self->{balancer}->{last_state_change};
+    return if !$delta_state_change || $delta_state_change < 60;
+
+    my ($sid, $node) = PVE::HA::Balancer::Services::get_vm_targetnode($self, $ss, $sc);
+
+    return if !$sid && !$node;
+
+    # do migration
+    my $sd = $ss->{$sid};
+    $self->{haenv}->log('info', "balancer - migrate service '$sid' to node '$node' (running)");
+    &$change_service_state($self, $sid, 'migrate', node => $sd->{node}, target => $node);
+}
+
+sub balancer_status {
+    my($self) = @_;
+
+    my $dc_ha_cfg = $self->{haenv}->get_ha_settings();
+    $self->{balancer}->{enabled} = $dc_ha_cfg->{balancer};
+}
+
 1;
-- 
2.30.2