From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by lists.proxmox.com (Postfix) with ESMTPS id A04BE9BB9 for ; Fri, 18 Nov 2022 13:14:00 +0100 (CET) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 77C161D43 for ; Fri, 18 Nov 2022 13:13:30 +0100 (CET) Received: from bastionodiso.odiso.net (bastionodiso.odiso.net [IPv6:2a0a:1580:2000::2d]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by firstgate.proxmox.com (Proxmox) with ESMTPS for ; Fri, 18 Nov 2022 13:13:28 +0100 (CET) Received: from kvmformation3.odiso.net (formationkvm3.odiso.net [10.3.94.12]) by bastionodiso.odiso.net (Postfix) with ESMTP id C665180BA; Fri, 18 Nov 2022 13:13:21 +0100 (CET) Received: by kvmformation3.odiso.net (Postfix, from userid 0) id B347B195C77; Fri, 18 Nov 2022 13:13:21 +0100 (CET) From: Alexandre Derumier To: pve-devel@lists.proxmox.com Date: Fri, 18 Nov 2022 13:13:19 +0100 Message-Id: <20221118121320.132283-2-aderumier@odiso.com> X-Mailer: git-send-email 2.30.2 In-Reply-To: <20221118121320.132283-1-aderumier@odiso.com> References: <20221118121320.132283-1-aderumier@odiso.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-SPAM-LEVEL: Spam detection results: =?UTF-8?Q?0=0A=09?=AWL 0.033 Adjusted score from AWL reputation of From: =?UTF-8?Q?address=0A=09?=BAYES_00 -1.9 Bayes spam probability is 0 to 1% HEADER_FROM_DIFFERENT_DOMAINS 0.25 From and EnvelopeFrom 2nd level mail domains are =?UTF-8?Q?different=0A=09?=KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict =?UTF-8?Q?Alignment=0A=09?=KAM_LAZY_DOMAIN_SECURITY 1 Sending domain does not have any anti-forgery =?UTF-8?Q?methods=0A=09?=NO_DNS_FOR_FROM 0.001 Envelope sender has no MX or A DNS =?UTF-8?Q?records=0A=09?=SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF =?UTF-8?Q?Record=0A=09?=SPF_NONE 0.001 SPF: sender does not publish an SPF =?UTF-8?Q?Record=0A=09?=URIBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to URIBL was blocked. See http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block for more information. [pci.pm, qemuserver.pm, memory.pm] Subject: [pve-devel] [PATCH v3 qemu-server 1/2] add virtio-mem support X-BeenThere: pve-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox VE development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 18 Nov 2022 12:14:00 -0000 This patch add virtio-mem support, through a new maxmemory option. a 4GB static memory is needed for DMA+boot memory, as this memory is almost always un-unpluggeable. 1 virtio-mem pci device is setup for each numa node on pci.4 bridge virtio-mem use a fixed blocksize with 32k max blocksize, so blocksize is computed from the maxmemory/32000 with a minimum of 2MB to map THP. (lower blocksize = more chance to unplug memory). Signed-off-by: Alexandre Derumier --- PVE/QemuServer.pm | 9 ++- PVE/QemuServer/Memory.pm | 141 ++++++++++++++++++++++++++++++--------- PVE/QemuServer/PCI.pm | 8 +++ 3 files changed, 126 insertions(+), 32 deletions(-) diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm index 9a20647..d0f6d1f 100644 --- a/PVE/QemuServer.pm +++ b/PVE/QemuServer.pm @@ -338,6 +338,13 @@ my $confdesc = { maximum => 262144, default => 'cgroup v1: 1024, cgroup v2: 100', }, + 'memory_max' => { + optional => 1, + type => 'integer', + description => "Max hotpluggable virtio-mem memory", + minimum => 4096, + default => undef, + }, memory => { optional => 1, type => 'integer', @@ -3858,7 +3865,7 @@ sub config_to_command { push @$cmd, get_cpu_options($conf, $arch, $kvm, $kvm_off, $machine_version, $winversion, $gpu_passthrough); } - PVE::QemuServer::Memory::config($conf, $vmid, $sockets, $cores, $defaults, $hotplug_features, $cmd); + PVE::QemuServer::Memory::config($conf, $vmid, $sockets, $cores, $defaults, $hotplug_features, $cmd, $devices, $bridges, $arch, $machine_type); push @$cmd, '-S' if $conf->{freeze}; diff --git a/PVE/QemuServer/Memory.pm b/PVE/QemuServer/Memory.pm index 013917e..ed7eff6 100644 --- a/PVE/QemuServer/Memory.pm +++ b/PVE/QemuServer/Memory.pm @@ -8,9 +8,45 @@ use PVE::Exception qw(raise raise_param_exc); use PVE::QemuServer; use PVE::QemuServer::Monitor qw(mon_cmd); +use PVE::QemuServer::PCI qw(print_pci_addr); my $MAX_NUMA = 8; -my $STATICMEM = 1024; + +my sub get_static_mem { + my ($conf, $defaults) = @_; + + my $sockets = 1; + $sockets = $conf->{smp} if $conf->{smp}; # old style - no longer iused + $sockets = $conf->{sockets} if $conf->{sockets}; + my $hotplug_features = PVE::QemuServer::parse_hotplug_features(defined($conf->{hotplug}) ? $conf->{hotplug} : '1'); + + my $static_memory = 0; + + if ($hotplug_features->{memory} || $conf->{'memory_max'}) { + $static_memory = 1024; + $static_memory = $static_memory * $sockets if ($conf->{hugepages} && $conf->{hugepages} == 1024); + $static_memory = 4096 if $conf->{'memory_max'}; + } else { + $static_memory = $conf->{memory} || $defaults->{memory}; + } + + return $static_memory; +} + +my sub get_virtiomem_block_size { + my ($conf, $static_memory) = @_; + + my $maxmemory = $conf->{'memory_max'}; + return undef if !$maxmemory; + + #virtiomem can map 32000 block size. try to use lowerst blocksize, lower = more chance to unplug memory. + my $blocksize = ($maxmemory - $static_memory) / 32000; + #round next power of 2 + $blocksize = 2**(int(log($blocksize)/log(2))+1); + #2MB is the minimum to be aligned with THP + $blocksize = 2 if $blocksize < 2; + return $blocksize; +} my $_host_bits; my sub get_host_phys_address_bits { @@ -58,7 +94,14 @@ my sub get_max_mem { # remove 20 bits to get MB and half that as QEMU needs some overhead my $bits_to_max_mem = int(1 << ($bits - 21)); - return $bits_to_max_mem > 4*1024*1024 ? 4*1024*1024 : $bits_to_max_mem; + my $max_mem = $bits_to_max_mem > 4*1024*1024 ? 4*1024*1024 : $bits_to_max_mem; + + if($conf->{'memory_max'}) { + die "memory_max can't be bigger than $max_mem MB" if $conf->{'memory_max'} > $max_mem; + return $conf->{'memory_max'}; + } + + return $max_mem; } sub get_numa_node_list { @@ -152,6 +195,8 @@ sub foreach_reverse_dimm { } } + + sub qemu_memory_hotplug { my ($vmid, $conf, $defaults, $opt, $value) = @_; @@ -164,14 +209,46 @@ sub qemu_memory_hotplug { $value = $defaults->{memory} if !$value; return $value if $value == $memory; - my $static_memory = $STATICMEM; - $static_memory = $static_memory * $sockets if ($conf->{hugepages} && $conf->{hugepages} == 1024); + my $static_memory = get_static_mem($conf, $defaults); + my $max_mem = get_max_mem($conf); die "memory can't be lower than $static_memory MB" if $value < $static_memory; - my $MAX_MEM = get_max_mem($conf); - die "you cannot add more memory than max mem $MAX_MEM MB!\n" if $memory > $MAX_MEM; + die "you cannot add more memory than max mem $max_mem MB!\n" if $value > $max_mem; + + if ($conf->{'memory_max'}) { + + die "memory size need to be multiple of 32MB when memory_max is defined" if $value % 32 != 0; + + my $requested_size = ($value - $static_memory) / $sockets * 1024 * 1024; - if ($value > $memory) { + my $totalsize = $static_memory; + my $err = undef; + + for (my $i = 0; $i < $sockets; $i++) { + + my $id = "virtiomem$i"; + my $retry = 0; + mon_cmd($vmid, 'qom-set', path => "/machine/peripheral/$id", property => "requested-size", value => int($requested_size)); + + my $size = 0; + while (1) { + sleep 1; + $size = mon_cmd($vmid, 'qom-get', path => "/machine/peripheral/$id", property => "size"); + $err = 1 if $retry > 5; + last if $size eq $requested_size || $retry > 5; + $retry++; + } + $totalsize += ($size / 1024 / 1024 ); + } + #update conf after each succesful module unplug + if($err) { + $conf->{memory} = $totalsize; + PVE::QemuConfig->write_config($vmid, $conf); + raise_param_exc({ 'memory' => "error modify virtio memory" }) if $err; + } + return $totalsize; + + } elsif($value > $memory) { my $numa_hostmap; @@ -266,33 +343,27 @@ sub qemu_dimm_list { } sub config { - my ($conf, $vmid, $sockets, $cores, $defaults, $hotplug_features, $cmd) = @_; + my ($conf, $vmid, $sockets, $cores, $defaults, $hotplug_features, $cmd, $devices, $bridges, $arch, $machine_type) = @_; my $memory = $conf->{memory} || $defaults->{memory}; - my $static_memory = 0; + my $static_memory = get_static_mem($conf, $defaults); - if ($hotplug_features->{memory}) { + if ($hotplug_features->{memory} || $conf->{'memory_max'}) { die "NUMA needs to be enabled for memory hotplug\n" if !$conf->{numa}; - my $MAX_MEM = get_max_mem($conf); - die "Total memory is bigger than ${MAX_MEM}MB\n" if $memory > $MAX_MEM; + my $max_mem = get_max_mem($conf); + die "Total memory is bigger than ${max_mem}MB\n" if $memory > $max_mem; for (my $i = 0; $i < $MAX_NUMA; $i++) { die "cannot enable memory hotplugging with custom NUMA topology\n" if $conf->{"numa$i"}; } - my $sockets = 1; - $sockets = $conf->{sockets} if $conf->{sockets}; - - $static_memory = $STATICMEM; - $static_memory = $static_memory * $sockets if ($conf->{hugepages} && $conf->{hugepages} == 1024); - - die "minimum memory must be ${static_memory}MB\n" if($memory < $static_memory); - push @$cmd, '-m', "size=${static_memory},slots=255,maxmem=${MAX_MEM}M"; + die "memory size need to be multiple of 32MB when memory_max is defined" if $conf->{'memory_max'} && ($memory % 32 != 0); + my $cmdstr = "size=${static_memory},maxmem=${max_mem}M"; + $cmdstr .= ",slots=255" if !$conf->{'memory_max'}; + push @$cmd, '-m', $cmdstr; } else { - - $static_memory = $memory; push @$cmd, '-m', $static_memory; } @@ -359,7 +430,21 @@ sub config { } } - if ($hotplug_features->{memory}) { + if ($conf->{'memory_max'}) { + my $node_maxmem = ($conf->{'memory_max'} - $static_memory) / $sockets; + my $node_mem = ($memory - $static_memory) / $sockets; + my $blocksize = get_virtiomem_block_size($conf, $static_memory); + + for (my $i = 0; $i < $sockets; $i++) { + + my $id = "virtiomem$i"; + my $pciaddr = print_pci_addr($id, $bridges, $arch, $machine_type); + my $mem_object = print_mem_object($conf, "mem-$id", $node_maxmem); + + push @$cmd, "-object" , $mem_object; + push @$devices, "-device", "virtio-mem-pci,block-size=${blocksize}M,requested-size=${node_mem}M,id=$id,memdev=mem-$id,node=$i$pciaddr"; + } + } elsif ($hotplug_features->{memory}) { foreach_dimm($conf, $vmid, $memory, $sockets, sub { my ($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory) = @_; @@ -485,20 +570,14 @@ sub hugepages_topology { my $defaults = PVE::QemuServer::load_defaults(); my $memory = $conf->{memory} || $defaults->{memory}; - my $static_memory = 0; + my $static_memory = get_static_mem($conf, $defaults); + my $sockets = 1; $sockets = $conf->{smp} if $conf->{smp}; # old style - no longer iused $sockets = $conf->{sockets} if $conf->{sockets}; my $numa_custom_topology = undef; my $hotplug_features = PVE::QemuServer::parse_hotplug_features(defined($conf->{hotplug}) ? $conf->{hotplug} : '1'); - if ($hotplug_features->{memory}) { - $static_memory = $STATICMEM; - $static_memory = $static_memory * $sockets if ($conf->{hugepages} && $conf->{hugepages} == 1024); - } else { - $static_memory = $memory; - } - #custom numa topology for (my $i = 0; $i < $MAX_NUMA; $i++) { next if !$conf->{"numa$i"}; diff --git a/PVE/QemuServer/PCI.pm b/PVE/QemuServer/PCI.pm index a18b974..0187c74 100644 --- a/PVE/QemuServer/PCI.pm +++ b/PVE/QemuServer/PCI.pm @@ -249,6 +249,14 @@ sub get_pci_addr_map { 'scsihw2' => { bus => 4, addr => 1 }, 'scsihw3' => { bus => 4, addr => 2 }, 'scsihw4' => { bus => 4, addr => 3 }, + 'virtiomem0' => { bus => 4, addr => 4 }, + 'virtiomem1' => { bus => 4, addr => 5 }, + 'virtiomem2' => { bus => 4, addr => 6 }, + 'virtiomem3' => { bus => 4, addr => 7 }, + 'virtiomem4' => { bus => 4, addr => 8 }, + 'virtiomem5' => { bus => 4, addr => 9 }, + 'virtiomem6' => { bus => 4, addr => 10 }, + 'virtiomem7' => { bus => 4, addr => 11 }, } if !defined($pci_addr_map); return $pci_addr_map; } -- 2.30.2