From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <d.csapak@proxmox.com>
Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature RSA-PSS (2048 bits))
 (No client certificate requested)
 by lists.proxmox.com (Postfix) with ESMTPS id 3CFBF8B6E5
 for <pve-devel@lists.proxmox.com>; Thu, 25 Aug 2022 11:25:38 +0200 (CEST)
Received: from firstgate.proxmox.com (localhost [127.0.0.1])
 by firstgate.proxmox.com (Proxmox) with ESMTP id A1D65233A3
 for <pve-devel@lists.proxmox.com>; Thu, 25 Aug 2022 11:25:06 +0200 (CEST)
Received: from proxmox-new.maurer-it.com (proxmox-new.maurer-it.com
 [94.136.29.106])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature RSA-PSS (2048 bits))
 (No client certificate requested)
 by firstgate.proxmox.com (Proxmox) with ESMTPS
 for <pve-devel@lists.proxmox.com>; Thu, 25 Aug 2022 11:24:56 +0200 (CEST)
Received: from proxmox-new.maurer-it.com (localhost.localdomain [127.0.0.1])
 by proxmox-new.maurer-it.com (Proxmox) with ESMTP id 3983643FC9
 for <pve-devel@lists.proxmox.com>; Thu, 25 Aug 2022 11:24:45 +0200 (CEST)
From: Dominik Csapak <d.csapak@proxmox.com>
To: pve-devel@lists.proxmox.com
Date: Thu, 25 Aug 2022 11:24:26 +0200
Message-Id: <20220825092440.1810328-18-d.csapak@proxmox.com>
X-Mailer: git-send-email 2.30.2
In-Reply-To: <20220825092440.1810328-1-d.csapak@proxmox.com>
References: <20220825092440.1810328-1-d.csapak@proxmox.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-SPAM-LEVEL: Spam detection results:  0
 AWL 0.094 Adjusted score from AWL reputation of From: address
 BAYES_00                 -1.9 Bayes spam probability is 0 to 1%
 KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment
 SPF_HELO_NONE           0.001 SPF: HELO does not publish an SPF Record
 SPF_PASS               -0.001 SPF: sender matches SPF record
 T_SCC_BODY_TEXT_LINE    -0.01 -
 URIBL_BLOCKED 0.001 ADMINISTRATOR NOTICE: The query to URIBL was blocked. See
 http://wiki.apache.org/spamassassin/DnsBlocklists#dnsbl-block for more
 information. [pci.pm, qemuserver.pm]
Subject: [pve-devel] [PATCH qemu-server v2 12/13] fix #3574: enable multi
 pci device mapping from config
X-BeenThere: pve-devel@lists.proxmox.com
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Proxmox VE development discussion <pve-devel.lists.proxmox.com>
List-Unsubscribe: <https://lists.proxmox.com/cgi-bin/mailman/options/pve-devel>, 
 <mailto:pve-devel-request@lists.proxmox.com?subject=unsubscribe>
List-Archive: <http://lists.proxmox.com/pipermail/pve-devel/>
List-Post: <mailto:pve-devel@lists.proxmox.com>
List-Help: <mailto:pve-devel-request@lists.proxmox.com?subject=help>
List-Subscribe: <https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel>, 
 <mailto:pve-devel-request@lists.proxmox.com?subject=subscribe>
X-List-Received-Date: Thu, 25 Aug 2022 09:25:38 -0000

The hardware config now supports multiple devices as a semicolon
seperated list. With this, instead of only having one device in a pci mapping,
we now have a list of which we can choose from on vm start. This way one can
dynamically start vms with a pool of (identical) pci devices without
having to manually assign the proper ids.

For that we have to change the internal representation of a parsed
device, such that we have the seperately configured paths in the mapping
in different lists (because multifunction devices still are interpreted
as single devices)

For mdev devices we now can also have multiple devices, where we simply
try to create the appropriate type on each until we either have one
created, or bail out.

Since we now have to reserve the pci ids in print_hostpci_devices, we
have to add a 'reserve' parameter to config_to_command (and chain it
through to reserve_pci_usage) so that a 'qm showcmd' does not actually
reserve any pci id (this would break when using that on running vms).
Additionally this also prevents the migration tests from failing
(they use vm_commandline which in turn uses config_to_command)

Signed-off-by: Dominik Csapak <d.csapak@proxmox.com>
---
 PVE/QemuServer.pm     | 43 ++++++++++++++++++++++++++++++-------
 PVE/QemuServer/PCI.pm | 49 ++++++++++++++++++++++++++++++++++++-------
 2 files changed, 76 insertions(+), 16 deletions(-)

diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm
index 4551e2b..ce24b19 100644
--- a/PVE/QemuServer.pm
+++ b/PVE/QemuServer.pm
@@ -3507,8 +3507,9 @@ my sub should_disable_smm {
 
 sub config_to_command {
     my ($storecfg, $vmid, $conf, $defaults, $forcemachine, $forcecpu,
-        $pbs_backing) = @_;
+        $pbs_backing, $reserve) = @_;
 
+    $reserve //= 1;
     my $cmd = [];
     my ($globalFlags, $machineFlags, $rtcFlags) = ([], [], []);
     my $devices = [];
@@ -3724,7 +3725,7 @@ sub config_to_command {
 
     # host pci device passthrough
     my ($kvm_off, $gpu_passthrough, $legacy_igd, $pci_devices) = PVE::QemuServer::PCI::print_hostpci_devices(
-	$vmid, $conf, $devices, $vga, $winversion, $q35, $bridges, $arch, $machine_type, $bootorder);
+	$vmid, $conf, $devices, $vga, $winversion, $q35, $bridges, $arch, $machine_type, $bootorder, $reserve);
 
     # usb devices
     my $usb_dev_features = {};
@@ -5623,13 +5624,30 @@ sub vm_start_nolock {
 	my $uuid;
 	for my $id (sort keys %$pci_devices) {
 	    my $d = $pci_devices->{$id}->{device};
-	    for my $dev ($d->{pciid}->@*) {
-		my $info = PVE::QemuServer::PCI::prepare_pci_device($vmid, $dev->{id}, $id, $d->{mdev});
 
-		# nvidia grid needs the uuid of the mdev as qemu parameter
-		if ($d->{mdev} && !defined($uuid) && $info->{vendor} eq '10de') {
-		    $uuid = PVE::QemuServer::PCI::generate_mdev_uuid($vmid, $id);
+	    # used pci devices for non-mdev
+	    if (!$d->{mdev}) {
+		for my $dev ($pci_devices->{$id}->{used}->@*) {
+		    PVE::QemuServer::PCI::prepare_pci_device($vmid, $dev->{id}, $id);
 		}
+		next;
+	    }
+
+	    # try each configured pci device for mdevs
+	    my $devs = [map { $_->{id} } map { @$_ } $d->{ids}->@*]; # flatten ids
+
+	    my $info;
+	    for my $dev (@$devs) {
+		$info = eval { PVE::QemuServer::PCI::prepare_pci_device($vmid, $dev, $id, $d->{mdev}) };
+		warn $@ if $@;
+		last if $info; # if successful, we're done
+	    }
+
+	    die "could not create mediated device\n" if !defined($info);
+
+	    # nvidia grid needs the uuid of the mdev as qemu parameter
+	    if (!defined($uuid) && $info->{vendor} eq '10de') {
+		$uuid = PVE::QemuServer::PCI::generate_mdev_uuid($vmid, $id);
 	    }
 	}
 	push @$cmd, '-uuid', $uuid if defined($uuid);
@@ -5862,7 +5880,16 @@ sub vm_commandline {
 
     my $defaults = load_defaults();
 
-    my $cmd = config_to_command($storecfg, $vmid, $conf, $defaults, $forcemachine, $forcecpu);
+    my $cmd = config_to_command(
+	$storecfg,
+	$vmid,
+	$conf,
+	$defaults,
+	$forcemachine,
+	$forcecpu,
+	undef,
+	0,
+    );
 
     return PVE::Tools::cmd2string($cmd);
 }
diff --git a/PVE/QemuServer/PCI.pm b/PVE/QemuServer/PCI.pm
index 8c171f3..df8d16a 100644
--- a/PVE/QemuServer/PCI.pm
+++ b/PVE/QemuServer/PCI.pm
@@ -386,6 +386,7 @@ sub parse_hostpci {
 
     my $res = PVE::JSONSchema::parse_property_string($hostpci_fmt, $value);
 
+    my $idlist = [];
     if ($res->{host} !~ m/:/) {
 	# we have no ordinary pci id, must be a mapping
 	my $device = PVE::HardwareMap::find_device_on_current_node('pci', $res->{host});
@@ -396,15 +397,27 @@ sub parse_hostpci {
 	if (my $err = $@) {
 	    die "PCI device mapping invalid (hardware probably changed): $err\n";
 	}
-	$res->{host} = $device->{path};
+	$idlist = [split(/;/, $device->{path})];
+	# if we have a list of mapped devices, we want to choose the first available one
+	$res->{choose} = 1 if scalar(@$idlist > 1);
+    } else {
+	$idlist = [split(/;/, $res->{host})];
     }
 
-    my @idlist = split(/;/, $res->{host});
     delete $res->{host};
-    foreach my $id (@idlist) {
+    my $ignore_mdev = !$res->{choose} && scalar(@$idlist) > 1;
+
+    $res->{ids} = [];
+    foreach my $id (@$idlist) {
 	my $devs = PVE::SysFSTools::lspci($id);
 	die "no PCI device found for '$id'\n" if !scalar(@$devs);
-	push @{$res->{pciid}}, @$devs;
+	$ignore_mdev = 1 if scalar(@$devs) > 1;
+	push @{$res->{ids}}, $devs;
+    }
+    # ignore mdev for multiple devices, except when from mapping
+    if ($res->{mdev} && $ignore_mdev) {
+	warn "ignoring mediated device with multifunction device\n";
+	delete $res->{mdev};
     }
     return $res;
 }
@@ -433,11 +446,13 @@ my $print_pci_device = sub {
 };
 
 sub print_hostpci_devices {
-    my ($vmid, $conf, $devices, $vga, $winversion, $q35, $bridges, $arch, $machine_type, $bootorder) = @_;
+    my ($vmid, $conf, $devices, $vga, $winversion, $q35, $bridges, $arch, $machine_type, $bootorder, $reserve) = @_;
 
+    $reserve //= 1;
     my $kvm_off = 0;
     my $gpu_passthrough = 0;
     my $legacy_igd = 0;
+    my $used_pci_ids = {};
     my $parsed_devices = {};
 
     my $pciaddr;
@@ -469,7 +484,24 @@ sub print_hostpci_devices {
 	    $pciaddr = print_pci_addr($pci_name, $bridges, $arch, $machine_type);
 	}
 
-	my $pcidevices = $d->{pciid};
+	# choose devices
+	my $pcidevices = [];
+	if (!$d->{mdev}) {
+	    for my $devs ($d->{ids}->@*) {
+		my $ids = [map { $_->{id} } @$devs];
+
+		if ($d->{choose}) {
+		    next if grep { defined($used_pci_ids->{$_}) } @$ids; # already used
+		    eval { reserve_pci_usage($ids, $vmid, 10, undef, $reserve) };
+		    next if $@;
+		}
+
+		map { $used_pci_ids->{$_} = 1 } @$ids;
+		push @$pcidevices, @$devs;
+		last if $d->{choose};
+	    }
+	    die "could not find a free device\n" if scalar(@$pcidevices) < 1;
+	}
 	$parsed_devices->{$i}->{used} = $pcidevices;
 	my $multifunction = @$pcidevices > 1;
 
@@ -599,8 +631,9 @@ sub remove_pci_reservation {
 }
 
 sub reserve_pci_usage {
-    my ($requested_ids, $vmid, $timeout, $pid) = @_;
+    my ($requested_ids, $vmid, $timeout, $pid, $reserve) = @_;
 
+    $reserve //= 1;
     $requested_ids = [ $requested_ids ] if !ref($requested_ids);
     return if !scalar(@$requested_ids); # do nothing for empty list
 
@@ -633,7 +666,7 @@ sub reserve_pci_usage {
 		$reservation_list->{$id}->{time} = $ctime + $timeout + 5;
 	    }
 	}
-	$write_pci_reservation_unlocked->($reservation_list);
+	$write_pci_reservation_unlocked->($reservation_list) if $reserve;
     });
     die $@ if $@;
 }
-- 
2.30.2