From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [IPv6:2a01:7e0:0:424::9]) by lore.proxmox.com (Postfix) with ESMTPS id A28401FF13C for ; Thu, 05 Mar 2026 10:16:54 +0100 (CET) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 3ACF21E6CB; Thu, 5 Mar 2026 10:17:48 +0100 (CET) From: Dominik Csapak To: pve-devel@lists.proxmox.com Subject: [PATCH qemu-server 1/2] pci: move mdev related code to own module Date: Thu, 5 Mar 2026 10:16:54 +0100 Message-ID: <20260305091711.1221589-11-d.csapak@proxmox.com> X-Mailer: git-send-email 2.47.3 In-Reply-To: <20260305091711.1221589-1-d.csapak@proxmox.com> References: <20260305091711.1221589-1-d.csapak@proxmox.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-SPAM-LEVEL: Spam detection results: 0 AWL -0.963 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment KAM_MAILER 2 Automated Mailer Tag Left in Email SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Message-ID-Hash: Y7QVAQVW6P6YNT2SHRTDBPQZBEV7WF3G X-Message-ID-Hash: Y7QVAQVW6P6YNT2SHRTDBPQZBEV7WF3G X-MailFrom: d.csapak@proxmox.com X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; loop; banned-address; emergency; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header X-Mailman-Version: 3.3.10 Precedence: list List-Id: Proxmox VE development discussion List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: some from PVE::QemuServer::PCI but also from PVE::SysFSTools, since it makes much more sense to have this here. Use the current PVE::File module instead of the legacy calls to PVE::Tools, and modernize the code with perls v5.36 parameter syntax. While at it change some old uses of SysFSTools generate_mdev_uuid to the local one. Signed-off-by: Dominik Csapak --- src/PVE/QemuServer.pm | 5 +- src/PVE/QemuServer/Makefile | 1 + src/PVE/QemuServer/PCI.pm | 58 +++---------- src/PVE/QemuServer/PCI/Makefile | 9 ++ src/PVE/QemuServer/PCI/Mdev.pm | 145 ++++++++++++++++++++++++++++++++ 5 files changed, 171 insertions(+), 47 deletions(-) create mode 100644 src/PVE/QemuServer/PCI/Makefile create mode 100644 src/PVE/QemuServer/PCI/Mdev.pm diff --git a/src/PVE/QemuServer.pm b/src/PVE/QemuServer.pm index dbcd8841..b407a1ed 100644 --- a/src/PVE/QemuServer.pm +++ b/src/PVE/QemuServer.pm @@ -87,6 +87,7 @@ use PVE::QemuServer::Monitor qw(mon_cmd); use PVE::QemuServer::Network; use PVE::QemuServer::OVMF; use PVE::QemuServer::PCI qw(print_pci_addr print_pcie_addr print_pcie_root_port parse_hostpci); +use PVE::QemuServer::PCI::Mdev; use PVE::QemuServer::QemuImage; use PVE::QemuServer::QMPHelpers qw(qemu_deviceadd qemu_devicedel qemu_objectadd qemu_objectdel); use PVE::QemuServer::QSD; @@ -5665,7 +5666,7 @@ sub vm_start_nolock { my $smbios_conf = parse_smbios1($conf->{smbios1}); $uuid = $smbios_conf->{uuid} if defined($smbios_conf->{uuid}); } - $uuid = PVE::QemuServer::PCI::generate_mdev_uuid($vmid, $index) + $uuid = PVE::QemuServer::PCI::Mdev::generate_mdev_uuid($vmid, $index) if !defined($uuid); } } @@ -6095,7 +6096,7 @@ sub cleanup_pci_devices { foreach my $key (keys %$conf) { next if $key !~ m/^hostpci(\d+)$/; my $hostpciindex = $1; - my $uuid = PVE::SysFSTools::generate_mdev_uuid($vmid, $hostpciindex); + my $uuid = PVE::QemuServer::PCI::Mdev::generate_mdev_uuid($vmid, $hostpciindex); my $d = parse_hostpci($conf->{$key}); if ($d->{mdev}) { # NOTE: avoid PVE::SysFSTools::pci_cleanup_mdev_device as it requires PCI ID and we diff --git a/src/PVE/QemuServer/Makefile b/src/PVE/QemuServer/Makefile index 7e48c388..821556ef 100644 --- a/src/PVE/QemuServer/Makefile +++ b/src/PVE/QemuServer/Makefile @@ -35,3 +35,4 @@ SOURCES=Agent.pm \ install: $(SOURCES) for i in $(SOURCES); do install -D -m 0644 $$i $(DESTDIR)$(PERLDIR)/PVE/QemuServer/$$i; done $(MAKE) -C Cfg2Cmd install + $(MAKE) -C PCI install diff --git a/src/PVE/QemuServer/PCI.pm b/src/PVE/QemuServer/PCI.pm index c9cf8de0..0b67943c 100644 --- a/src/PVE/QemuServer/PCI.pm +++ b/src/PVE/QemuServer/PCI.pm @@ -12,6 +12,7 @@ use PVE::Tools; use PVE::QemuServer::Helpers; use PVE::QemuServer::Machine; +use PVE::QemuServer::PCI::Mdev; use base 'Exporter'; @@ -282,11 +283,6 @@ sub get_pci_addr_map { return $pci_addr_map; } -sub generate_mdev_uuid { - my ($vmid, $index) = @_; - return sprintf("%08d-0000-0000-0000-%012d", $index, $vmid); -} - my $get_addr_mapping_from_id = sub { my ($map, $id) = @_; @@ -543,41 +539,6 @@ sub parse_hostpci_devices { return $parsed_devices; } -# set vgpu type of a vf of an nvidia gpu with kernel 6.8 or newer -my sub create_nvidia_device { - my ($id, $model) = @_; - - $id = PVE::SysFSTools::normalize_pci_id($id); - - my $creation = "/sys/bus/pci/devices/$id/nvidia/current_vgpu_type"; - - die "no nvidia sysfs api for '$id'\n" if !-f $creation; - - my $current = PVE::Tools::file_read_firstline($creation); - if ($current ne "0") { - return 1 if $current eq $model; - # reset vgpu type so we can see all available and set the real device - die "unable to reset vgpu type for '$id'\n" if !PVE::SysFSTools::file_write($creation, "0"); - } - - my $types = PVE::SysFSTools::get_mdev_types($id); - my $selected; - for my $type_definition ($types->@*) { - next if $type_definition->{type} ne "nvidia-$model"; - $selected = $type_definition; - } - - if (!defined($selected) || $selected->{available} < 1) { - die "vgpu type '$model' not available for '$id'\n"; - } - - if (!PVE::SysFSTools::file_write($creation, $model)) { - die "could not set vgpu type to '$model' for '$id'\n"; - } - - return 1; -} - # takes the hash returned by parse_hostpci_devices and for all non mdev gpus, # selects one of the given alternatives by trying to reserve it # @@ -612,7 +573,10 @@ sub choose_hostpci_devices { $add_used_device->($device->{ids}); if ($device->{nvidia} && !$dry_run) { reserve_pci_usage($device->{ids}->[0]->{id}, $vmid, 10, undef); - create_nvidia_device($device->{ids}->[0]->{id}, $device->{nvidia}); + PVE::QemuServer::PCI::Mdev::create_nvidia_device( + $device->{ids}->[0]->{id}, + $device->{nvidia}, + ); } next; } @@ -628,7 +592,11 @@ sub choose_hostpci_devices { } if ($device->{nvidia} && !$dry_run) { - eval { create_nvidia_device($ids->[0], $device->{nvidia}) }; + eval { + PVE::QemuServer::PCI::Mdev::create_nvidia_device( + $ids->[0], $device->{nvidia}, + ); + }; if (my $err = $@) { warn $err; remove_pci_reservation($vmid, $ids); @@ -696,7 +664,7 @@ sub print_hostpci_devices { my $sysfspath; if ($d->{mdev}) { - my $uuid = generate_mdev_uuid($vmid, $i); + my $uuid = PVE::QemuServer::PCI::Mdev::generate_mdev_uuid($vmid, $i); $sysfspath = "/sys/bus/mdev/devices/$uuid"; } @@ -748,8 +716,8 @@ sub prepare_pci_device { if ($device->{nvidia} || $driver eq "keep") { # nothing to do } elsif (my $mdev = $device->{mdev}) { - my $uuid = generate_mdev_uuid($vmid, $index); - PVE::SysFSTools::pci_create_mdev_device($pciid, $uuid, $mdev); + my $uuid = PVE::QemuServer::PCI::Mdev::generate_mdev_uuid($vmid, $index); + PVE::QemuServer::PCI::Mdev::pci_create_mdev_device($pciid, $uuid, $mdev); } else { die "can't unbind/bind PCI group to VFIO '$pciid'\n" if !PVE::SysFSTools::pci_dev_group_bind_to_vfio($pciid); diff --git a/src/PVE/QemuServer/PCI/Makefile b/src/PVE/QemuServer/PCI/Makefile new file mode 100644 index 00000000..ecf37411 --- /dev/null +++ b/src/PVE/QemuServer/PCI/Makefile @@ -0,0 +1,9 @@ +DESTDIR= +PREFIX=/usr +PERLDIR=$(PREFIX)/share/perl5 + +SOURCES=Mdev.pm + +.PHONY: install +install: $(SOURCES) + for i in $(SOURCES); do install -D -m 0644 $$i $(DESTDIR)$(PERLDIR)/PVE/QemuServer/PCI/$$i; done diff --git a/src/PVE/QemuServer/PCI/Mdev.pm b/src/PVE/QemuServer/PCI/Mdev.pm new file mode 100644 index 00000000..3b42ce2d --- /dev/null +++ b/src/PVE/QemuServer/PCI/Mdev.pm @@ -0,0 +1,145 @@ +package PVE::QemuServer::PCI::Mdev; + +use v5.36; + +use PVE::SysFSTools; +use PVE::File qw(file_read_first_line dir_glob_foreach file_get_contents); + +my $pcisysfs = "/sys/bus/pci"; + +sub generate_mdev_uuid($vmid, $index) { + return sprintf("%08d-0000-0000-0000-%012d", $index, $vmid); +} + +# +# return format: +# [ +# { +# type => 'FooType_1', +# description => "a longer description with custom format\nand newlines", +# available => 5, +# }, +# ... +# ] +# +sub get_mdev_types($id) { + $id = PVE::SysFSTools::normalize_pci_id($id); + + my $types = []; + + my $dev_path = "$pcisysfs/devices/$id"; + my $mdev_path = "$dev_path/mdev_supported_types"; + my $nvidia_path = "$dev_path/nvidia/creatable_vgpu_types"; + if (-d $mdev_path) { + dir_glob_foreach( + $mdev_path, + '[^\.].*', + sub { + my ($type) = @_; + + my $type_path = "$mdev_path/$type"; + + my $available = int(file_read_first_line("$type_path/available_instances")); + my $description = file_get_contents("$type_path/description"); + + my $entry = { + type => $type, + description => $description, + available => $available, + }; + + my $name = file_read_first_line("$type_path/name"); + $entry->{name} = $name if defined($name); + + push @$types, $entry; + }, + ); + } elsif (-f $nvidia_path) { + my $creatable = PVE::Tools::file_get_contents($nvidia_path); + for my $line (split("\n", $creatable)) { + next if $line =~ m/^ID/; # header + next if $line !~ m/^(.*?)\s*:\s*(.*)$/; + my $id = $1; + my $name = $2; + + push $types->@*, { + type => "nvidia-$id", # backwards compatibility + description => "", # TODO, read from xml/nvidia-smi ? + available => 1, + name => $name, + }; + } + } + + return $types; +} + +sub pci_create_mdev_device($pciid, $uuid, $type) { + $pciid = PVE::SysFSTools::normalize_pci_id($pciid); + + my $basedir = "$pcisysfs/devices/$pciid"; + my $mdev_dir = "$basedir/mdev_supported_types"; + + die "pci device '$pciid' does not support mediated devices \n" + if !-d $mdev_dir; + + die "pci device '$pciid' has no type '$type'\n" + if !-d "$mdev_dir/$type"; + + if (-d "$basedir/$uuid") { + # it already exists, checking type + my $typelink = readlink("$basedir/$uuid/mdev_type"); + my ($existingtype) = $typelink =~ m|/([^/]+)$|; + die "mdev instance '$uuid' already exists, but type is not '$type'\n" + if $type ne $existingtype; + + # instance exists, so use it but warn the user + warn "mdev instance '$uuid' already existed, using it.\n"; + return undef; + } + + my $instances = file_read_first_line("$mdev_dir/$type/available_instances"); + my ($avail) = $instances =~ m/^(\d+)$/; + die "pci device '$pciid' has no available instances of '$type'\n" + if $avail < 1; + + die "could not create '$type' for pci devices '$pciid'\n" + if !PVE::SysFSTools::file_write("$mdev_dir/$type/create", $uuid); + + return undef; +} + +# set vgpu type of a vf of an nvidia gpu with kernel 6.8 or newer +sub create_nvidia_device($id, $model) { + $id = PVE::SysFSTools::normalize_pci_id($id); + + my $creation = "$pcisysfs/devices/$id/nvidia/current_vgpu_type"; + + die "no nvidia sysfs api for '$id'\n" if !-f $creation; + + my $current = file_read_first_line($creation); + if ($current ne "0") { + return 1 if $current eq $model; + # reset vgpu type so we can see all available and set the real device + die "unable to reset vgpu type for '$id'\n" if !PVE::SysFSTools::file_write($creation, "0"); + } + + my $types = get_mdev_types($id); + my $selected; + for my $type_definition ($types->@*) { + next if $type_definition->{type} ne "nvidia-$model"; + $selected = $type_definition; + } + + if (!defined($selected) || $selected->{available} < 1) { + die "vgpu type '$model' not available for '$id'\n"; + } + + if (!PVE::SysFSTools::file_write($creation, $model)) { + die "could not set vgpu type to '$model' for '$id'\n"; + } + + return 1; +} + +1; -- 2.47.3