* [pve-devel] [PATCH qemu-server v3 1/3] pci: refactor pci device preparation
2021-10-07 13:45 [pve-devel] [PATCH qemu-server v3 0/3] fix #3258: check for in-use pci devices on vm start Dominik Csapak
@ 2021-10-07 13:45 ` Dominik Csapak
2021-10-11 6:49 ` [pve-devel] applied: " Thomas Lamprecht
2021-10-07 13:45 ` [pve-devel] [PATCH qemu-server v3 2/3] pci: add helpers to (un)reserve pciids for a vm Dominik Csapak
2021-10-07 13:45 ` [pve-devel] [PATCH qemu-server v3 3/3] fix #3258: block vm start when pci device is already in use Dominik Csapak
2 siblings, 1 reply; 7+ messages in thread
From: Dominik Csapak @ 2021-10-07 13:45 UTC (permalink / raw)
To: pve-devel
makes the vm start a bit less crowded
Signed-off-by: Dominik Csapak <d.csapak@proxmox.com>
---
PVE/QemuServer.pm | 14 +-------------
PVE/QemuServer/PCI.pm | 20 ++++++++++++++++++++
2 files changed, 21 insertions(+), 13 deletions(-)
diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm
index e5175b3..f78b2cc 100644
--- a/PVE/QemuServer.pm
+++ b/PVE/QemuServer.pm
@@ -5389,19 +5389,7 @@ sub vm_start_nolock {
foreach my $pcidevice (@$pcidevices) {
my $pciid = $pcidevice->{id};
- my $info = PVE::SysFSTools::pci_device_info("$pciid");
- die "IOMMU not present\n" if !PVE::SysFSTools::check_iommu_support();
- die "no pci device info for device '$pciid'\n" if !$info;
-
- if ($d->{mdev}) {
- my $uuid = PVE::SysFSTools::generate_mdev_uuid($vmid, $i);
- PVE::SysFSTools::pci_create_mdev_device($pciid, $uuid, $d->{mdev});
- } else {
- die "can't unbind/bind PCI group to VFIO '$pciid'\n"
- if !PVE::SysFSTools::pci_dev_group_bind_to_vfio($pciid);
- die "can't reset PCI device '$pciid'\n"
- if $info->{has_fl_reset} && !PVE::SysFSTools::pci_dev_reset($info);
- }
+ PVE::QemuServer::PCI::prepare_pci_device($vmid, $pciid, $i, $d->{mdev});
}
}
diff --git a/PVE/QemuServer/PCI.pm b/PVE/QemuServer/PCI.pm
index 5608207..b94a99a 100644
--- a/PVE/QemuServer/PCI.pm
+++ b/PVE/QemuServer/PCI.pm
@@ -461,4 +461,24 @@ sub print_hostpci_devices {
return ($kvm_off, $gpu_passthrough, $legacy_igd);
}
+sub prepare_pci_device {
+ my ($vmid, $pciid, $confslot, $mdev) = @_;
+
+ my $info = PVE::SysFSTools::pci_device_info("$pciid");
+ die "IOMMU not present\n" if !PVE::SysFSTools::check_iommu_support();
+ die "no pci device info for device '$pciid'\n" if !$info;
+
+ if ($mdev) {
+ my $uuid = PVE::SysFSTools::generate_mdev_uuid($vmid, $confslot);
+ PVE::SysFSTools::pci_create_mdev_device($pciid, $uuid, $mdev);
+ } else {
+ die "can't unbind/bind PCI group to VFIO '$pciid'\n"
+ if !PVE::SysFSTools::pci_dev_group_bind_to_vfio($pciid);
+ die "can't reset PCI device '$pciid'\n"
+ if $info->{has_fl_reset} && !PVE::SysFSTools::pci_dev_reset($info);
+ }
+
+ return;
+}
+
1;
--
2.30.2
^ permalink raw reply [flat|nested] 7+ messages in thread
* [pve-devel] [PATCH qemu-server v3 2/3] pci: add helpers to (un)reserve pciids for a vm
2021-10-07 13:45 [pve-devel] [PATCH qemu-server v3 0/3] fix #3258: check for in-use pci devices on vm start Dominik Csapak
2021-10-07 13:45 ` [pve-devel] [PATCH qemu-server v3 1/3] pci: refactor pci device preparation Dominik Csapak
@ 2021-10-07 13:45 ` Dominik Csapak
2021-10-15 17:59 ` [pve-devel] applied: " Thomas Lamprecht
2021-10-07 13:45 ` [pve-devel] [PATCH qemu-server v3 3/3] fix #3258: block vm start when pci device is already in use Dominik Csapak
2 siblings, 1 reply; 7+ messages in thread
From: Dominik Csapak @ 2021-10-07 13:45 UTC (permalink / raw)
To: pve-devel
saves a list of pciid <-> vmid mappings in /var/run
that we can check when we start a vm
if we're not given a pid but a timeout, we save the time when the
reservation will run out (current time + timeout + 5s) since each
vm start (until we can save the pid) varies from config to config
reserve_pci_usage and remove_pci_reservation always expect a list of ids
so that we can update the reservation for a vm all at once
Signed-off-by: Dominik Csapak <d.csapak@proxmox.com>
---
PVE/QemuServer/PCI.pm | 103 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 103 insertions(+)
diff --git a/PVE/QemuServer/PCI.pm b/PVE/QemuServer/PCI.pm
index b94a99a..102d509 100644
--- a/PVE/QemuServer/PCI.pm
+++ b/PVE/QemuServer/PCI.pm
@@ -5,6 +5,7 @@ use strict;
use PVE::JSONSchema;
use PVE::SysFSTools;
+use PVE::Tools;
use base 'Exporter';
@@ -481,4 +482,106 @@ sub prepare_pci_device {
return;
}
+my $PCIID_RESERVATION_FILE = "/var/run/pve-reserved-pciids";
+my $PCIID_RESERVATION_LCK = "/var/lock/pve-reserved-pciids.lck";
+
+my $parse_pci_reservation = sub {
+ my $pciids = {};
+
+ if (my $fh = IO::File->new ($PCIID_RESERVATION_FILE, "r")) {
+ while (my $line = <$fh>) {
+ if ($line =~ m/^($PCIRE)\s(\d+)\s(time|pid)\:(\d+)$/) {
+ $pciids->{$1} = {
+ vmid => $2,
+ "$3" => $4,
+ };
+ }
+ }
+ }
+
+ return $pciids;
+};
+
+my $write_pci_reservation = sub {
+ my ($pciids) = @_;
+
+ my $data = "";
+ foreach my $p (keys %$pciids) {
+ my $entry = $pciids->{$p};
+ if (defined($entry->{pid})) {
+ $data .= "$p $entry->{vmid} pid:$entry->{pid}\n";
+ } else {
+ $data .= "$p $entry->{vmid} time:$entry->{time}\n";
+ }
+ }
+
+ PVE::Tools::file_set_contents($PCIID_RESERVATION_FILE, $data);
+};
+
+sub remove_pci_reservation {
+ my ($ids) = @_;
+
+ return if !scalar(@$ids); # do nothing for empty list
+
+ my $code = sub {
+ my $pciids = $parse_pci_reservation->();
+
+ for my $id (@$ids) {
+ delete $pciids->{$id};
+ }
+
+ $write_pci_reservation->($pciids);
+ };
+
+ PVE::Tools::lock_file($PCIID_RESERVATION_LCK, 10, $code);
+ die $@ if $@;
+
+ return;
+}
+
+sub reserve_pci_usage {
+ my ($ids, $vmid, $timeout, $pid) = @_;
+
+ return if !scalar(@$ids); # do nothing for empty list
+
+ PVE::Tools::lock_file($PCIID_RESERVATION_LCK, 10, sub {
+
+ my $ctime = time();
+ my $pciids = $parse_pci_reservation->();
+
+ for my $id (@$ids) {
+ my $errmsg = "PCI device '$id' already in use\n";
+ if (my $pciid = $pciids->{$id}) {
+ if ($pciid->{vmid} != $vmid) {
+ # check time based reservation
+ die $errmsg if defined($pciid->{time}) && $pciid->{time} > $ctime;
+
+ # check running vm
+ my $pid = PVE::QemuServer::Helpers::vm_running_locally($pciid->{vmid});
+ if (my $oldpid = $pciid->{pid}) {
+ die $errmsg if defined($pid) && $pid == $oldpid;
+ warn "leftover PCI reservation found for $id, ignoring...\n"
+ if !defined($pid) || $pid != $oldpid;
+ }
+ }
+ }
+
+ $pciids->{$id} = {
+ vmid => $vmid,
+ };
+ if (defined($timeout)) {
+ # we save the time when the reservation gets invalid (+5s),
+ # since the start timeout depends on the config
+ $pciids->{$id}->{time} = $ctime + $timeout + 5;
+ }
+ $pciids->{$id}->{pid} = $pid if defined($pid);
+ }
+
+ $write_pci_reservation->($pciids);
+
+ return;
+ });
+ die $@ if $@;
+}
+
1;
--
2.30.2
^ permalink raw reply [flat|nested] 7+ messages in thread
* [pve-devel] [PATCH qemu-server v3 3/3] fix #3258: block vm start when pci device is already in use
2021-10-07 13:45 [pve-devel] [PATCH qemu-server v3 0/3] fix #3258: check for in-use pci devices on vm start Dominik Csapak
2021-10-07 13:45 ` [pve-devel] [PATCH qemu-server v3 1/3] pci: refactor pci device preparation Dominik Csapak
2021-10-07 13:45 ` [pve-devel] [PATCH qemu-server v3 2/3] pci: add helpers to (un)reserve pciids for a vm Dominik Csapak
@ 2021-10-07 13:45 ` Dominik Csapak
2021-10-15 17:59 ` [pve-devel] applied: " Thomas Lamprecht
2 siblings, 1 reply; 7+ messages in thread
From: Dominik Csapak @ 2021-10-07 13:45 UTC (permalink / raw)
To: pve-devel
on vm start, we reserve all pciids that we use, and
remove the reservation again in vm_stop_cleanup
first with only a time-based reservation but after the vm is started,
we reserve again but with the pid.
for this, we have to move the start_timeout calculation above the
hostpci handling.
also moved the pci initialization out of the conf parsing loop
so that we can reserve all ids before we actually touch any of them
while touching the lines, fix the indentation
this way, when a vm starts with a pci device that is already configured
for a different running vm, will not be started and the user gets
the error that the device is already in use
Signed-off-by: Dominik Csapak <d.csapak@proxmox.com>
---
PVE/QemuServer.pm | 50 +++++++++++++++++++++++++++++++++++++++--------
1 file changed, 42 insertions(+), 8 deletions(-)
diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm
index f78b2cc..e504e9a 100644
--- a/PVE/QemuServer.pm
+++ b/PVE/QemuServer.pm
@@ -5381,16 +5381,40 @@ sub vm_start_nolock {
push @$cmd, '-S';
}
+ my $start_timeout = $params->{timeout} // config_aware_timeout($conf, $resume);
+ my $pciids = [];
+ my $pci_devices = {};
+
# host pci devices
for (my $i = 0; $i < $PVE::QemuServer::PCI::MAX_HOSTPCI_DEVICES; $i++) {
- my $d = parse_hostpci($conf->{"hostpci$i"});
- next if !$d;
- my $pcidevices = $d->{pciid};
- foreach my $pcidevice (@$pcidevices) {
- my $pciid = $pcidevice->{id};
+ my $d = parse_hostpci($conf->{"hostpci$i"});
+ next if !$d;
+ $pci_devices->{$i} = $d;
- PVE::QemuServer::PCI::prepare_pci_device($vmid, $pciid, $i, $d->{mdev});
- }
+ my $pcidevices = $d->{pciid};
+
+ my $ids = [map { $_->{id} } @$pcidevices];
+ push @$pciids, @$ids;
+ }
+
+ # reserve all pci ids before actually doing anything with them
+ PVE::QemuServer::PCI::reserve_pci_usage($pciids, $vmid, $start_timeout);
+
+ eval {
+ for my $i (sort keys %$pci_devices) {
+ my $d = $pci_devices->{$i};
+ my $pcidevices = $d->{pciid};
+ foreach my $pcidevice (@$pcidevices) {
+ my $pciid = $pcidevice->{id};
+ PVE::QemuServer::PCI::prepare_pci_device($vmid, $pciid, $i, $d->{mdev});
+ }
+ }
+ };
+
+ if (my $err = $@) {
+ eval { PVE::QemuServer::PCI::remove_pci_reservation($pciids) };
+ warn $@ if $@;
+ die $err;
}
PVE::Storage::activate_volumes($storecfg, $vollist);
@@ -5405,7 +5429,6 @@ sub vm_start_nolock {
my $cpuunits = get_cpuunits($conf);
- my $start_timeout = $params->{timeout} // config_aware_timeout($conf, $resume);
my %run_params = (
timeout => $statefile ? undef : $start_timeout,
umask => 0077,
@@ -5485,9 +5508,17 @@ sub vm_start_nolock {
if (my $err = $@) {
# deactivate volumes if start fails
eval { PVE::Storage::deactivate_volumes($storecfg, $vollist); };
+ eval { PVE::QemuServer::PCI::remove_pci_reservation($pciids) };
+
die "start failed: $err";
}
+ # reserve all pciids again with the pid
+ # the vm is already started, we can only warn on error here
+ my $pid = PVE::QemuServer::Helpers::vm_running_locally($vmid);
+ eval { PVE::QemuServer::PCI::reserve_pci_usage($pciids, $vmid, undef, $pid) };
+ warn $@ if $@;
+
print "migration listens on $migrate_uri\n" if $migrate_uri;
$res->{migrate_uri} = $migrate_uri;
@@ -5676,6 +5707,7 @@ sub vm_stop_cleanup {
unlink '/dev/shm/pve-shm-' . ($ivshmem->{name} // $vmid);
}
+ my $ids = [];
foreach my $key (keys %$conf) {
next if $key !~ m/^hostpci(\d+)$/;
my $hostpciindex = $1;
@@ -5684,9 +5716,11 @@ sub vm_stop_cleanup {
foreach my $pci (@{$d->{pciid}}) {
my $pciid = $pci->{id};
+ push @$ids, $pci->{id};
PVE::SysFSTools::pci_cleanup_mdev_device($pciid, $uuid);
}
}
+ PVE::QemuServer::PCI::remove_pci_reservation($ids);
vmconfig_apply_pending($vmid, $conf, $storecfg) if $apply_pending_changes;
};
--
2.30.2
^ permalink raw reply [flat|nested] 7+ messages in thread