* [pve-devel] [PATCH many v2] Add container device passthrough
@ 2023-10-24 12:55 Filip Schauer
2023-10-24 12:55 ` [pve-devel] [PATCH v2 container 1/1] Add " Filip Schauer
2023-10-24 12:55 ` [pve-devel] [PATCH v2 guest-common 1/1] Add foreach_passthrough_device Filip Schauer
0 siblings, 2 replies; 9+ messages in thread
From: Filip Schauer @ 2023-10-24 12:55 UTC (permalink / raw)
To: pve-devel
Changes since v2:
* mknod the devices in /var/lib/lxc/$vmid/passthrough and setup proper
permissions instead of bind mounting the devices from /dev directly
* Add support for USB mapping
* Add foreach_passthrough_device helper function
pve-container:
Filip Schauer (1):
Add device passthrough
src/PVE/LXC.pm | 34 +++++++++++++++++++++++-
src/PVE/LXC/Config.pm | 60 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 93 insertions(+), 1 deletion(-)
pve-guest-common:
Filip Schauer (1):
Add foreach_passthrough_device
src/PVE/AbstractConfig.pm | 44 +++++++++++++++++++++++++++++++++++++++
1 file changed, 44 insertions(+)
^ permalink raw reply [flat|nested] 9+ messages in thread
* [pve-devel] [PATCH v2 container 1/1] Add device passthrough
2023-10-24 12:55 [pve-devel] [PATCH many v2] Add container device passthrough Filip Schauer
@ 2023-10-24 12:55 ` Filip Schauer
2023-10-30 13:34 ` Wolfgang Bumiller
2023-10-24 12:55 ` [pve-devel] [PATCH v2 guest-common 1/1] Add foreach_passthrough_device Filip Schauer
1 sibling, 1 reply; 9+ messages in thread
From: Filip Schauer @ 2023-10-24 12:55 UTC (permalink / raw)
To: pve-devel
Add a dev[n] argument to the container config to pass devices through to
a container. A device can be passed by its path. Alternatively a mapped
USB device can be passed through with usbmapping=<name>.
Signed-off-by: Filip Schauer <f.schauer@proxmox.com>
---
src/PVE/LXC.pm | 34 +++++++++++++++++++++++-
src/PVE/LXC/Config.pm | 60 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 93 insertions(+), 1 deletion(-)
diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm
index c9b5ba7..a3ddb62 100644
--- a/src/PVE/LXC.pm
+++ b/src/PVE/LXC.pm
@@ -5,7 +5,8 @@ use warnings;
use Cwd qw();
use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED EEXIST);
-use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY);
+use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY :mode);
+use File::Basename;
use File::Path;
use File::Spec;
use IO::Poll qw(POLLIN POLLHUP);
@@ -639,6 +640,37 @@ sub update_lxc_config {
$raw .= "lxc.mount.auto = sys:mixed\n";
}
+ # Clear passthrough directory from previous run
+ my $passthrough_dir = "/var/lib/lxc/$vmid/passthrough";
+ File::Path::rmtree($passthrough_dir);
+
+ PVE::LXC::Config->foreach_passthrough_device($conf, sub {
+ my ($key, $sanitized_path) = @_;
+
+ my $absolute_path = "/$sanitized_path";
+ my ($mode, $rdev) = (stat($absolute_path))[2, 6];
+ die "Could not find major and minor ids of device $absolute_path.\n"
+ unless ($mode && $rdev);
+
+ my $major = PVE::Tools::dev_t_major($rdev);
+ my $minor = PVE::Tools::dev_t_minor($rdev);
+ my $device_type_char = S_ISBLK($mode) ? 'b' : 'c';
+ my $passthrough_device_path = "$passthrough_dir/$sanitized_path";
+ File::Path::make_path(dirname($passthrough_device_path));
+ PVE::Tools::run_command([
+ '/usr/bin/mknod',
+ '-m', '0660',
+ $passthrough_device_path,
+ $device_type_char,
+ $major,
+ $minor
+ ]);
+ chown 100000, 100000, $passthrough_device_path if ($unprivileged);
+
+ $raw .= "lxc.cgroup2.devices.allow = $device_type_char $major:$minor rw\n";
+ $raw .= "lxc.mount.entry = $passthrough_device_path $sanitized_path none bind,create=file\n";
+ });
+
# WARNING: DO NOT REMOVE this without making sure that loop device nodes
# cannot be exposed to the container with r/w access (cgroup perms).
# When this is enabled mounts will still remain in the monitor's namespace
diff --git a/src/PVE/LXC/Config.pm b/src/PVE/LXC/Config.pm
index 56e1f10..edd813e 100644
--- a/src/PVE/LXC/Config.pm
+++ b/src/PVE/LXC/Config.pm
@@ -29,6 +29,7 @@ mkdir $lockdir;
mkdir "/etc/pve/nodes/$nodename/lxc";
my $MAX_MOUNT_POINTS = 256;
my $MAX_UNUSED_DISKS = $MAX_MOUNT_POINTS;
+my $MAX_DEVICES = 256;
# BEGIN implemented abstract methods from PVE::AbstractConfig
@@ -908,6 +909,49 @@ for (my $i = 0; $i < $MAX_UNUSED_DISKS; $i++) {
}
}
+PVE::JSONSchema::register_format('pve-lxc-dev-string', \&verify_lxc_dev_string);
+sub verify_lxc_dev_string {
+ my ($dev, $noerr) = @_;
+
+ if (
+ $dev =~ m@/\.\.?/@ ||
+ $dev =~ m@/\.\.?$@ ||
+ $dev !~ m!^/dev/!
+ ) {
+ return undef if $noerr;
+ die "$dev is not a valid device path\n";
+ }
+
+ return $dev;
+}
+
+my $dev_desc = {
+ path => {
+ optional => 1,
+ type => 'string',
+ default_key => 1,
+ format => 'pve-lxc-dev-string',
+ format_description => 'Path',
+ description => 'Device to pass through to the container',
+ verbose_description => 'Path to the device to pass through to the container'
+ },
+ usbmapping => {
+ optional => 1,
+ type => 'string',
+ format => 'pve-configid',
+ format_description => 'mapping-id',
+ description => 'The ID of a cluster wide USB mapping.'
+ }
+};
+
+for (my $i = 0; $i < $MAX_DEVICES; $i++) {
+ $confdesc->{"dev$i"} = {
+ optional => 1,
+ type => 'string', format => $dev_desc,
+ description => "Device to pass through to the container",
+ }
+}
+
sub parse_pct_config {
my ($filename, $raw, $strict) = @_;
@@ -1255,6 +1299,22 @@ sub parse_volume {
return;
}
+sub parse_device {
+ my ($class, $device_string, $noerr) = @_;
+
+ my $res;
+ eval { $res = PVE::JSONSchema::parse_property_string($dev_desc, $device_string) };
+ if ($@) {
+ return undef if $noerr;
+ die $@;
+ }
+
+ die "Either path or usbmapping has to be defined"
+ unless (defined($res->{path}) || defined($res->{usbmapping}));
+
+ return $res;
+}
+
sub print_volume {
my ($class, $key, $volume) = @_;
--
2.39.2
^ permalink raw reply [flat|nested] 9+ messages in thread
* [pve-devel] [PATCH v2 guest-common 1/1] Add foreach_passthrough_device
2023-10-24 12:55 [pve-devel] [PATCH many v2] Add container device passthrough Filip Schauer
2023-10-24 12:55 ` [pve-devel] [PATCH v2 container 1/1] Add " Filip Schauer
@ 2023-10-24 12:55 ` Filip Schauer
2023-10-30 13:12 ` Wolfgang Bumiller
1 sibling, 1 reply; 9+ messages in thread
From: Filip Schauer @ 2023-10-24 12:55 UTC (permalink / raw)
To: pve-devel
Add a function to iterate over passthrough devices of a provided
container config.
Signed-off-by: Filip Schauer <f.schauer@proxmox.com>
---
src/PVE/AbstractConfig.pm | 44 +++++++++++++++++++++++++++++++++++++++
1 file changed, 44 insertions(+)
diff --git a/src/PVE/AbstractConfig.pm b/src/PVE/AbstractConfig.pm
index a286b13..ed26e91 100644
--- a/src/PVE/AbstractConfig.pm
+++ b/src/PVE/AbstractConfig.pm
@@ -484,6 +484,50 @@ sub foreach_volume {
return $class->foreach_volume_full($conf, undef, $func, @param);
}
+sub foreach_passthrough_device {
+ my ($class, $conf, $func, @param) = @_;
+
+ foreach my $key (keys %$conf) {
+ next if $key !~ m/^dev(\d+)$/;
+
+ my $device = $class->parse_device($conf->{$key});
+
+ if (defined($device->{path})) {
+ die "Device path $device->{path} does not start with /dev/\n"
+ if $device->{path} !~ m!^/dev/!;
+
+ my $sanitized_path = substr($conf->{$key}, 1);
+ die "Device /$sanitized_path does not exist\n" unless (-e "/$sanitized_path");
+
+ $func->($key, $sanitized_path, @param);
+ } elsif (defined($device->{usbmapping})) {
+ my $mapping = $device->{usbmapping};
+ my $map_devices = PVE::Mapping::USB::find_on_current_node($mapping);
+
+ die "USB device mapping not found for '$mapping'\n"
+ if !$map_devices || !scalar($map_devices->@*);
+
+ die "More than one USB mapping per host not supported\n"
+ if scalar($map_devices->@*) > 1;
+
+ eval {
+ PVE::Mapping::USB::assert_valid($mapping, $map_devices->[0]);
+ };
+ if (my $err = $@) {
+ die "USB Mapping invalid (hardware probably changed): $err\n";
+ }
+
+ my $map_device = $map_devices->[0];
+ my $lsusb_output = `/usr/bin/lsusb -d $map_device->{id}`;
+ my ($bus_id, $device_id) = $lsusb_output =~ /Bus (\d+) Device (\d+)/;
+
+ $func->($key, "dev/bus/usb/$bus_id/$device_id", @param);
+ } else {
+ die "Either path or usbmapping has to be defined for $key";
+ }
+ }
+}
+
# $volume_map is a hash of 'old_volid' => 'new_volid' pairs.
# This method replaces 'old_volid' by 'new_volid' throughout the config including snapshots, pending
# changes, unused volumes and vmstate volumes.
--
2.39.2
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [pve-devel] [PATCH v2 guest-common 1/1] Add foreach_passthrough_device
2023-10-24 12:55 ` [pve-devel] [PATCH v2 guest-common 1/1] Add foreach_passthrough_device Filip Schauer
@ 2023-10-30 13:12 ` Wolfgang Bumiller
2023-10-31 9:00 ` Thomas Lamprecht
0 siblings, 1 reply; 9+ messages in thread
From: Wolfgang Bumiller @ 2023-10-30 13:12 UTC (permalink / raw)
To: Filip Schauer; +Cc: pve-devel
On Tue, Oct 24, 2023 at 02:55:54PM +0200, Filip Schauer wrote:
> Add a function to iterate over passthrough devices of a provided
> container config.
As container specific code this should be in pve-container.
>
> Signed-off-by: Filip Schauer <f.schauer@proxmox.com>
> ---
> src/PVE/AbstractConfig.pm | 44 +++++++++++++++++++++++++++++++++++++++
> 1 file changed, 44 insertions(+)
>
> diff --git a/src/PVE/AbstractConfig.pm b/src/PVE/AbstractConfig.pm
> index a286b13..ed26e91 100644
> --- a/src/PVE/AbstractConfig.pm
> +++ b/src/PVE/AbstractConfig.pm
> @@ -484,6 +484,50 @@ sub foreach_volume {
> return $class->foreach_volume_full($conf, undef, $func, @param);
> }
>
> +sub foreach_passthrough_device {
> + my ($class, $conf, $func, @param) = @_;
> +
> + foreach my $key (keys %$conf) {
> + next if $key !~ m/^dev(\d+)$/;
> +
> + my $device = $class->parse_device($conf->{$key});
(`parse_device` does not exist in AbstractConfig)
> +
> + if (defined($device->{path})) {
> + die "Device path $device->{path} does not start with /dev/\n"
> + if $device->{path} !~ m!^/dev/!;
IMO `parse_device()` should be responsible for the above check, and
`verify_lxc_dev_string()` already does this, so the above check can just
be dropped.
> +
> + my $sanitized_path = substr($conf->{$key}, 1);
> + die "Device /$sanitized_path does not exist\n" unless (-e "/$sanitized_path");
Since we now have a property string and `parse_device()`, it would make
more sense to pass `$device` rather than this path.
In fact, thinking about this more I'm not sure passing the substring
through was the best idea, since it's rather specific to how it'll end
up in the lxc config. That's my bad.
> +
> + $func->($key, $sanitized_path, @param);
> + } elsif (defined($device->{usbmapping})) {
> + my $mapping = $device->{usbmapping};
> + my $map_devices = PVE::Mapping::USB::find_on_current_node($mapping);
> +
> + die "USB device mapping not found for '$mapping'\n"
> + if !$map_devices || !scalar($map_devices->@*);
> +
> + die "More than one USB mapping per host not supported\n"
> + if scalar($map_devices->@*) > 1;
> +
> + eval {
> + PVE::Mapping::USB::assert_valid($mapping, $map_devices->[0]);
> + };
> + if (my $err = $@) {
> + die "USB Mapping invalid (hardware probably changed): $err\n";
> + }
> +
> + my $map_device = $map_devices->[0];
> + my $lsusb_output = `/usr/bin/lsusb -d $map_device->{id}`;
> + my ($bus_id, $device_id) = $lsusb_output =~ /Bus (\d+) Device (\d+)/;
^ qx/backticks should be avoided, it'll be interpreted by a shell
Also, we don't need this :-)
See `PVE::SysFSTools::scan_usb()` which does essentially the same thing
without invoking an external binary.
> +
> + $func->($key, "dev/bus/usb/$bus_id/$device_id", @param);
So this will do... something :-)
But unfortunately it won't deal with the *interesting* bits.
So while I do like the idea of having such mappings, for it to make
sense we'd also need to figure out the device specific nodes.
Eg. for input devices we'd want to include `/dev/input/eventXY`, for eg.
FIDO keys we'd want `/dev/hidraw*` devices.
I'm not sure how much work we should put into this in the initial
implementation, the question is mainly whether we want to do it like
*this* in the first place and how much we plan to support.
Or maybe it would make more sense to have specific entries for hidraw,
event devices, block devices, ... instead?
I'm not sure.
> + } else {
> + die "Either path or usbmapping has to be defined for $key";
> + }
> + }
> +}
> +
> # $volume_map is a hash of 'old_volid' => 'new_volid' pairs.
> # This method replaces 'old_volid' by 'new_volid' throughout the config including snapshots, pending
> # changes, unused volumes and vmstate volumes.
> --
> 2.39.2
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [pve-devel] [PATCH v2 container 1/1] Add device passthrough
2023-10-24 12:55 ` [pve-devel] [PATCH v2 container 1/1] Add " Filip Schauer
@ 2023-10-30 13:34 ` Wolfgang Bumiller
2023-11-02 14:28 ` Filip Schauer
0 siblings, 1 reply; 9+ messages in thread
From: Wolfgang Bumiller @ 2023-10-30 13:34 UTC (permalink / raw)
To: Filip Schauer; +Cc: pve-devel
On Tue, Oct 24, 2023 at 02:55:53PM +0200, Filip Schauer wrote:
> Add a dev[n] argument to the container config to pass devices through to
> a container. A device can be passed by its path. Alternatively a mapped
> USB device can be passed through with usbmapping=<name>.
>
> Signed-off-by: Filip Schauer <f.schauer@proxmox.com>
> ---
> src/PVE/LXC.pm | 34 +++++++++++++++++++++++-
> src/PVE/LXC/Config.pm | 60 +++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 93 insertions(+), 1 deletion(-)
>
> diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm
> index c9b5ba7..a3ddb62 100644
> --- a/src/PVE/LXC.pm
> +++ b/src/PVE/LXC.pm
> @@ -5,7 +5,8 @@ use warnings;
>
> use Cwd qw();
> use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED EEXIST);
> -use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY);
> +use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY :mode);
> +use File::Basename;
> use File::Path;
> use File::Spec;
> use IO::Poll qw(POLLIN POLLHUP);
> @@ -639,6 +640,37 @@ sub update_lxc_config {
> $raw .= "lxc.mount.auto = sys:mixed\n";
> }
>
> + # Clear passthrough directory from previous run
> + my $passthrough_dir = "/var/lib/lxc/$vmid/passthrough";
> + File::Path::rmtree($passthrough_dir);
I think we need to make a few changes here.
First: we don't necessarily need this directory.
Having a device list would certainly be nice, but it makes more sense to
just have a file we can easily parse (possibly even just a json hash),
like the `devices` file we already create in the pre-start hook, except
prepared *for* the pre-start hook, which *should* be able to just
`mknod` the devices right into the container's `/dev` on startup.
We'd also avoid "lingering" device nodes with potentially harmful
uid/permissions in /var, which is certainly better from a security POV.
But note that we do need the `lxc.cgroup2.*` entries before starting the
container in order to ensure the devices cgroup has the right
permissions.
> +
> + PVE::LXC::Config->foreach_passthrough_device($conf, sub {
> + my ($key, $sanitized_path) = @_;
> +
> + my $absolute_path = "/$sanitized_path";
> + my ($mode, $rdev) = (stat($absolute_path))[2, 6];
> + die "Could not find major and minor ids of device $absolute_path.\n"
> + unless ($mode && $rdev);
> +
> + my $major = PVE::Tools::dev_t_major($rdev);
> + my $minor = PVE::Tools::dev_t_minor($rdev);
> + my $device_type_char = S_ISBLK($mode) ? 'b' : 'c';
> + my $passthrough_device_path = "$passthrough_dir/$sanitized_path";
> + File::Path::make_path(dirname($passthrough_device_path));
> + PVE::Tools::run_command([
> + '/usr/bin/mknod',
> + '-m', '0660',
> + $passthrough_device_path,
> + $device_type_char,
> + $major,
> + $minor
> + ]);
It's probably worth adding a helper for the mknod syscall to
`PVE::Tools`, there are a bunch of syscalls in there already.
> + chown 100000, 100000, $passthrough_device_path if ($unprivileged);
^ This isn't necessarily the correct id. Users may have custom id
mappings.
`PVE::LXC::parse_id_maps($conf)` returns the mapping alongside the root
uid and gid. (See for example `sub mount_all` for how it's used.
> +
> + $raw .= "lxc.cgroup2.devices.allow = $device_type_char $major:$minor rw\n";
> + $raw .= "lxc.mount.entry = $passthrough_device_path $sanitized_path none bind,create=file\n";
> + });
> +
> # WARNING: DO NOT REMOVE this without making sure that loop device nodes
> # cannot be exposed to the container with r/w access (cgroup perms).
> # When this is enabled mounts will still remain in the monitor's namespace
> diff --git a/src/PVE/LXC/Config.pm b/src/PVE/LXC/Config.pm
> index 56e1f10..edd813e 100644
> --- a/src/PVE/LXC/Config.pm
> +++ b/src/PVE/LXC/Config.pm
> @@ -29,6 +29,7 @@ mkdir $lockdir;
> mkdir "/etc/pve/nodes/$nodename/lxc";
> my $MAX_MOUNT_POINTS = 256;
> my $MAX_UNUSED_DISKS = $MAX_MOUNT_POINTS;
> +my $MAX_DEVICES = 256;
>
> # BEGIN implemented abstract methods from PVE::AbstractConfig
>
> @@ -908,6 +909,49 @@ for (my $i = 0; $i < $MAX_UNUSED_DISKS; $i++) {
> }
> }
>
> +PVE::JSONSchema::register_format('pve-lxc-dev-string', \&verify_lxc_dev_string);
> +sub verify_lxc_dev_string {
> + my ($dev, $noerr) = @_;
> +
> + if (
> + $dev =~ m@/\.\.?/@ ||
> + $dev =~ m@/\.\.?$@ ||
> + $dev !~ m!^/dev/!
> + ) {
> + return undef if $noerr;
> + die "$dev is not a valid device path\n";
> + }
> +
> + return $dev;
> +}
> +
> +my $dev_desc = {
> + path => {
> + optional => 1,
> + type => 'string',
> + default_key => 1,
> + format => 'pve-lxc-dev-string',
> + format_description => 'Path',
> + description => 'Device to pass through to the container',
> + verbose_description => 'Path to the device to pass through to the container'
> + },
> + usbmapping => {
> + optional => 1,
> + type => 'string',
> + format => 'pve-configid',
> + format_description => 'mapping-id',
> + description => 'The ID of a cluster wide USB mapping.'
> + }
> +};
> +
> +for (my $i = 0; $i < $MAX_DEVICES; $i++) {
> + $confdesc->{"dev$i"} = {
> + optional => 1,
> + type => 'string', format => $dev_desc,
> + description => "Device to pass through to the container",
> + }
> +}
> +
> sub parse_pct_config {
> my ($filename, $raw, $strict) = @_;
>
> @@ -1255,6 +1299,22 @@ sub parse_volume {
> return;
> }
>
> +sub parse_device {
> + my ($class, $device_string, $noerr) = @_;
> +
> + my $res;
> + eval { $res = PVE::JSONSchema::parse_property_string($dev_desc, $device_string) };
> + if ($@) {
> + return undef if $noerr;
> + die $@;
> + }
> +
> + die "Either path or usbmapping has to be defined"
> + unless (defined($res->{path}) || defined($res->{usbmapping}));
> +
> + return $res;
> +}
> +
> sub print_volume {
> my ($class, $key, $volume) = @_;
>
> --
> 2.39.2
>
>
>
> _______________________________________________
> pve-devel mailing list
> pve-devel@lists.proxmox.com
> https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel
>
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [pve-devel] [PATCH v2 guest-common 1/1] Add foreach_passthrough_device
2023-10-30 13:12 ` Wolfgang Bumiller
@ 2023-10-31 9:00 ` Thomas Lamprecht
0 siblings, 0 replies; 9+ messages in thread
From: Thomas Lamprecht @ 2023-10-31 9:00 UTC (permalink / raw)
To: Proxmox VE development discussion, Wolfgang Bumiller, Filip Schauer
On 30/10/2023 14:12, Wolfgang Bumiller wrote:
> On Tue, Oct 24, 2023 at 02:55:54PM +0200, Filip Schauer wrote:
>> +
>> + $func->($key, "dev/bus/usb/$bus_id/$device_id", @param);
>
> So this will do... something :-)
> But unfortunately it won't deal with the *interesting* bits.
>
> So while I do like the idea of having such mappings, for it to make
> sense we'd also need to figure out the device specific nodes.
> Eg. for input devices we'd want to include `/dev/input/eventXY`, for eg.
> FIDO keys we'd want `/dev/hidraw*` devices.
>
> I'm not sure how much work we should put into this in the initial
> implementation, the question is mainly whether we want to do it like
> *this* in the first place and how much we plan to support.
Hmm, yeah, let's leave the USB mappings out for now, not really winning
much here.
>
> Or maybe it would make more sense to have specific entries for hidraw,
> event devices, block devices, ... instead?
> I'm not sure.
A device (node) mapping with a sub-type like block ("disk") devices might
be indeed better then, as that could be re-used for VMs too, e.g., for
passing through a /dev/sda (well, more likely via /dev/disk/by-id/...)
device through. IIUC, that would be also more in line with Dominik's
feedback.
In any way, let's get raw pass-through right first, adding in mappings
should be doable then, we added them only much later for VMs too, so
I'd think one would have to actively try to really make that hard to
do in the future here.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [pve-devel] [PATCH v2 container 1/1] Add device passthrough
2023-10-30 13:34 ` Wolfgang Bumiller
@ 2023-11-02 14:28 ` Filip Schauer
2023-11-03 8:14 ` Wolfgang Bumiller
0 siblings, 1 reply; 9+ messages in thread
From: Filip Schauer @ 2023-11-02 14:28 UTC (permalink / raw)
To: Wolfgang Bumiller; +Cc: pve-devel
On 30/10/2023 14:34, Wolfgang Bumiller wrote:
> On Tue, Oct 24, 2023 at 02:55:53PM +0200, Filip Schauer wrote:
>> Add a dev[n] argument to the container config to pass devices through to
>> a container. A device can be passed by its path. Alternatively a mapped
>> USB device can be passed through with usbmapping=<name>.
>>
>> Signed-off-by: Filip Schauer<f.schauer@proxmox.com>
>> ---
>> src/PVE/LXC.pm | 34 +++++++++++++++++++++++-
>> src/PVE/LXC/Config.pm | 60 +++++++++++++++++++++++++++++++++++++++++++
>> 2 files changed, 93 insertions(+), 1 deletion(-)
>>
>> diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm
>> index c9b5ba7..a3ddb62 100644
>> --- a/src/PVE/LXC.pm
>> +++ b/src/PVE/LXC.pm
>> @@ -5,7 +5,8 @@ use warnings;
>>
>> use Cwd qw();
>> use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED EEXIST);
>> -use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY);
>> +use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY :mode);
>> +use File::Basename;
>> use File::Path;
>> use File::Spec;
>> use IO::Poll qw(POLLIN POLLHUP);
>> @@ -639,6 +640,37 @@ sub update_lxc_config {
>> $raw .= "lxc.mount.auto = sys:mixed\n";
>> }
>>
>> + # Clear passthrough directory from previous run
>> + my $passthrough_dir = "/var/lib/lxc/$vmid/passthrough";
>> + File::Path::rmtree($passthrough_dir);
> I think we need to make a few changes here.
>
> First: we don't necessarily need this directory.
> Having a device list would certainly be nice, but it makes more sense to
> just have a file we can easily parse (possibly even just a json hash),
> like the `devices` file we already create in the pre-start hook, except
> prepared *for* the pre-start hook, which *should* be able to just
> `mknod` the devices right into the container's `/dev` on startup.
Devices mknoded into the container's /dev directory in the pre-start
hook will not be visible in the container once it is fully started.
Meanwhile mknoding a device to a different path inside the container
works fine. It seems that LXC mounts over the /dev directory. This can
be solved by calling mknod in lxc-pve-autodev-hook, but this does not
work with unprivileged containers without the mknod capability.
So are bind mounts our only option without modifying LXC,
or am I overlooking something?
> We'd also avoid "lingering" device nodes with potentially harmful
> uid/permissions in /var, which is certainly better from a security POV.
>
> But note that we do need the `lxc.cgroup2.*` entries before starting the
> container in order to ensure the devices cgroup has the right
> permissions.
>
>> +
>> + PVE::LXC::Config->foreach_passthrough_device($conf, sub {
>> + my ($key, $sanitized_path) = @_;
>> +
>> + my $absolute_path = "/$sanitized_path";
>> + my ($mode, $rdev) = (stat($absolute_path))[2, 6];
>> + die "Could not find major and minor ids of device $absolute_path.\n"
>> + unless ($mode && $rdev);
>> +
>> + my $major = PVE::Tools::dev_t_major($rdev);
>> + my $minor = PVE::Tools::dev_t_minor($rdev);
>> + my $device_type_char = S_ISBLK($mode) ? 'b' : 'c';
>> + my $passthrough_device_path = "$passthrough_dir/$sanitized_path";
>> + File::Path::make_path(dirname($passthrough_device_path));
>> + PVE::Tools::run_command([
>> + '/usr/bin/mknod',
>> + '-m', '0660',
>> + $passthrough_device_path,
>> + $device_type_char,
>> + $major,
>> + $minor
>> + ]);
> It's probably worth adding a helper for the mknod syscall to
> `PVE::Tools`, there are a bunch of syscalls in there already.
>
>> + chown 100000, 100000, $passthrough_device_path if ($unprivileged);
> ^ This isn't necessarily the correct id. Users may have custom id
> mappings.
> `PVE::LXC::parse_id_maps($conf)` returns the mapping alongside the root
> uid and gid. (See for example `sub mount_all` for how it's used.
>
>> +
>> + $raw .= "lxc.cgroup2.devices.allow = $device_type_char $major:$minor rw\n";
>> + $raw .= "lxc.mount.entry = $passthrough_device_path $sanitized_path none bind,create=file\n";
>> + });
>> +
>> # WARNING: DO NOT REMOVE this without making sure that loop device nodes
>> # cannot be exposed to the container with r/w access (cgroup perms).
>> # When this is enabled mounts will still remain in the monitor's namespace
>> diff --git a/src/PVE/LXC/Config.pm b/src/PVE/LXC/Config.pm
>> index 56e1f10..edd813e 100644
>> --- a/src/PVE/LXC/Config.pm
>> +++ b/src/PVE/LXC/Config.pm
>> @@ -29,6 +29,7 @@ mkdir $lockdir;
>> mkdir "/etc/pve/nodes/$nodename/lxc";
>> my $MAX_MOUNT_POINTS = 256;
>> my $MAX_UNUSED_DISKS = $MAX_MOUNT_POINTS;
>> +my $MAX_DEVICES = 256;
>>
>> # BEGIN implemented abstract methods from PVE::AbstractConfig
>>
>> @@ -908,6 +909,49 @@ for (my $i = 0; $i < $MAX_UNUSED_DISKS; $i++) {
>> }
>> }
>>
>> +PVE::JSONSchema::register_format('pve-lxc-dev-string', \&verify_lxc_dev_string);
>> +sub verify_lxc_dev_string {
>> + my ($dev, $noerr) = @_;
>> +
>> + if (
>> + $dev =~m@/\.\.?/@ ||
>> + $dev =~m@/\.\.?$@ ||
>> + $dev !~ m!^/dev/!
>> + ) {
>> + return undef if $noerr;
>> + die "$dev is not a valid device path\n";
>> + }
>> +
>> + return $dev;
>> +}
>> +
>> +my $dev_desc = {
>> + path => {
>> + optional => 1,
>> + type => 'string',
>> + default_key => 1,
>> + format => 'pve-lxc-dev-string',
>> + format_description => 'Path',
>> + description => 'Device to pass through to the container',
>> + verbose_description => 'Path to the device to pass through to the container'
>> + },
>> + usbmapping => {
>> + optional => 1,
>> + type => 'string',
>> + format => 'pve-configid',
>> + format_description => 'mapping-id',
>> + description => 'The ID of a cluster wide USB mapping.'
>> + }
>> +};
>> +
>> +for (my $i = 0; $i < $MAX_DEVICES; $i++) {
>> + $confdesc->{"dev$i"} = {
>> + optional => 1,
>> + type => 'string', format => $dev_desc,
>> + description => "Device to pass through to the container",
>> + }
>> +}
>> +
>> sub parse_pct_config {
>> my ($filename, $raw, $strict) = @_;
>>
>> @@ -1255,6 +1299,22 @@ sub parse_volume {
>> return;
>> }
>>
>> +sub parse_device {
>> + my ($class, $device_string, $noerr) = @_;
>> +
>> + my $res;
>> + eval { $res = PVE::JSONSchema::parse_property_string($dev_desc, $device_string) };
>> + if ($@) {
>> + return undef if $noerr;
>> + die $@;
>> + }
>> +
>> + die "Either path or usbmapping has to be defined"
>> + unless (defined($res->{path}) || defined($res->{usbmapping}));
>> +
>> + return $res;
>> +}
>> +
>> sub print_volume {
>> my ($class, $key, $volume) = @_;
>>
>> --
>> 2.39.2
>>
>>
>>
>> _______________________________________________
>> pve-devel mailing list
>> pve-devel@lists.proxmox.com
>> https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel
>>
>>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [pve-devel] [PATCH v2 container 1/1] Add device passthrough
2023-11-02 14:28 ` Filip Schauer
@ 2023-11-03 8:14 ` Wolfgang Bumiller
2023-11-07 13:49 ` Filip Schauer
0 siblings, 1 reply; 9+ messages in thread
From: Wolfgang Bumiller @ 2023-11-03 8:14 UTC (permalink / raw)
To: Filip Schauer; +Cc: pve-devel
On Thu, Nov 02, 2023 at 03:28:22PM +0100, Filip Schauer wrote:
>
> On 30/10/2023 14:34, Wolfgang Bumiller wrote:
> > On Tue, Oct 24, 2023 at 02:55:53PM +0200, Filip Schauer wrote:
> > > Add a dev[n] argument to the container config to pass devices through to
> > > a container. A device can be passed by its path. Alternatively a mapped
> > > USB device can be passed through with usbmapping=<name>.
> > >
> > > Signed-off-by: Filip Schauer<f.schauer@proxmox.com>
> > > ---
> > > src/PVE/LXC.pm | 34 +++++++++++++++++++++++-
> > > src/PVE/LXC/Config.pm | 60 +++++++++++++++++++++++++++++++++++++++++++
> > > 2 files changed, 93 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm
> > > index c9b5ba7..a3ddb62 100644
> > > --- a/src/PVE/LXC.pm
> > > +++ b/src/PVE/LXC.pm
> > > @@ -5,7 +5,8 @@ use warnings;
> > > use Cwd qw();
> > > use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED EEXIST);
> > > -use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY);
> > > +use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY :mode);
> > > +use File::Basename;
> > > use File::Path;
> > > use File::Spec;
> > > use IO::Poll qw(POLLIN POLLHUP);
> > > @@ -639,6 +640,37 @@ sub update_lxc_config {
> > > $raw .= "lxc.mount.auto = sys:mixed\n";
> > > }
> > > + # Clear passthrough directory from previous run
> > > + my $passthrough_dir = "/var/lib/lxc/$vmid/passthrough";
> > > + File::Path::rmtree($passthrough_dir);
> > I think we need to make a few changes here.
> >
> > First: we don't necessarily need this directory.
> > Having a device list would certainly be nice, but it makes more sense to
> > just have a file we can easily parse (possibly even just a json hash),
> > like the `devices` file we already create in the pre-start hook, except
> > prepared *for* the pre-start hook, which *should* be able to just
> > `mknod` the devices right into the container's `/dev` on startup.
>
>
> Devices mknoded into the container's /dev directory in the pre-start
> hook will not be visible in the container once it is fully started.
Ah yes, I keep ignoring that.
> Meanwhile mknoding a device to a different path inside the container
> works fine. It seems that LXC mounts over the /dev directory. This can
/dev will be a tmpfs, yes.
> be solved by calling mknod in lxc-pve-autodev-hook, but this does not
> work with unprivileged containers without the mknod capability.
>
> So are bind mounts our only option without modifying LXC,
> or am I overlooking something?
Sort of. We *could* still do this via a separate process we signal from
out of the autodev hook to do the work for it, but that'll make the
startup process even more convoluted.
And I think the seccomp proxying only starts after the entire init
setup, so we also can't just reuly on syscalld (of which the entire
point is to do mknods for the container 🙄).
I'm also working on a seccomp wrapper to allow unprivileged restores of
backups to `mknod()` the basics, but that, too, happens via seccomp, so
not really reusable in this case either (and syscalld is not suitable
for *this* either (for now) as it uses an lxc specific protocol and does
not by itself perform the seccomp setup...)
Perhaps there's a way to unify all that (at least partially) by teaching
syscalld an additional protocol we can use in all 3 cases (although the
the requirements are slightly different... here we only have "known"
paths & permissions, so we wouldn't need to deal with copying another
process' rootfs/chroot/fds/... to perform a syscall on their behalf,
which the other cases do need...)
So yeah, I suppose we can go the bind-mount route first, as it is
simpler, and then maybe change it later.
However, I still don't want to fill `/var/lib/lxc` on the host with
device nodes directly whenever we update the config via
`update_lxc_config()`.
So how about this:
In the prestart hook:
- mount a tmpfs to this path
- mknod the devices into it
And then in the autodev hook do the bind-mounting.
>
>
> > We'd also avoid "lingering" device nodes with potentially harmful
> > uid/permissions in /var, which is certainly better from a security POV.
> >
> > But note that we do need the `lxc.cgroup2.*` entries before starting the
> > container in order to ensure the devices cgroup has the right
> > permissions.
> >
> > > +
> > > + PVE::LXC::Config->foreach_passthrough_device($conf, sub {
> > > + my ($key, $sanitized_path) = @_;
> > > +
> > > + my $absolute_path = "/$sanitized_path";
> > > + my ($mode, $rdev) = (stat($absolute_path))[2, 6];
> > > + die "Could not find major and minor ids of device $absolute_path.\n"
> > > + unless ($mode && $rdev);
> > > +
> > > + my $major = PVE::Tools::dev_t_major($rdev);
> > > + my $minor = PVE::Tools::dev_t_minor($rdev);
> > > + my $device_type_char = S_ISBLK($mode) ? 'b' : 'c';
> > > + my $passthrough_device_path = "$passthrough_dir/$sanitized_path";
> > > + File::Path::make_path(dirname($passthrough_device_path));
> > > + PVE::Tools::run_command([
> > > + '/usr/bin/mknod',
> > > + '-m', '0660',
Btw. with a property string used for the device entry, we could probably
also have an optional `mode` to use instead of `0660`, as well as a
`uid` and `gid` - but we'd need to map those with the container's id
mapping. Not sure if we already have helpers for that apart from getting
the root ids.
> > > + $passthrough_device_path,
> > > + $device_type_char,
> > > + $major,
> > > + $minor
> > > + ]);
> > It's probably worth adding a helper for the mknod syscall to
> > `PVE::Tools`, there are a bunch of syscalls in there already.
> >
> > > + chown 100000, 100000, $passthrough_device_path if ($unprivileged);
> > ^ This isn't necessarily the correct id. Users may have custom id
> > mappings.
> > `PVE::LXC::parse_id_maps($conf)` returns the mapping alongside the root
> > uid and gid. (See for example `sub mount_all` for how it's used.
> >
> > > +
> > > + $raw .= "lxc.cgroup2.devices.allow = $device_type_char $major:$minor rw\n";
> > > + $raw .= "lxc.mount.entry = $passthrough_device_path $sanitized_path none bind,create=file\n";
> > > + });
> > > +
> > > # WARNING: DO NOT REMOVE this without making sure that loop device nodes
> > > # cannot be exposed to the container with r/w access (cgroup perms).
> > > # When this is enabled mounts will still remain in the monitor's namespace
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [pve-devel] [PATCH v2 container 1/1] Add device passthrough
2023-11-03 8:14 ` Wolfgang Bumiller
@ 2023-11-07 13:49 ` Filip Schauer
0 siblings, 0 replies; 9+ messages in thread
From: Filip Schauer @ 2023-11-07 13:49 UTC (permalink / raw)
To: Wolfgang Bumiller; +Cc: pve-devel
Patch v3 available:
https://lists.proxmox.com/pipermail/pve-devel/2023-November/059844.html
On 03/11/2023 09:14, Wolfgang Bumiller wrote:
> On Thu, Nov 02, 2023 at 03:28:22PM +0100, Filip Schauer wrote:
>> On 30/10/2023 14:34, Wolfgang Bumiller wrote:
>>> On Tue, Oct 24, 2023 at 02:55:53PM +0200, Filip Schauer wrote:
>>>> Add a dev[n] argument to the container config to pass devices through to
>>>> a container. A device can be passed by its path. Alternatively a mapped
>>>> USB device can be passed through with usbmapping=<name>.
>>>>
>>>> Signed-off-by: Filip Schauer<f.schauer@proxmox.com>
>>>> ---
>>>> src/PVE/LXC.pm | 34 +++++++++++++++++++++++-
>>>> src/PVE/LXC/Config.pm | 60 +++++++++++++++++++++++++++++++++++++++++++
>>>> 2 files changed, 93 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm
>>>> index c9b5ba7..a3ddb62 100644
>>>> --- a/src/PVE/LXC.pm
>>>> +++ b/src/PVE/LXC.pm
>>>> @@ -5,7 +5,8 @@ use warnings;
>>>> use Cwd qw();
>>>> use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED EEXIST);
>>>> -use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY);
>>>> +use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY :mode);
>>>> +use File::Basename;
>>>> use File::Path;
>>>> use File::Spec;
>>>> use IO::Poll qw(POLLIN POLLHUP);
>>>> @@ -639,6 +640,37 @@ sub update_lxc_config {
>>>> $raw .= "lxc.mount.auto = sys:mixed\n";
>>>> }
>>>> + # Clear passthrough directory from previous run
>>>> + my $passthrough_dir = "/var/lib/lxc/$vmid/passthrough";
>>>> + File::Path::rmtree($passthrough_dir);
>>> I think we need to make a few changes here.
>>>
>>> First: we don't necessarily need this directory.
>>> Having a device list would certainly be nice, but it makes more sense to
>>> just have a file we can easily parse (possibly even just a json hash),
>>> like the `devices` file we already create in the pre-start hook, except
>>> prepared *for* the pre-start hook, which *should* be able to just
>>> `mknod` the devices right into the container's `/dev` on startup.
>>
>> Devices mknoded into the container's /dev directory in the pre-start
>> hook will not be visible in the container once it is fully started.
> Ah yes, I keep ignoring that.
>
>> Meanwhile mknoding a device to a different path inside the container
>> works fine. It seems that LXC mounts over the /dev directory. This can
> /dev will be a tmpfs, yes.
>
>> be solved by calling mknod in lxc-pve-autodev-hook, but this does not
>> work with unprivileged containers without the mknod capability.
>>
>> So are bind mounts our only option without modifying LXC,
>> or am I overlooking something?
> Sort of. We *could* still do this via a separate process we signal from
> out of the autodev hook to do the work for it, but that'll make the
> startup process even more convoluted.
> And I think the seccomp proxying only starts after the entire init
> setup, so we also can't just reuly on syscalld (of which the entire
> point is to do mknods for the container 🙄).
>
> I'm also working on a seccomp wrapper to allow unprivileged restores of
> backups to `mknod()` the basics, but that, too, happens via seccomp, so
> not really reusable in this case either (and syscalld is not suitable
> for *this* either (for now) as it uses an lxc specific protocol and does
> not by itself perform the seccomp setup...)
>
> Perhaps there's a way to unify all that (at least partially) by teaching
> syscalld an additional protocol we can use in all 3 cases (although the
> the requirements are slightly different... here we only have "known"
> paths & permissions, so we wouldn't need to deal with copying another
> process' rootfs/chroot/fds/... to perform a syscall on their behalf,
> which the other cases do need...)
>
> So yeah, I suppose we can go the bind-mount route first, as it is
> simpler, and then maybe change it later.
>
> However, I still don't want to fill `/var/lib/lxc` on the host with
> device nodes directly whenever we update the config via
> `update_lxc_config()`.
>
> So how about this:
>
> In the prestart hook:
> - mount a tmpfs to this path
> - mknod the devices into it
> And then in the autodev hook do the bind-mounting.
>
>>
>>> We'd also avoid "lingering" device nodes with potentially harmful
>>> uid/permissions in /var, which is certainly better from a security POV.
>>>
>>> But note that we do need the `lxc.cgroup2.*` entries before starting the
>>> container in order to ensure the devices cgroup has the right
>>> permissions.
>>>
>>>> +
>>>> + PVE::LXC::Config->foreach_passthrough_device($conf, sub {
>>>> + my ($key, $sanitized_path) = @_;
>>>> +
>>>> + my $absolute_path = "/$sanitized_path";
>>>> + my ($mode, $rdev) = (stat($absolute_path))[2, 6];
>>>> + die "Could not find major and minor ids of device $absolute_path.\n"
>>>> + unless ($mode && $rdev);
>>>> +
>>>> + my $major = PVE::Tools::dev_t_major($rdev);
>>>> + my $minor = PVE::Tools::dev_t_minor($rdev);
>>>> + my $device_type_char = S_ISBLK($mode) ? 'b' : 'c';
>>>> + my $passthrough_device_path = "$passthrough_dir/$sanitized_path";
>>>> + File::Path::make_path(dirname($passthrough_device_path));
>>>> + PVE::Tools::run_command([
>>>> + '/usr/bin/mknod',
>>>> + '-m', '0660',
> Btw. with a property string used for the device entry, we could probably
> also have an optional `mode` to use instead of `0660`, as well as a
> `uid` and `gid` - but we'd need to map those with the container's id
> mapping. Not sure if we already have helpers for that apart from getting
> the root ids.
>
>>>> + $passthrough_device_path,
>>>> + $device_type_char,
>>>> + $major,
>>>> + $minor
>>>> + ]);
>>> It's probably worth adding a helper for the mknod syscall to
>>> `PVE::Tools`, there are a bunch of syscalls in there already.
>>>
>>>> + chown 100000, 100000, $passthrough_device_path if ($unprivileged);
>>> ^ This isn't necessarily the correct id. Users may have custom id
>>> mappings.
>>> `PVE::LXC::parse_id_maps($conf)` returns the mapping alongside the root
>>> uid and gid. (See for example `sub mount_all` for how it's used.
>>>
>>>> +
>>>> + $raw .= "lxc.cgroup2.devices.allow = $device_type_char $major:$minor rw\n";
>>>> + $raw .= "lxc.mount.entry = $passthrough_device_path $sanitized_path none bind,create=file\n";
>>>> + });
>>>> +
>>>> # WARNING: DO NOT REMOVE this without making sure that loop device nodes
>>>> # cannot be exposed to the container with r/w access (cgroup perms).
>>>> # When this is enabled mounts will still remain in the monitor's namespace
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2023-11-07 13:49 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-24 12:55 [pve-devel] [PATCH many v2] Add container device passthrough Filip Schauer
2023-10-24 12:55 ` [pve-devel] [PATCH v2 container 1/1] Add " Filip Schauer
2023-10-30 13:34 ` Wolfgang Bumiller
2023-11-02 14:28 ` Filip Schauer
2023-11-03 8:14 ` Wolfgang Bumiller
2023-11-07 13:49 ` Filip Schauer
2023-10-24 12:55 ` [pve-devel] [PATCH v2 guest-common 1/1] Add foreach_passthrough_device Filip Schauer
2023-10-30 13:12 ` Wolfgang Bumiller
2023-10-31 9:00 ` Thomas Lamprecht
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox