From: Wolfgang Bumiller <w.bumiller@proxmox.com>
To: Filip Schauer <f.schauer@proxmox.com>
Cc: pve-devel@lists.proxmox.com
Subject: Re: [pve-devel] [PATCH v3 container 1/1] Add device passthrough
Date: Fri, 10 Nov 2023 11:44:29 +0100 [thread overview]
Message-ID: <zkec77e3hncofomgbborccjky63d2wvug7xgirwp52eyqwvhsc@j2zxvycerij2> (raw)
In-Reply-To: <20231107134642.110576-3-f.schauer@proxmox.com>
On Tue, Nov 07, 2023 at 02:46:42PM +0100, Filip Schauer wrote:
> Add a dev[n] argument to the container config to pass devices through to
> a container. A device can be passed by its path. Additionally the access
> mode, uid and gid can be specified through their respective properties.
>
> Signed-off-by: Filip Schauer <f.schauer@proxmox.com>
> ---
> src/PVE/LXC.pm | 29 +++++++++++++-
> src/PVE/LXC/Config.pm | 79 +++++++++++++++++++++++++++++++++++++++
> src/PVE/LXC/Tools.pm | 23 +++++++++---
> src/lxc-pve-autodev-hook | 14 ++++++-
> src/lxc-pve-prestart-hook | 63 +++++++++++++++++++++++++++++++
> 5 files changed, 201 insertions(+), 7 deletions(-)
>
> diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm
> index 7ec816b..162f093 100644
> --- a/src/PVE/LXC.pm
> +++ b/src/PVE/LXC.pm
> @@ -5,7 +5,7 @@ use warnings;
>
> use Cwd qw();
> use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED EEXIST);
> -use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY);
> +use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY :mode);
> use File::Path;
> use File::Spec;
> use IO::Poll qw(POLLIN POLLHUP);
> @@ -639,6 +639,33 @@ sub update_lxc_config {
> $raw .= "lxc.mount.auto = sys:mixed\n";
> }
>
> + PVE::LXC::Config->foreach_passthrough_device($conf, sub {
> + my ($key, $device) = @_;
> +
> + die "Path is not defined for passthrough device $key"
> + unless (defined($device->{path}));
> +
> + my $absolute_path = $device->{path};
> +
> + die "Device $absolute_path does not exist\n"
> + unless (-e $absolute_path);
> +
> + my ($mode, $rdev) = (stat($absolute_path))[2, 6];
> +
> + die "Could not get mode of device $absolute_path\n"
> + if $mode !~ /^([\d]+)$/;
> + $mode = $1;
> +
> + die "Could not get device ID of $absolute_path\n"
> + if $rdev !~ /^([\d]+)$/;
> + $rdev = $1;
^ The above 2 checks don't seem necessary. `stat()` cannot return
different types of values there.
Just check them for definedness once together.
> +
> + my $major = PVE::Tools::dev_t_major($rdev);
> + my $minor = PVE::Tools::dev_t_minor($rdev);
> + my $device_type_char = S_ISBLK($mode) ? 'b' : 'c';
> + $raw .= "lxc.cgroup2.devices.allow = $device_type_char $major:$minor rw\n";
> + });
> +
> # WARNING: DO NOT REMOVE this without making sure that loop device nodes
> # cannot be exposed to the container with r/w access (cgroup perms).
> # When this is enabled mounts will still remain in the monitor's namespace
> diff --git a/src/PVE/LXC/Config.pm b/src/PVE/LXC/Config.pm
> index 56e1f10..b4fa147 100644
> --- a/src/PVE/LXC/Config.pm
> +++ b/src/PVE/LXC/Config.pm
> @@ -29,6 +29,7 @@ mkdir $lockdir;
> mkdir "/etc/pve/nodes/$nodename/lxc";
> my $MAX_MOUNT_POINTS = 256;
> my $MAX_UNUSED_DISKS = $MAX_MOUNT_POINTS;
> +my $MAX_DEVICES = 256;
>
> # BEGIN implemented abstract methods from PVE::AbstractConfig
>
> @@ -908,6 +909,57 @@ for (my $i = 0; $i < $MAX_UNUSED_DISKS; $i++) {
> }
> }
>
> +PVE::JSONSchema::register_format('pve-lxc-dev-string', \&verify_lxc_dev_string);
> +sub verify_lxc_dev_string {
> + my ($dev, $noerr) = @_;
> +
> + if (
> + $dev =~ m@/\.\.?/@ ||
> + $dev =~ m@/\.\.?$@ ||
> + $dev !~ m!^/dev/!
> + ) {
> + return undef if $noerr;
> + die "$dev is not a valid device path\n";
> + }
> +
> + return $dev;
> +}
> +
> +my $dev_desc = {
> + path => {
> + optional => 1,
> + type => 'string',
> + default_key => 1,
> + format => 'pve-lxc-dev-string',
> + format_description => 'Path',
> + description => 'Device to pass through to the container',
> + verbose_description => 'Path to the device to pass through to the container',
> + },
> + mode => {
> + optional => 1,
> + type => 'integer',
> + description => 'Access mode (in octal) to be set on the device node',
> + },
> + uid => {
> + optional => 1,
> + type => 'integer',
> + description => 'User ID to be assigned to the device node',
> + },
> + gid => {
> + optional => 1,
> + type => 'integer',
> + description => 'Group ID to be assigned to the device node',
> + },
> +};
> +
> +for (my $i = 0; $i < $MAX_DEVICES; $i++) {
> + $confdesc->{"dev$i"} = {
> + optional => 1,
> + type => 'string', format => $dev_desc,
> + description => "Device to pass through to the container",
> + }
> +}
> +
> sub parse_pct_config {
> my ($filename, $raw, $strict) = @_;
>
> @@ -1255,6 +1307,21 @@ sub parse_volume {
> return;
> }
>
> +sub parse_device {
> + my ($class, $device_string, $noerr) = @_;
> +
> + my $res;
> + eval { $res = PVE::JSONSchema::parse_property_string($dev_desc, $device_string) };
> + if ($@) {
> + return undef if $noerr;
> + die $@;
> + }
> +
> + die "Path has to be defined" unless (defined($res->{path}));
> +
> + return $res;
> +}
> +
> sub print_volume {
> my ($class, $key, $volume) = @_;
>
> @@ -1762,4 +1829,16 @@ sub get_derived_property {
> }
> }
>
> +sub foreach_passthrough_device {
> + my ($class, $conf, $func, @param) = @_;
> +
> + foreach my $key (keys %$conf) {
> + next if $key !~ m/^dev(\d+)$/;
> +
> + my $device = $class->parse_device($conf->{$key});
> +
> + $func->($key, $device, @param);
> + }
> +}
> +
> 1;
> diff --git a/src/PVE/LXC/Tools.pm b/src/PVE/LXC/Tools.pm
> index 62cdbc1..c9fb834 100644
> --- a/src/PVE/LXC/Tools.pm
> +++ b/src/PVE/LXC/Tools.pm
> @@ -81,10 +81,9 @@ sub lxc_hook($$&) {
> $code->($ct_name, $common_vars, $namespaces, $args);
> }
>
> -sub for_current_devices($&) {
> - my ($vmid, $code) = @_;
> +sub for_devices {
> + my ($devlist_file, $vmid, $code) = @_;
>
> - my $devlist_file = "/var/lib/lxc/$vmid/devices";
> my $fd;
>
> if (! open $fd, '<', $devlist_file) {
> @@ -93,8 +92,8 @@ sub for_current_devices($&) {
> }
>
> while (defined(my $line = <$fd>)) {
> - if ($line !~ m@^(b):(\d+):(\d+):/dev/(\S+)\s*$@) {
> - warn "invalid .pve-devices entry: $line\n";
> + if ($line !~ m@^(b|c):(\d+):(\d+):/dev/(\S+)\s*$@) {
> + warn "invalid $devlist_file entry: $line\n";
> next;
> }
>
> @@ -117,6 +116,20 @@ sub for_current_devices($&) {
> close $fd;
> }
>
> +sub for_current_devices($&) {
> + my ($vmid, $code) = @_;
> +
> + my $devlist_file = "/var/lib/lxc/$vmid/devices";
> + for_devices($devlist_file, $vmid, $code);
> +}
> +
> +sub for_passthrough_devices($&) {
> + my ($vmid, $code) = @_;
> +
> + my $passthrough_devlist_file = "/var/lib/lxc/$vmid/passthrough_devices";
> + for_devices($passthrough_devlist_file, $vmid, $code);
> +}
> +
> sub cgroup_do_write($$) {
> my ($path, $value) = @_;
> my $fd;
> diff --git a/src/lxc-pve-autodev-hook b/src/lxc-pve-autodev-hook
> index 3c45949..cd2b92a 100755
> --- a/src/lxc-pve-autodev-hook
> +++ b/src/lxc-pve-autodev-hook
> @@ -3,8 +3,9 @@
> use strict;
> use warnings;
>
> -use File::Path;
> use File::Basename;
> +use File::Path;
> +use File::Touch;
>
> use PVE::LXC::Tools;
> use PVE::Tools;
> @@ -14,6 +15,17 @@ PVE::LXC::Tools::lxc_hook('autodev', 'lxc', sub {
>
> my $root = $vars->{ROOTFS_MOUNT};
>
> + PVE::LXC::Tools::for_passthrough_devices($vmid, sub {
> + my ($type, $major, $minor, $dev) = @_;
> +
> + my $rel_devpath = "/dev/$dev";
> + my $rel_dir = dirname($rel_devpath);
> + File::Path::mkpath("$root/$rel_dir");
> + touch("$root/dev/$dev");
^ Hint: `touch` is not a syscall. This does an `utimensat()` to update
the file mtimes. This isn't really necessary, and an easy way to just
create a file and do nothing if it already exists is to just use mknod
with S_IFREG for the file type in the mode bits and zero for the device
type:
PVE::Tools::mknod("$root/dev/$dev", S_IFREG, 0);
> +
> + PVE::Tools::mount("/var/lib/lxc/$vmid/passthrough/dev/$dev", "$root/dev/$dev", 0, 4096, 0);
Don't use magic numbers. Please place the MS_BIND constant in PVE::Tools
instead.
Also die() if mount fails.
> + });
> +
> PVE::LXC::Tools::for_current_devices($vmid, sub {
> my ($type, $major, $minor, $dev) = @_;
>
> diff --git a/src/lxc-pve-prestart-hook b/src/lxc-pve-prestart-hook
> index 936d0bf..68c7cec 100755
> --- a/src/lxc-pve-prestart-hook
> +++ b/src/lxc-pve-prestart-hook
> @@ -6,6 +6,7 @@ use strict;
> use warnings;
>
> use Fcntl qw(O_DIRECTORY :mode);
> +use File::Basename;
> use File::Path;
> use POSIX;
>
> @@ -118,6 +119,55 @@ PVE::LXC::Tools::lxc_hook('pre-start', 'lxc', sub {
>
> PVE::LXC::Config->foreach_volume($conf, $setup_mountpoint);
>
> + # Device passthrough
> + my $passthrough_devlist_file = "/var/lib/lxc/$vmid/passthrough_devices";
> + unlink $passthrough_devlist_file;
> + my $passthrough_devices = [];
> +
> + my $passthrough_dir = "/var/lib/lxc/$vmid/passthrough";
> + File::Path::make_path($passthrough_dir);
> + PVE::Tools::mount("none", $passthrough_dir, "tmpfs", 0, "size=8k");
^ checking that `mount` succeeded is particularly important here
> +
> + my $setup_passthrough_device = sub {
> + my ($key, $device) = @_;
> +
> + my $absolute_path = $device->{path};
> + my ($mode, $rdev) = (stat($absolute_path))[2, 6];
> +
> + die "Could not get mode of device $absolute_path\n"
> + if $mode !~ /^([\d]+)$/;
> + $mode = $1;
> +
> + die "Could not get device ID of $absolute_path\n"
> + if $rdev !~ /^([\d]+)$/;
> + $rdev = $1;
^ As above for the 2 checks with the stat() returned values.
> +
> + my $passthrough_device_path = "$passthrough_dir$absolute_path";
^ The above concatenation would probably be more readable with `.`
instead of quotes ($passthrough_dir . $absolute_path), or with `{}`
around the names.
> +
> + die "passthrough device path is invalid\n"
> + if $passthrough_device_path !~ /^([\w\/]+)$/;
> + $passthrough_device_path = $1;
^ Why this check? Both portions should be fine - $passthrough_dir is
trivial and $absolute_path comes from the parse function after
verification (and is already used in `stat()` so definitely contains a
valid path we also already used.
> +
> + File::Path::make_path(dirname($passthrough_device_path));
> + PVE::Tools::mknod($passthrough_device_path, $mode, $rdev);
^ `mknod` should be checked for errors.
> +
> + # Use chmod because umask could mess with the access mode on mknod
> + my $passthrough_mode = 0660;
> + $passthrough_mode = oct($device->{mode}) if (defined($device->{mode}));
> + chmod $passthrough_mode, $passthrough_device_path;
^ should be checked for errors.
> +
> + # Set uid and gid of the device node
> + my $uid = $rootuid;
> + my $gid = $rootgid;
> + $uid += $device->{uid} if (defined($device->{uid}));
> + $gid += $device->{gid} if (defined($device->{gid}));
^ id mappings can constist of multiple ranges.
You get the mapping from `parse_id_maps` (the 1st return value in the
line fetching `$root*id`).
It's an array of tuples of the form:
[ "g" or "u", <ns id>, <host id>, <length> ]
Make a helper to map ids using that array (see [1]).
$device->{*id} is from the perspective of the container and you want to
map from that to the host.
[1] https://git.proxmox.com/?p=pve-lxc-syscalld.git;a=blob;f=src/process/id_map.rs;h=d61d489da7af7418908fd8d85f1660c435b7c73e;hb=refs/heads/master#l26
> + chown $uid, $gid, $passthrough_device_path;
^ check for errors
> +
> + push @$passthrough_devices, $absolute_path;
push @$passthrough_devices, [$absolute_path, $mode, $rdev];
to avoid the extra stat below...
> + };
> +
> + PVE::LXC::Config->foreach_passthrough_device($conf, $setup_passthrough_device);
> +
> my $lxc_setup = PVE::LXC::Setup->new($conf, $rootdir);
> $lxc_setup->pre_start_hook();
>
> @@ -140,6 +190,19 @@ PVE::LXC::Tools::lxc_hook('pre-start', 'lxc', sub {
> }
> PVE::Tools::file_set_contents($devlist_file, $devlist);
> }
> +
> + if (@$passthrough_devices) {
> + my $devlist = '';
> + foreach my $dev (@$passthrough_devices) {
> + my ($mode, $rdev) = (stat($dev))[2, 6];
> + next if !$mode || !$rdev;
...and then resolve the tuple via:
($dev, my $mode, my $rdev) = @$dev;
> + my $major = PVE::Tools::dev_t_major($rdev);
> + my $minor = PVE::Tools::dev_t_minor($rdev);
> + my $device_type_char = S_ISBLK($mode) ? 'b' : 'c';
> + $devlist .= "$device_type_char:$major:$minor:$dev\n";
> + }
> + PVE::Tools::file_set_contents($passthrough_devlist_file, $devlist);
> + }
> });
>
> # Leftover cgroups prevent lxc from starting without any useful information
> --
> 2.39.2
next prev parent reply other threads:[~2023-11-10 10:44 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-11-07 13:46 [pve-devel] [PATCH v3 many] Add container " Filip Schauer
2023-11-07 13:46 ` [pve-devel] [PATCH v3 common 1/1] tools: Add mknod syscall Filip Schauer
2023-11-07 13:46 ` [pve-devel] [PATCH v3 container 1/1] Add device passthrough Filip Schauer
2023-11-08 11:03 ` Dominik Csapak
2023-11-10 10:44 ` Wolfgang Bumiller [this message]
2023-11-13 10:33 ` Filip Schauer
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=zkec77e3hncofomgbborccjky63d2wvug7xgirwp52eyqwvhsc@j2zxvycerij2 \
--to=w.bumiller@proxmox.com \
--cc=f.schauer@proxmox.com \
--cc=pve-devel@lists.proxmox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.