all lists on lists.proxmox.com
 help / color / mirror / Atom feed
From: Filip Schauer <f.schauer@proxmox.com>
To: pve-devel@lists.proxmox.com
Subject: [pve-devel] [PATCH container 3/3] implement device hotplug
Date: Mon, 16 Dec 2024 18:21:32 +0100	[thread overview]
Message-ID: <20241216172132.235857-4-f.schauer@proxmox.com> (raw)
In-Reply-To: <20241216172132.235857-1-f.schauer@proxmox.com>

Signed-off-by: Filip Schauer <f.schauer@proxmox.com>
---
 src/PVE/LXC.pm        | 93 ++++++++++++++++++++++++++++++++++++++++++-
 src/PVE/LXC/Config.pm | 19 +++++++++
 2 files changed, 111 insertions(+), 1 deletion(-)

diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm
index 12a4378..06902a1 100644
--- a/src/PVE/LXC.pm
+++ b/src/PVE/LXC.pm
@@ -5,7 +5,7 @@ use warnings;
 
 use Cwd qw();
 use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED EEXIST);
-use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY :mode);
+use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY O_CREAT :mode);
 use File::Path;
 use File::Spec;
 use IO::Poll qw(POLLIN POLLHUP);
@@ -2008,6 +2008,97 @@ my $enter_mnt_ns_and_change_aa_profile = sub {
 	or die "failed to change apparmor profile (close() failed): $!\n";
 };
 
+sub device_passthrough_hotplug :prototype($$$) {
+    my ($vmid, $conf, $dev) = @_;
+
+    my ($mode, $rdev) = (stat($dev->{path}))[2, 6];
+
+    die "Could not get mode or device ID of $dev->{path}\n"
+	if (!defined($mode) || !defined($rdev));
+
+    # We do the rest in a fork with an unshared mount namespace:
+    #  -) change our apparmor profile to 'pve-container-mounthotplug', which is '/usr/bin/lxc-start'
+    #     with move_mount privileges on every mount.
+    #  -) create the device node, then grab it, create a file to bind mount the device node onto in
+    #     the container, switch to the container mount namespace, and move_mount the device node.
+
+    PVE::Tools::run_fork(sub {
+	# Pin the container pid longer, we also need to get its monitor/parent:
+	my ($ct_pid, $ct_pidfd) = open_lxc_pid($vmid)
+	    or die "failed to open pidfd of container $vmid\'s init process\n";
+
+	my ($monitor_pid, $monitor_pidfd) = open_ppid($ct_pid)
+	    or die "failed to open pidfd of container $vmid\'s monitor process\n";
+
+	my $ct_mnt_ns = $get_container_namespace->($vmid, $ct_pid, 'mnt');
+	my $ct_user_ns = $get_container_namespace->($vmid, $ct_pid, 'user');
+	my $monitor_mnt_ns = $get_container_namespace->($vmid, $monitor_pid, 'mnt');
+
+	# Enter monitor mount namespace and switch to 'pve-container-mounthotplug' apparmor profile.
+	$enter_mnt_ns_and_change_aa_profile->($monitor_mnt_ns, "pve-container-mounthotplug", undef);
+
+	# Create the device node
+	my $passthrough_dir = "/var/lib/lxc/$vmid/passthrough";
+	my $passthrough_device_path = $passthrough_dir . $dev->{path};
+	PVE::Tools::mknod($passthrough_device_path, $mode, $rdev)
+	    or die("failed to mknod $passthrough_device_path: $!\n");
+
+	# Use chmod because umask could mess with the access mode on mknod
+	my $passthrough_mode = 0660;
+	$passthrough_mode = oct($dev->{mode}) if defined($dev->{mode});
+	chmod $passthrough_mode, $passthrough_device_path
+	    or die "failed to chmod $passthrough_mode $passthrough_device_path: $!\n";
+
+	# Set uid and gid of the device node
+	my $uid = 0;
+	my $gid = 0;
+	$uid = $dev->{uid} if defined($dev->{uid});
+	$gid = $dev->{gid} if defined($dev->{gid});
+	my $id_map = (PVE::LXC::parse_id_maps($conf))[0];
+	$uid = PVE::LXC::map_ct_uid_to_host($uid, $id_map);
+	$gid = PVE::LXC::map_ct_gid_to_host($gid, $id_map);
+	chown $uid, $gid, $passthrough_device_path
+	    or die("failed to chown $uid:$gid $passthrough_device_path: $!\n");
+
+	my $srcfh = PVE::Tools::open_tree(&AT_FDCWD, $passthrough_device_path, &OPEN_TREE_CLOEXEC | &OPEN_TREE_CLONE)
+	    or die "open_tree() on passthrough device node failed: $!\n";
+
+	if ($conf->{unprivileged}) {
+	    PVE::Tools::setns(fileno($ct_user_ns), PVE::Tools::CLONE_NEWUSER)
+		or die "failed to enter user namespace of container $vmid: $!\n";
+
+	    POSIX::setuid(0);
+	    POSIX::setgid(0);
+	}
+
+	# Create a regular file in the container to bind mount the device node onto.
+	sysopen(my $dstfh, "/proc/$ct_pid/root$dev->{path}", O_CREAT)
+	    or die "failed to open '/proc/$ct_pid/root$dev->{path}': $!\n";
+
+	# Enter the container mount namespace
+	PVE::Tools::setns(fileno($ct_mnt_ns), PVE::Tools::CLONE_NEWNS);
+	chdir('/')
+	    or die "failed to change root directory within the container's mount namespace: $!\n";
+
+	# Bind mount the device node into the container
+	PVE::Tools::move_mount(fileno($srcfh), '', fileno($dstfh), '', &MOVE_MOUNT_F_EMPTY_PATH | &MOVE_MOUNT_T_EMPTY_PATH)
+	    or die "move_mount failed: $!\n";
+    });
+
+    # Allow or deny device access with cgroup2
+    my $major = PVE::Tools::dev_t_major($rdev);
+    my $minor = PVE::Tools::dev_t_minor($rdev);
+    my $device_type = S_ISBLK($mode) ? 'b' : 'c';
+
+    run_command(["lxc-cgroup", "-n", $vmid, "devices.deny", "$device_type $major:$minor w"])
+	if ($dev->{'deny-write'});
+
+    my $allow_perms = $dev->{'deny-write'} ? 'r' : 'rw';
+    run_command([
+	"lxc-cgroup", "-n", $vmid, "devices.allow", "$device_type $major:$minor $allow_perms"
+    ]);
+}
+
 sub mountpoint_hotplug :prototype($$$$$) {
     my ($vmid, $conf, $opt, $mp, $storage_cfg) = @_;
 
diff --git a/src/PVE/LXC/Config.pm b/src/PVE/LXC/Config.pm
index b44bcce..1acc76d 100644
--- a/src/PVE/LXC/Config.pm
+++ b/src/PVE/LXC/Config.pm
@@ -1529,6 +1529,13 @@ sub vmconfig_hotplug_pending {
 		$class->apply_pending_mountpoint($vmid, $conf, $opt, $storecfg, 1);
 		# apply_pending_mountpoint modifies the value if it creates a new disk
 		$value = $conf->{pending}->{$opt};
+	    } elsif ($opt =~ m/^dev(\d+)$/) {
+		if (exists($conf->{$opt})) {
+		    die "skip\n"; # don't try to hotplug over existing dev
+		}
+
+		$class->apply_pending_device_passthrough($vmid, $conf, $opt, 1);
+		$value = $conf->{pending}->{$opt};
 	    } else {
 		die "skip\n"; # skip non-hotpluggable
 	    }
@@ -1623,6 +1630,18 @@ my $rescan_volume = sub {
     warn "Could not rescan volume size - $@\n" if $@;
 };
 
+sub apply_pending_device_passthrough {
+    my ($class, $vmid, $conf, $opt, $running) = @_;
+
+    my $dev = $class->parse_device($conf->{pending}->{$opt});
+    my $old = $conf->{$opt};
+    if ($running) {
+	die "skip\n" if defined($old); # TODO: editing a device passthrough
+	PVE::LXC::device_passthrough_hotplug($vmid, $conf, $dev);
+	$conf->{pending}->{$opt} = $class->print_device($dev);
+    }
+}
+
 sub apply_pending_mountpoint {
     my ($class, $vmid, $conf, $opt, $storecfg, $running) = @_;
 
-- 
2.39.5



_______________________________________________
pve-devel mailing list
pve-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel


  parent reply	other threads:[~2024-12-16 17:22 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-12-16 17:21 [pve-devel] [PATCH container 0/3] " Filip Schauer
2024-12-16 17:21 ` [pve-devel] [PATCH container 1/3] extract apparmor profile & namespace switch to its own helper Filip Schauer
2024-12-16 17:21 ` [pve-devel] [PATCH container 2/3] config: support printing a device Filip Schauer
2024-12-16 17:21 ` Filip Schauer [this message]
2025-04-23 12:59 ` [pve-devel] [PATCH container 0/3] implement device hotplug Filip Schauer

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20241216172132.235857-4-f.schauer@proxmox.com \
    --to=f.schauer@proxmox.com \
    --cc=pve-devel@lists.proxmox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal