From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) by lore.proxmox.com (Postfix) with ESMTPS id BD8F31FF15F for ; Mon, 16 Dec 2024 18:22:16 +0100 (CET) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id D0CD412A57; Mon, 16 Dec 2024 18:22:26 +0100 (CET) From: Filip Schauer To: pve-devel@lists.proxmox.com Date: Mon, 16 Dec 2024 18:21:32 +0100 Message-Id: <20241216172132.235857-4-f.schauer@proxmox.com> X-Mailer: git-send-email 2.39.5 In-Reply-To: <20241216172132.235857-1-f.schauer@proxmox.com> References: <20241216172132.235857-1-f.schauer@proxmox.com> MIME-Version: 1.0 X-SPAM-LEVEL: Spam detection results: 0 AWL -0.026 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Subject: [pve-devel] [PATCH container 3/3] implement device hotplug X-BeenThere: pve-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox VE development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: Proxmox VE development discussion Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: pve-devel-bounces@lists.proxmox.com Sender: "pve-devel" Signed-off-by: Filip Schauer --- src/PVE/LXC.pm | 93 ++++++++++++++++++++++++++++++++++++++++++- src/PVE/LXC/Config.pm | 19 +++++++++ 2 files changed, 111 insertions(+), 1 deletion(-) diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm index 12a4378..06902a1 100644 --- a/src/PVE/LXC.pm +++ b/src/PVE/LXC.pm @@ -5,7 +5,7 @@ use warnings; use Cwd qw(); use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED EEXIST); -use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY :mode); +use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY O_CREAT :mode); use File::Path; use File::Spec; use IO::Poll qw(POLLIN POLLHUP); @@ -2008,6 +2008,97 @@ my $enter_mnt_ns_and_change_aa_profile = sub { or die "failed to change apparmor profile (close() failed): $!\n"; }; +sub device_passthrough_hotplug :prototype($$$) { + my ($vmid, $conf, $dev) = @_; + + my ($mode, $rdev) = (stat($dev->{path}))[2, 6]; + + die "Could not get mode or device ID of $dev->{path}\n" + if (!defined($mode) || !defined($rdev)); + + # We do the rest in a fork with an unshared mount namespace: + # -) change our apparmor profile to 'pve-container-mounthotplug', which is '/usr/bin/lxc-start' + # with move_mount privileges on every mount. + # -) create the device node, then grab it, create a file to bind mount the device node onto in + # the container, switch to the container mount namespace, and move_mount the device node. + + PVE::Tools::run_fork(sub { + # Pin the container pid longer, we also need to get its monitor/parent: + my ($ct_pid, $ct_pidfd) = open_lxc_pid($vmid) + or die "failed to open pidfd of container $vmid\'s init process\n"; + + my ($monitor_pid, $monitor_pidfd) = open_ppid($ct_pid) + or die "failed to open pidfd of container $vmid\'s monitor process\n"; + + my $ct_mnt_ns = $get_container_namespace->($vmid, $ct_pid, 'mnt'); + my $ct_user_ns = $get_container_namespace->($vmid, $ct_pid, 'user'); + my $monitor_mnt_ns = $get_container_namespace->($vmid, $monitor_pid, 'mnt'); + + # Enter monitor mount namespace and switch to 'pve-container-mounthotplug' apparmor profile. + $enter_mnt_ns_and_change_aa_profile->($monitor_mnt_ns, "pve-container-mounthotplug", undef); + + # Create the device node + my $passthrough_dir = "/var/lib/lxc/$vmid/passthrough"; + my $passthrough_device_path = $passthrough_dir . $dev->{path}; + PVE::Tools::mknod($passthrough_device_path, $mode, $rdev) + or die("failed to mknod $passthrough_device_path: $!\n"); + + # Use chmod because umask could mess with the access mode on mknod + my $passthrough_mode = 0660; + $passthrough_mode = oct($dev->{mode}) if defined($dev->{mode}); + chmod $passthrough_mode, $passthrough_device_path + or die "failed to chmod $passthrough_mode $passthrough_device_path: $!\n"; + + # Set uid and gid of the device node + my $uid = 0; + my $gid = 0; + $uid = $dev->{uid} if defined($dev->{uid}); + $gid = $dev->{gid} if defined($dev->{gid}); + my $id_map = (PVE::LXC::parse_id_maps($conf))[0]; + $uid = PVE::LXC::map_ct_uid_to_host($uid, $id_map); + $gid = PVE::LXC::map_ct_gid_to_host($gid, $id_map); + chown $uid, $gid, $passthrough_device_path + or die("failed to chown $uid:$gid $passthrough_device_path: $!\n"); + + my $srcfh = PVE::Tools::open_tree(&AT_FDCWD, $passthrough_device_path, &OPEN_TREE_CLOEXEC | &OPEN_TREE_CLONE) + or die "open_tree() on passthrough device node failed: $!\n"; + + if ($conf->{unprivileged}) { + PVE::Tools::setns(fileno($ct_user_ns), PVE::Tools::CLONE_NEWUSER) + or die "failed to enter user namespace of container $vmid: $!\n"; + + POSIX::setuid(0); + POSIX::setgid(0); + } + + # Create a regular file in the container to bind mount the device node onto. + sysopen(my $dstfh, "/proc/$ct_pid/root$dev->{path}", O_CREAT) + or die "failed to open '/proc/$ct_pid/root$dev->{path}': $!\n"; + + # Enter the container mount namespace + PVE::Tools::setns(fileno($ct_mnt_ns), PVE::Tools::CLONE_NEWNS); + chdir('/') + or die "failed to change root directory within the container's mount namespace: $!\n"; + + # Bind mount the device node into the container + PVE::Tools::move_mount(fileno($srcfh), '', fileno($dstfh), '', &MOVE_MOUNT_F_EMPTY_PATH | &MOVE_MOUNT_T_EMPTY_PATH) + or die "move_mount failed: $!\n"; + }); + + # Allow or deny device access with cgroup2 + my $major = PVE::Tools::dev_t_major($rdev); + my $minor = PVE::Tools::dev_t_minor($rdev); + my $device_type = S_ISBLK($mode) ? 'b' : 'c'; + + run_command(["lxc-cgroup", "-n", $vmid, "devices.deny", "$device_type $major:$minor w"]) + if ($dev->{'deny-write'}); + + my $allow_perms = $dev->{'deny-write'} ? 'r' : 'rw'; + run_command([ + "lxc-cgroup", "-n", $vmid, "devices.allow", "$device_type $major:$minor $allow_perms" + ]); +} + sub mountpoint_hotplug :prototype($$$$$) { my ($vmid, $conf, $opt, $mp, $storage_cfg) = @_; diff --git a/src/PVE/LXC/Config.pm b/src/PVE/LXC/Config.pm index b44bcce..1acc76d 100644 --- a/src/PVE/LXC/Config.pm +++ b/src/PVE/LXC/Config.pm @@ -1529,6 +1529,13 @@ sub vmconfig_hotplug_pending { $class->apply_pending_mountpoint($vmid, $conf, $opt, $storecfg, 1); # apply_pending_mountpoint modifies the value if it creates a new disk $value = $conf->{pending}->{$opt}; + } elsif ($opt =~ m/^dev(\d+)$/) { + if (exists($conf->{$opt})) { + die "skip\n"; # don't try to hotplug over existing dev + } + + $class->apply_pending_device_passthrough($vmid, $conf, $opt, 1); + $value = $conf->{pending}->{$opt}; } else { die "skip\n"; # skip non-hotpluggable } @@ -1623,6 +1630,18 @@ my $rescan_volume = sub { warn "Could not rescan volume size - $@\n" if $@; }; +sub apply_pending_device_passthrough { + my ($class, $vmid, $conf, $opt, $running) = @_; + + my $dev = $class->parse_device($conf->{pending}->{$opt}); + my $old = $conf->{$opt}; + if ($running) { + die "skip\n" if defined($old); # TODO: editing a device passthrough + PVE::LXC::device_passthrough_hotplug($vmid, $conf, $dev); + $conf->{pending}->{$opt} = $class->print_device($dev); + } +} + sub apply_pending_mountpoint { my ($class, $vmid, $conf, $opt, $storecfg, $running) = @_; -- 2.39.5 _______________________________________________ pve-devel mailing list pve-devel@lists.proxmox.com https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel