public inbox for pve-devel@lists.proxmox.com
 help / color / mirror / Atom feed
* [pve-devel] [PATCH container 0/3] implement device hotplug
@ 2024-12-16 17:21 Filip Schauer
  2024-12-16 17:21 ` [pve-devel] [PATCH container 1/3] extract apparmor profile & namespace switch to its own helper Filip Schauer
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Filip Schauer @ 2024-12-16 17:21 UTC (permalink / raw)
  To: pve-devel

Filip Schauer (3):
  extract apparmor profile & namespace switch to its own helper
  config: support printing a device
  implement device hotplug

 src/PVE/LXC.pm        | 164 ++++++++++++++++++++++++++++++++++--------
 src/PVE/LXC/Config.pm |  35 +++++++--
 2 files changed, 163 insertions(+), 36 deletions(-)

-- 
2.39.5



_______________________________________________
pve-devel mailing list
pve-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [pve-devel] [PATCH container 1/3] extract apparmor profile & namespace switch to its own helper
  2024-12-16 17:21 [pve-devel] [PATCH container 0/3] implement device hotplug Filip Schauer
@ 2024-12-16 17:21 ` Filip Schauer
  2024-12-16 17:21 ` [pve-devel] [PATCH container 2/3] config: support printing a device Filip Schauer
  2024-12-16 17:21 ` [pve-devel] [PATCH container 3/3] implement device hotplug Filip Schauer
  2 siblings, 0 replies; 4+ messages in thread
From: Filip Schauer @ 2024-12-16 17:21 UTC (permalink / raw)
  To: pve-devel

Signed-off-by: Filip Schauer <f.schauer@proxmox.com>
---
 src/PVE/LXC.pm | 71 +++++++++++++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 30 deletions(-)

diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm
index e78e365..12a4378 100644
--- a/src/PVE/LXC.pm
+++ b/src/PVE/LXC.pm
@@ -1982,15 +1982,42 @@ sub __mountpoint_mount {
     die "unsupported storage";
 }
 
+my $enter_mnt_ns_and_change_aa_profile = sub {
+    my ($mnt_ns, $aa_profile, $code_pre_changeprofile) = @_;
+
+    # Grab a file descriptor to our apparmor label file so we can change the profile
+    sysopen(my $aa_fd, "/proc/self/attr/current", O_WRONLY)
+	or die "failed to open '/proc/self/attr/current' for writing: $!\n";
+
+    # But switch namespaces first, to make sure the namespace switches aren't blocked by
+    # apparmor.
+    PVE::Tools::setns(fileno($mnt_ns), PVE::Tools::CLONE_NEWNS);
+    chdir('/')
+	or die "failed to change root directory within mount namespace: $!\n";
+
+    $code_pre_changeprofile->() if defined($code_pre_changeprofile);
+
+    # Now switch our apparmor profile:
+    my $data = "changeprofile $aa_profile";
+    my $data_written = syswrite($aa_fd, $data, length($data));
+    if (!defined($data_written) || $data_written != length($data)) {
+	die "failed to change apparmor profile: $!\n";
+    }
+    # Check errors on close as well:
+    close($aa_fd)
+	or die "failed to change apparmor profile (close() failed): $!\n";
+};
+
 sub mountpoint_hotplug :prototype($$$$$) {
     my ($vmid, $conf, $opt, $mp, $storage_cfg) = @_;
 
     my (undef, $root_uid, $root_gid) = PVE::LXC::parse_id_maps($conf);
 
-    # We do the rest in a fork with an unshared mount namespace, because:
-    #  -) change our papparmor profile to that of /usr/bin/lxc-start
-    #  -) we're now going to 'stage' # the mountpoint, then grab it, then move into the
-    #     container's namespace, then mount it.
+    # We do the rest in a fork with an unshared mount namespace:
+    #  -) change our apparmor profile to 'pve-container-mounthotplug', which is '/usr/bin/lxc-start'
+    #     with move_mount privileges on every mount.
+    #  -) we're now going to 'stage' the mountpoint, then grab it, then move into the container's
+    #     namespace, then mount it.
 
     PVE::Tools::run_fork(sub {
 	# Pin the container pid longer, we also need to get its monitor/parent:
@@ -2003,32 +2030,16 @@ sub mountpoint_hotplug :prototype($$$$$) {
 	my $ct_mnt_ns = $get_container_namespace->($vmid, $ct_pid, 'mnt');
 	my $monitor_mnt_ns = $get_container_namespace->($vmid, $monitor_pid, 'mnt');
 
-	# Grab a file descriptor to our apparmor label file so we can change into the 'lxc-start'
-	# profile to lower our privileges to the same level we have in the start hook:
-	sysopen(my $aa_fd, "/proc/self/attr/current", O_WRONLY)
-	    or die "failed to open '/proc/self/attr/current' for writing: $!\n";
-	# But switch namespaces first, to make sure the namespace switches aren't blocked by
-	# apparmor.
-
-	# Change into the monitor's mount namespace. We "pin" the mount into the monitor's
-	# namespace for it to remain active there since the container will be able to unmount
-	# hotplugged mount points and thereby potentially free up loop devices, which is a security
-	# concern.
-	PVE::Tools::setns(fileno($monitor_mnt_ns), PVE::Tools::CLONE_NEWNS);
-	chdir('/')
-	    or die "failed to change root directory within the monitor's mount namespace: $!\n";
-
-	my $dir = get_staging_mount_path($opt);
-
-	# Now switch our apparmor profile before mounting:
-	my $data = 'changeprofile pve-container-mounthotplug';
-	my $data_written = syswrite($aa_fd, $data, length($data));
-	if (!defined($data_written) || $data_written != length($data)) {
-	    die "failed to change apparmor profile: $!\n";
-	}
-	# Check errors on close as well:
-	close($aa_fd)
-	    or die "failed to change apparmor profile (close() failed): $!\n";
+	# -) Change into the monitor's mount namespace. We "pin" the mount into the monitor's
+	#    namespace for it to remain active there since the container will be able to unmount
+	#    hotplugged mount points and thereby potentially free up loop devices, which is a
+	#    security concern.
+	# -) Prepare the staging directory.
+	# -) Switch to the 'pve-container-mounthotplug' apparmor profile.
+	my $dir;
+	$enter_mnt_ns_and_change_aa_profile->($monitor_mnt_ns, "pve-container-mounthotplug", sub {
+	    $dir = get_staging_mount_path($opt);
+	});
 
 	my $mount_fd = mountpoint_stage($mp, $dir, $storage_cfg, undef, $root_uid, $root_gid);
 
-- 
2.39.5



_______________________________________________
pve-devel mailing list
pve-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [pve-devel] [PATCH container 2/3] config: support printing a device
  2024-12-16 17:21 [pve-devel] [PATCH container 0/3] implement device hotplug Filip Schauer
  2024-12-16 17:21 ` [pve-devel] [PATCH container 1/3] extract apparmor profile & namespace switch to its own helper Filip Schauer
@ 2024-12-16 17:21 ` Filip Schauer
  2024-12-16 17:21 ` [pve-devel] [PATCH container 3/3] implement device hotplug Filip Schauer
  2 siblings, 0 replies; 4+ messages in thread
From: Filip Schauer @ 2024-12-16 17:21 UTC (permalink / raw)
  To: pve-devel

Signed-off-by: Filip Schauer <f.schauer@proxmox.com>
---
 src/PVE/LXC/Config.pm | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/PVE/LXC/Config.pm b/src/PVE/LXC/Config.pm
index 5cc37f7..b44bcce 100644
--- a/src/PVE/LXC/Config.pm
+++ b/src/PVE/LXC/Config.pm
@@ -1333,6 +1333,14 @@ sub parse_volume {
     return;
 }
 
+sub print_volume {
+    my ($class, $key, $volume) = @_;
+
+    return $class->print_ct_unused($volume) if $key =~ m/^unused(\d+)$/;
+
+    return $class->print_ct_mountpoint($volume, $key eq 'rootfs');
+}
+
 sub parse_device {
     my ($class, $device_string, $noerr) = @_;
 
@@ -1350,12 +1358,10 @@ sub parse_device {
     return $res;
 }
 
-sub print_volume {
-    my ($class, $key, $volume) = @_;
-
-    return $class->print_ct_unused($volume) if $key =~ m/^unused(\d+)$/;
+sub print_device {
+    my ($class, $info) = @_;
 
-    return $class->print_ct_mountpoint($volume, $key eq 'rootfs');
+    return PVE::JSONSchema::print_property_string($info, $dev_desc);
 }
 
 sub volid_key {
-- 
2.39.5



_______________________________________________
pve-devel mailing list
pve-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [pve-devel] [PATCH container 3/3] implement device hotplug
  2024-12-16 17:21 [pve-devel] [PATCH container 0/3] implement device hotplug Filip Schauer
  2024-12-16 17:21 ` [pve-devel] [PATCH container 1/3] extract apparmor profile & namespace switch to its own helper Filip Schauer
  2024-12-16 17:21 ` [pve-devel] [PATCH container 2/3] config: support printing a device Filip Schauer
@ 2024-12-16 17:21 ` Filip Schauer
  2 siblings, 0 replies; 4+ messages in thread
From: Filip Schauer @ 2024-12-16 17:21 UTC (permalink / raw)
  To: pve-devel

Signed-off-by: Filip Schauer <f.schauer@proxmox.com>
---
 src/PVE/LXC.pm        | 93 ++++++++++++++++++++++++++++++++++++++++++-
 src/PVE/LXC/Config.pm | 19 +++++++++
 2 files changed, 111 insertions(+), 1 deletion(-)

diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm
index 12a4378..06902a1 100644
--- a/src/PVE/LXC.pm
+++ b/src/PVE/LXC.pm
@@ -5,7 +5,7 @@ use warnings;
 
 use Cwd qw();
 use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED EEXIST);
-use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY :mode);
+use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY O_CREAT :mode);
 use File::Path;
 use File::Spec;
 use IO::Poll qw(POLLIN POLLHUP);
@@ -2008,6 +2008,97 @@ my $enter_mnt_ns_and_change_aa_profile = sub {
 	or die "failed to change apparmor profile (close() failed): $!\n";
 };
 
+sub device_passthrough_hotplug :prototype($$$) {
+    my ($vmid, $conf, $dev) = @_;
+
+    my ($mode, $rdev) = (stat($dev->{path}))[2, 6];
+
+    die "Could not get mode or device ID of $dev->{path}\n"
+	if (!defined($mode) || !defined($rdev));
+
+    # We do the rest in a fork with an unshared mount namespace:
+    #  -) change our apparmor profile to 'pve-container-mounthotplug', which is '/usr/bin/lxc-start'
+    #     with move_mount privileges on every mount.
+    #  -) create the device node, then grab it, create a file to bind mount the device node onto in
+    #     the container, switch to the container mount namespace, and move_mount the device node.
+
+    PVE::Tools::run_fork(sub {
+	# Pin the container pid longer, we also need to get its monitor/parent:
+	my ($ct_pid, $ct_pidfd) = open_lxc_pid($vmid)
+	    or die "failed to open pidfd of container $vmid\'s init process\n";
+
+	my ($monitor_pid, $monitor_pidfd) = open_ppid($ct_pid)
+	    or die "failed to open pidfd of container $vmid\'s monitor process\n";
+
+	my $ct_mnt_ns = $get_container_namespace->($vmid, $ct_pid, 'mnt');
+	my $ct_user_ns = $get_container_namespace->($vmid, $ct_pid, 'user');
+	my $monitor_mnt_ns = $get_container_namespace->($vmid, $monitor_pid, 'mnt');
+
+	# Enter monitor mount namespace and switch to 'pve-container-mounthotplug' apparmor profile.
+	$enter_mnt_ns_and_change_aa_profile->($monitor_mnt_ns, "pve-container-mounthotplug", undef);
+
+	# Create the device node
+	my $passthrough_dir = "/var/lib/lxc/$vmid/passthrough";
+	my $passthrough_device_path = $passthrough_dir . $dev->{path};
+	PVE::Tools::mknod($passthrough_device_path, $mode, $rdev)
+	    or die("failed to mknod $passthrough_device_path: $!\n");
+
+	# Use chmod because umask could mess with the access mode on mknod
+	my $passthrough_mode = 0660;
+	$passthrough_mode = oct($dev->{mode}) if defined($dev->{mode});
+	chmod $passthrough_mode, $passthrough_device_path
+	    or die "failed to chmod $passthrough_mode $passthrough_device_path: $!\n";
+
+	# Set uid and gid of the device node
+	my $uid = 0;
+	my $gid = 0;
+	$uid = $dev->{uid} if defined($dev->{uid});
+	$gid = $dev->{gid} if defined($dev->{gid});
+	my $id_map = (PVE::LXC::parse_id_maps($conf))[0];
+	$uid = PVE::LXC::map_ct_uid_to_host($uid, $id_map);
+	$gid = PVE::LXC::map_ct_gid_to_host($gid, $id_map);
+	chown $uid, $gid, $passthrough_device_path
+	    or die("failed to chown $uid:$gid $passthrough_device_path: $!\n");
+
+	my $srcfh = PVE::Tools::open_tree(&AT_FDCWD, $passthrough_device_path, &OPEN_TREE_CLOEXEC | &OPEN_TREE_CLONE)
+	    or die "open_tree() on passthrough device node failed: $!\n";
+
+	if ($conf->{unprivileged}) {
+	    PVE::Tools::setns(fileno($ct_user_ns), PVE::Tools::CLONE_NEWUSER)
+		or die "failed to enter user namespace of container $vmid: $!\n";
+
+	    POSIX::setuid(0);
+	    POSIX::setgid(0);
+	}
+
+	# Create a regular file in the container to bind mount the device node onto.
+	sysopen(my $dstfh, "/proc/$ct_pid/root$dev->{path}", O_CREAT)
+	    or die "failed to open '/proc/$ct_pid/root$dev->{path}': $!\n";
+
+	# Enter the container mount namespace
+	PVE::Tools::setns(fileno($ct_mnt_ns), PVE::Tools::CLONE_NEWNS);
+	chdir('/')
+	    or die "failed to change root directory within the container's mount namespace: $!\n";
+
+	# Bind mount the device node into the container
+	PVE::Tools::move_mount(fileno($srcfh), '', fileno($dstfh), '', &MOVE_MOUNT_F_EMPTY_PATH | &MOVE_MOUNT_T_EMPTY_PATH)
+	    or die "move_mount failed: $!\n";
+    });
+
+    # Allow or deny device access with cgroup2
+    my $major = PVE::Tools::dev_t_major($rdev);
+    my $minor = PVE::Tools::dev_t_minor($rdev);
+    my $device_type = S_ISBLK($mode) ? 'b' : 'c';
+
+    run_command(["lxc-cgroup", "-n", $vmid, "devices.deny", "$device_type $major:$minor w"])
+	if ($dev->{'deny-write'});
+
+    my $allow_perms = $dev->{'deny-write'} ? 'r' : 'rw';
+    run_command([
+	"lxc-cgroup", "-n", $vmid, "devices.allow", "$device_type $major:$minor $allow_perms"
+    ]);
+}
+
 sub mountpoint_hotplug :prototype($$$$$) {
     my ($vmid, $conf, $opt, $mp, $storage_cfg) = @_;
 
diff --git a/src/PVE/LXC/Config.pm b/src/PVE/LXC/Config.pm
index b44bcce..1acc76d 100644
--- a/src/PVE/LXC/Config.pm
+++ b/src/PVE/LXC/Config.pm
@@ -1529,6 +1529,13 @@ sub vmconfig_hotplug_pending {
 		$class->apply_pending_mountpoint($vmid, $conf, $opt, $storecfg, 1);
 		# apply_pending_mountpoint modifies the value if it creates a new disk
 		$value = $conf->{pending}->{$opt};
+	    } elsif ($opt =~ m/^dev(\d+)$/) {
+		if (exists($conf->{$opt})) {
+		    die "skip\n"; # don't try to hotplug over existing dev
+		}
+
+		$class->apply_pending_device_passthrough($vmid, $conf, $opt, 1);
+		$value = $conf->{pending}->{$opt};
 	    } else {
 		die "skip\n"; # skip non-hotpluggable
 	    }
@@ -1623,6 +1630,18 @@ my $rescan_volume = sub {
     warn "Could not rescan volume size - $@\n" if $@;
 };
 
+sub apply_pending_device_passthrough {
+    my ($class, $vmid, $conf, $opt, $running) = @_;
+
+    my $dev = $class->parse_device($conf->{pending}->{$opt});
+    my $old = $conf->{$opt};
+    if ($running) {
+	die "skip\n" if defined($old); # TODO: editing a device passthrough
+	PVE::LXC::device_passthrough_hotplug($vmid, $conf, $dev);
+	$conf->{pending}->{$opt} = $class->print_device($dev);
+    }
+}
+
 sub apply_pending_mountpoint {
     my ($class, $vmid, $conf, $opt, $storecfg, $running) = @_;
 
-- 
2.39.5



_______________________________________________
pve-devel mailing list
pve-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2024-12-16 17:22 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-12-16 17:21 [pve-devel] [PATCH container 0/3] implement device hotplug Filip Schauer
2024-12-16 17:21 ` [pve-devel] [PATCH container 1/3] extract apparmor profile & namespace switch to its own helper Filip Schauer
2024-12-16 17:21 ` [pve-devel] [PATCH container 2/3] config: support printing a device Filip Schauer
2024-12-16 17:21 ` [pve-devel] [PATCH container 3/3] implement device hotplug Filip Schauer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal