* [pve-devel] [PATCH container 1/3] extract apparmor profile & namespace switch to its own helper
2024-12-16 17:21 [pve-devel] [PATCH container 0/3] implement device hotplug Filip Schauer
@ 2024-12-16 17:21 ` Filip Schauer
2024-12-16 17:21 ` [pve-devel] [PATCH container 2/3] config: support printing a device Filip Schauer
2024-12-16 17:21 ` [pve-devel] [PATCH container 3/3] implement device hotplug Filip Schauer
2 siblings, 0 replies; 4+ messages in thread
From: Filip Schauer @ 2024-12-16 17:21 UTC (permalink / raw)
To: pve-devel
Signed-off-by: Filip Schauer <f.schauer@proxmox.com>
---
src/PVE/LXC.pm | 71 +++++++++++++++++++++++++++++---------------------
1 file changed, 41 insertions(+), 30 deletions(-)
diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm
index e78e365..12a4378 100644
--- a/src/PVE/LXC.pm
+++ b/src/PVE/LXC.pm
@@ -1982,15 +1982,42 @@ sub __mountpoint_mount {
die "unsupported storage";
}
+my $enter_mnt_ns_and_change_aa_profile = sub {
+ my ($mnt_ns, $aa_profile, $code_pre_changeprofile) = @_;
+
+ # Grab a file descriptor to our apparmor label file so we can change the profile
+ sysopen(my $aa_fd, "/proc/self/attr/current", O_WRONLY)
+ or die "failed to open '/proc/self/attr/current' for writing: $!\n";
+
+ # But switch namespaces first, to make sure the namespace switches aren't blocked by
+ # apparmor.
+ PVE::Tools::setns(fileno($mnt_ns), PVE::Tools::CLONE_NEWNS);
+ chdir('/')
+ or die "failed to change root directory within mount namespace: $!\n";
+
+ $code_pre_changeprofile->() if defined($code_pre_changeprofile);
+
+ # Now switch our apparmor profile:
+ my $data = "changeprofile $aa_profile";
+ my $data_written = syswrite($aa_fd, $data, length($data));
+ if (!defined($data_written) || $data_written != length($data)) {
+ die "failed to change apparmor profile: $!\n";
+ }
+ # Check errors on close as well:
+ close($aa_fd)
+ or die "failed to change apparmor profile (close() failed): $!\n";
+};
+
sub mountpoint_hotplug :prototype($$$$$) {
my ($vmid, $conf, $opt, $mp, $storage_cfg) = @_;
my (undef, $root_uid, $root_gid) = PVE::LXC::parse_id_maps($conf);
- # We do the rest in a fork with an unshared mount namespace, because:
- # -) change our papparmor profile to that of /usr/bin/lxc-start
- # -) we're now going to 'stage' # the mountpoint, then grab it, then move into the
- # container's namespace, then mount it.
+ # We do the rest in a fork with an unshared mount namespace:
+ # -) change our apparmor profile to 'pve-container-mounthotplug', which is '/usr/bin/lxc-start'
+ # with move_mount privileges on every mount.
+ # -) we're now going to 'stage' the mountpoint, then grab it, then move into the container's
+ # namespace, then mount it.
PVE::Tools::run_fork(sub {
# Pin the container pid longer, we also need to get its monitor/parent:
@@ -2003,32 +2030,16 @@ sub mountpoint_hotplug :prototype($$$$$) {
my $ct_mnt_ns = $get_container_namespace->($vmid, $ct_pid, 'mnt');
my $monitor_mnt_ns = $get_container_namespace->($vmid, $monitor_pid, 'mnt');
- # Grab a file descriptor to our apparmor label file so we can change into the 'lxc-start'
- # profile to lower our privileges to the same level we have in the start hook:
- sysopen(my $aa_fd, "/proc/self/attr/current", O_WRONLY)
- or die "failed to open '/proc/self/attr/current' for writing: $!\n";
- # But switch namespaces first, to make sure the namespace switches aren't blocked by
- # apparmor.
-
- # Change into the monitor's mount namespace. We "pin" the mount into the monitor's
- # namespace for it to remain active there since the container will be able to unmount
- # hotplugged mount points and thereby potentially free up loop devices, which is a security
- # concern.
- PVE::Tools::setns(fileno($monitor_mnt_ns), PVE::Tools::CLONE_NEWNS);
- chdir('/')
- or die "failed to change root directory within the monitor's mount namespace: $!\n";
-
- my $dir = get_staging_mount_path($opt);
-
- # Now switch our apparmor profile before mounting:
- my $data = 'changeprofile pve-container-mounthotplug';
- my $data_written = syswrite($aa_fd, $data, length($data));
- if (!defined($data_written) || $data_written != length($data)) {
- die "failed to change apparmor profile: $!\n";
- }
- # Check errors on close as well:
- close($aa_fd)
- or die "failed to change apparmor profile (close() failed): $!\n";
+ # -) Change into the monitor's mount namespace. We "pin" the mount into the monitor's
+ # namespace for it to remain active there since the container will be able to unmount
+ # hotplugged mount points and thereby potentially free up loop devices, which is a
+ # security concern.
+ # -) Prepare the staging directory.
+ # -) Switch to the 'pve-container-mounthotplug' apparmor profile.
+ my $dir;
+ $enter_mnt_ns_and_change_aa_profile->($monitor_mnt_ns, "pve-container-mounthotplug", sub {
+ $dir = get_staging_mount_path($opt);
+ });
my $mount_fd = mountpoint_stage($mp, $dir, $storage_cfg, undef, $root_uid, $root_gid);
--
2.39.5
_______________________________________________
pve-devel mailing list
pve-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel
^ permalink raw reply [flat|nested] 4+ messages in thread
* [pve-devel] [PATCH container 3/3] implement device hotplug
2024-12-16 17:21 [pve-devel] [PATCH container 0/3] implement device hotplug Filip Schauer
2024-12-16 17:21 ` [pve-devel] [PATCH container 1/3] extract apparmor profile & namespace switch to its own helper Filip Schauer
2024-12-16 17:21 ` [pve-devel] [PATCH container 2/3] config: support printing a device Filip Schauer
@ 2024-12-16 17:21 ` Filip Schauer
2 siblings, 0 replies; 4+ messages in thread
From: Filip Schauer @ 2024-12-16 17:21 UTC (permalink / raw)
To: pve-devel
Signed-off-by: Filip Schauer <f.schauer@proxmox.com>
---
src/PVE/LXC.pm | 93 ++++++++++++++++++++++++++++++++++++++++++-
src/PVE/LXC/Config.pm | 19 +++++++++
2 files changed, 111 insertions(+), 1 deletion(-)
diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm
index 12a4378..06902a1 100644
--- a/src/PVE/LXC.pm
+++ b/src/PVE/LXC.pm
@@ -5,7 +5,7 @@ use warnings;
use Cwd qw();
use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED EEXIST);
-use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY :mode);
+use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY O_CREAT :mode);
use File::Path;
use File::Spec;
use IO::Poll qw(POLLIN POLLHUP);
@@ -2008,6 +2008,97 @@ my $enter_mnt_ns_and_change_aa_profile = sub {
or die "failed to change apparmor profile (close() failed): $!\n";
};
+sub device_passthrough_hotplug :prototype($$$) {
+ my ($vmid, $conf, $dev) = @_;
+
+ my ($mode, $rdev) = (stat($dev->{path}))[2, 6];
+
+ die "Could not get mode or device ID of $dev->{path}\n"
+ if (!defined($mode) || !defined($rdev));
+
+ # We do the rest in a fork with an unshared mount namespace:
+ # -) change our apparmor profile to 'pve-container-mounthotplug', which is '/usr/bin/lxc-start'
+ # with move_mount privileges on every mount.
+ # -) create the device node, then grab it, create a file to bind mount the device node onto in
+ # the container, switch to the container mount namespace, and move_mount the device node.
+
+ PVE::Tools::run_fork(sub {
+ # Pin the container pid longer, we also need to get its monitor/parent:
+ my ($ct_pid, $ct_pidfd) = open_lxc_pid($vmid)
+ or die "failed to open pidfd of container $vmid\'s init process\n";
+
+ my ($monitor_pid, $monitor_pidfd) = open_ppid($ct_pid)
+ or die "failed to open pidfd of container $vmid\'s monitor process\n";
+
+ my $ct_mnt_ns = $get_container_namespace->($vmid, $ct_pid, 'mnt');
+ my $ct_user_ns = $get_container_namespace->($vmid, $ct_pid, 'user');
+ my $monitor_mnt_ns = $get_container_namespace->($vmid, $monitor_pid, 'mnt');
+
+ # Enter monitor mount namespace and switch to 'pve-container-mounthotplug' apparmor profile.
+ $enter_mnt_ns_and_change_aa_profile->($monitor_mnt_ns, "pve-container-mounthotplug", undef);
+
+ # Create the device node
+ my $passthrough_dir = "/var/lib/lxc/$vmid/passthrough";
+ my $passthrough_device_path = $passthrough_dir . $dev->{path};
+ PVE::Tools::mknod($passthrough_device_path, $mode, $rdev)
+ or die("failed to mknod $passthrough_device_path: $!\n");
+
+ # Use chmod because umask could mess with the access mode on mknod
+ my $passthrough_mode = 0660;
+ $passthrough_mode = oct($dev->{mode}) if defined($dev->{mode});
+ chmod $passthrough_mode, $passthrough_device_path
+ or die "failed to chmod $passthrough_mode $passthrough_device_path: $!\n";
+
+ # Set uid and gid of the device node
+ my $uid = 0;
+ my $gid = 0;
+ $uid = $dev->{uid} if defined($dev->{uid});
+ $gid = $dev->{gid} if defined($dev->{gid});
+ my $id_map = (PVE::LXC::parse_id_maps($conf))[0];
+ $uid = PVE::LXC::map_ct_uid_to_host($uid, $id_map);
+ $gid = PVE::LXC::map_ct_gid_to_host($gid, $id_map);
+ chown $uid, $gid, $passthrough_device_path
+ or die("failed to chown $uid:$gid $passthrough_device_path: $!\n");
+
+ my $srcfh = PVE::Tools::open_tree(&AT_FDCWD, $passthrough_device_path, &OPEN_TREE_CLOEXEC | &OPEN_TREE_CLONE)
+ or die "open_tree() on passthrough device node failed: $!\n";
+
+ if ($conf->{unprivileged}) {
+ PVE::Tools::setns(fileno($ct_user_ns), PVE::Tools::CLONE_NEWUSER)
+ or die "failed to enter user namespace of container $vmid: $!\n";
+
+ POSIX::setuid(0);
+ POSIX::setgid(0);
+ }
+
+ # Create a regular file in the container to bind mount the device node onto.
+ sysopen(my $dstfh, "/proc/$ct_pid/root$dev->{path}", O_CREAT)
+ or die "failed to open '/proc/$ct_pid/root$dev->{path}': $!\n";
+
+ # Enter the container mount namespace
+ PVE::Tools::setns(fileno($ct_mnt_ns), PVE::Tools::CLONE_NEWNS);
+ chdir('/')
+ or die "failed to change root directory within the container's mount namespace: $!\n";
+
+ # Bind mount the device node into the container
+ PVE::Tools::move_mount(fileno($srcfh), '', fileno($dstfh), '', &MOVE_MOUNT_F_EMPTY_PATH | &MOVE_MOUNT_T_EMPTY_PATH)
+ or die "move_mount failed: $!\n";
+ });
+
+ # Allow or deny device access with cgroup2
+ my $major = PVE::Tools::dev_t_major($rdev);
+ my $minor = PVE::Tools::dev_t_minor($rdev);
+ my $device_type = S_ISBLK($mode) ? 'b' : 'c';
+
+ run_command(["lxc-cgroup", "-n", $vmid, "devices.deny", "$device_type $major:$minor w"])
+ if ($dev->{'deny-write'});
+
+ my $allow_perms = $dev->{'deny-write'} ? 'r' : 'rw';
+ run_command([
+ "lxc-cgroup", "-n", $vmid, "devices.allow", "$device_type $major:$minor $allow_perms"
+ ]);
+}
+
sub mountpoint_hotplug :prototype($$$$$) {
my ($vmid, $conf, $opt, $mp, $storage_cfg) = @_;
diff --git a/src/PVE/LXC/Config.pm b/src/PVE/LXC/Config.pm
index b44bcce..1acc76d 100644
--- a/src/PVE/LXC/Config.pm
+++ b/src/PVE/LXC/Config.pm
@@ -1529,6 +1529,13 @@ sub vmconfig_hotplug_pending {
$class->apply_pending_mountpoint($vmid, $conf, $opt, $storecfg, 1);
# apply_pending_mountpoint modifies the value if it creates a new disk
$value = $conf->{pending}->{$opt};
+ } elsif ($opt =~ m/^dev(\d+)$/) {
+ if (exists($conf->{$opt})) {
+ die "skip\n"; # don't try to hotplug over existing dev
+ }
+
+ $class->apply_pending_device_passthrough($vmid, $conf, $opt, 1);
+ $value = $conf->{pending}->{$opt};
} else {
die "skip\n"; # skip non-hotpluggable
}
@@ -1623,6 +1630,18 @@ my $rescan_volume = sub {
warn "Could not rescan volume size - $@\n" if $@;
};
+sub apply_pending_device_passthrough {
+ my ($class, $vmid, $conf, $opt, $running) = @_;
+
+ my $dev = $class->parse_device($conf->{pending}->{$opt});
+ my $old = $conf->{$opt};
+ if ($running) {
+ die "skip\n" if defined($old); # TODO: editing a device passthrough
+ PVE::LXC::device_passthrough_hotplug($vmid, $conf, $dev);
+ $conf->{pending}->{$opt} = $class->print_device($dev);
+ }
+}
+
sub apply_pending_mountpoint {
my ($class, $vmid, $conf, $opt, $storecfg, $running) = @_;
--
2.39.5
_______________________________________________
pve-devel mailing list
pve-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel
^ permalink raw reply [flat|nested] 4+ messages in thread