all lists on lists.proxmox.com
 help / color / mirror / Atom feed
From: Thomas Lamprecht <t.lamprecht@proxmox.com>
To: Proxmox VE development discussion <pve-devel@lists.proxmox.com>,
	Filip Schauer <f.schauer@proxmox.com>
Subject: Re: [pve-devel] [PATCH container v4 4/4] implement device hotplug
Date: Wed, 30 Jul 2025 12:59:52 +0200	[thread overview]
Message-ID: <c1cecf40-2544-4d57-af7f-a51db2de4680@proxmox.com> (raw)
In-Reply-To: <20250728124800.96685-5-f.schauer@proxmox.com>

Am 28.07.25 um 14:49 schrieb Filip Schauer:
> This only includes adding devices to a running container. Removing or
> editing existing devices is still not implemented.
> 
> Signed-off-by: Filip Schauer <f.schauer@proxmox.com>
> ---
>  src/PVE/LXC.pm        | 84 ++++++++++++++++++++++++++++++++++++++++++-
>  src/PVE/LXC/Config.pm | 19 ++++++++++
>  2 files changed, 102 insertions(+), 1 deletion(-)
> 
> diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm
> index e5c0714..63fb5d1 100644
> --- a/src/PVE/LXC.pm
> +++ b/src/PVE/LXC.pm
> @@ -5,7 +5,7 @@ use warnings;
>  
>  use Cwd qw();
>  use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED EEXIST);
> -use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY :mode);
> +use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY O_CREAT :mode);
>  use File::Basename;
>  use File::Path;
>  use File::Spec;
> @@ -2178,6 +2178,88 @@ my $enter_mnt_ns_and_change_aa_profile = sub {
>          or die "failed to change apparmor profile (close() failed): $!\n";
>  };
>  
> +sub device_passthrough_hotplug : prototype($$$) {
> +    my ($vmid, $conf, $dev) = @_;
> +
> +    my ($mode, $rdev) = PVE::LXC::Tools::get_device_mode_and_rdev($dev->{path});
> +    my $device_type = S_ISBLK($mode) ? 'b' : 'c';
> +    my $major = PVE::Tools::dev_t_major($rdev);
> +    my $minor = PVE::Tools::dev_t_minor($rdev);
> +
> +    # We do the rest in a fork with an unshared mount namespace:
> +    #  -) change our apparmor profile to 'pve-container-mounthotplug', which is '/usr/bin/lxc-start'
> +    #     with move_mount privileges on every mount.
> +    #  -) create the device node, then grab it, create a file to bind mount the device node onto in
> +    #     the container, switch to the container mount namespace, and move_mount the device node.
> +
> +    PVE::Tools::run_fork(sub {
> +        # Pin the container pid longer, we also need to get its monitor/parent:
> +        my ($ct_pid, $ct_pidfd) = open_lxc_pid($vmid)
> +            or die "failed to open pidfd of container $vmid\'s init process\n";
> +
> +        my ($monitor_pid, $monitor_pidfd) = open_ppid($ct_pid)
> +            or die "failed to open pidfd of container $vmid\'s monitor process\n";
> +
> +        my $ct_mnt_ns = $get_container_namespace->($vmid, $ct_pid, 'mnt');
> +        my $ct_user_ns = $get_container_namespace->($vmid, $ct_pid, 'user');
> +        my $monitor_mnt_ns = $get_container_namespace->($vmid, $monitor_pid, 'mnt');
> +
> +        # Enter monitor mount namespace and switch to 'pve-container-mounthotplug' apparmor profile.
> +        $enter_mnt_ns_and_change_aa_profile->(
> +            $monitor_mnt_ns, "pve-container-mounthotplug", undef,
> +        );
> +
> +        my $id_map = (PVE::LXC::parse_id_maps($conf))[0];
> +        my $passthrough_device_path = create_passthrough_device_node(
> +            "/var/lib/lxc/$vmid/passthrough",
> +            $dev, $mode, $rdev, $id_map,
> +        );
> +
> +        my $srcfh = PVE::Tools::open_tree(
> +            &AT_FDCWD,
> +            $passthrough_device_path,
> +            &OPEN_TREE_CLOEXEC | &OPEN_TREE_CLONE,
> +        ) or die "open_tree() on passthrough device node failed: $!\n";
> +
> +        if ($conf->{unprivileged}) {
> +            PVE::Tools::setns(fileno($ct_user_ns), PVE::Tools::CLONE_NEWUSER)
> +                or die "failed to enter user namespace of container $vmid: $!\n";
> +
> +            POSIX::setuid(0);
> +            POSIX::setgid(0);
> +        }
> +
> +        # Create a regular file in the container to bind mount the device node onto.
> +        my $device_path = "/proc/$ct_pid/root$dev->{path}";
> +        File::Path::make_path(dirname($device_path));
> +        sysopen(my $dstfh, $device_path, O_CREAT)
> +            or die "failed to create '$device_path': $!\n";
> +
> +        # Enter the container mount namespace
> +        PVE::Tools::setns(fileno($ct_mnt_ns), PVE::Tools::CLONE_NEWNS);
> +        chdir('/')
> +            or die "failed to change directory within the container's mount namespace: $!\n";
> +
> +        # Bind mount the device node into the container
> +        PVE::Tools::move_mount(
> +            fileno($srcfh),
> +            '',
> +            fileno($dstfh),
> +            '',
> +            &MOVE_MOUNT_F_EMPTY_PATH | &MOVE_MOUNT_T_EMPTY_PATH,
> +        ) or die "move_mount failed: $!\n";
> +    });
> +
> +    # Allow or deny device access with cgroup2
> +    run_command(["lxc-cgroup", "-n", $vmid, "devices.deny", "$device_type $major:$minor w"])
> +        if ($dev->{'deny-write'});
> +
> +    my $allow_perms = $dev->{'deny-write'} ? 'r' : 'rw';
> +    run_command([
> +        "lxc-cgroup", "-n", $vmid, "devices.allow", "$device_type $major:$minor $allow_perms",
> +    ]);
> +}
> +
>  sub mountpoint_hotplug : prototype($$$$$) {
>      my ($vmid, $conf, $opt, $mp, $storage_cfg) = @_;
>  
> diff --git a/src/PVE/LXC/Config.pm b/src/PVE/LXC/Config.pm
> index 7aa6263..de963bc 100644
> --- a/src/PVE/LXC/Config.pm
> +++ b/src/PVE/LXC/Config.pm
> @@ -1613,6 +1613,13 @@ sub vmconfig_hotplug_pending {
>                  $class->apply_pending_mountpoint($vmid, $conf, $opt, $storecfg, 1);
>                  # apply_pending_mountpoint modifies the value if it creates a new disk
>                  $value = $conf->{pending}->{$opt};
> +            } elsif ($opt =~ m/^dev(\d+)$/) {
> +                if (exists($conf->{$opt})) {
> +                    die "skip\n"; # don't try to hotplug over existing dev
> +                }
> +
> +                $class->apply_pending_device_passthrough($vmid, $conf, $opt, 1);

Below would do well with a comment, and actually I'm not really sure it's needed,
currently the print_device called in apply_pending_device_passthrough is just
printing the property string per the format, which we just parsed before,
so this should be 1:1 the same before after here? Unlike for mpX mountpoints,
where the formatted string might change. Or is this just for future preparation
for more complex handling/devices?

In anyway, can be fine as is now, it doesn't really hurt either, it's just
a little bit confusing and might be unnecessary, so maybe take another look
at this or provide some rationale.

> +                $value = $conf->{pending}->{$opt};
>              } else {
>                  die "skip\n"; # skip non-hotpluggable
>              }
> @@ -1732,6 +1739,18 @@ my $rescan_volume = sub {
>      warn "Could not rescan volume size - $@\n" if $@;
>  };
>  
> +sub apply_pending_device_passthrough {
> +    my ($class, $vmid, $conf, $opt, $running) = @_;
> +
> +    my $dev = $class->parse_device($conf->{pending}->{$opt});

parsed here

> +    my $old = $conf->{$opt};
> +    if ($running) {
> +        die "skip\n" if defined($old); # TODO: editing a device passthrough
> +        PVE::LXC::device_passthrough_hotplug($vmid, $conf, $dev);
> +        $conf->{pending}->{$opt} = $class->print_device($dev);

serialized 1:1 again here

> +    }
> +}
> +
>  sub apply_pending_mountpoint {
>      my ($class, $vmid, $conf, $opt, $storecfg, $running) = @_;
>  



_______________________________________________
pve-devel mailing list
pve-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel


  reply	other threads:[~2025-07-30 10:58 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-07-28 12:47 [pve-devel] [PATCH container v4 0/4] " Filip Schauer
2025-07-28 12:47 ` [pve-devel] [PATCH container v4 1/4] extract apparmor profile & namespace switch to a helper Filip Schauer
2025-07-28 12:47 ` [pve-devel] [PATCH container v4 2/4] extract passthrough device node creation " Filip Schauer
2025-07-28 12:47 ` [pve-devel] [PATCH container v4 3/4] config: support printing a device Filip Schauer
2025-07-28 12:47 ` [pve-devel] [PATCH container v4 4/4] implement device hotplug Filip Schauer
2025-07-30 10:59   ` Thomas Lamprecht [this message]
2025-07-30 12:12     ` Filip Schauer
2025-07-30 11:40 ` [pve-devel] applied-series: [PATCH container v4 0/4] " Thomas Lamprecht

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=c1cecf40-2544-4d57-af7f-a51db2de4680@proxmox.com \
    --to=t.lamprecht@proxmox.com \
    --cc=f.schauer@proxmox.com \
    --cc=pve-devel@lists.proxmox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal