From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by lists.proxmox.com (Postfix) with ESMTPS id 5378DB382 for ; Wed, 9 Aug 2023 10:38:40 +0200 (CEST) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id CE7F411EF8 for ; Wed, 9 Aug 2023 10:38:08 +0200 (CEST) Received: from proxmox-new.maurer-it.com (proxmox-new.maurer-it.com [94.136.29.106]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by firstgate.proxmox.com (Proxmox) with ESMTPS for ; Wed, 9 Aug 2023 10:38:05 +0200 (CEST) Received: from proxmox-new.maurer-it.com (localhost.localdomain [127.0.0.1]) by proxmox-new.maurer-it.com (Proxmox) with ESMTP id 5F6F242B3A for ; Wed, 9 Aug 2023 10:38:05 +0200 (CEST) From: Markus Frank To: pve-devel@lists.proxmox.com Date: Wed, 9 Aug 2023 10:37:32 +0200 Message-Id: <20230809083739.100024-5-m.frank@proxmox.com> X-Mailer: git-send-email 2.39.2 In-Reply-To: <20230809083739.100024-1-m.frank@proxmox.com> References: <20230809083739.100024-1-m.frank@proxmox.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-SPAM-LEVEL: Spam detection results: 0 AWL -0.054 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Subject: [pve-devel] [PATCH qemu-server v7 4/11] feature #1027: virtio-fs support X-BeenThere: pve-devel@lists.proxmox.com X-Mailman-Version: 2.1.29 Precedence: list List-Id: Proxmox VE development discussion List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 09 Aug 2023 08:38:40 -0000 add support for sharing directories with a guest vm virtio-fs needs virtiofsd to be started. In order to start virtiofsd as a process (despite being a daemon it is does not run in the background), a double-fork is used. virtiofsd should close itself together with qemu. There are the parameters dirid and the optional parameters direct-io & cache. Additionally the xattr & acl parameter overwrite the directory mapping settings for xattr & acl. The dirid gets mapped to the path on the current node and is also used as a mount-tag (name used to mount the device on the guest). example config: ``` virtiofs0: foo,direct-io=1,cache=always,acl=1 virtiofs1: dirid=bar,cache=never,xattr=1 ``` For information on the optional parameters see there: https://gitlab.com/virtio-fs/virtiofsd/-/blob/main/README.md Signed-off-by: Markus Frank --- I did not get virtiofsd to run with run_command without creating zombie processes after stutdown. So I replaced run_command with exec for now. Maybe someone can find out why this happens. PVE/QemuServer.pm | 174 ++++++++++++++++++++++++++++++++++++++- PVE/QemuServer/Memory.pm | 25 ++++-- debian/control | 1 + 3 files changed, 193 insertions(+), 7 deletions(-) diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm index 484bc7f..d547dd6 100644 --- a/PVE/QemuServer.pm +++ b/PVE/QemuServer.pm @@ -43,6 +43,7 @@ use PVE::PBSClient; use PVE::RESTEnvironment qw(log_warn); use PVE::RPCEnvironment; use PVE::Storage; +use PVE::Mapping::Dir; use PVE::SysFSTools; use PVE::Systemd; use PVE::Tools qw(run_command file_read_firstline file_get_contents dir_glob_foreach get_host_arch $IPV6RE); @@ -276,6 +277,42 @@ my $rng_fmt = { }, }; +my $virtiofs_fmt = { + 'dirid' => { + type => 'string', + default_key => 1, + description => "Mapping identifier of the directory mapping to be" + ." shared with the guest. Also used as a mount tag inside the VM.", + format_description => 'mapping-id', + format => 'pve-configid', + }, + 'cache' => { + type => 'string', + description => "The caching policy the file system should use" + ." (auto, always, never).", + format_description => "virtiofs-cache", + enum => [qw(auto always never)], + optional => 1, + }, + 'direct-io' => { + type => 'boolean', + description => "Honor the O_DIRECT flag passed down by guest applications", + format_description => "virtiofs-directio", + optional => 1, + }, + xattr => { + type => 'boolean', + description => "Enable support for extended attributes.", + optional => 1, + }, + acl => { + type => 'boolean', + description => "Enable support for posix ACLs (implies --xattr).", + optional => 1, + }, +}; +PVE::JSONSchema::register_format('pve-qm-virtiofs', $virtiofs_fmt); + my $meta_info_fmt = { 'ctime' => { type => 'integer', @@ -840,6 +877,7 @@ while (my ($k, $v) = each %$confdesc) { } my $MAX_NETS = 32; +my $MAX_VIRTIOFS = 10; my $MAX_SERIAL_PORTS = 4; my $MAX_PARALLEL_PORTS = 3; my $MAX_NUMA = 8; @@ -984,6 +1022,21 @@ my $netdesc = { PVE::JSONSchema::register_standard_option("pve-qm-net", $netdesc); +my $virtiofsdesc = { + optional => 1, + type => 'string', format => $virtiofs_fmt, + description => "share files between host and guest", +}; +PVE::JSONSchema::register_standard_option("pve-qm-virtiofs", $virtiofsdesc); + +sub max_virtiofs { + return $MAX_VIRTIOFS; +} + +for (my $i = 0; $i < $MAX_VIRTIOFS; $i++) { + $confdesc->{"virtiofs$i"} = $virtiofsdesc; +} + my $ipconfig_fmt = { ip => { type => 'string', @@ -4113,6 +4166,21 @@ sub config_to_command { push @$devices, '-device', $netdevicefull; } + my $virtiofs_enabled = 0; + for (my $i = 0; $i < $MAX_VIRTIOFS; $i++) { + my $opt = "virtiofs$i"; + + next if !$conf->{$opt}; + my $virtiofs = parse_property_string('pve-qm-virtiofs', $conf->{$opt}); + next if !$virtiofs; + + push @$devices, '-chardev', "socket,id=virtfs$i,path=/var/run/virtiofsd/vm$vmid-fs$i"; + push @$devices, '-device', 'vhost-user-fs-pci,queue-size=1024' + .",chardev=virtfs$i,tag=$virtiofs->{dirid}"; + + $virtiofs_enabled = 1; + } + if ($conf->{ivshmem}) { my $ivshmem = parse_property_string($ivshmem_fmt, $conf->{ivshmem}); @@ -4172,6 +4240,14 @@ sub config_to_command { } push @$machineFlags, "type=${machine_type_min}"; + if ($virtiofs_enabled && !$conf->{numa}) { + # kvm: '-machine memory-backend' and '-numa memdev' properties are + # mutually exclusive + push @$devices, '-object', 'memory-backend-file,id=virtiofs-mem' + .",size=$conf->{memory}M,mem-path=/dev/shm,share=on"; + push @$machineFlags, 'memory-backend=virtiofs-mem'; + } + push @$cmd, @$devices; push @$cmd, '-rtc', join(',', @$rtcFlags) if scalar(@$rtcFlags); push @$cmd, '-machine', join(',', @$machineFlags) if scalar(@$machineFlags); @@ -4198,6 +4274,85 @@ sub config_to_command { return wantarray ? ($cmd, $vollist, $spice_port, $pci_devices) : $cmd; } +sub start_virtiofs { + my ($vmid, $fsid, $virtiofs) = @_; + + my $dir_cfg = PVE::Mapping::Dir::config()->{ids}->{$virtiofs->{dirid}}; + my $node_list = PVE::Mapping::Dir::find_on_current_node($virtiofs->{dirid}); + + if (!$node_list || scalar($node_list->@*) != 1) { + die "virtiofs needs exactly one mapping for this node\n"; + } + + eval { + PVE::Mapping::Dir::assert_valid($node_list->[0]); + }; + if (my $err = $@) { + die "Directory Mapping invalid: $err\n"; + } + + my $node_cfg = $node_list->[0]; + my $path = $node_cfg->{path}; + my $socket_path_root = "/var/run/virtiofsd"; + mkdir $socket_path_root; + my $socket_path = "$socket_path_root/vm$vmid-fs$fsid"; + unlink($socket_path); + my $socket = IO::Socket::UNIX->new( + Type => SOCK_STREAM, + Local => $socket_path, + Listen => 1, + ) or die "cannot create socket - $!\n"; + + my $flags = fcntl($socket, F_GETFD, 0) + or die "failed to get file descriptor flags: $!\n"; + fcntl($socket, F_SETFD, $flags & ~FD_CLOEXEC) + or die "failed to remove FD_CLOEXEC from file descriptor\n"; + + my $fd = $socket->fileno(); + + my $virtiofsd_bin = '/usr/libexec/virtiofsd'; + + my $pid = fork(); + if ($pid == 0) { + setsid(); + $0 = "task pve-vm$vmid-virtiofs$fsid"; + for my $fd_loop (3 .. POSIX::sysconf( &POSIX::_SC_OPEN_MAX )) { + POSIX::close($fd_loop) if ($fd_loop != $fd); + } + + my $pid2 = fork(); + if ($pid2 == 0) { + my $cmd = [$virtiofsd_bin, "--fd=$fd", "--shared-dir=$path"]; + push @$cmd, '--xattr' if ($virtiofs->{xattr}); + push @$cmd, '--posix-acl' if ($virtiofs->{acl}); + + # Default to dir config xattr & acl settings + push @$cmd, '--xattr' + if !defined $virtiofs->{'xattr'} && $dir_cfg->{'xattr'}; + push @$cmd, '--posix-acl' + if !defined $virtiofs->{'acl'} && $dir_cfg->{'acl'}; + + push @$cmd, '--announce-submounts' if ($node_cfg->{submounts}); + push @$cmd, '--allow-direct-io' if ($virtiofs->{'direct-io'}); + push @$cmd, "--cache=$virtiofs->{'cache'}" if ($virtiofs->{'cache'}); + + exec(@$cmd); + } elsif (!defined($pid2)) { + die "could not fork to start virtiofsd\n"; + } else { + POSIX::_exit(0); + } + } elsif (!defined($pid)) { + die "could not fork to start virtiofsd\n"; + } else { + waitpid($pid, 0); + } + + # return socket to keep it alive, + # so that qemu will wait for virtiofsd to start + return $socket; +} + sub check_rng_source { my ($source) = @_; @@ -5655,7 +5810,6 @@ sub vm_start { }); } - # params: # statefile => 'tcp', 'unix' for migration or path/volid for RAM state # skiplock => 0/1, skip checking for config lock @@ -5918,10 +6072,23 @@ sub vm_start_nolock { } $systemd_properties{timeout} = 10 if $statefile; # setting up the scope shoul be quick + my $run_qemu = sub { PVE::Tools::run_fork sub { PVE::Systemd::enter_systemd_scope($vmid, "Proxmox VE VM $vmid", %systemd_properties); + my @virtiofs_sockets; + for (my $i = 0; $i < $MAX_VIRTIOFS; $i++) { + my $opt = "virtiofs$i"; + + next if !$conf->{$opt}; + my $virtiofs = parse_property_string('pve-qm-virtiofs', $conf->{$opt}); + next if !$virtiofs; + + my $virtiofs_socket = start_virtiofs($vmid, $i, $virtiofs); + push @virtiofs_sockets, $virtiofs_socket; + } + my $tpmpid; if (my $tpm = $conf->{tpmstate0}) { # start the TPM emulator so QEMU can connect on start @@ -5936,6 +6103,11 @@ sub vm_start_nolock { } die "QEMU exited with code $exitcode\n"; } + + foreach my $virtiofs_socket (@virtiofs_sockets) { + shutdown($virtiofs_socket, 2); + close($virtiofs_socket); + } }; }; diff --git a/PVE/QemuServer/Memory.pm b/PVE/QemuServer/Memory.pm index 0601dd6..648bc08 100644 --- a/PVE/QemuServer/Memory.pm +++ b/PVE/QemuServer/Memory.pm @@ -278,6 +278,16 @@ sub config { die "numa needs to be enabled to use hugepages" if $conf->{hugepages} && !$conf->{numa}; + my $virtiofs_enabled = 0; + for (my $i = 0; $i < PVE::QemuServer::max_virtiofs(); $i++) { + my $opt = "virtiofs$i"; + next if !$conf->{$opt}; + my $virtiofs = PVE::JSONSchema::parse_property_string('pve-qm-virtiofs', $conf->{$opt}); + if ($virtiofs) { + $virtiofs_enabled = 1; + } + } + if ($conf->{numa}) { my $numa_totalmemory = undef; @@ -290,7 +300,8 @@ sub config { my $numa_memory = $numa->{memory}; $numa_totalmemory += $numa_memory; - my $mem_object = print_mem_object($conf, "ram-node$i", $numa_memory); + my $memdev = $virtiofs_enabled ? "virtiofs-mem$i" : "ram-node$i"; + my $mem_object = print_mem_object($conf, $memdev, $numa_memory); # cpus my $cpulists = $numa->{cpus}; @@ -315,7 +326,7 @@ sub config { } push @$cmd, '-object', $mem_object; - push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=ram-node$i"; + push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=$memdev"; } die "total memory for NUMA nodes must be equal to vm static memory\n" @@ -329,13 +340,13 @@ sub config { die "host NUMA node$i doesn't exist\n" if !host_numanode_exists($i) && $conf->{hugepages}; - my $mem_object = print_mem_object($conf, "ram-node$i", $numa_memory); - push @$cmd, '-object', $mem_object; - my $cpus = ($cores * $i); $cpus .= "-" . ($cpus + $cores - 1) if $cores > 1; - push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=ram-node$i"; + my $memdev = $virtiofs_enabled ? "virtiofs-mem$i" : "ram-node$i"; + my $mem_object = print_mem_object($conf, $memdev, $numa_memory); + push @$cmd, '-object', $mem_object; + push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=$memdev"; } } } @@ -364,6 +375,8 @@ sub print_mem_object { my $path = hugepages_mount_path($hugepages_size); return "memory-backend-file,id=$id,size=${size}M,mem-path=$path,share=on,prealloc=yes"; + } elsif ($id =~ m/^virtiofs-mem/) { + return "memory-backend-file,id=$id,size=${size}M,mem-path=/dev/shm,share=on"; } else { return "memory-backend-ram,id=$id,size=${size}M"; } diff --git a/debian/control b/debian/control index 49f67b2..f008a9b 100644 --- a/debian/control +++ b/debian/control @@ -53,6 +53,7 @@ Depends: dbus, socat, swtpm, swtpm-tools, + virtiofsd, ${misc:Depends}, ${perl:Depends}, ${shlibs:Depends}, -- 2.39.2