all lists on lists.proxmox.com
 help / color / mirror / Atom feed
From: Markus Frank <m.frank@proxmox.com>
To: pve-devel@lists.proxmox.com
Subject: [pve-devel] [PATCH qemu-server v7 4/11] feature #1027: virtio-fs support
Date: Wed,  9 Aug 2023 10:37:32 +0200	[thread overview]
Message-ID: <20230809083739.100024-5-m.frank@proxmox.com> (raw)
In-Reply-To: <20230809083739.100024-1-m.frank@proxmox.com>

add support for sharing directories with a guest vm

virtio-fs needs virtiofsd to be started.

In order to start virtiofsd as a process (despite being a daemon it is does not run
in the background), a double-fork is used.

virtiofsd should close itself together with qemu.

There are the parameters dirid
and the optional parameters direct-io & cache.
Additionally the xattr & acl parameter overwrite the
directory mapping settings for xattr & acl.

The dirid gets mapped to the path on the current node
and is also used as a mount-tag (name used to mount the
device on the guest).

example config:
```
virtiofs0: foo,direct-io=1,cache=always,acl=1
virtiofs1: dirid=bar,cache=never,xattr=1
```

For information on the optional parameters see there:
https://gitlab.com/virtio-fs/virtiofsd/-/blob/main/README.md

Signed-off-by: Markus Frank <m.frank@proxmox.com>
---
I did not get virtiofsd to run with run_command without creating zombie
processes after stutdown.
So I replaced run_command with exec for now. 
Maybe someone can find out why this happens.

 PVE/QemuServer.pm        | 174 ++++++++++++++++++++++++++++++++++++++-
 PVE/QemuServer/Memory.pm |  25 ++++--
 debian/control           |   1 +
 3 files changed, 193 insertions(+), 7 deletions(-)

diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm
index 484bc7f..d547dd6 100644
--- a/PVE/QemuServer.pm
+++ b/PVE/QemuServer.pm
@@ -43,6 +43,7 @@ use PVE::PBSClient;
 use PVE::RESTEnvironment qw(log_warn);
 use PVE::RPCEnvironment;
 use PVE::Storage;
+use PVE::Mapping::Dir;
 use PVE::SysFSTools;
 use PVE::Systemd;
 use PVE::Tools qw(run_command file_read_firstline file_get_contents dir_glob_foreach get_host_arch $IPV6RE);
@@ -276,6 +277,42 @@ my $rng_fmt = {
     },
 };
 
+my $virtiofs_fmt = {
+    'dirid' => {
+	type => 'string',
+	default_key => 1,
+	description => "Mapping identifier of the directory mapping to be"
+	    ." shared with the guest. Also used as a mount tag inside the VM.",
+	format_description => 'mapping-id',
+	format => 'pve-configid',
+    },
+    'cache' => {
+	type => 'string',
+	description => "The caching policy the file system should use"
+	    ." (auto, always, never).",
+	format_description => "virtiofs-cache",
+	enum => [qw(auto always never)],
+	optional => 1,
+    },
+    'direct-io' => {
+	type => 'boolean',
+	description => "Honor the O_DIRECT flag passed down by guest applications",
+	format_description => "virtiofs-directio",
+	optional => 1,
+    },
+    xattr => {
+	type => 'boolean',
+	description => "Enable support for extended attributes.",
+	optional => 1,
+    },
+    acl => {
+	type => 'boolean',
+	description => "Enable support for posix ACLs (implies --xattr).",
+	optional => 1,
+    },
+};
+PVE::JSONSchema::register_format('pve-qm-virtiofs', $virtiofs_fmt);
+
 my $meta_info_fmt = {
     'ctime' => {
 	type => 'integer',
@@ -840,6 +877,7 @@ while (my ($k, $v) = each %$confdesc) {
 }
 
 my $MAX_NETS = 32;
+my $MAX_VIRTIOFS = 10;
 my $MAX_SERIAL_PORTS = 4;
 my $MAX_PARALLEL_PORTS = 3;
 my $MAX_NUMA = 8;
@@ -984,6 +1022,21 @@ my $netdesc = {
 
 PVE::JSONSchema::register_standard_option("pve-qm-net", $netdesc);
 
+my $virtiofsdesc = {
+    optional => 1,
+    type => 'string', format => $virtiofs_fmt,
+    description => "share files between host and guest",
+};
+PVE::JSONSchema::register_standard_option("pve-qm-virtiofs", $virtiofsdesc);
+
+sub max_virtiofs {
+    return $MAX_VIRTIOFS;
+}
+
+for (my $i = 0; $i < $MAX_VIRTIOFS; $i++)  {
+    $confdesc->{"virtiofs$i"} = $virtiofsdesc;
+}
+
 my $ipconfig_fmt = {
     ip => {
 	type => 'string',
@@ -4113,6 +4166,21 @@ sub config_to_command {
 	push @$devices, '-device', $netdevicefull;
     }
 
+    my $virtiofs_enabled = 0;
+    for (my $i = 0; $i < $MAX_VIRTIOFS; $i++) {
+	my $opt = "virtiofs$i";
+
+	next if !$conf->{$opt};
+	my $virtiofs = parse_property_string('pve-qm-virtiofs', $conf->{$opt});
+	next if !$virtiofs;
+
+	push @$devices, '-chardev', "socket,id=virtfs$i,path=/var/run/virtiofsd/vm$vmid-fs$i";
+	push @$devices, '-device', 'vhost-user-fs-pci,queue-size=1024'
+	    .",chardev=virtfs$i,tag=$virtiofs->{dirid}";
+
+	$virtiofs_enabled = 1;
+    }
+
     if ($conf->{ivshmem}) {
 	my $ivshmem = parse_property_string($ivshmem_fmt, $conf->{ivshmem});
 
@@ -4172,6 +4240,14 @@ sub config_to_command {
     }
     push @$machineFlags, "type=${machine_type_min}";
 
+    if ($virtiofs_enabled && !$conf->{numa}) {
+	# kvm: '-machine memory-backend' and '-numa memdev' properties are
+	# mutually exclusive
+	push @$devices, '-object', 'memory-backend-file,id=virtiofs-mem'
+	    .",size=$conf->{memory}M,mem-path=/dev/shm,share=on";
+	push @$machineFlags, 'memory-backend=virtiofs-mem';
+    }
+
     push @$cmd, @$devices;
     push @$cmd, '-rtc', join(',', @$rtcFlags) if scalar(@$rtcFlags);
     push @$cmd, '-machine', join(',', @$machineFlags) if scalar(@$machineFlags);
@@ -4198,6 +4274,85 @@ sub config_to_command {
     return wantarray ? ($cmd, $vollist, $spice_port, $pci_devices) : $cmd;
 }
 
+sub start_virtiofs {
+    my ($vmid, $fsid, $virtiofs) = @_;
+
+    my $dir_cfg = PVE::Mapping::Dir::config()->{ids}->{$virtiofs->{dirid}};
+    my $node_list = PVE::Mapping::Dir::find_on_current_node($virtiofs->{dirid});
+
+    if (!$node_list || scalar($node_list->@*) != 1) {
+	die "virtiofs needs exactly one mapping for this node\n";
+    }
+
+    eval {
+	PVE::Mapping::Dir::assert_valid($node_list->[0]);
+    };
+    if (my $err = $@) {
+	die "Directory Mapping invalid: $err\n";
+    }
+
+    my $node_cfg = $node_list->[0];
+    my $path = $node_cfg->{path};
+    my $socket_path_root = "/var/run/virtiofsd";
+    mkdir $socket_path_root;
+    my $socket_path = "$socket_path_root/vm$vmid-fs$fsid";
+    unlink($socket_path);
+    my $socket = IO::Socket::UNIX->new(
+	Type => SOCK_STREAM,
+	Local => $socket_path,
+	Listen => 1,
+    ) or die "cannot create socket - $!\n";
+
+    my $flags = fcntl($socket, F_GETFD, 0)
+	or die "failed to get file descriptor flags: $!\n";
+    fcntl($socket, F_SETFD, $flags & ~FD_CLOEXEC)
+	or die "failed to remove FD_CLOEXEC from file descriptor\n";
+
+    my $fd = $socket->fileno();
+
+    my $virtiofsd_bin = '/usr/libexec/virtiofsd';
+
+    my $pid = fork();
+    if ($pid == 0) {
+	setsid();
+	$0 = "task pve-vm$vmid-virtiofs$fsid";
+	for my $fd_loop (3 .. POSIX::sysconf( &POSIX::_SC_OPEN_MAX )) {
+	    POSIX::close($fd_loop) if ($fd_loop != $fd);
+	}
+
+	my $pid2 = fork();
+	if ($pid2 == 0) {
+	    my $cmd = [$virtiofsd_bin, "--fd=$fd", "--shared-dir=$path"];
+	    push @$cmd, '--xattr' if ($virtiofs->{xattr});
+	    push @$cmd, '--posix-acl' if ($virtiofs->{acl});
+
+	    # Default to dir config xattr & acl settings
+	    push @$cmd, '--xattr'
+		if !defined $virtiofs->{'xattr'} && $dir_cfg->{'xattr'};
+	    push @$cmd, '--posix-acl'
+		if !defined $virtiofs->{'acl'} && $dir_cfg->{'acl'};
+
+	    push @$cmd, '--announce-submounts' if ($node_cfg->{submounts});
+	    push @$cmd, '--allow-direct-io' if ($virtiofs->{'direct-io'});
+	    push @$cmd, "--cache=$virtiofs->{'cache'}" if ($virtiofs->{'cache'});
+
+	    exec(@$cmd);
+	} elsif (!defined($pid2)) {
+	    die "could not fork to start virtiofsd\n";
+	} else {
+	    POSIX::_exit(0);
+	}
+    } elsif (!defined($pid)) {
+	die "could not fork to start virtiofsd\n";
+    } else {
+	waitpid($pid, 0);
+    }
+
+    # return socket to keep it alive,
+    # so that qemu will wait for virtiofsd to start
+    return $socket;
+}
+
 sub check_rng_source {
     my ($source) = @_;
 
@@ -5655,7 +5810,6 @@ sub vm_start {
     });
 }
 
-
 # params:
 #   statefile => 'tcp', 'unix' for migration or path/volid for RAM state
 #   skiplock => 0/1, skip checking for config lock
@@ -5918,10 +6072,23 @@ sub vm_start_nolock {
     }
     $systemd_properties{timeout} = 10 if $statefile; # setting up the scope shoul be quick
 
+
     my $run_qemu = sub {
 	PVE::Tools::run_fork sub {
 	    PVE::Systemd::enter_systemd_scope($vmid, "Proxmox VE VM $vmid", %systemd_properties);
 
+	    my @virtiofs_sockets;
+	    for (my $i = 0; $i < $MAX_VIRTIOFS; $i++) {
+		my $opt = "virtiofs$i";
+
+		next if !$conf->{$opt};
+		my $virtiofs = parse_property_string('pve-qm-virtiofs', $conf->{$opt});
+		next if !$virtiofs;
+
+		my $virtiofs_socket = start_virtiofs($vmid, $i, $virtiofs);
+		push @virtiofs_sockets, $virtiofs_socket;
+	    }
+
 	    my $tpmpid;
 	    if (my $tpm = $conf->{tpmstate0}) {
 		# start the TPM emulator so QEMU can connect on start
@@ -5936,6 +6103,11 @@ sub vm_start_nolock {
 		}
 		die "QEMU exited with code $exitcode\n";
 	    }
+
+	    foreach my $virtiofs_socket (@virtiofs_sockets) {
+		shutdown($virtiofs_socket, 2);
+		close($virtiofs_socket);
+	    }
 	};
     };
 
diff --git a/PVE/QemuServer/Memory.pm b/PVE/QemuServer/Memory.pm
index 0601dd6..648bc08 100644
--- a/PVE/QemuServer/Memory.pm
+++ b/PVE/QemuServer/Memory.pm
@@ -278,6 +278,16 @@ sub config {
 
     die "numa needs to be enabled to use hugepages" if $conf->{hugepages} && !$conf->{numa};
 
+    my $virtiofs_enabled = 0;
+    for (my $i = 0; $i < PVE::QemuServer::max_virtiofs(); $i++) {
+	my $opt = "virtiofs$i";
+	next if !$conf->{$opt};
+	my $virtiofs = PVE::JSONSchema::parse_property_string('pve-qm-virtiofs', $conf->{$opt});
+	if ($virtiofs) {
+	    $virtiofs_enabled = 1;
+	}
+    }
+
     if ($conf->{numa}) {
 
 	my $numa_totalmemory = undef;
@@ -290,7 +300,8 @@ sub config {
 	    my $numa_memory = $numa->{memory};
 	    $numa_totalmemory += $numa_memory;
 
-	    my $mem_object = print_mem_object($conf, "ram-node$i", $numa_memory);
+	    my $memdev = $virtiofs_enabled ? "virtiofs-mem$i" : "ram-node$i";
+	    my $mem_object = print_mem_object($conf, $memdev, $numa_memory);
 
 	    # cpus
 	    my $cpulists = $numa->{cpus};
@@ -315,7 +326,7 @@ sub config {
 	    }
 
 	    push @$cmd, '-object', $mem_object;
-	    push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=ram-node$i";
+	    push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=$memdev";
 	}
 
 	die "total memory for NUMA nodes must be equal to vm static memory\n"
@@ -329,13 +340,13 @@ sub config {
 		die "host NUMA node$i doesn't exist\n"
 		    if !host_numanode_exists($i) && $conf->{hugepages};
 
-		my $mem_object = print_mem_object($conf, "ram-node$i", $numa_memory);
-		push @$cmd, '-object', $mem_object;
-
 		my $cpus = ($cores * $i);
 		$cpus .= "-" . ($cpus + $cores - 1) if $cores > 1;
 
-		push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=ram-node$i";
+		my $memdev = $virtiofs_enabled ? "virtiofs-mem$i" : "ram-node$i";
+		my $mem_object = print_mem_object($conf, $memdev, $numa_memory);
+		push @$cmd, '-object', $mem_object;
+		push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=$memdev";
 	    }
 	}
     }
@@ -364,6 +375,8 @@ sub print_mem_object {
 	my $path = hugepages_mount_path($hugepages_size);
 
 	return "memory-backend-file,id=$id,size=${size}M,mem-path=$path,share=on,prealloc=yes";
+    } elsif ($id =~ m/^virtiofs-mem/) {
+	return "memory-backend-file,id=$id,size=${size}M,mem-path=/dev/shm,share=on";
     } else {
 	return "memory-backend-ram,id=$id,size=${size}M";
     }
diff --git a/debian/control b/debian/control
index 49f67b2..f008a9b 100644
--- a/debian/control
+++ b/debian/control
@@ -53,6 +53,7 @@ Depends: dbus,
          socat,
          swtpm,
          swtpm-tools,
+         virtiofsd,
          ${misc:Depends},
          ${perl:Depends},
          ${shlibs:Depends},
-- 
2.39.2





  parent reply	other threads:[~2023-08-09  8:38 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-08-09  8:37 [pve-devel] [PATCH cluster/guest-common/docs/qemu-server/manager v6 0/11] virtiofs Markus Frank
2023-08-09  8:37 ` [pve-devel] [PATCH cluster v7 1/11] add mapping/dir.cfg for resource mapping Markus Frank
2023-08-09  8:37 ` [pve-devel] [PATCH guest-common v7 2/11] add Dir mapping config Markus Frank
2023-08-09  8:37 ` [pve-devel] [PATCH docs v7 3/11] added shared filesystem doc for virtio-fs Markus Frank
2023-10-05  8:56   ` Fabian Grünbichler
2023-08-09  8:37 ` Markus Frank [this message]
2023-10-05  8:56   ` [pve-devel] [PATCH qemu-server v7 4/11] feature #1027: virtio-fs support Fabian Grünbichler
2023-08-09  8:37 ` [pve-devel] [PATCH qemu-server v7 5/11] Permission check for virtiofs directory access Markus Frank
2023-10-05  8:56   ` Fabian Grünbichler
2023-08-09  8:37 ` [pve-devel] [PATCH qemu-server v7 6/11] check_local_resources: virtiofs Markus Frank
2023-08-09  8:37 ` [pve-devel] [PATCH manager v7 07/11] api: add resource map api endpoints for directories Markus Frank
2023-08-09  8:37 ` [pve-devel] [PATCH manager v7 08/11] ui: add edit window for dir mappings Markus Frank
2023-08-09  8:37 ` [pve-devel] [PATCH manager v7 09/11] ui: ResourceMapTree for DIR Markus Frank
2023-08-09  8:37 ` [pve-devel] [PATCH manager v7 10/11] ui: form: add DIRMapSelector Markus Frank
2023-08-09  8:37 ` [pve-devel] [PATCH manager v7 11/11] ui: add options to add virtio-fs to qemu config Markus Frank
2023-10-05  8:57 ` [pve-devel] [PATCH cluster/guest-common/docs/qemu-server/manager v6 0/11] virtiofs Fabian Grünbichler

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230809083739.100024-5-m.frank@proxmox.com \
    --to=m.frank@proxmox.com \
    --cc=pve-devel@lists.proxmox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal