public inbox for pve-devel@lists.proxmox.com
 help / color / mirror / Atom feed
From: Markus Frank <m.frank@proxmox.com>
To: pve-devel@lists.proxmox.com
Subject: [pve-devel] [PATCH qemu-server v5 1/1] feature #1027: virtio-fs support
Date: Wed,  7 Jun 2023 10:57:32 +0200	[thread overview]
Message-ID: <20230607085732.32063-4-m.frank@proxmox.com> (raw)
In-Reply-To: <20230607085732.32063-1-m.frank@proxmox.com>

adds support for sharing directorys with a guest vm.

virtio-fs needs virtiofsd to be started.

In order to start virtiofsd as a process (despite being a daemon it is does not run
in the background), a double-fork is used.

virtiofsd should close itself together with qemu.

There are the parameters dirid & tag
and the optional parameters direct-io & cache.

The dirid gets mapped to the path on the current node.
The tag parameter is for choosing the tag-name that is used with the
mount command.

example config:
---
virtiofs0: foo,tag=tag1,direct-io=1,cache=always
virtiofs1: dirid=bar,tag=tag2,cache=never
---

For information on the optional parameters see there:
https://gitlab.com/virtio-fs/virtiofsd/-/blob/main/README.md

Signed-off-by: Markus Frank <m.frank@proxmox.com>
---
 PVE/QemuServer.pm        | 170 +++++++++++++++++++++++++++++++++++++++
 PVE/QemuServer/Memory.pm |  25 ++++--
 2 files changed, 189 insertions(+), 6 deletions(-)

diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm
index ab33aa3..9fe408c 100644
--- a/PVE/QemuServer.pm
+++ b/PVE/QemuServer.pm
@@ -41,6 +41,7 @@ use PVE::PBSClient;
 use PVE::RESTEnvironment qw(log_warn);
 use PVE::RPCEnvironment;
 use PVE::Storage;
+use PVE::Mapping::DIR;
 use PVE::SysFSTools;
 use PVE::Systemd;
 use PVE::Tools qw(run_command file_read_firstline file_get_contents dir_glob_foreach get_host_arch $IPV6RE);
@@ -274,6 +275,35 @@ my $rng_fmt = {
     },
 };
 
+my $virtiofs_fmt = {
+    'dirid' => {
+	type => 'string',
+	default_key => 1,
+	description => "dirid of directory you want to share with the guest VM",
+	format_description => "virtiofs-dirid",
+    },
+    'tag' => {
+	type => 'string',
+	description => "tag name for mounting in the guest VM",
+	format_description => "virtiofs-tag",
+    },
+    'cache' => {
+	type => 'string',
+	description => "The caching policy the file system should use"
+	    ." (auto, always, never).",
+	format_description => "virtiofs-cache",
+	enum => [qw(auto always never)],
+	optional => 1,
+    },
+    'direct-io' => {
+	type => 'boolean',
+	description => "The caching policy the file system should use"
+	    ." (auto, always, never).",
+	format_description => "virtiofs-directio",
+	optional => 1,
+    },
+};
+
 my $meta_info_fmt = {
     'ctime' => {
 	type => 'integer',
@@ -832,6 +862,7 @@ while (my ($k, $v) = each %$confdesc) {
 
 my $MAX_USB_DEVICES = 14;
 my $MAX_NETS = 32;
+my $MAX_VIRTIOFS = 10;
 my $MAX_SERIAL_PORTS = 4;
 my $MAX_PARALLEL_PORTS = 3;
 my $MAX_NUMA = 8;
@@ -974,6 +1005,12 @@ my $netdesc = {
     description => "Specify network devices.",
 };
 
+my $virtiofsdesc = {
+    optional => 1,
+    type => 'string', format => $virtiofs_fmt,
+    description => "share files between host and guest",
+};
+
 PVE::JSONSchema::register_standard_option("pve-qm-net", $netdesc);
 
 my $ipconfig_fmt = {
@@ -1035,6 +1072,14 @@ for (my $i = 0; $i < $MAX_NETS; $i++)  {
     $confdesc_cloudinit->{"ipconfig$i"} = $ipconfigdesc;
 }
 
+sub max_virtiofs {
+    return $MAX_VIRTIOFS;
+}
+
+for (my $i = 0; $i < $MAX_VIRTIOFS; $i++)  {
+    $confdesc->{"virtiofs$i"} = $virtiofsdesc;
+}
+
 foreach my $key (keys %$confdesc_cloudinit) {
     $confdesc->{$key} = $confdesc_cloudinit->{$key};
 }
@@ -1988,6 +2033,16 @@ sub parse_net {
     return $res;
 }
 
+sub parse_virtiofs {
+    my ($value) = @_;
+
+    return if !$value;
+    my $res = eval { parse_property_string($virtiofs_fmt, $value) };
+
+    warn $@ if $@;
+    return $res;
+}
+
 # ipconfigX ip=cidr,gw=ip,ip6=cidr,gw6=ip
 sub parse_ipconfig {
     my ($data) = @_;
@@ -4107,6 +4162,25 @@ sub config_to_command {
 	push @$devices, '-device', $netdevicefull;
     }
 
+    my $onevirtiofs = 0;
+    for (my $i = 0; $i < $MAX_VIRTIOFS; $i++) {
+	my $virtiofsstr = "virtiofs$i";
+
+	next if !$conf->{$virtiofsstr};
+	my $virtiofs = parse_virtiofs($conf->{$virtiofsstr});
+	next if !$virtiofs;
+
+	push @$devices, '-chardev', "socket,id=virtfs$i,path=/var/run/virtiofsd/vm$vmid-fs$i";
+	push @$devices, '-device', 'vhost-user-fs-pci,queue-size=1024'
+	    .",chardev=virtfs$i,tag=$virtiofs->{tag}";
+
+	$onevirtiofs = 1;
+    }
+
+    if ($onevirtiofs && $conf->{hugepages}){
+	die "hugepages not supported in combination with virtiofs\n";
+    }
+
     if ($conf->{ivshmem}) {
 	my $ivshmem = parse_property_string($ivshmem_fmt, $conf->{ivshmem});
 
@@ -4166,6 +4240,14 @@ sub config_to_command {
     }
     push @$machineFlags, "type=${machine_type_min}";
 
+    if ($onevirtiofs && !$conf->{numa}) {
+	# kvm: '-machine memory-backend' and '-numa memdev' properties are
+	# mutually exclusive
+	push @$devices, '-object', 'memory-backend-file,id=virtiofs-mem'
+	    .",size=$conf->{memory}M,mem-path=/dev/shm,share=on";
+	push @$machineFlags, 'memory-backend=virtiofs-mem';
+    }
+
     push @$cmd, @$devices;
     push @$cmd, '-rtc', join(',', @$rtcFlags) if scalar(@$rtcFlags);
     push @$cmd, '-machine', join(',', @$machineFlags) if scalar(@$machineFlags);
@@ -4192,6 +4274,76 @@ sub config_to_command {
     return wantarray ? ($cmd, $vollist, $spice_port) : $cmd;
 }
 
+sub start_virtiofs {
+    my ($vmid, $fsid, $virtiofs) = @_;
+
+    my $dir_list = PVE::Mapping::DIR::find_on_current_node($virtiofs->{dirid});
+
+    if (!$dir_list || scalar($dir_list->@*) != 1) {
+	die "virtiofs needs exactly one mapping for this node\n";
+    }
+
+    eval {
+	PVE::Mapping::DIR::assert_valid($dir_list->[0]);
+    };
+    if (my $err = $@) {
+	die "Directory Mapping invalid: $err\n";
+    }
+
+    my $dir_cfg = $dir_list->[0];
+    my $path = $dir_cfg->{path};
+    my $socket_path_root = "/var/run/virtiofsd";
+    mkdir $socket_path_root;
+    my $socket_path = "$socket_path_root/vm$vmid-fs$fsid";
+    unlink($socket_path);
+    my $socket = IO::Socket::UNIX->new(
+	Type => SOCK_STREAM,
+	Local => $socket_path,
+	Listen => 1,
+    ) or die "cannot create socket - $!\n";
+
+    my $flags = fcntl($socket, F_GETFD, 0)
+	or die "failed to get file descriptor flags: $!\n";
+    fcntl($socket, F_SETFD, $flags & ~FD_CLOEXEC)
+	or die "failed to remove FD_CLOEXEC from file descriptor\n";
+
+    my $fd = $socket->fileno();
+
+    my $virtiofsd_bin = '/usr/bin/virtiofsd';
+
+    if (! -e $virtiofsd_bin) {
+	die "virtiofsd binary is not installed\n";
+    }
+
+    my $pid = fork();
+    if ($pid == 0) {
+	for my $fd_loop (3 .. POSIX::sysconf( &POSIX::_SC_OPEN_MAX )) {
+	    POSIX::close($fd_loop) if ($fd_loop != $fd);
+	}
+	my $pid2 = fork();
+	if ($pid2 == 0) {
+	    my $cmd = [$virtiofsd_bin, "--fd=$fd", "--shared-dir=$path"];
+	    push @$cmd, '--xattr' if ($dir_cfg->{xattr});
+	    push @$cmd, '--posix-acl' if ($dir_cfg->{acl});
+	    push @$cmd, '--announce-submounts' if ($dir_cfg->{submounts});
+	    push @$cmd, '--allow-direct-io' if ($virtiofs->{'direct-io'});
+	    push @$cmd, "--cache=$virtiofs->{'cache'}" if ($virtiofs->{'cache'});
+	    run_command($cmd);
+	    POSIX::_exit(0);
+	} elsif (!defined($pid2)) {
+	    die "could not fork to start virtiofsd\n";
+	} else {
+	    POSIX::_exit(0);
+	}
+    } elsif (!defined($pid)) {
+	die "could not fork to start virtiofsd\n";
+    }
+
+    # return socket to keep it alive,
+    # so that qemu will wait for virtiofsd to start
+    return $socket;
+}
+
 sub check_rng_source {
     my ($source) = @_;
 
@@ -5749,6 +5901,19 @@ sub vm_start_nolock {
     my ($cmd, $vollist, $spice_port) = config_to_command($storecfg, $vmid,
 	$conf, $defaults, $forcemachine, $forcecpu, $params->{'pbs-backing'});
 
+    my @sockets;
+    for (my $i = 0; $i < $MAX_VIRTIOFS; $i++) {
+	my $virtiofsstr = "virtiofs$i";
+
+	next if !$conf->{$virtiofsstr};
+	my $virtiofs = parse_virtiofs($conf->{$virtiofsstr});
+	next if !$virtiofs;
+
+
+	my $socket = start_virtiofs($vmid, $i, $virtiofs);
+	push @sockets, $socket;
+    }
+
     my $migration_ip;
     my $get_migration_ip = sub {
 	my ($nodename) = @_;
@@ -6096,6 +6261,11 @@ sub vm_start_nolock {
 
     PVE::GuestHelpers::exec_hookscript($conf, $vmid, 'post-start');
 
+    foreach my $socket (@sockets) {
+	shutdown($socket, 2);
+	close($socket);
+    }
+
     return $res;
 }
 
diff --git a/PVE/QemuServer/Memory.pm b/PVE/QemuServer/Memory.pm
index 0601dd6..4283162 100644
--- a/PVE/QemuServer/Memory.pm
+++ b/PVE/QemuServer/Memory.pm
@@ -278,6 +278,16 @@ sub config {
 
     die "numa needs to be enabled to use hugepages" if $conf->{hugepages} && !$conf->{numa};
 
+    my $onevirtiofs = 0;
+    for (my $i = 0; $i < PVE::QemuServer::max_virtiofs(); $i++) {
+	my $virtiofsstr = "virtiofs$i";
+	next if !$conf->{$virtiofsstr};
+	my $virtiofs = PVE::QemuServer::parse_virtiofs($conf->{$virtiofsstr});
+	if ($virtiofs) {
+	    $onevirtiofs = 1;
+	}
+    }
+
     if ($conf->{numa}) {
 
 	my $numa_totalmemory = undef;
@@ -290,7 +300,8 @@ sub config {
 	    my $numa_memory = $numa->{memory};
 	    $numa_totalmemory += $numa_memory;
 
-	    my $mem_object = print_mem_object($conf, "ram-node$i", $numa_memory);
+	    my $memdev = $onevirtiofs ? "virtiofs-mem$i" : "ram-node$i";
+	    my $mem_object = print_mem_object($conf, $memdev, $numa_memory);
 
 	    # cpus
 	    my $cpulists = $numa->{cpus};
@@ -315,7 +326,7 @@ sub config {
 	    }
 
 	    push @$cmd, '-object', $mem_object;
-	    push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=ram-node$i";
+	    push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=$memdev";
 	}
 
 	die "total memory for NUMA nodes must be equal to vm static memory\n"
@@ -329,13 +340,13 @@ sub config {
 		die "host NUMA node$i doesn't exist\n"
 		    if !host_numanode_exists($i) && $conf->{hugepages};
 
-		my $mem_object = print_mem_object($conf, "ram-node$i", $numa_memory);
-		push @$cmd, '-object', $mem_object;
-
 		my $cpus = ($cores * $i);
 		$cpus .= "-" . ($cpus + $cores - 1) if $cores > 1;
 
-		push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=ram-node$i";
+		my $memdev = $onevirtiofs ? "virtiofs-mem$i" : "ram-node$i";
+		my $mem_object = print_mem_object($conf, $memdev, $numa_memory);
+		push @$cmd, '-object', $mem_object;
+		push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=$memdev";
 	    }
 	}
     }
@@ -364,6 +375,8 @@ sub print_mem_object {
 	my $path = hugepages_mount_path($hugepages_size);
 
 	return "memory-backend-file,id=$id,size=${size}M,mem-path=$path,share=on,prealloc=yes";
+    } elsif ($id =~ m/^virtiofs-mem/) {
+	return "memory-backend-file,id=$id,size=${size}M,mem-path=/dev/shm,share=on";
     } else {
 	return "memory-backend-ram,id=$id,size=${size}M";
     }
-- 
2.30.2





      parent reply	other threads:[~2023-06-07  8:57 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-06-07  8:57 [pve-devel] [PATCH cluster/guest-common/qemu-server v5] virtio-fs Markus Frank
2023-06-07  8:57 ` [pve-devel] [PATCH cluster v5 1/1] add mapping/dirs.cfg for resource mapping Markus Frank
2023-06-07  8:57 ` [pve-devel] [PATCH guest-common v5 1/1] add DIR mapping config Markus Frank
2023-06-07  8:57 ` Markus Frank [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230607085732.32063-4-m.frank@proxmox.com \
    --to=m.frank@proxmox.com \
    --cc=pve-devel@lists.proxmox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal