all lists on lists.proxmox.com
 help / color / mirror / Atom feed
From: Markus Frank <m.frank@proxmox.com>
To: pve-devel@lists.proxmox.com
Subject: [pve-devel] [PATCH qemu-server v5 1/1] feature #1027: virtio-fs support
Date: Wed,  7 Jun 2023 10:57:32 +0200	[thread overview]
Message-ID: <20230607085732.32063-4-m.frank@proxmox.com> (raw)
In-Reply-To: <20230607085732.32063-1-m.frank@proxmox.com>

adds support for sharing directorys with a guest vm.

virtio-fs needs virtiofsd to be started.

In order to start virtiofsd as a process (despite being a daemon it is does not run
in the background), a double-fork is used.

virtiofsd should close itself together with qemu.

There are the parameters dirid & tag
and the optional parameters direct-io & cache.

The dirid gets mapped to the path on the current node.
The tag parameter is for choosing the tag-name that is used with the
mount command.

example config:
---
virtiofs0: foo,tag=tag1,direct-io=1,cache=always
virtiofs1: dirid=bar,tag=tag2,cache=never
---

For information on the optional parameters see there:
https://gitlab.com/virtio-fs/virtiofsd/-/blob/main/README.md

Signed-off-by: Markus Frank <m.frank@proxmox.com>
---
 PVE/QemuServer.pm        | 170 +++++++++++++++++++++++++++++++++++++++
 PVE/QemuServer/Memory.pm |  25 ++++--
 2 files changed, 189 insertions(+), 6 deletions(-)

diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm
index ab33aa3..9fe408c 100644
--- a/PVE/QemuServer.pm
+++ b/PVE/QemuServer.pm
@@ -41,6 +41,7 @@ use PVE::PBSClient;
 use PVE::RESTEnvironment qw(log_warn);
 use PVE::RPCEnvironment;
 use PVE::Storage;
+use PVE::Mapping::DIR;
 use PVE::SysFSTools;
 use PVE::Systemd;
 use PVE::Tools qw(run_command file_read_firstline file_get_contents dir_glob_foreach get_host_arch $IPV6RE);
@@ -274,6 +275,35 @@ my $rng_fmt = {
     },
 };
 
+my $virtiofs_fmt = {
+    'dirid' => {
+	type => 'string',
+	default_key => 1,
+	description => "dirid of directory you want to share with the guest VM",
+	format_description => "virtiofs-dirid",
+    },
+    'tag' => {
+	type => 'string',
+	description => "tag name for mounting in the guest VM",
+	format_description => "virtiofs-tag",
+    },
+    'cache' => {
+	type => 'string',
+	description => "The caching policy the file system should use"
+	    ." (auto, always, never).",
+	format_description => "virtiofs-cache",
+	enum => [qw(auto always never)],
+	optional => 1,
+    },
+    'direct-io' => {
+	type => 'boolean',
+	description => "The caching policy the file system should use"
+	    ." (auto, always, never).",
+	format_description => "virtiofs-directio",
+	optional => 1,
+    },
+};
+
 my $meta_info_fmt = {
     'ctime' => {
 	type => 'integer',
@@ -832,6 +862,7 @@ while (my ($k, $v) = each %$confdesc) {
 
 my $MAX_USB_DEVICES = 14;
 my $MAX_NETS = 32;
+my $MAX_VIRTIOFS = 10;
 my $MAX_SERIAL_PORTS = 4;
 my $MAX_PARALLEL_PORTS = 3;
 my $MAX_NUMA = 8;
@@ -974,6 +1005,12 @@ my $netdesc = {
     description => "Specify network devices.",
 };
 
+my $virtiofsdesc = {
+    optional => 1,
+    type => 'string', format => $virtiofs_fmt,
+    description => "share files between host and guest",
+};
+
 PVE::JSONSchema::register_standard_option("pve-qm-net", $netdesc);
 
 my $ipconfig_fmt = {
@@ -1035,6 +1072,14 @@ for (my $i = 0; $i < $MAX_NETS; $i++)  {
     $confdesc_cloudinit->{"ipconfig$i"} = $ipconfigdesc;
 }
 
+sub max_virtiofs {
+    return $MAX_VIRTIOFS;
+}
+
+for (my $i = 0; $i < $MAX_VIRTIOFS; $i++)  {
+    $confdesc->{"virtiofs$i"} = $virtiofsdesc;
+}
+
 foreach my $key (keys %$confdesc_cloudinit) {
     $confdesc->{$key} = $confdesc_cloudinit->{$key};
 }
@@ -1988,6 +2033,16 @@ sub parse_net {
     return $res;
 }
 
+sub parse_virtiofs {
+    my ($value) = @_;
+
+    return if !$value;
+    my $res = eval { parse_property_string($virtiofs_fmt, $value) };
+
+    warn $@ if $@;
+    return $res;
+}
+
 # ipconfigX ip=cidr,gw=ip,ip6=cidr,gw6=ip
 sub parse_ipconfig {
     my ($data) = @_;
@@ -4107,6 +4162,25 @@ sub config_to_command {
 	push @$devices, '-device', $netdevicefull;
     }
 
+    my $onevirtiofs = 0;
+    for (my $i = 0; $i < $MAX_VIRTIOFS; $i++) {
+	my $virtiofsstr = "virtiofs$i";
+
+	next if !$conf->{$virtiofsstr};
+	my $virtiofs = parse_virtiofs($conf->{$virtiofsstr});
+	next if !$virtiofs;
+
+	push @$devices, '-chardev', "socket,id=virtfs$i,path=/var/run/virtiofsd/vm$vmid-fs$i";
+	push @$devices, '-device', 'vhost-user-fs-pci,queue-size=1024'
+	    .",chardev=virtfs$i,tag=$virtiofs->{tag}";
+
+	$onevirtiofs = 1;
+    }
+
+    if ($onevirtiofs && $conf->{hugepages}){
+	die "hugepages not supported in combination with virtiofs\n";
+    }
+
     if ($conf->{ivshmem}) {
 	my $ivshmem = parse_property_string($ivshmem_fmt, $conf->{ivshmem});
 
@@ -4166,6 +4240,14 @@ sub config_to_command {
     }
     push @$machineFlags, "type=${machine_type_min}";
 
+    if ($onevirtiofs && !$conf->{numa}) {
+	# kvm: '-machine memory-backend' and '-numa memdev' properties are
+	# mutually exclusive
+	push @$devices, '-object', 'memory-backend-file,id=virtiofs-mem'
+	    .",size=$conf->{memory}M,mem-path=/dev/shm,share=on";
+	push @$machineFlags, 'memory-backend=virtiofs-mem';
+    }
+
     push @$cmd, @$devices;
     push @$cmd, '-rtc', join(',', @$rtcFlags) if scalar(@$rtcFlags);
     push @$cmd, '-machine', join(',', @$machineFlags) if scalar(@$machineFlags);
@@ -4192,6 +4274,76 @@ sub config_to_command {
     return wantarray ? ($cmd, $vollist, $spice_port) : $cmd;
 }
 
+sub start_virtiofs {
+    my ($vmid, $fsid, $virtiofs) = @_;
+
+    my $dir_list = PVE::Mapping::DIR::find_on_current_node($virtiofs->{dirid});
+
+    if (!$dir_list || scalar($dir_list->@*) != 1) {
+	die "virtiofs needs exactly one mapping for this node\n";
+    }
+
+    eval {
+	PVE::Mapping::DIR::assert_valid($dir_list->[0]);
+    };
+    if (my $err = $@) {
+	die "Directory Mapping invalid: $err\n";
+    }
+
+    my $dir_cfg = $dir_list->[0];
+    my $path = $dir_cfg->{path};
+    my $socket_path_root = "/var/run/virtiofsd";
+    mkdir $socket_path_root;
+    my $socket_path = "$socket_path_root/vm$vmid-fs$fsid";
+    unlink($socket_path);
+    my $socket = IO::Socket::UNIX->new(
+	Type => SOCK_STREAM,
+	Local => $socket_path,
+	Listen => 1,
+    ) or die "cannot create socket - $!\n";
+
+    my $flags = fcntl($socket, F_GETFD, 0)
+	or die "failed to get file descriptor flags: $!\n";
+    fcntl($socket, F_SETFD, $flags & ~FD_CLOEXEC)
+	or die "failed to remove FD_CLOEXEC from file descriptor\n";
+
+    my $fd = $socket->fileno();
+
+    my $virtiofsd_bin = '/usr/bin/virtiofsd';
+
+    if (! -e $virtiofsd_bin) {
+	die "virtiofsd binary is not installed\n";
+    }
+
+    my $pid = fork();
+    if ($pid == 0) {
+	for my $fd_loop (3 .. POSIX::sysconf( &POSIX::_SC_OPEN_MAX )) {
+	    POSIX::close($fd_loop) if ($fd_loop != $fd);
+	}
+	my $pid2 = fork();
+	if ($pid2 == 0) {
+	    my $cmd = [$virtiofsd_bin, "--fd=$fd", "--shared-dir=$path"];
+	    push @$cmd, '--xattr' if ($dir_cfg->{xattr});
+	    push @$cmd, '--posix-acl' if ($dir_cfg->{acl});
+	    push @$cmd, '--announce-submounts' if ($dir_cfg->{submounts});
+	    push @$cmd, '--allow-direct-io' if ($virtiofs->{'direct-io'});
+	    push @$cmd, "--cache=$virtiofs->{'cache'}" if ($virtiofs->{'cache'});
+	    run_command($cmd);
+	    POSIX::_exit(0);
+	} elsif (!defined($pid2)) {
+	    die "could not fork to start virtiofsd\n";
+	} else {
+	    POSIX::_exit(0);
+	}
+    } elsif (!defined($pid)) {
+	die "could not fork to start virtiofsd\n";
+    }
+
+    # return socket to keep it alive,
+    # so that qemu will wait for virtiofsd to start
+    return $socket;
+}
+
 sub check_rng_source {
     my ($source) = @_;
 
@@ -5749,6 +5901,19 @@ sub vm_start_nolock {
     my ($cmd, $vollist, $spice_port) = config_to_command($storecfg, $vmid,
 	$conf, $defaults, $forcemachine, $forcecpu, $params->{'pbs-backing'});
 
+    my @sockets;
+    for (my $i = 0; $i < $MAX_VIRTIOFS; $i++) {
+	my $virtiofsstr = "virtiofs$i";
+
+	next if !$conf->{$virtiofsstr};
+	my $virtiofs = parse_virtiofs($conf->{$virtiofsstr});
+	next if !$virtiofs;
+
+
+	my $socket = start_virtiofs($vmid, $i, $virtiofs);
+	push @sockets, $socket;
+    }
+
     my $migration_ip;
     my $get_migration_ip = sub {
 	my ($nodename) = @_;
@@ -6096,6 +6261,11 @@ sub vm_start_nolock {
 
     PVE::GuestHelpers::exec_hookscript($conf, $vmid, 'post-start');
 
+    foreach my $socket (@sockets) {
+	shutdown($socket, 2);
+	close($socket);
+    }
+
     return $res;
 }
 
diff --git a/PVE/QemuServer/Memory.pm b/PVE/QemuServer/Memory.pm
index 0601dd6..4283162 100644
--- a/PVE/QemuServer/Memory.pm
+++ b/PVE/QemuServer/Memory.pm
@@ -278,6 +278,16 @@ sub config {
 
     die "numa needs to be enabled to use hugepages" if $conf->{hugepages} && !$conf->{numa};
 
+    my $onevirtiofs = 0;
+    for (my $i = 0; $i < PVE::QemuServer::max_virtiofs(); $i++) {
+	my $virtiofsstr = "virtiofs$i";
+	next if !$conf->{$virtiofsstr};
+	my $virtiofs = PVE::QemuServer::parse_virtiofs($conf->{$virtiofsstr});
+	if ($virtiofs) {
+	    $onevirtiofs = 1;
+	}
+    }
+
     if ($conf->{numa}) {
 
 	my $numa_totalmemory = undef;
@@ -290,7 +300,8 @@ sub config {
 	    my $numa_memory = $numa->{memory};
 	    $numa_totalmemory += $numa_memory;
 
-	    my $mem_object = print_mem_object($conf, "ram-node$i", $numa_memory);
+	    my $memdev = $onevirtiofs ? "virtiofs-mem$i" : "ram-node$i";
+	    my $mem_object = print_mem_object($conf, $memdev, $numa_memory);
 
 	    # cpus
 	    my $cpulists = $numa->{cpus};
@@ -315,7 +326,7 @@ sub config {
 	    }
 
 	    push @$cmd, '-object', $mem_object;
-	    push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=ram-node$i";
+	    push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=$memdev";
 	}
 
 	die "total memory for NUMA nodes must be equal to vm static memory\n"
@@ -329,13 +340,13 @@ sub config {
 		die "host NUMA node$i doesn't exist\n"
 		    if !host_numanode_exists($i) && $conf->{hugepages};
 
-		my $mem_object = print_mem_object($conf, "ram-node$i", $numa_memory);
-		push @$cmd, '-object', $mem_object;
-
 		my $cpus = ($cores * $i);
 		$cpus .= "-" . ($cpus + $cores - 1) if $cores > 1;
 
-		push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=ram-node$i";
+		my $memdev = $onevirtiofs ? "virtiofs-mem$i" : "ram-node$i";
+		my $mem_object = print_mem_object($conf, $memdev, $numa_memory);
+		push @$cmd, '-object', $mem_object;
+		push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=$memdev";
 	    }
 	}
     }
@@ -364,6 +375,8 @@ sub print_mem_object {
 	my $path = hugepages_mount_path($hugepages_size);
 
 	return "memory-backend-file,id=$id,size=${size}M,mem-path=$path,share=on,prealloc=yes";
+    } elsif ($id =~ m/^virtiofs-mem/) {
+	return "memory-backend-file,id=$id,size=${size}M,mem-path=/dev/shm,share=on";
     } else {
 	return "memory-backend-ram,id=$id,size=${size}M";
     }
-- 
2.30.2





      parent reply	other threads:[~2023-06-07  8:57 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-06-07  8:57 [pve-devel] [PATCH cluster/guest-common/qemu-server v5] virtio-fs Markus Frank
2023-06-07  8:57 ` [pve-devel] [PATCH cluster v5 1/1] add mapping/dirs.cfg for resource mapping Markus Frank
2023-06-07  8:57 ` [pve-devel] [PATCH guest-common v5 1/1] add DIR mapping config Markus Frank
2023-06-07  8:57 ` Markus Frank [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230607085732.32063-4-m.frank@proxmox.com \
    --to=m.frank@proxmox.com \
    --cc=pve-devel@lists.proxmox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal