[pve-devel] [PATCH master kernel 2/4] backport fix for kvm performance regression with Intel Emerald Rapids

From: Fiona Ebner <f.ebner@proxmox.com>
To: pve-devel@lists.proxmox.com
Subject: [pve-devel] [PATCH master kernel 2/4] backport fix for kvm performance regression with Intel Emerald Rapids
Date: Thu, 19 Dec 2024 15:29:45 +0100	[thread overview]
Message-ID: <20241219142947.1447728-3-f.ebner@proxmox.com> (raw)
In-Reply-To: <20241219142947.1447728-1-f.ebner@proxmox.com>

Adapted to context change in "arch/x86/kvm/cpuid.h", because of the
vcpu_supports_xsave_pkru() function that got added by Proxmox VE
downstream patch "kvm: xsave set: mask-out PKRU bit in xfeatures if
vCPU has no support". But otherwise clean cherry-pick from linux-next,
no functional changes.

Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
---
 ...UID.0xD-XSTATE-offsets-sizes-during-.patch | 165 ++++++++++++++++++
 1 file changed, 165 insertions(+)
 create mode 100644 patches/kernel/0018-KVM-x86-Cache-CPUID.0xD-XSTATE-offsets-sizes-during-.patch

diff --git a/patches/kernel/0018-KVM-x86-Cache-CPUID.0xD-XSTATE-offsets-sizes-during-.patch b/patches/kernel/0018-KVM-x86-Cache-CPUID.0xD-XSTATE-offsets-sizes-during-.patch
new file mode 100644
index 0000000..5b03995
--- /dev/null
+++ b/patches/kernel/0018-KVM-x86-Cache-CPUID.0xD-XSTATE-offsets-sizes-during-.patch
@@ -0,0 +1,165 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Tue, 10 Dec 2024 17:32:58 -0800
+Subject: [PATCH] KVM: x86: Cache CPUID.0xD XSTATE offsets+sizes during module
+ init
+
+Snapshot the output of CPUID.0xD.[1..n] during kvm.ko initiliaization to
+avoid the overead of CPUID during runtime.  The offset, size, and metadata
+for CPUID.0xD.[1..n] sub-leaves does not depend on XCR0 or XSS values, i.e.
+is constant for a given CPU, and thus can be cached during module load.
+
+On Intel's Emerald Rapids, CPUID is *wildly* expensive, to the point where
+recomputing XSAVE offsets and sizes results in a 4x increase in latency of
+nested VM-Enter and VM-Exit (nested transitions can trigger
+xstate_required_size() multiple times per transition), relative to using
+cached values.  The issue is easily visible by running `perf top` while
+triggering nested transitions: kvm_update_cpuid_runtime() shows up at a
+whopping 50%.
+
+As measured via RDTSC from L2 (using KVM-Unit-Test's CPUID VM-Exit test
+and a slightly modified L1 KVM to handle CPUID in the fastpath), a nested
+roundtrip to emulate CPUID on Skylake (SKX), Icelake (ICX), and Emerald
+Rapids (EMR) takes:
+
+  SKX 11650
+  ICX 22350
+  EMR 28850
+
+Using cached values, the latency drops to:
+
+  SKX 6850
+  ICX 9000
+  EMR 7900
+
+The underlying issue is that CPUID itself is slow on ICX, and comically
+slow on EMR.  The problem is exacerbated on CPUs which support XSAVES
+and/or XSAVEC, as KVM invokes xstate_required_size() twice on each
+runtime CPUID update, and because there are more supported XSAVE features
+(CPUID for supported XSAVE feature sub-leafs is significantly slower).
+
+ SKX:
+  CPUID.0xD.2  = 348 cycles
+  CPUID.0xD.3  = 400 cycles
+  CPUID.0xD.4  = 276 cycles
+  CPUID.0xD.5  = 236 cycles
+  <other sub-leaves are similar>
+
+ EMR:
+  CPUID.0xD.2  = 1138 cycles
+  CPUID.0xD.3  = 1362 cycles
+  CPUID.0xD.4  = 1068 cycles
+  CPUID.0xD.5  = 910 cycles
+  CPUID.0xD.6  = 914 cycles
+  CPUID.0xD.7  = 1350 cycles
+  CPUID.0xD.8  = 734 cycles
+  CPUID.0xD.9  = 766 cycles
+  CPUID.0xD.10 = 732 cycles
+  CPUID.0xD.11 = 718 cycles
+  CPUID.0xD.12 = 734 cycles
+  CPUID.0xD.13 = 1700 cycles
+  CPUID.0xD.14 = 1126 cycles
+  CPUID.0xD.15 = 898 cycles
+  CPUID.0xD.16 = 716 cycles
+  CPUID.0xD.17 = 748 cycles
+  CPUID.0xD.18 = 776 cycles
+
+Note, updating runtime CPUID information multiple times per nested
+transition is itself a flaw, especially since CPUID is a mandotory
+intercept on both Intel and AMD.  E.g. KVM doesn't need to ensure emulated
+CPUID state is up-to-date while running L2.  That flaw will be fixed in a
+future patch, as deferring runtime CPUID updates is more subtle than it
+appears at first glance, the benefits aren't super critical to have once
+the XSAVE issue is resolved, and caching CPUID output is desirable even if
+KVM's updates are deferred.
+
+Cc: Jim Mattson <jmattson@google.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-ID: <20241211013302.1347853-2-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+(cherry picked from commit 1201f226c863b7da739f7420ddba818cedf372fc)
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+---
+ arch/x86/kvm/cpuid.c | 31 ++++++++++++++++++++++++++-----
+ arch/x86/kvm/cpuid.h |  1 +
+ arch/x86/kvm/x86.c   |  2 ++
+ 3 files changed, 29 insertions(+), 5 deletions(-)
+
+diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
+index b6892645c4cde238923062e3f71df21d68cc201d..dc73965aa73b21d26b4cf039336da3ca38e89bc6 100644
+--- a/arch/x86/kvm/cpuid.c
++++ b/arch/x86/kvm/cpuid.c
+@@ -36,6 +36,26 @@
+ u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
+ EXPORT_SYMBOL_GPL(kvm_cpu_caps);
+ 
++struct cpuid_xstate_sizes {
++	u32 eax;
++	u32 ebx;
++	u32 ecx;
++};
++
++static struct cpuid_xstate_sizes xstate_sizes[XFEATURE_MAX] __ro_after_init;
++
++void __init kvm_init_xstate_sizes(void)
++{
++	u32 ign;
++	int i;
++
++	for (i = XFEATURE_YMM; i < ARRAY_SIZE(xstate_sizes); i++) {
++		struct cpuid_xstate_sizes *xs = &xstate_sizes[i];
++
++		cpuid_count(0xD, i, &xs->eax, &xs->ebx, &xs->ecx, &ign);
++	}
++}
++
+ u32 xstate_required_size(u64 xstate_bv, bool compacted)
+ {
+ 	int feature_bit = 0;
+@@ -44,14 +64,15 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted)
+ 	xstate_bv &= XFEATURE_MASK_EXTEND;
+ 	while (xstate_bv) {
+ 		if (xstate_bv & 0x1) {
+-		        u32 eax, ebx, ecx, edx, offset;
+-		        cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx);
++			struct cpuid_xstate_sizes *xs = &xstate_sizes[feature_bit];
++			u32 offset;
++
+ 			/* ECX[1]: 64B alignment in compacted form */
+ 			if (compacted)
+-				offset = (ecx & 0x2) ? ALIGN(ret, 64) : ret;
++				offset = (xs->ecx & 0x2) ? ALIGN(ret, 64) : ret;
+ 			else
+-				offset = ebx;
+-			ret = max(ret, offset + eax);
++				offset = xs->ebx;
++			ret = max(ret, offset + xs->eax);
+ 		}
+ 
+ 		xstate_bv >>= 1;
+diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
+index 0e46c4555311d02b6ec9eba7f33b292c3562b0ee..6287ec33feb7b143a10a7d8ee38d74381a567d24 100644
+--- a/arch/x86/kvm/cpuid.h
++++ b/arch/x86/kvm/cpuid.h
+@@ -34,6 +34,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+ 
+ bool vcpu_supports_xsave_pkru(struct kvm_vcpu *vcpu);
+ 
++void __init kvm_init_xstate_sizes(void);
+ u32 xstate_required_size(u64 xstate_bv, bool compacted);
+ 
+ int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index c7c3d04198c6c7f88f97e122bfe78b857f26cb26..4d10fc1a9b4114d1e2edf133717f307043560263 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -14061,6 +14061,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_rmp_fault);
+ 
+ static int __init kvm_x86_init(void)
+ {
++	kvm_init_xstate_sizes();
++
+ 	kvm_mmu_x86_module_init();
+ 	mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible();
+ 	return 0;
-- 
2.39.5



_______________________________________________
pve-devel mailing list
pve-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel