Re: [PATCH v4 2/6] arch/x86/kvm: Refactor tlbflush and l1d flush

From: Tom Lendacky
Date: Fri Apr 24 2020 - 15:04:26 EST


On 4/23/20 9:01 AM, Balbir Singh wrote:
Refactor the existing assembly bits into smaller helper functions
and also abstract L1D_FLUSH into a helper function. Use these
functions in kvm for L1D flushing.

Reviewed-by: Kees Cook <keescook@xxxxxxxxxxxx>
Signed-off-by: Balbir Singh <sblbir@xxxxxxxxxx>
---
arch/x86/include/asm/cacheflush.h | 3 ++
arch/x86/kernel/l1d_flush.c | 54 +++++++++++++++++++++++++++++++
arch/x86/kvm/vmx/vmx.c | 29 ++---------------
3 files changed, 60 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index bac56fcd9790..21cc3b28fa63 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -8,7 +8,10 @@
#define L1D_CACHE_ORDER 4
void clflush_cache_range(void *addr, unsigned int size);
+void l1d_flush_populate_tlb(void *l1d_flush_pages);
void *l1d_flush_alloc_pages(void);
void l1d_flush_cleanup_pages(void *l1d_flush_pages);
+void l1d_flush_sw(void *l1d_flush_pages);
+int l1d_flush_hw(void);
#endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/kernel/l1d_flush.c b/arch/x86/kernel/l1d_flush.c
index d605878c8f28..5871794f890d 100644
--- a/arch/x86/kernel/l1d_flush.c
+++ b/arch/x86/kernel/l1d_flush.c
@@ -34,3 +34,57 @@ void l1d_flush_cleanup_pages(void *l1d_flush_pages)
free_pages((unsigned long)l1d_flush_pages, L1D_CACHE_ORDER);
}
EXPORT_SYMBOL_GPL(l1d_flush_cleanup_pages);
+
+/*
+ * Not all users of l1d flush would want to populate the TLB first
+ * split out the function so that callers can optionally flush the L1D
+ * cache via sw without prefetching the TLB.
+ */
+void l1d_flush_populate_tlb(void *l1d_flush_pages)
+{
+ int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+ asm volatile(
+ /* First ensure the pages are in the TLB */
+ "xorl %%eax, %%eax\n"
+ ".Lpopulate_tlb:\n\t"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $4096, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lpopulate_tlb\n\t"
+ "xorl %%eax, %%eax\n\t"
+ "cpuid\n\t"
+ :: [flush_pages] "r" (l1d_flush_pages),
+ [size] "r" (size)
+ : "eax", "ebx", "ecx", "edx");
+}
+EXPORT_SYMBOL_GPL(l1d_flush_populate_tlb);
+
+int l1d_flush_hw(void)
+{
+ if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ return 0;
+ }
+ return -ENOTSUPP;
+}
+EXPORT_SYMBOL_GPL(l1d_flush_hw);
+
+void l1d_flush_sw(void *l1d_flush_pages)
+{
+ int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+ asm volatile(
+ /* Fill the cache */
+ "xorl %%eax, %%eax\n"
+ ".Lfill_cache:\n"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $64, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lfill_cache\n\t"
+ "lfence\n"
+ :: [flush_pages] "r" (l1d_flush_pages),
+ [size] "r" (size)
+ : "eax", "ecx");
+}

If the answer to my previous question in patch 1/6 about the allocation being twice the size is yes, then could you allocate the flush pages the same size as the cache and just write it twice? Wouldn't that accomplish the same goal and provide a performance improvement with (mostly) now present L1D entries of the flush pages? Also, can't you unroll this loop a bit and operate on multiple entries in each loop, reducing the overall looping compares and jumps as an optimization?

Thanks,
Tom


+EXPORT_SYMBOL_GPL(l1d_flush_sw);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 225aa8219bac..786d1615a09f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5983,8 +5983,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu,
*/
static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
{
- int size = PAGE_SIZE << L1D_CACHE_ORDER;
-
/*
* This code is only executed when the the flush mode is 'cond' or
* 'always'
@@ -6013,32 +6011,11 @@ static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
vcpu->stat.l1d_flush++;
- if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
- wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ if (!l1d_flush_hw())
return;
- }
- asm volatile(
- /* First ensure the pages are in the TLB */
- "xorl %%eax, %%eax\n"
- ".Lpopulate_tlb:\n\t"
- "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
- "addl $4096, %%eax\n\t"
- "cmpl %%eax, %[size]\n\t"
- "jne .Lpopulate_tlb\n\t"
- "xorl %%eax, %%eax\n\t"
- "cpuid\n\t"
- /* Now fill the cache */
- "xorl %%eax, %%eax\n"
- ".Lfill_cache:\n"
- "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
- "addl $64, %%eax\n\t"
- "cmpl %%eax, %[size]\n\t"
- "jne .Lfill_cache\n\t"
- "lfence\n"
- :: [flush_pages] "r" (vmx_l1d_flush_pages),
- [size] "r" (size)
- : "eax", "ebx", "ecx", "edx");
+ l1d_flush_populate_tlb(vmx_l1d_flush_pages);
+ l1d_flush_sw(vmx_l1d_flush_pages);
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)