Re: [PATCHv7 10/16] x86/tdx: Convert shared memory back to private on kexec

From: Kirill A. Shutemov
Date: Sun Feb 25 2024 - 09:59:02 EST


On Fri, Feb 23, 2024 at 11:39:07AM -0800, Dave Hansen wrote:
> On 2/12/24 02:44, Kirill A. Shutemov wrote:
> > +static void tdx_kexec_stop_conversion(bool crash)
> > +{
> > + /* Stop new private<->shared conversions */
> > + conversion_allowed = false;
> > +
> > + /*
> > + * Make sure conversion_allowed is cleared before checking
> > + * conversions_in_progress.
> > + */
> > + barrier();
> > +
> > + /*
> > + * Crash kernel reaches here with interrupts disabled: can't wait for
> > + * conversions to finish.
> > + *
> > + * If race happened, just report and proceed.
> > + */
> > + if (!crash) {
> > + unsigned long timeout;
> > +
> > + /*
> > + * Wait for in-flight conversions to complete.
> > + *
> > + * Do not wait more than 30 seconds.
> > + */
> > + timeout = 30 * USEC_PER_SEC;
> > + while (atomic_read(&conversions_in_progress) && timeout--)
> > + udelay(1);
> > + }
> > +
> > + if (atomic_read(&conversions_in_progress))
> > + pr_warn("Failed to finish shared<->private conversions\n");
> > +}
>
> I'd really prefer we find a way to do this with actual locks, especially
> 'conversion_allowed'.
>
> This is _awfully_ close to being able to be handled by a rwsem where the
> readers are the converters and tdx_kexec_stop_conversion() takes a write.

Okay, here's what I come up with. It needs more testing.

Any comments?

diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index fd212c9bad89..5eb0dac33f37 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -6,8 +6,10 @@

#include <linux/cpufeature.h>
#include <linux/debugfs.h>
+#include <linux/delay.h>
#include <linux/export.h>
#include <linux/io.h>
+#include <linux/kexec.h>
#include <asm/coco.h>
#include <asm/tdx.h>
#include <asm/vmx.h>
@@ -15,6 +17,7 @@
#include <asm/insn.h>
#include <asm/insn-eval.h>
#include <asm/pgtable.h>
+#include <asm/set_memory.h>

/* MMIO direction */
#define EPT_READ 0
@@ -837,6 +840,65 @@ static int tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
return 0;
}

+static void tdx_kexec_stop_conversion(bool crash)
+{
+ /* Stop new private<->shared conversions */
+ if (!stop_memory_enc_conversion(!crash))
+ pr_warn("Failed to finish shared<->private conversions\n");
+}
+
+static void tdx_kexec_unshare_mem(void)
+{
+ unsigned long addr, end;
+ long found = 0, shared;
+
+ /*
+ * Walk direct mapping and convert all shared memory back to private,
+ */
+
+ addr = PAGE_OFFSET;
+ end = PAGE_OFFSET + get_max_mapped();
+
+ while (addr < end) {
+ unsigned long size;
+ unsigned int level;
+ pte_t *pte;
+
+ pte = lookup_address(addr, &level);
+ size = page_level_size(level);
+
+ if (pte && pte_decrypted(*pte)) {
+ int pages = size / PAGE_SIZE;
+
+ /*
+ * Touching memory with shared bit set triggers implicit
+ * conversion to shared.
+ *
+ * Make sure nobody touches the shared range from
+ * now on.
+ */
+ set_pte(pte, __pte(0));
+
+ if (!tdx_enc_status_changed(addr, pages, true)) {
+ pr_err("Failed to unshare range %#lx-%#lx\n",
+ addr, addr + size);
+ }
+
+ found += pages;
+ }
+
+ addr += size;
+ }
+
+ __flush_tlb_all();
+
+ shared = atomic_long_read(&nr_shared);
+ if (shared != found) {
+ pr_err("shared page accounting is off\n");
+ pr_err("nr_shared = %ld, nr_found = %ld\n", shared, found);
+ }
+}
+
void __init tdx_early_init(void)
{
struct tdx_module_args args = {
@@ -896,6 +958,9 @@ void __init tdx_early_init(void)
x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required;
x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required;

+ x86_platform.guest.enc_kexec_stop_conversion = tdx_kexec_stop_conversion;
+ x86_platform.guest.enc_kexec_unshare_mem = tdx_kexec_unshare_mem;
+
/*
* TDX intercepts the RDMSR to read the X2APIC ID in the parallel
* bringup low level code. That raises #VE which cannot be handled
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index a5e89641bd2d..9d4a8e548820 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -48,8 +48,11 @@ int set_memory_wc(unsigned long addr, int numpages);
int set_memory_wb(unsigned long addr, int numpages);
int set_memory_np(unsigned long addr, int numpages);
int set_memory_4k(unsigned long addr, int numpages);
+
+bool stop_memory_enc_conversion(bool wait);
int set_memory_encrypted(unsigned long addr, int numpages);
int set_memory_decrypted(unsigned long addr, int numpages);
+
int set_memory_np_noalias(unsigned long addr, int numpages);
int set_memory_nonglobal(unsigned long addr, int numpages);
int set_memory_global(unsigned long addr, int numpages);
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 0d2267ad4e0e..e074b2aca970 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2176,12 +2176,32 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
return ret;
}

+static DECLARE_RWSEM(mem_enc_lock);
+
+bool stop_memory_enc_conversion(bool wait)
+{
+ if (!wait)
+ return down_write_trylock(&mem_enc_lock);
+
+ down_write(&mem_enc_lock);
+
+ return true;
+}
+
static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
{
- if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
- return __set_memory_enc_pgtable(addr, numpages, enc);
+ int ret = 0;

- return 0;
+ if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
+ if (!down_read_trylock(&mem_enc_lock))
+ return -EBUSY;
+
+ ret =__set_memory_enc_pgtable(addr, numpages, enc);
+
+ up_read(&mem_enc_lock);
+ }
+
+ return ret;
}

int set_memory_encrypted(unsigned long addr, int numpages)
--
Kiryl Shutsemau / Kirill A. Shutemov