[PATCH Part1 v5 22/38] x86/sev: Use SEV-SNP AP creation to start secondary CPUs

From: Brijesh Singh
Date: Fri Aug 20 2021 - 11:21:37 EST


From: Tom Lendacky <thomas.lendacky@xxxxxxx>

To provide a more secure way to start APs under SEV-SNP, use the SEV-SNP
AP Creation NAE event. This allows for guest control over the AP register
state rather than trusting the hypervisor with the SEV-ES Jump Table
address.

During native_smp_prepare_cpus(), invoke an SEV-SNP function that, if
SEV-SNP is active, will set/override apic->wakeup_secondary_cpu. This
will allow the SEV-SNP AP Creation NAE event method to be used to boot
the APs. As a result of installing the override when SEV-SNP is active,
this method of starting the APs becomes the required method. The override
function will fail to start the AP if the hypervisor does not have
support for AP creation.

Signed-off-by: Tom Lendacky <thomas.lendacky@xxxxxxx>
Signed-off-by: Brijesh Singh <brijesh.singh@xxxxxxx>
---
arch/x86/include/asm/sev-common.h | 1 +
arch/x86/include/asm/sev.h | 4 +
arch/x86/include/uapi/asm/svm.h | 5 +
arch/x86/kernel/sev.c | 205 ++++++++++++++++++++++++++++++
arch/x86/kernel/smpboot.c | 3 +
5 files changed, 218 insertions(+)

diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index 3388db814fd0..072540dfb129 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -100,6 +100,7 @@ enum psc_op {
(((u64)(v) & GENMASK_ULL(63, 12)) >> 12)

#define GHCB_HV_FT_SNP BIT_ULL(0)
+#define GHCB_HV_FT_SNP_AP_CREATION (BIT_ULL(1) | GHCB_HV_FT_SNP)

/* SNP Page State Change NAE event */
#define VMGEXIT_PSC_MAX_ENTRY 253
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 005f230d0406..7f063127aa66 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -65,6 +65,8 @@ extern bool handle_vc_boot_ghcb(struct pt_regs *regs);
/* RMP page size */
#define RMP_PG_SIZE_4K 0

+#define RMPADJUST_VMSA_PAGE_BIT BIT(16)
+
#ifdef CONFIG_AMD_MEM_ENCRYPT
extern struct static_key_false sev_es_enable_key;
extern void __sev_es_ist_enter(struct pt_regs *regs);
@@ -111,6 +113,7 @@ void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr
void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op);
void snp_set_memory_shared(unsigned long vaddr, unsigned int npages);
void snp_set_memory_private(unsigned long vaddr, unsigned int npages);
+void snp_set_wakeup_secondary_cpu(void);
#else
static inline void sev_es_ist_enter(struct pt_regs *regs) { }
static inline void sev_es_ist_exit(void) { }
@@ -125,6 +128,7 @@ early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned i
static inline void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) { }
static inline void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) { }
static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npages) { }
+static inline void snp_set_wakeup_secondary_cpu(void) { }
#endif

#endif
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index 0dcdb6e0c913..8b4c57baec52 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -109,6 +109,10 @@
#define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0
#define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1
#define SVM_VMGEXIT_PSC 0x80000010
+#define SVM_VMGEXIT_AP_CREATION 0x80000013
+#define SVM_VMGEXIT_AP_CREATE_ON_INIT 0
+#define SVM_VMGEXIT_AP_CREATE 1
+#define SVM_VMGEXIT_AP_DESTROY 2
#define SVM_VMGEXIT_HV_FEATURES 0x8000fffd
#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff

@@ -221,6 +225,7 @@
{ SVM_VMGEXIT_AP_HLT_LOOP, "vmgexit_ap_hlt_loop" }, \
{ SVM_VMGEXIT_AP_JUMP_TABLE, "vmgexit_ap_jump_table" }, \
{ SVM_VMGEXIT_PSC, "vmgexit_page_state_change" }, \
+ { SVM_VMGEXIT_AP_CREATION, "vmgexit_ap_creation" }, \
{ SVM_VMGEXIT_HV_FEATURES, "vmgexit_hypervisor_feature" }, \
{ SVM_EXIT_ERR, "invalid_guest_state" }

diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 106b4aaddfde..ddf8ced4a879 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -18,6 +18,7 @@
#include <linux/memblock.h>
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/cpumask.h>

#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>
@@ -30,6 +31,7 @@
#include <asm/svm.h>
#include <asm/smp.h>
#include <asm/cpu.h>
+#include <asm/apic.h>

#include "sev-internal.h"

@@ -105,6 +107,8 @@ struct ghcb_state {
static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
DEFINE_STATIC_KEY_FALSE(sev_es_enable_key);

+static DEFINE_PER_CPU(struct sev_es_save_area *, snp_vmsa);
+
/* Needed in vc_early_forward_exception */
void do_early_exception(struct pt_regs *regs, int trapnr);

@@ -858,6 +862,207 @@ void snp_set_memory_private(unsigned long vaddr, unsigned int npages)
pvalidate_pages(vaddr, npages, 1);
}

+static int rmpadjust(void *va, bool vmsa)
+{
+ u64 attrs;
+ int err;
+
+ /*
+ * The RMPADJUST instruction is used to set or clear the VMSA bit for
+ * a page. A change to the VMSA bit is only performed when running
+ * at VMPL0 and is ignored at other VMPL levels. If too low of a target
+ * VMPL level is specified, the instruction can succeed without changing
+ * the VMSA bit should the kernel not be in VMPL0. Using a target VMPL
+ * level of 1 will return a FAIL_PERMISSION error if the kernel is not
+ * at VMPL0, thus ensuring that the VMSA bit has been properly set when
+ * no error is returned.
+ */
+ attrs = 1;
+ if (vmsa)
+ attrs |= RMPADJUST_VMSA_PAGE_BIT;
+
+ /* Instruction mnemonic supported in binutils versions v2.36 and later */
+ asm volatile (".byte 0xf3,0x0f,0x01,0xfe\n\t"
+ : "=a" (err)
+ : "a" (va), "c" (RMP_PG_SIZE_4K), "d" (attrs)
+ : "memory", "cc");
+
+ return err;
+}
+
+#define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK)
+#define INIT_CS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_READ_MASK | SVM_SELECTOR_CODE_MASK)
+#define INIT_DS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_WRITE_MASK)
+
+#define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2)
+#define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3)
+
+static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip)
+{
+ struct sev_es_save_area *cur_vmsa, *vmsa;
+ struct ghcb_state state;
+ unsigned long flags;
+ struct ghcb *ghcb;
+ int cpu, err, ret;
+ u8 sipi_vector;
+ u64 cr4;
+
+ if ((sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION) != GHCB_HV_FT_SNP_AP_CREATION)
+ return -EOPNOTSUPP;
+
+ /*
+ * Verify the desired start IP against the known trampoline start IP
+ * to catch any future new trampolines that may be introduced that
+ * would require a new protected guest entry point.
+ */
+ if (WARN_ONCE(start_ip != real_mode_header->trampoline_start,
+ "Unsupported SEV-SNP start_ip: %lx\n", start_ip))
+ return -EINVAL;
+
+ /* Override start_ip with known protected guest start IP */
+ start_ip = real_mode_header->sev_es_trampoline_start;
+
+ /* Find the logical CPU for the APIC ID */
+ for_each_present_cpu(cpu) {
+ if (arch_match_cpu_phys_id(cpu, apic_id))
+ break;
+ }
+ if (cpu >= nr_cpu_ids)
+ return -EINVAL;
+
+ cur_vmsa = per_cpu(snp_vmsa, cpu);
+
+ /*
+ * A new VMSA is created each time because there is no guarantee that
+ * the current VMSA is the kernels or that the vCPU is not running. If
+ * an attempt was done to use the current VMSA with a running vCPU, a
+ * #VMEXIT of that vCPU would wipe out all of the settings being done
+ * here.
+ */
+ vmsa = (struct sev_es_save_area *)get_zeroed_page(GFP_KERNEL);
+ if (!vmsa)
+ return -ENOMEM;
+
+ /* CR4 should maintain the MCE value */
+ cr4 = native_read_cr4() & ~X86_CR4_MCE;
+
+ /* Set the CS value based on the start_ip converted to a SIPI vector */
+ sipi_vector = (start_ip >> 12);
+ vmsa->cs.base = sipi_vector << 12;
+ vmsa->cs.limit = 0xffff;
+ vmsa->cs.attrib = INIT_CS_ATTRIBS;
+ vmsa->cs.selector = sipi_vector << 8;
+
+ /* Set the RIP value based on start_ip */
+ vmsa->rip = start_ip & 0xfff;
+
+ /* Set VMSA entries to the INIT values as documented in the APM */
+ vmsa->ds.limit = 0xffff;
+ vmsa->ds.attrib = INIT_DS_ATTRIBS;
+ vmsa->es = vmsa->ds;
+ vmsa->fs = vmsa->ds;
+ vmsa->gs = vmsa->ds;
+ vmsa->ss = vmsa->ds;
+
+ vmsa->gdtr.limit = 0xffff;
+ vmsa->ldtr.limit = 0xffff;
+ vmsa->ldtr.attrib = INIT_LDTR_ATTRIBS;
+ vmsa->idtr.limit = 0xffff;
+ vmsa->tr.limit = 0xffff;
+ vmsa->tr.attrib = INIT_TR_ATTRIBS;
+
+ vmsa->efer = 0x1000; /* Must set SVME bit */
+ vmsa->cr4 = cr4;
+ vmsa->cr0 = 0x60000010;
+ vmsa->dr7 = 0x400;
+ vmsa->dr6 = 0xffff0ff0;
+ vmsa->rflags = 0x2;
+ vmsa->g_pat = 0x0007040600070406ULL;
+ vmsa->xcr0 = 0x1;
+ vmsa->mxcsr = 0x1f80;
+ vmsa->x87_ftw = 0x5555;
+ vmsa->x87_fcw = 0x0040;
+
+ /*
+ * Set the SNP-specific fields for this VMSA:
+ * VMPL level
+ * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits)
+ */
+ vmsa->vmpl = 0;
+ vmsa->sev_features = sev_status >> 2;
+
+ /* Switch the page over to a VMSA page now that it is initialized */
+ ret = rmpadjust(vmsa, true);
+ if (ret) {
+ pr_err("set VMSA page failed (%u)\n", ret);
+ free_page((unsigned long)vmsa);
+
+ return -EINVAL;
+ }
+
+ /* Issue VMGEXIT AP Creation NAE event */
+ local_irq_save(flags);
+
+ ghcb = __sev_get_ghcb(&state);
+
+ vc_ghcb_invalidate(ghcb);
+ ghcb_set_rax(ghcb, vmsa->sev_features);
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION);
+ ghcb_set_sw_exit_info_1(ghcb, ((u64)apic_id << 32) | SVM_VMGEXIT_AP_CREATE);
+ ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa));
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+ VMGEXIT();
+
+ if (!ghcb_sw_exit_info_1_is_valid(ghcb) ||
+ lower_32_bits(ghcb->save.sw_exit_info_1)) {
+ pr_alert("SNP AP Creation error\n");
+ ret = -EINVAL;
+ }
+
+ __sev_put_ghcb(&state);
+
+ local_irq_restore(flags);
+
+ /* Perform cleanup if there was an error */
+ if (ret) {
+ err = rmpadjust(vmsa, false);
+ if (err)
+ pr_err("clear VMSA page failed (%u), leaking page\n", err);
+ else
+ free_page((unsigned long)vmsa);
+
+ vmsa = NULL;
+ }
+
+ /* Free up any previous VMSA page */
+ if (cur_vmsa) {
+ err = rmpadjust(cur_vmsa, false);
+ if (err)
+ pr_err("clear VMSA page failed (%u), leaking page\n", err);
+ else
+ free_page((unsigned long)cur_vmsa);
+ }
+
+ /* Record the current VMSA page */
+ per_cpu(snp_vmsa, cpu) = vmsa;
+
+ return ret;
+}
+
+void snp_set_wakeup_secondary_cpu(void)
+{
+ if (!sev_feature_enabled(SEV_SNP))
+ return;
+
+ /*
+ * Always set this override if SEV-SNP is enabled. This makes it the
+ * required method to start APs under SEV-SNP. If the hypervisor does
+ * not support AP creation, then no APs will be started.
+ */
+ apic->wakeup_secondary_cpu = wakeup_cpu_via_vmgexit;
+}
+
int sev_es_setup_ap_jump_table(struct real_mode_header *rmh)
{
u16 startup_cs, startup_ip;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 85f6e242b6b4..ca78711620e0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -82,6 +82,7 @@
#include <asm/spec-ctrl.h>
#include <asm/hw_irq.h>
#include <asm/stackprotector.h>
+#include <asm/sev.h>

#ifdef CONFIG_ACPI_CPPC_LIB
#include <acpi/cppc_acpi.h>
@@ -1380,6 +1381,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
smp_quirk_init_udelay();

speculative_store_bypass_ht_init();
+
+ snp_set_wakeup_secondary_cpu();
}

void arch_thaw_secondary_cpus_begin(void)
--
2.17.1