[PATCH v2] x86/sev: Fix host kdump support for SNP

From: Ashish Kalra
Date: Tue Sep 03 2024 - 15:11:21 EST


From: Ashish Kalra <ashish.kalra@xxxxxxx>

With active SNP VMs, SNP_SHUTDOWN_EX invoked during panic notifiers causes
crashkernel boot failure with the following signature:

[ 563.497112] sysrq: Trigger a crash
[ 563.508415] Kernel panic - not syncing: sysrq triggered crash
[ 563.522002] CPU: 10 UID: 0 PID: 4661 Comm: bash Kdump: loaded Not tainted 6.11.0-rc3-next-20240813-snp-host-f2a41ff576cc-dirty #61
[ 563.549762] Hardware name: AMD Corporation ETHANOL_X/ETHANOL_X, BIOS RXM100AB 10/17/2022
[ 563.566266] Call Trace:
[ 563.576430] <TASK>
[ 563.585932] dump_stack_lvl+0x2b/0x90
[ 563.597244] dump_stack+0x14/0x20
[ 563.608141] panic+0x3b9/0x400
[ 563.618801] ? srso_alias_return_thunk+0x5/0xfbef5
[ 563.631271] sysrq_handle_crash+0x19/0x20
[ 563.642696] __handle_sysrq+0xf9/0x290
[ 563.653691] ? srso_alias_return_thunk+0x5/0xfbef5
[ 563.666126] write_sysrq_trigger+0x60/0x80
...
...
[ 564.186804] in panic
[ 564.194287] in panic_other_cpus_shutdown
[ 564.203674] kexec: in crash_smp_send_stop
[ 564.213205] kexec: in kdump_nmi_shootdown_cpus
[ 564.224338] Kernel Offset: 0x35a00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
[ 564.282209] in snp_shutdown_on_panic after decommission, wbinvd + df_flush required
[ 564.462217] ccp 0000:23:00.1: SEV-SNP DF_FLUSH failed with error 14
[ 564.676920] kexec: in native_machine_crash_shutdown
early console in extract_kernel
input_data: 0x000000007410d2cc
input_len: 0x0000000000ce98b2
output: 0x0000000071600000
output_len: 0x000000000379eb8c
kernel_total_size: 0x0000000002c30000
needed_size: 0x0000000003800000
trampoline_32bit: 0x0000000000000000

Invalid physical address chosen!

Physical KASLR disabled: no suitable memory region!

Virtual KASLR using RDRAND RDTSC...

Decompressing Linux... Parsing ELF... Performing relocations... done.
Booting the kernel (entry_offset: 0x0000000000000bda).
[ 0.000000] Linux version 6.11.0-rc3-next-20240813-snp-host-f2a41ff576cc-dirty (amd@ethanolx7e2ehost) (gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0, GNU ld (GNU Binutils) 2.40) #61 SMP Mon Aug 19 19:59:02 UTC 2024
[ 0.000000] Command line: BOOT_IMAGE=/vmlinuz-6.11.0-rc3-next-20240813-snp-host-f2a41ff576cc-dirty root=UUID=4b87a03b-0e78-42ca-a8ad-997e63bba4e0 ro console=tty0 console=ttyS0,115200n8 earlyprintk=ttyS0,115200n8 amd_iommu_dump=1 reset_devices systemd.unit=kdump-tools-dump.service nr_cpus=1 irqpoll nousb elfcorehdr=1916276K
[ 0.000000] KERNEL supported cpus:
...
...
[ 1.671804] AMD-Vi: Using global IVHD EFR:0x841f77e022094ace, EFR2:0x0
[ 1.679835] AMD-Vi: Translation is already enabled - trying to copy translation structures
[ 1.689363] AMD-Vi: Copied DEV table from previous kernel.
[ 1.864369] AMD-Vi: Completion-Wait loop timed out
[ 2.038289] AMD-Vi: Completion-Wait loop timed out
[ 2.212215] AMD-Vi: Completion-Wait loop timed out
[ 2.386141] AMD-Vi: Completion-Wait loop timed out
[ 2.560068] AMD-Vi: Completion-Wait loop timed out
[ 2.733997] AMD-Vi: Completion-Wait loop timed out
[ 2.907927] AMD-Vi: Completion-Wait loop timed out
[ 3.081855] AMD-Vi: Completion-Wait loop timed out
[ 3.225500] AMD-Vi: Completion-Wait loop timed out
[ 3.231083] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
d out
[ 3.579592] AMD-Vi: Completion-Wait loop timed out
[ 3.753164] AMD-Vi: Completion-Wait loop timed out
[ 3.815762] Kernel panic - not syncing: timer doesn't work through Interrupt-remapped IO-APIC
[ 3.825347] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.11.0-rc3-next-20240813-snp-host-f2a41ff576cc-dirty #61
[ 3.837188] Hardware name: AMD Corporation ETHANOL_X/ETHANOL_X, BIOS RXM100AB 10/17/2022
[ 3.846215] Call Trace:
[ 3.848939] <TASK>
[ 3.851277] dump_stack_lvl+0x2b/0x90
[ 3.855354] dump_stack+0x14/0x20
[ 3.859050] panic+0x3b9/0x400
[ 3.862454] panic_if_irq_remap+0x21/0x30
[ 3.866925] setup_IO_APIC+0x8aa/0xa50
[ 3.871106] ? __pfx_amd_iommu_enable_faulting+0x10/0x10
[ 3.877032] ? __cpuhp_setup_state+0x5e/0xd0
[ 3.881793] apic_intr_mode_init+0x6a/0xf0
[ 3.886360] x86_late_time_init+0x28/0x40
[ 3.890832] start_kernel+0x6a8/0xb50
[ 3.894914] x86_64_start_reservations+0x1c/0x30
[ 3.900064] x86_64_start_kernel+0xbf/0x110
[ 3.904729] ? setup_ghcb+0x12/0x130
[ 3.908716] common_startup_64+0x13e/0x141
[ 3.913283] </TASK>
[ 3.915715] in panic
[ 3.918149] in panic_other_cpus_shutdown
[ 3.922523] ---[ end Kernel panic - not syncing: timer doesn't work through Interrupt-remapped IO-APIC ]---

This happens as SNP_SHUTDOWN_EX fails when SNP VMs are active as the
firmware checks every encryption-capable ASID to verify that it is
not in use by a guest and a DF_FLUSH is not required. If a DF_FLUSH
is required, the firmware returns DFFLUSH_REQUIRED.

To fix this, added support to do SNP_DECOMMISSION of all active SNP VMs
in the panic notifier before doing SNP_SHUTDOWN_EX, but then
SNP_DECOMMISSION tags all CPUs on which guest has been activated to do
a WBINVD. This causes SNP_DF_FLUSH command failure with the following
flow: SNP_DECOMMISSION -> SNP_SHUTDOWN_EX -> SNP_DF_FLUSH ->
failure with WBINVD_REQUIRED.

When panic notifier is invoked all other CPUs have already been
shutdown, so it is not possible to do a wbinvd_on_all_cpus() after
SNP_DECOMMISSION has been executed. This eventually causes SNP_SHUTDOWN_EX
to fail after SNP_DECOMMISSION.

Adding fix to do SNP_DECOMMISSION and subsequent WBINVD on all CPUs
during NMI shutdown of CPUs as part of disabling virtualization on
all CPUs via cpu_emergency_disable_virtualization ->
svm_emergency_disable().

SNP_DECOMMISSION unbinds the ASID from SNP context and marks the ASID
as unusable and then transitions the SNP guest context page to a
firmware page and SNP_SHUTDOWN_EX transitions all pages associated
with the IOMMU to reclaim state which the hypervisor then transitions
to hypervisor state, all these page state changes are in the RMP
table, so there is no loss of guest data as such and the complete
host memory is captured with the crashkernel boot. There are no
processes which are being killed and host/guest memory is not
being altered or modified in any way.

This fixes and enables crashkernel/kdump on SNP host.

v2:
- rename all instances of decommision to decommission
- created a new function sev_emergency_disable() which is exported from
sev.c and calls __snp_decommission_all() to do SNP_DECOMMISSION
- added more information to commit message

Fixes: c3b86e61b756 ("x86/cpufeatures: Enable/unmask SEV-SNP CPU feature")
Signed-off-by: Ashish Kalra <ashish.kalra@xxxxxxx>
---
arch/x86/kvm/svm/sev.c | 133 +++++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/svm/svm.c | 2 +
arch/x86/kvm/svm/svm.h | 3 +-
3 files changed, 137 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 714c517dd4b7..30f286a3afb0 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -16,6 +16,7 @@
#include <linux/psp-sev.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
+#include <linux/delay.h>
#include <linux/misc_cgroup.h>
#include <linux/processor.h>
#include <linux/trace_events.h>
@@ -89,6 +90,8 @@ static unsigned int nr_asids;
static unsigned long *sev_asid_bitmap;
static unsigned long *sev_reclaim_asid_bitmap;

+static DEFINE_SPINLOCK(snp_decommission_lock);
+static void **snp_asid_to_gctx_pages_map;
static int snp_decommission_context(struct kvm *kvm);

struct enc_region {
@@ -2248,6 +2251,9 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
goto e_free_context;
}

+ if (snp_asid_to_gctx_pages_map)
+ snp_asid_to_gctx_pages_map[sev_get_asid(kvm)] = sev->snp_context;
+
return 0;

e_free_context:
@@ -2884,9 +2890,126 @@ static int snp_decommission_context(struct kvm *kvm)
snp_free_firmware_page(sev->snp_context);
sev->snp_context = NULL;

+ if (snp_asid_to_gctx_pages_map)
+ snp_asid_to_gctx_pages_map[sev_get_asid(kvm)] = NULL;
+
return 0;
}

+static void __snp_decommission_all(void)
+{
+ struct sev_data_snp_addr data = {};
+ int ret, asid;
+
+ if (!snp_asid_to_gctx_pages_map)
+ return;
+
+ for (asid = 1; asid < min_sev_asid; asid++) {
+ if (snp_asid_to_gctx_pages_map[asid]) {
+ data.address = __sme_pa(snp_asid_to_gctx_pages_map[asid]);
+ ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL);
+ if (!ret) {
+ snp_free_firmware_page(snp_asid_to_gctx_pages_map[asid]);
+ snp_asid_to_gctx_pages_map[asid] = NULL;
+ }
+ }
+ }
+}
+
+/*
+ * NOTE: called in NMI context from svm_emergency_disable().
+ */
+void sev_emergency_disable(void)
+{
+ static atomic_t waiting_for_cpus_synchronized;
+ static bool synchronize_cpus_initiated;
+ static bool snp_decommission_handled;
+ static atomic_t cpus_synchronized;
+
+ if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ return;
+
+ /*
+ * SNP_SHUTDOWN_EX fails when SNP VMs are active as the firmware checks
+ * every encryption-capable ASID to verify that it is not in use by a
+ * guest and a DF_FLUSH is not required. If a DF_FLUSH is required,
+ * the firmware returns DFFLUSH_REQUIRED. To address this, SNP_DECOMMISSION
+ * is required to shutdown all active SNP VMs, but SNP_DECOMMISSION tags all
+ * CPUs that guest was activated on to do a WBINVD. When panic notifier
+ * is invoked all other CPUs have already been shutdown, so it is not
+ * possible to do a wbinvd_on_all_cpus() after SNP_DECOMMISSION has been
+ * executed. This eventually causes SNP_SHUTDOWN_EX to fail after
+ * SNP_DECOMMISSION. To fix this, do SNP_DECOMMISSION and subsequent WBINVD
+ * on all CPUs during NMI shutdown of CPUs as part of disabling
+ * virtualization on all CPUs via cpu_emergency_disable_virtualization().
+ */
+
+ spin_lock(&snp_decommission_lock);
+
+ /*
+ * exit early for call from native_machine_crash_shutdown()
+ * as SNP_DECOMMISSION has already been done as part of
+ * NMI shutdown of the CPUs.
+ */
+ if (snp_decommission_handled) {
+ spin_unlock(&snp_decommission_lock);
+ return;
+ }
+
+ /*
+ * Synchronize all CPUs handling NMI before issuing
+ * SNP_DECOMMISSION.
+ */
+ if (!synchronize_cpus_initiated) {
+ /*
+ * one CPU handling panic, the other CPU is initiator for
+ * CPU synchronization.
+ */
+ atomic_set(&waiting_for_cpus_synchronized, num_online_cpus() - 2);
+ synchronize_cpus_initiated = true;
+ /*
+ * Ensure CPU synchronization parameters are setup before dropping
+ * the lock to let other CPUs continue to reach synchronization.
+ */
+ wmb();
+
+ spin_unlock(&snp_decommission_lock);
+
+ /*
+ * This will not cause system to hang forever as the CPU
+ * handling panic waits for maximum one second for
+ * other CPUs to stop in nmi_shootdown_cpus().
+ */
+ while (atomic_read(&waiting_for_cpus_synchronized) > 0)
+ mdelay(1);
+
+ /* Reacquire the lock once CPUs are synchronized */
+ spin_lock(&snp_decommission_lock);
+
+ atomic_set(&cpus_synchronized, 1);
+ } else {
+ atomic_dec(&waiting_for_cpus_synchronized);
+ /*
+ * drop the lock to let other CPUs contiune to reach
+ * synchronization.
+ */
+ spin_unlock(&snp_decommission_lock);
+
+ while (atomic_read(&cpus_synchronized) == 0)
+ mdelay(1);
+
+ /* Try to re-acquire lock after CPUs are synchronized */
+ spin_lock(&snp_decommission_lock);
+ }
+
+ if (!snp_decommission_handled) {
+ __snp_decommission_all();
+ snp_decommission_handled = true;
+ }
+ spin_unlock(&snp_decommission_lock);
+ wbinvd();
+}
+
void sev_vm_destroy(struct kvm *kvm)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -3052,6 +3175,13 @@ void __init sev_hardware_setup(void)
sev_es_supported = true;
sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP);

+ if (sev_snp_supported) {
+ snp_asid_to_gctx_pages_map = kmalloc_array(min_sev_asid,
+ sizeof(void *),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!snp_asid_to_gctx_pages_map)
+ pr_warn("Could not allocate SNP asid to guest context map\n");
+ }
out:
if (boot_cpu_has(X86_FEATURE_SEV))
pr_info("SEV %s (ASIDs %u - %u)\n",
@@ -3094,6 +3224,9 @@ void sev_hardware_unsetup(void)

misc_cg_set_capacity(MISC_CG_RES_SEV, 0);
misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0);
+
+ kfree(snp_asid_to_gctx_pages_map);
+ snp_asid_to_gctx_pages_map = NULL;
}

int sev_cpu_init(struct svm_cpu_data *sd)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index ecf4303422ed..899f4481107c 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -597,6 +597,8 @@ static void svm_emergency_disable(void)
kvm_rebooting = true;

kvm_cpu_svm_disable();
+
+ sev_emergency_disable();
}

static void svm_hardware_disable(void)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 43fa6a16eb19..644101327c19 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -763,6 +763,7 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu);
int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn);
+void sev_emergency_disable(void);
#else
static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
{
@@ -793,7 +794,7 @@ static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
{
return 0;
}
-
+static void sev_emergency_disable(void) {}
#endif

/* vmenter.S */
--
2.34.1