Re: [PATCH 1/4] kvm: svm: Streamline VMSA setting for VCPUs

From: Tom Lendacky

Date: Tue Jun 16 2026 - 16:55:09 EST

On 6/11/26 07:35, Jörg Rödel wrote:
> From: Joerg Roedel <joerg.roedel@xxxxxxx>
>
> Streamline the VMSA setting state of vcpus, where a VMSA can be either
> KVM-allocated or guest-provided. This consolidates the various
> tracking state around VMSAs.
>
> Signed-off-by: Joerg Roedel <joerg.roedel@xxxxxxx>
> ---
> arch/x86/kvm/svm/sev.c | 301 ++++++++++++++++++++++++++++-------------
> arch/x86/kvm/svm/svm.h | 31 ++++-
> 2 files changed, 237 insertions(+), 95 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> index 6c6a6d663e29..9b1280222e20 100644
> --- a/arch/x86/kvm/svm/sev.c
> +++ b/arch/x86/kvm/svm/sev.c
> @@ -147,6 +147,9 @@ static bool sev_snp_guest(struct kvm *kvm)
> }
>
> static int snp_decommission_context(struct kvm *kvm);
> +static int kvm_rmp_make_shared(struct kvm *kvm, u64 pfn, enum pg_level level);
> +static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va);
> +static int snp_page_reclaim(struct kvm *kvm, u64 pfn);

Can this be worked so that we don't add more forward declarations?

>
> struct enc_region {
> struct list_head list;
> @@ -156,6 +159,173 @@ struct enc_region {
> unsigned long size;
> };
>
> +static void *sev_es_vmsa_ref(struct kvm_vcpu *vcpu)

sev_es_get_kvm_shared_vmsa ?

> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> + void *vmsa = NULL;
> +
> + if (svm->sev_es.vmsa.vmsa_state == VMSA_SHARED) {
> + vmsa = page_address(svm->sev_es.vmsa.vmsa_page);
> + }
> +
> + return vmsa;

How about just:

if (svm->sev_es.vmsa.vmsa_state == VMSA_SHARED)
return page_address(svm->sev_es.vmsa.vmsa_page);

return NULL;

> +}
> +
> +static int sev_es_vcpu_alloc_vmsa(struct kvm_vcpu *vcpu)

sev_es_alloc_kvm_vmsa ?

> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> + struct page *vmsa_page;
> +
> + if (WARN_ON_ONCE(svm->sev_es.vmsa.vmsa_state != VMSA_NONE))
> + return -EINVAL;
> +
> + /*
> + * SEV-ES guests require a separate (from the VMCB) VMSA page used to
> + * contain the encrypted register state of the guest.
> + */
> + vmsa_page = snp_safe_alloc_page();
> + if (!vmsa_page)
> + return -ENOMEM;
> +
> + svm->sev_es.vmsa.vmsa_state = VMSA_SHARED;
> + svm->sev_es.vmsa.vmsa_page = vmsa_page;
> +
> + return 0;
> +}
> +
> +static int sev_es_vcpu_vmsa_make_private(struct kvm_vcpu *vcpu)

sev_es_make_kvm_vmsa_private ?

> +{
> + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
> + struct vcpu_svm *svm = to_svm(vcpu);
> + void *vmsa = sev_es_vmsa_ref(vcpu);
> +
> + if (!vmsa)
> + return -EINVAL;
> +
> + if (is_sev_snp_guest(vcpu)) {
> + u64 pfn = __pa(vmsa) >> PAGE_SHIFT;
> + int ret;
> +
> + /* Transition the VMSA page to a firmware state. */
> + ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true);
> + if (ret)
> + return ret;
> + }
> +
> + svm->sev_es.vmsa.vmsa_state = VMSA_PRIVATE;
> +
> + return 0;
> +}
> +
> +static void sev_es_vcpu_free_vmsa(struct kvm_vcpu *vcpu)

sev_es_free_kvm_vmsa ?

> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> + void *vmsa_ptr;

s/vmsa_ptr/vmsa/

> +
> + switch (svm->sev_es.vmsa.vmsa_state) {
> + case VMSA_NONE:
> + case VMSA_GUEST:
> + break;
> + case VMSA_PRIVATE:
> + vmsa_ptr = page_address(svm->sev_es.vmsa.vmsa_page);
> +
> + if (is_sev_snp_guest(vcpu)) {
> + u64 pfn = __pa(vmsa_ptr) >> PAGE_SHIFT;

PHYS_PFN(__pa(vmsa));

> +
> + if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K)) {
> + pr_err("Failed to make VMSA page shared - leaking it to avoid re-use\n");
> + goto out;

s/goto out/break/
> + }
> + }
> +
> + if (vcpu->arch.guest_state_protected)
> + sev_flush_encrypted_page(vcpu, vmsa_ptr);
> +
> + fallthrough;
> + case VMSA_SHARED:
> + __free_page(svm->sev_es.vmsa.vmsa_page);
> + break;
> + default:
> + BUG();

WARN_ON or WARN_ON_ONCE() instead of BUG().
> + }
> +out:

If using the 'break' above, then no need for this label.

> +
> + svm->sev_es.vmsa.vmsa_page = NULL;
> + svm->sev_es.vmsa.vmsa_state = VMSA_NONE;
> +}
> +
> +static void sev_snp_vcpu_reclaim_vmsa(struct kvm_vcpu *vcpu)

sev_snp_reclaim_kvm_vmsa ?

> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> + void *vmsa_ptr;

s/vmsa_ptr/vmsa/

> + u64 pfn;
> +
> + if (WARN_ON_ONCE(!is_sev_snp_guest(vcpu) ||
> + svm->sev_es.vmsa.vmsa_state != VMSA_PRIVATE))
> + return;
> +
> + vmsa_ptr = page_address(svm->sev_es.vmsa.vmsa_page);
> + pfn = __pa(vmsa_ptr) >> PAGE_SHIFT;

PHYS_PFN()

> +
> + if (!snp_page_reclaim(vcpu->kvm, pfn))
> + __free_page(svm->sev_es.vmsa.vmsa_page);

Should you issue a message here similar to above about leaking the page
if snp_page_reclaim() fails?

> +
> + svm->sev_es.vmsa.vmsa_page = NULL;
> + svm->sev_es.vmsa.vmsa_state = VMSA_NONE;
> +}
> +
> +static void sev_es_set_guest_vmsa(struct kvm_vcpu *vcpu, gpa_t vmsa_gpa)
> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> +
> + sev_es_vcpu_free_vmsa(vcpu);
> +
> + svm->sev_es.vmsa.vmsa_state = VMSA_GUEST;
> + svm->sev_es.vmsa.vmsa_gpa = vmsa_gpa;
> +}
> +
> +static u64 sev_es_vmsa_pa(struct kvm_vcpu *vcpu)
> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> + enum vmsa_state vmsa_state = svm->sev_es.vmsa.vmsa_state;
> + u64 vmsa_pa = INVALID_PAGE;
> +
> + if (vmsa_state == VMSA_GUEST) {

Would a switch statement like you have in sev_es_vcpu_free_vmsa() be
more consistent?

Also, you could return INVALID_PAGE directly instead of having the goto's.

switch (svm->sev_es.vmsa.vmsa_state) {
case VMSA_NONE:
return INVALID_PAGE;

case VMSA_SHARED:
case VMSA_PRIVATE:
return __pa(page_address(svm->sev_es.vmsa.vmsa_page));

case VMSA_GUEST:
...
}

> + gpa_t vmsa_gpa = svm->sev_es.vmsa.vmsa_gpa;
> + struct kvm_memory_slot *slot;
> + struct page *page;
> + kvm_pfn_t pfn;
> + gfn_t gfn;
> +
> + gfn = gpa_to_gfn(vmsa_gpa);
> +
> + slot = gfn_to_memslot(vcpu->kvm, gfn);
> + if (!slot)
> + goto out;
> +
> + /*
> + * The new VMSA will be private memory guest memory, so retrieve the
> + * PFN from the gmem backend.
> + */
> + if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL))
> + goto out;
> +
> + vmsa_pa = pfn_to_hpa(pfn);
> +
> + /*
> + * gmem pages aren't currently migratable, but if this ever changes
> + * then care should be taken to ensure the guest vmsa is pinned
> + * through some other means.
> + */
> + kvm_release_page_clean(page);
> + } else if (vmsa_state == VMSA_PRIVATE || vmsa_state == VMSA_SHARED) {
> + vmsa_pa = __pa(page_address(svm->sev_es.vmsa.vmsa_page));
> + }
> +
> +out:
> + return vmsa_pa;
> +}
> +
> /* Called with the sev_bitmap_lock held, or on shutdown */
> static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
> {
> @@ -925,7 +1095,7 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
> {
> struct kvm_vcpu *vcpu = &svm->vcpu;
> struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
> - struct sev_es_save_area *save = svm->sev_es.vmsa;
> + struct sev_es_save_area *save = sev_es_vmsa_ref(vcpu);
> struct xregs_state *xsave;
> const u8 *s;
> u8 *d;
> @@ -1026,6 +1196,7 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu,
> {
> struct sev_data_launch_update_vmsa vmsa;
> struct vcpu_svm *svm = to_svm(vcpu);
> + void *vmsa_ref = sev_es_vmsa_ref(vcpu);

Might make more sense rename the sev_data_launch_update_vmsa variable to
vmsa_lu and keep this as just vmsa ?\

> int ret;
>
> if (vcpu->guest_debug) {
> @@ -1043,15 +1214,19 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu,
> * the VMSA memory content (i.e it will write the same memory region
> * with the guest's key), so invalidate it first.
> */
> - clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE);
> + clflush_cache_range(vmsa_ref, PAGE_SIZE);
>
> vmsa.reserved = 0;
> vmsa.handle = to_kvm_sev_info(kvm)->handle;
> - vmsa.address = __sme_pa(svm->sev_es.vmsa);
> + vmsa.address = __sme_pa(vmsa_ref);
> vmsa.len = PAGE_SIZE;
> ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error);
> if (ret)
> - return ret;
> + goto free_vmsa;
> +
> + ret = sev_es_vcpu_vmsa_make_private(vcpu);
> + if (ret)
> + goto free_vmsa;

Similar to the SNP path, you can probably move this to before the
LAUNCH_UPDATE, just for consistency.
>
> /*
> * SEV-ES guests maintain an encrypted version of their FPU
> @@ -1069,7 +1244,13 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu,
> * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
> */
> svm_enable_lbrv(vcpu);
> +
> return 0;
> +
> +free_vmsa:
> + sev_es_vcpu_free_vmsa(vcpu);
> +
> + return ret;
> }
>
> static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
> @@ -2508,23 +2689,22 @@ static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
>
> kvm_for_each_vcpu(i, vcpu, kvm) {
> struct vcpu_svm *svm = to_svm(vcpu);
> - u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
> + void *vmsa = sev_es_vmsa_ref(vcpu);
>
> ret = sev_es_sync_vmsa(svm);
> if (ret)
> goto out;
>
> - /* Transition the VMSA page to a firmware state. */
> - ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true);
> + ret = sev_es_vcpu_vmsa_make_private(vcpu);
> if (ret)
> goto out;
>
> /* Issue the SNP command to encrypt the VMSA */
> - data.address = __sme_pa(svm->sev_es.vmsa);
> + data.address = __sme_pa(vmsa);
> ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
> &data, &argp->error);
> if (ret) {
> - snp_page_reclaim(kvm, pfn);
> + sev_snp_vcpu_reclaim_vmsa(vcpu);
>
> goto out;
> }

It might be nice to break the loop contents out into its own function
like is done with ES. I haven't looked ahead in the series to see if
that would make it easier to supply/measure a single VMSA or not, though.

> @@ -3593,31 +3773,13 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm)
>
> void sev_free_vcpu(struct kvm_vcpu *vcpu)
> {
> - struct vcpu_svm *svm;
> + struct vcpu_svm *svm = to_svm(vcpu);

You can probably reduce the churn here by keeping this line unchanged
and not deleting the assignment below.

>
> if (!is_sev_es_guest(vcpu))
> return;
>
> - svm = to_svm(vcpu);
> -
> - /*
> - * If it's an SNP guest, then the VMSA was marked in the RMP table as
> - * a guest-owned page. Transition the page to hypervisor state before
> - * releasing it back to the system.
> - */
> - if (is_sev_snp_guest(vcpu)) {
> - u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
> -
> - if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K))
> - goto skip_vmsa_free;
> - }
> -
> - if (vcpu->arch.guest_state_protected)
> - sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
> -
> - __free_page(virt_to_page(svm->sev_es.vmsa));
> + sev_es_vcpu_free_vmsa(vcpu);
>
> -skip_vmsa_free:
> __sev_es_unmap_ghcb(svm);
> }
>
> @@ -4067,10 +4229,7 @@ static int snp_begin_psc(struct vcpu_svm *svm)
> static void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> - struct kvm_memory_slot *slot;
> - struct page *page;
> - kvm_pfn_t pfn;
> - gfn_t gfn;
> + u64 vmsa_pa;
>
> guard(mutex)(&svm->sev_es.snp_vmsa_mutex);
>
> @@ -4092,46 +4251,17 @@ static void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
> */
> vmcb_mark_all_dirty(svm->vmcb);
>
> - if (!VALID_PAGE(svm->sev_es.snp_vmsa_gpa))
> - return;
> -
> - gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa);
> - svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
> + sev_es_set_guest_vmsa(vcpu, svm->sev_es.req_vmsa_gpa);
> + vmsa_pa = sev_es_vmsa_pa(vcpu);
>
> - slot = gfn_to_memslot(vcpu->kvm, gfn);
> - if (!slot)
> + if (!VALID_PAGE(vmsa_pa))
> return;
>
> - /*
> - * The new VMSA will be private memory guest memory, so retrieve the
> - * PFN from the gmem backend.
> - */
> - if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL))
> - return;
> -
> - /*
> - * From this point forward, the VMSA will always be a guest-mapped page
> - * rather than the initial one allocated by KVM in svm->sev_es.vmsa. In
> - * theory, svm->sev_es.vmsa could be free'd and cleaned up here, but
> - * that involves cleanups like flushing caches, which would ideally be
> - * handled during teardown rather than guest boot. Deferring that also
> - * allows the existing logic for SEV-ES VMSAs to be re-used with
> - * minimal SNP-specific changes.
> - */
> - svm->sev_es.snp_has_guest_vmsa = true;
> -
> /* Use the new VMSA */
> - svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn);
> + svm->vmcb->control.vmsa_pa = vmsa_pa;
>
> /* Mark the vCPU as runnable */
> kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
> -
> - /*
> - * gmem pages aren't currently migratable, but if this ever changes
> - * then care should be taken to ensure svm->sev_es.vmsa is pinned
> - * through some other means.
> - */
> - kvm_release_page_clean(page);
> }
>
> static int sev_snp_ap_creation(struct vcpu_svm *svm)
> @@ -4187,10 +4317,10 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm)
> return -EINVAL;
> }
>
> - target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2;
> + target_svm->sev_es.req_vmsa_gpa = svm->vmcb->control.exit_info_2;
> break;
> case SVM_VMGEXIT_AP_DESTROY:
> - target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
> + target_svm->sev_es.req_vmsa_gpa = INVALID_PAGE;
> break;
> default:
> vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n",
> @@ -4708,20 +4838,7 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm, bool init_event)
> struct vmcb *vmcb = svm->vmcb01.ptr;
>
> svm->vmcb->control.misc_ctl |= SVM_MISC_ENABLE_SEV_ES;
> -
> - /*
> - * An SEV-ES guest requires a VMSA area that is a separate from the
> - * VMCB page. Do not include the encryption mask on the VMSA physical
> - * address since hardware will access it using the guest key. Note,
> - * the VMSA will be NULL if this vCPU is the destination for intrahost
> - * migration, and will be copied later.
> - */
> - if (!svm->sev_es.snp_has_guest_vmsa) {
> - if (svm->sev_es.vmsa)
> - svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
> - else
> - svm->vmcb->control.vmsa_pa = INVALID_PAGE;
> - }
> + svm->vmcb->control.vmsa_pa = sev_es_vmsa_pa(&svm->vcpu);
>
> if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES))
> svm->vmcb->control.allowed_sev_features = sev->vmsa_features |
> @@ -4797,7 +4914,7 @@ void sev_init_vmcb(struct vcpu_svm *svm, bool init_event)
> int sev_vcpu_create(struct kvm_vcpu *vcpu)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> - struct page *vmsa_page;
> + int ret;
>
> mutex_init(&svm->sev_es.snp_vmsa_mutex);
>
> @@ -4808,11 +4925,9 @@ int sev_vcpu_create(struct kvm_vcpu *vcpu)
> * SEV-ES guests require a separate (from the VMCB) VMSA page used to
> * contain the encrypted register state of the guest.
> */
> - vmsa_page = snp_safe_alloc_page();
> - if (!vmsa_page)
> - return -ENOMEM;
> -
> - svm->sev_es.vmsa = page_address(vmsa_page);
> + ret = sev_es_vcpu_alloc_vmsa(vcpu);
> + if (ret)
> + return ret;
>
> vcpu->arch.guest_tsc_protected = snp_is_secure_tsc_enabled(vcpu->kvm);
>
> @@ -5227,12 +5342,14 @@ struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
> if (!is_sev_es_guest(vcpu))
> return NULL;
>
> + vmsa = sev_es_vmsa_ref(vcpu);
> +
> /*
> * If the VMSA has not yet been encrypted, return a pointer to the
> * current un-encrypted VMSA.
> */
> - if (!vcpu->arch.guest_state_protected)
> - return (struct vmcb_save_area *)svm->sev_es.vmsa;
> + if (vmsa)
> + return vmsa;
>
> sev = to_kvm_sev_info(vcpu->kvm);
>
> @@ -5303,8 +5420,10 @@ struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
>
> void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa)
> {
> + struct vmcb_save_area *vmsa_ptr = sev_es_vmsa_ref(vcpu);
> +
> /* If the VMSA has not yet been encrypted, nothing was allocated */
> - if (!vcpu->arch.guest_state_protected || !vmsa)
> + if (vmsa == vmsa_ptr)

Can't this just be

if (sev_es_vmsa_ref(vcpu))

And then you save defining a new variable.

> return;
>
> free_page((unsigned long)vmsa);
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 5137416be593..3d4799f09b23 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -240,9 +240,29 @@ struct svm_nested_state {
> bool force_msr_bitmap_recalc;
> };
>
> +enum vmsa_state {
> + /* No VMSA set */
> + VMSA_NONE,
> + /* VMSA allocated by KVM - Shared in RMP (if applicable) */
> + VMSA_SHARED,

VMSA_KVM_SHARED

> + /* VMSA allocated by KVM - Guest-private in RMP (SEV-SNP only) */
> + VMSA_PRIVATE,

VMSA_KVM_PRIVATE

> + /* Guest-owned VMSA */
> + VMSA_GUEST,

VMSA_GUEST_OWNED

This way ownership of the page becomes more obvious when used in the code.

> +};
> +
> +struct sev_es_vmsa_state {
> + enum vmsa_state vmsa_state;
> + union {
> + /* state == (KVM_SHARED || KVM_PRIVATE) */
> + struct page *vmsa_page;
> + /* state == GUEST */
> + gpa_t vmsa_gpa;
> + };
> +};
> +
> struct vcpu_sev_es_state {
> /* SEV-ES support */
> - struct sev_es_save_area *vmsa;
> struct ghcb *ghcb;
> u8 valid_bitmap[16];
> struct kvm_host_map ghcb_map;
> @@ -266,10 +286,13 @@ struct vcpu_sev_es_state {
>
> u64 ghcb_registered_gpa;
>
> - struct mutex snp_vmsa_mutex; /* Used to handle concurrent updates of VMSA. */
> - gpa_t snp_vmsa_gpa;
> + /* VMSA related state */
> + struct mutex snp_vmsa_mutex; /* Used to handle concurrent updates of VMSA. */
> + struct sev_es_vmsa_state vmsa; /* VMSA currently used by the VCPU */
> + gpa_t req_vmsa_gpa; /* Requested new VMSA GPA */
> +
> + bool snp_ap_runnable;

I don't see this used in the patch anywhere.

Thanks,
Tom

> bool snp_ap_waiting_for_reset;
> - bool snp_has_guest_vmsa;
> };
>
> struct vcpu_svm {