Re: [RFC PATCH v2 01/10] x86/sgx: Introduce mechanism to prevent new initializations of EPC pages
From: Jarkko Sakkinen
Date: Thu Mar 17 2022 - 00:55:46 EST
On Tue, Mar 15, 2022 at 09:02:51AM +0800, Cathy Zhang wrote:
> == Background ==
>
> EUPDATESVN is a new SGX instruction which allows enclave attestation
> to include information about updated microcode without a reboot.
>
> The SGX hardware maintains metadata for each enclave page to help
> enforce its security guarantees. This includes things like a record
> of the enclave to which the page belongs and the type of the page:
> SGX metadata like "VA" or "SECS" pages, or regular enclave pages
> like those that store user data.
>
> Before an EUPDATESVN operation can be successful, all SGX memory (aka.
> EPC) must be marked as "unused" in the SGX hardware metadata (aka,
> EPCM). The SGX microcode now maintains a reference count of pages
> which are unused to aid in determining when all pages reach the
> "unused" state.
>
> Both bare-metal and KVM guest EPC must be made unused. To increase
> the chance of a successful EUPDATESVN, the kernel prevents existing
> enclaves from creating new, valid pages and prevents new enclave
> creation (creating an enclave involves initializing a "SECS" page).
>
> The entire EUPDATESVN process is very slow since it potentially
> affects gigabytes of enclave memory. It can potentially take seconds
> or minutes to complete. Userspace may encounter -EBUSY errors during
> the update and is expected to retry.
>
> == Patch contents ==
>
> Introduce mechanism to prevent new initializations of EPC pages.
>
> Use a flag to indicate when SGX EPC pages are "locked", which means
> it's not allowed to allocate new EPC page for use. Check it in all
> paths that can initialize an EPC page. Use SRCU to ensure that the
> flag is visible across the system before proceeding with an update.
>
> Add checks to all sites that call SGX instructions that can transition
> pages from unused to initialized to ensure that the SRCU lock is held.
>
> Signed-off-by: Cathy Zhang <cathy.zhang@xxxxxxxxx>
> ---
> arch/x86/kernel/cpu/sgx/encls.h | 10 +++++++
> arch/x86/kernel/cpu/sgx/sgx.h | 3 ++
> arch/x86/kernel/cpu/sgx/encl.c | 25 +++++++++++++++--
> arch/x86/kernel/cpu/sgx/ioctl.c | 50 ++++++++++++++++++++++++++++++++-
> arch/x86/kernel/cpu/sgx/main.c | 37 ++++++++++++++++++++++++
> arch/x86/kernel/cpu/sgx/virt.c | 20 +++++++++++++
> 6 files changed, 142 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
> index 99004b02e2ed..3f1797ec2445 100644
> --- a/arch/x86/kernel/cpu/sgx/encls.h
> +++ b/arch/x86/kernel/cpu/sgx/encls.h
> @@ -139,6 +139,8 @@ static inline bool encls_failed(int ret)
> /* Initialize an EPC page into an SGX Enclave Control Structure (SECS) page. */
> static inline int __ecreate(struct sgx_pageinfo *pginfo, void *secs)
> {
> + lockdep_assert_held(&sgx_lock_epc_srcu);
> +
> return __encls_2(ECREATE, pginfo, secs);
> }
>
> @@ -154,6 +156,8 @@ static inline int __eextend(void *secs, void *addr)
> */
> static inline int __eadd(struct sgx_pageinfo *pginfo, void *addr)
> {
> + lockdep_assert_held(&sgx_lock_epc_srcu);
> +
> return __encls_2(EADD, pginfo, addr);
> }
>
> @@ -191,6 +195,8 @@ static inline int __etrack(void *addr)
> static inline int __eldu(struct sgx_pageinfo *pginfo, void *addr,
> void *va)
> {
> + lockdep_assert_held(&sgx_lock_epc_srcu);
> +
> return __encls_ret_3(ELDU, pginfo, addr, va);
> }
>
> @@ -205,6 +211,8 @@ static inline int __epa(void *addr)
> {
> unsigned long rbx = SGX_PAGE_TYPE_VA;
>
> + lockdep_assert_held(&sgx_lock_epc_srcu);
> +
> return __encls_2(EPA, rbx, addr);
> }
>
> @@ -230,6 +238,8 @@ static inline int __emodt(struct sgx_secinfo *secinfo, void *addr)
> /* Zero a page of EPC memory and add it to an initialized enclave. */
> static inline int __eaug(struct sgx_pageinfo *pginfo, void *addr)
> {
> + lockdep_assert_held(&sgx_lock_epc_srcu);
> +
> return __encls_2(EAUG, pginfo, addr);
> }
>
> diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
> index 85cbf103b0dd..dc8cb58100e3 100644
> --- a/arch/x86/kernel/cpu/sgx/sgx.h
> +++ b/arch/x86/kernel/cpu/sgx/sgx.h
> @@ -104,4 +104,7 @@ static inline int __init sgx_vepc_init(void)
>
> void sgx_update_lepubkeyhash(u64 *lepubkeyhash);
>
> +extern struct srcu_struct sgx_lock_epc_srcu;
> +bool sgx_epc_is_locked(void);
> +
> #endif /* _X86_SGX_H */
> diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
> index a0255d41e075..d2b428992910 100644
> --- a/arch/x86/kernel/cpu/sgx/encl.c
> +++ b/arch/x86/kernel/cpu/sgx/encl.c
> @@ -264,6 +264,7 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
> unsigned long phys_addr;
> struct sgx_encl *encl;
> vm_fault_t ret;
> + int srcu_idx;
>
> encl = vma->vm_private_data;
>
> @@ -275,6 +276,12 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
> if (unlikely(!encl))
> return VM_FAULT_SIGBUS;
>
> + srcu_idx = srcu_read_lock(&sgx_lock_epc_srcu);
> + if (sgx_epc_is_locked()) {
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
> + return VM_FAULT_SIGBUS;
> + }
> +
> /*
> * The page_array keeps track of all enclave pages, whether they
> * are swapped out or not. If there is no entry for this page and
> @@ -283,14 +290,18 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
> * enclave that will be checked for right away.
> */
> if (cpu_feature_enabled(X86_FEATURE_SGX2) &&
> - (!xa_load(&encl->page_array, PFN_DOWN(addr))))
> - return sgx_encl_eaug_page(vma, encl, addr);
> + (!xa_load(&encl->page_array, PFN_DOWN(addr)))) {
> + ret = sgx_encl_eaug_page(vma, encl, addr);
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
> + return ret;
> + }
>
> mutex_lock(&encl->lock);
>
> entry = sgx_encl_load_page(encl, addr);
> if (IS_ERR(entry)) {
> mutex_unlock(&encl->lock);
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
>
> if (PTR_ERR(entry) == -EBUSY)
> return VM_FAULT_NOPAGE;
> @@ -315,12 +326,14 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
> vm_get_page_prot(page_prot_bits));
> if (ret != VM_FAULT_NOPAGE) {
> mutex_unlock(&encl->lock);
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
>
> return VM_FAULT_SIGBUS;
> }
>
> sgx_encl_test_and_clear_young(vma->vm_mm, entry);
> mutex_unlock(&encl->lock);
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
>
> return VM_FAULT_NOPAGE;
> }
> @@ -513,6 +526,7 @@ static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr,
> struct sgx_encl_page *entry = NULL;
> char data[sizeof(unsigned long)];
> unsigned long align;
> + int srcu_idx;
> int offset;
> int cnt;
> int ret = 0;
> @@ -529,6 +543,12 @@ static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr,
> return -EFAULT;
>
> for (i = 0; i < len; i += cnt) {
> + srcu_idx = srcu_read_lock(&sgx_lock_epc_srcu);
> + if (sgx_epc_is_locked()) {
> + ret = -EBUSY;
> + goto out;
> + }
> +
> entry = sgx_encl_reserve_page(encl, (addr + i) & PAGE_MASK);
> if (IS_ERR(entry)) {
> ret = PTR_ERR(entry);
> @@ -555,6 +575,7 @@ static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr,
>
> out:
> mutex_unlock(&encl->lock);
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
>
> if (ret)
> break;
> diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
> index d8c3c07badb3..da3b569a10bd 100644
> --- a/arch/x86/kernel/cpu/sgx/ioctl.c
> +++ b/arch/x86/kernel/cpu/sgx/ioctl.c
> @@ -147,6 +147,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
> static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg)
> {
> struct sgx_enclave_create create_arg;
> + int srcu_idx;
> void *secs;
> int ret;
>
> @@ -162,9 +163,18 @@ static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg)
>
> if (copy_from_user(secs, (void __user *)create_arg.src, PAGE_SIZE))
> ret = -EFAULT;
> - else
> + else {
> + srcu_idx = srcu_read_lock(&sgx_lock_epc_srcu);
> + if (sgx_epc_is_locked()) {
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
> + return -EBUSY;
> + }
> +
> ret = sgx_encl_create(encl, secs);
>
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
> + }
> +
> kfree(secs);
> return ret;
> }
> @@ -444,6 +454,7 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg)
> struct sgx_enclave_add_pages add_arg;
> struct sgx_secinfo secinfo;
> unsigned long c;
> + int srcu_idx;
> int ret;
>
> if (!test_bit(SGX_ENCL_CREATED, &encl->flags) ||
> @@ -477,8 +488,18 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg)
> if (need_resched())
> cond_resched();
>
> + srcu_idx = srcu_read_lock(&sgx_lock_epc_srcu);
> + if (sgx_epc_is_locked()) {
> + ret = -EBUSY;
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
> + break;
> + }
> +
> ret = sgx_encl_add_page(encl, add_arg.src + c, add_arg.offset + c,
> &secinfo, add_arg.flags);
> +
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
> +
> if (ret)
> break;
> }
> @@ -952,6 +973,7 @@ static long sgx_enclave_restrict_perm(struct sgx_encl *encl,
> unsigned long addr;
> unsigned long c;
> void *epc_virt;
> + int srcu_idx;
> int ret;
>
> memset(&secinfo, 0, sizeof(secinfo));
> @@ -960,6 +982,12 @@ static long sgx_enclave_restrict_perm(struct sgx_encl *encl,
> vm_prot = vm_prot_from_secinfo(secinfo_perm);
>
> for (c = 0 ; c < modp->length; c += PAGE_SIZE) {
> + srcu_idx = srcu_read_lock(&sgx_lock_epc_srcu);
> + if (sgx_epc_is_locked()) {
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
> + return -EBUSY;
> + }
> +
> addr = encl->base + modp->offset + c;
>
> sgx_direct_reclaim();
> @@ -1049,6 +1077,7 @@ static long sgx_enclave_restrict_perm(struct sgx_encl *encl,
>
> sgx_mark_page_reclaimable(entry->epc_page);
> mutex_unlock(&encl->lock);
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
> }
>
> ret = 0;
> @@ -1060,6 +1089,7 @@ static long sgx_enclave_restrict_perm(struct sgx_encl *encl,
> sgx_mark_page_reclaimable(entry->epc_page);
> out_unlock:
> mutex_unlock(&encl->lock);
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
> out:
> modp->count = c;
>
> @@ -1143,6 +1173,7 @@ static long sgx_enclave_modt(struct sgx_encl *encl,
> unsigned long addr;
> unsigned long c;
> void *epc_virt;
> + int srcu_idx;
> int ret;
>
> /*
> @@ -1156,6 +1187,12 @@ static long sgx_enclave_modt(struct sgx_encl *encl,
> secinfo.flags = page_type << 8;
>
> for (c = 0 ; c < modt->length; c += PAGE_SIZE) {
> + srcu_idx = srcu_read_lock(&sgx_lock_epc_srcu);
> + if (sgx_epc_is_locked()) {
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
> + return -EBUSY;
> + }
> +
> addr = encl->base + modt->offset + c;
>
> sgx_direct_reclaim();
> @@ -1255,6 +1292,7 @@ static long sgx_enclave_modt(struct sgx_encl *encl,
> entry->type = page_type;
>
> mutex_unlock(&encl->lock);
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
> }
>
> ret = 0;
> @@ -1265,6 +1303,7 @@ static long sgx_enclave_modt(struct sgx_encl *encl,
> entry->vm_run_prot_bits = run_prot_restore;
> out_unlock:
> mutex_unlock(&encl->lock);
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
> out:
> modt->count = c;
>
> @@ -1350,12 +1389,19 @@ static long sgx_encl_remove_pages(struct sgx_encl *encl,
> unsigned long addr;
> unsigned long c;
> void *epc_virt;
> + int srcu_idx;
> int ret;
>
> memset(&secinfo, 0, sizeof(secinfo));
> secinfo.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X;
>
> for (c = 0 ; c < params->length; c += PAGE_SIZE) {
> + srcu_idx = srcu_read_lock(&sgx_lock_epc_srcu);
> + if (sgx_epc_is_locked()) {
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
> + return -EBUSY;
> + }
> +
> addr = encl->base + params->offset + c;
>
> sgx_direct_reclaim();
> @@ -1411,6 +1457,7 @@ static long sgx_encl_remove_pages(struct sgx_encl *encl,
> kfree(entry);
>
> mutex_unlock(&encl->lock);
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
> }
>
> ret = 0;
> @@ -1418,6 +1465,7 @@ static long sgx_encl_remove_pages(struct sgx_encl *encl,
>
> out_unlock:
> mutex_unlock(&encl->lock);
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
> out:
> params->count = c;
>
> diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> index 545da16bb3ea..99c86b77ca8f 100644
> --- a/arch/x86/kernel/cpu/sgx/main.c
> +++ b/arch/x86/kernel/cpu/sgx/main.c
> @@ -23,6 +23,17 @@ static int sgx_nr_epc_sections;
> static struct task_struct *ksgxd_tsk;
> static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
> static DEFINE_XARRAY(sgx_epc_address_space);
> +/*
> + * The flag sgx_epc_locked prevents any new SGX flows that
> + * may attempt to allocate a new EPC page.
> + */
> +static bool __rcu sgx_epc_locked;
> +/*
> + * By synchronizing around sgx_epc_locked SRCU ensures that any executing
> + * SGX flows have completed before proceeding with an SVN update. New SGX flows
> + * will be prevented from starting during an SVN update.
> + */
> +DEFINE_SRCU(sgx_lock_epc_srcu);
>
> /*
> * These variables are part of the state of the reclaimer, and must be accessed
> @@ -378,6 +389,8 @@ void sgx_direct_reclaim(void)
>
> static int ksgxd(void *p)
> {
> + int srcu_idx;
> +
> set_freezable();
>
> /*
> @@ -398,9 +411,15 @@ static int ksgxd(void *p)
> kthread_should_stop() ||
> sgx_should_reclaim(SGX_NR_HIGH_PAGES));
>
> + srcu_idx = srcu_read_lock(&sgx_lock_epc_srcu);
> + if (sgx_epc_is_locked())
> + goto maybe_resched;
> +
> if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
> sgx_reclaim_pages();
>
> +maybe_resched:
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
> cond_resched();
> }
>
> @@ -943,3 +962,21 @@ static int __init sgx_init(void)
> }
>
> device_initcall(sgx_init);
> +
> +static void sgx_lock_epc(void)
> +{
> + sgx_epc_locked = true;
> + synchronize_srcu(&sgx_lock_epc_srcu);
> +}
> +
> +static void sgx_unlock_epc(void)
> +{
> + sgx_epc_locked = false;
> + synchronize_srcu(&sgx_lock_epc_srcu);
> +}
> +
> +bool sgx_epc_is_locked(void)
> +{
> + lockdep_assert_held(&sgx_lock_epc_srcu);
> + return sgx_epc_locked;
> +}
> diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
> index 6a77a14eee38..e953816d7c8b 100644
> --- a/arch/x86/kernel/cpu/sgx/virt.c
> +++ b/arch/x86/kernel/cpu/sgx/virt.c
> @@ -75,10 +75,21 @@ static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
> {
> struct vm_area_struct *vma = vmf->vma;
> struct sgx_vepc *vepc = vma->vm_private_data;
> + int srcu_idx;
> int ret;
>
> mutex_lock(&vepc->lock);
> + srcu_idx = srcu_read_lock(&sgx_lock_epc_srcu);
> +
> + if (sgx_epc_is_locked()) {
> + ret = -EBUSY;
> + goto out_unlock;
> + }
> +
> ret = __sgx_vepc_fault(vepc, vma, vmf->address);
> +
> +out_unlock:
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
> mutex_unlock(&vepc->lock);
>
> if (!ret)
> @@ -331,6 +342,7 @@ int __init sgx_vepc_init(void)
> int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
> int *trapnr)
> {
> + int srcu_idx;
> int ret;
>
> /*
> @@ -347,6 +359,12 @@ int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
> if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
> return -EINVAL;
>
> + srcu_idx = srcu_read_lock(&sgx_lock_epc_srcu);
> + if (sgx_epc_is_locked()) {
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
> + return -EBUSY;
> + }
> +
> __uaccess_begin();
> ret = __ecreate(pageinfo, (void *)secs);
> __uaccess_end();
> @@ -356,6 +374,8 @@ int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
> return -EFAULT;
> }
>
> + srcu_read_unlock(&sgx_lock_epc_srcu, srcu_idx);
> +
> /* ECREATE doesn't return an error code, it faults or succeeds. */
> WARN_ON_ONCE(ret);
> return 0;
> --
> 2.17.1
>
This series is broken. It does not apply on top of tip/x86/sgx. Also you
forgot CC to me.
BR, Jarkko