Re: [PATCH v8 08/15] x86/sgx: Implement EPC reclamation flows for cgroup

From: Jarkko Sakkinen
Date: Thu Feb 01 2024 - 18:39:22 EST


On Tue Jan 30, 2024 at 4:09 AM EET, Haitao Huang wrote:
> From: Kristen Carlson Accardi <kristen@xxxxxxxxxxxxxxx>
>
> Implement the reclamation flow for cgroup, encapsulated in the top-level
> function sgx_epc_cgroup_reclaim_pages(). It does a pre-order walk on its
> subtree, and make calls to sgx_reclaim_pages() at each node passing in
> the LRU of that node. It keeps track of total reclaimed pages, and pages
> left to attempt. It stops the walk if desired number of pages are
> attempted.
>
> In some contexts, e.g. page fault handling, only asynchronous
> reclamation is allowed. Create a work-queue, corresponding work item and
> function definitions to support the asynchronous reclamation. Both
> synchronous and asynchronous flows invoke the same top level reclaim
> function, and will be triggered later by sgx_epc_cgroup_try_charge()
> when usage of the cgroup is at or near its limit.
>
> Co-developed-by: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
> Signed-off-by: Kristen Carlson Accardi <kristen@xxxxxxxxxxxxxxx>
> Co-developed-by: Haitao Huang <haitao.huang@xxxxxxxxxxxxxxx>
> Signed-off-by: Haitao Huang <haitao.huang@xxxxxxxxxxxxxxx>
> ---
> V8:
> - Remove alignment for substructure variables. (Jarkko)
>
> V7:
> - Split this out from the big patch, #10 in V6. (Dave, Kai)
> ---
> arch/x86/kernel/cpu/sgx/epc_cgroup.c | 174 ++++++++++++++++++++++++++-
> arch/x86/kernel/cpu/sgx/epc_cgroup.h | 3 +
> 2 files changed, 176 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kernel/cpu/sgx/epc_cgroup.c b/arch/x86/kernel/cpu/sgx/epc_cgroup.c
> index eac8548164de..8858a0850f8a 100644
> --- a/arch/x86/kernel/cpu/sgx/epc_cgroup.c
> +++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.c
> @@ -7,9 +7,173 @@
>
> static struct sgx_epc_cgroup epc_cg_root;
>
> +static struct workqueue_struct *sgx_epc_cg_wq;

I'd document this with a comment.

Missed this for "epc_eg_root", which should have the same treatment.

Just brief 1-2 line thing, no need for prose here. Something to remind
what it is later on.

> +
> +static inline u64 sgx_epc_cgroup_page_counter_read(struct sgx_epc_cgroup *epc_cg)
> +{
> + return atomic64_read(&epc_cg->cg->res[MISC_CG_RES_SGX_EPC].usage) / PAGE_SIZE;
> +}
> +
> +static inline u64 sgx_epc_cgroup_max_pages(struct sgx_epc_cgroup *epc_cg)
> +{
> + return READ_ONCE(epc_cg->cg->res[MISC_CG_RES_SGX_EPC].max) / PAGE_SIZE;
> +}
> +
> +/*
> + * Get the lower bound of limits of a cgroup and its ancestors. Used in
> + * sgx_epc_cgroup_reclaim_work_func() to determine if EPC usage of a cgroup is
> + * over its limit or its ancestors' hence reclamation is needed.
> + */
> +static inline u64 sgx_epc_cgroup_max_pages_to_root(struct sgx_epc_cgroup *epc_cg)
> +{
> + struct misc_cg *i = epc_cg->cg;
> + u64 m = U64_MAX;
> +
> + while (i) {
> + m = min(m, READ_ONCE(i->res[MISC_CG_RES_SGX_EPC].max));
> + i = misc_cg_parent(i);
> + }
> +
> + return m / PAGE_SIZE;
> +}
> +
> /**
> - * sgx_epc_cgroup_try_charge() - try to charge cgroup for a single EPC page
> + * sgx_epc_cgroup_lru_empty() - check if a cgroup tree has no pages on its LRUs
> + * @root: Root of the tree to check
> *
> + * Return: %true if all cgroups under the specified root have empty LRU lists.
> + * Used to avoid livelocks due to a cgroup having a non-zero charge count but
> + * no pages on its LRUs, e.g. due to a dead enclave waiting to be released or
> + * because all pages in the cgroup are unreclaimable.
> + */
> +bool sgx_epc_cgroup_lru_empty(struct misc_cg *root)
> +{
> + struct cgroup_subsys_state *css_root;
> + struct cgroup_subsys_state *pos;
> + struct sgx_epc_cgroup *epc_cg;
> + bool ret = true;
> +
> + /*
> + * Caller ensure css_root ref acquired
> + */
> + css_root = &root->css;
> +
> + rcu_read_lock();
> + css_for_each_descendant_pre(pos, css_root) {
> + if (!css_tryget(pos))
> + break;
> +
> + rcu_read_unlock();
> +
> + epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos));
> +
> + spin_lock(&epc_cg->lru.lock);
> + ret = list_empty(&epc_cg->lru.reclaimable);
> + spin_unlock(&epc_cg->lru.lock);
> +
> + rcu_read_lock();
> + css_put(pos);
> + if (!ret)
> + break;
> + }
> +
> + rcu_read_unlock();
> +
> + return ret;
> +}
> +
> +/**
> + * sgx_epc_cgroup_reclaim_pages() - walk a cgroup tree and scan LRUs to reclaim pages
> + * @root: Root of the tree to start walking
> + * Return: Number of pages reclaimed.
> + */
> +unsigned int sgx_epc_cgroup_reclaim_pages(struct misc_cg *root)
> +{
> + /*
> + * Attempting to reclaim only a few pages will often fail and is
> + * inefficient, while reclaiming a huge number of pages can result in
> + * soft lockups due to holding various locks for an extended duration.
> + */
> + unsigned int nr_to_scan = SGX_NR_TO_SCAN;
> + struct cgroup_subsys_state *css_root;
> + struct cgroup_subsys_state *pos;
> + struct sgx_epc_cgroup *epc_cg;
> + unsigned int cnt;
> +
> + /* Caller ensure css_root ref acquired */
> + css_root = &root->css;
> +
> + cnt = 0;
> + rcu_read_lock();
> + css_for_each_descendant_pre(pos, css_root) {
> + if (!css_tryget(pos))
> + break;
> + rcu_read_unlock();
> +
> + epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos));
> + cnt += sgx_reclaim_pages(&epc_cg->lru, &nr_to_scan);
> +
> + rcu_read_lock();
> + css_put(pos);
> + if (!nr_to_scan)
> + break;
> + }
> +
> + rcu_read_unlock();
> + return cnt;
> +}
> +
> +/*
> + * Scheduled by sgx_epc_cgroup_try_charge() to reclaim pages from the cgroup
> + * when the cgroup is at/near its maximum capacity
> + */
> +static void sgx_epc_cgroup_reclaim_work_func(struct work_struct *work)
> +{
> + struct sgx_epc_cgroup *epc_cg;
> + u64 cur, max;
> +
> + epc_cg = container_of(work, struct sgx_epc_cgroup, reclaim_work);
> +
> + for (;;) {
> + max = sgx_epc_cgroup_max_pages_to_root(epc_cg);
> +
> + /*
> + * Adjust the limit down by one page, the goal is to free up
> + * pages for fault allocations, not to simply obey the limit.
> + * Conditionally decrementing max also means the cur vs. max
> + * check will correctly handle the case where both are zero.
> + */
> + if (max)
> + max--;
> +
> + /*
> + * Unless the limit is extremely low, in which case forcing
> + * reclaim will likely cause thrashing, force the cgroup to
> + * reclaim at least once if it's operating *near* its maximum
> + * limit by adjusting @max down by half the min reclaim size.
> + * This work func is scheduled by sgx_epc_cgroup_try_charge
> + * when it cannot directly reclaim due to being in an atomic
> + * context, e.g. EPC allocation in a fault handler. Waiting
> + * to reclaim until the cgroup is actually at its limit is less
> + * performant as it means the faulting task is effectively
> + * blocked until a worker makes its way through the global work
> + * queue.
> + */
> + if (max > SGX_NR_TO_SCAN * 2)
> + max -= (SGX_NR_TO_SCAN / 2);
> +
> + cur = sgx_epc_cgroup_page_counter_read(epc_cg);
> +
> + if (cur <= max || sgx_epc_cgroup_lru_empty(epc_cg->cg))
> + break;
> +
> + /* Keep reclaiming until above condition is met. */
> + sgx_epc_cgroup_reclaim_pages(epc_cg->cg);
> + }
> +}
> +
> +/**
> + * sgx_epc_cgroup_try_charge() - try to charge cgroup for a single EPC page
> * @epc_cg: The EPC cgroup to be charged for the page.
> * Return:
> * * %0 - If successfully charged.
> @@ -37,6 +201,7 @@ static void sgx_epc_cgroup_free(struct misc_cg *cg)
> if (!epc_cg)
> return;
>
> + cancel_work_sync(&epc_cg->reclaim_work);
> kfree(epc_cg);
> }
>
> @@ -49,6 +214,8 @@ const struct misc_res_ops sgx_epc_cgroup_ops = {
>
> static void sgx_epc_misc_init(struct misc_cg *cg, struct sgx_epc_cgroup *epc_cg)
> {
> + sgx_lru_init(&epc_cg->lru);
> + INIT_WORK(&epc_cg->reclaim_work, sgx_epc_cgroup_reclaim_work_func);
> cg->res[MISC_CG_RES_SGX_EPC].priv = epc_cg;
> epc_cg->cg = cg;
> }
> @@ -68,6 +235,11 @@ static int sgx_epc_cgroup_alloc(struct misc_cg *cg)
>
> void sgx_epc_cgroup_init(void)
> {
> + sgx_epc_cg_wq = alloc_workqueue("sgx_epc_cg_wq",
> + WQ_UNBOUND | WQ_FREEZABLE,
> + WQ_UNBOUND_MAX_ACTIVE);
> + BUG_ON(!sgx_epc_cg_wq);
> +
> misc_cg_set_ops(MISC_CG_RES_SGX_EPC, &sgx_epc_cgroup_ops);
> sgx_epc_misc_init(misc_cg_root(), &epc_cg_root);
> }
> diff --git a/arch/x86/kernel/cpu/sgx/epc_cgroup.h b/arch/x86/kernel/cpu/sgx/epc_cgroup.h
> index 6b664b4c321f..e3c6a08f0ee8 100644
> --- a/arch/x86/kernel/cpu/sgx/epc_cgroup.h
> +++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.h
> @@ -34,6 +34,8 @@ static inline void sgx_epc_cgroup_init(void) { }
> #else
> struct sgx_epc_cgroup {
> struct misc_cg *cg;
> + struct sgx_epc_lru_list lru;
> + struct work_struct reclaim_work;
> };
>
> static inline struct sgx_epc_cgroup *sgx_epc_cgroup_from_misc_cg(struct misc_cg *cg)
> @@ -66,6 +68,7 @@ static inline void sgx_put_epc_cg(struct sgx_epc_cgroup *epc_cg)
>
> int sgx_epc_cgroup_try_charge(struct sgx_epc_cgroup *epc_cg);
> void sgx_epc_cgroup_uncharge(struct sgx_epc_cgroup *epc_cg);
> +bool sgx_epc_cgroup_lru_empty(struct misc_cg *root);
> void sgx_epc_cgroup_init(void);
>
> #endif


BR, Jarkko