Re: [RFC 13/17] cgroup/drm: Ability to periodically scan cgroups for over budget GPU usage

From: Tvrtko Ursulin
Date: Thu Oct 27 2022 - 10:45:45 EST

Next message: Mark Kettenis: "Re: [RFC PATCH] arm64: Enable data independent timing (DIT) in the kernel"
Previous message: Sean Christopherson: "Re: [PATCH v2 03/16] KVM: x86: Always use non-compat vcpu_runstate_info size for gfn=>pfn cache"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

On 21/10/2022 23:52, T.J. Mercier wrote:

On Wed, Oct 19, 2022 at 10:34 AM Tvrtko Ursulin
<tvrtko.ursulin@xxxxxxxxxxxxxxx> wrote:

From: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>

Add a scanning worker, which if enabled, periodically queries the cgroup
for GPU usage and if over budget (as configured by it's relative weight
share) notifies the drm core about the fact.

This is off by default and can be enabled by configuring a scanning
period using the drm.period_us cgroup control file.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
---
Documentation/admin-guide/cgroup-v2.rst | 35 +-
kernel/cgroup/drm.c | 426 +++++++++++++++++++++++-
2 files changed, 459 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 1f3cca4e2572..318f463a1316 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2401,7 +2401,8 @@ HugeTLB Interface Files
DRM
---

-The DRM controller allows configuring static hierarchical scheduling priority.
+The DRM controller allows configuring static hierarchical scheduling priority
+and scheduling soft limits.

DRM static priority control
~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -2458,6 +2459,38 @@ DRM static priority interface files
Read only integer showing the current effective priority level for the
group. Effective meaning taking into account the chain of inherited

+DRM scheduling soft limits
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Because of the heterogenous hardware and driver DRM capabilities, soft limits
+are implemented as a loose co-operative (bi-directional) interface between the
+controller and DRM core.
+
+The controller configures the GPU time allowed per group and periodically scans
+the belonging tasks to detect the over budget condition, at which point it
+invokes a callback notifying the DRM core of the condition.
+
+DRM core provides an API to query per process GPU utilization and 2nd API to
+receive notification from the cgroup controller when the group enters or exits
+the over budget condition.
+
+Individual DRM drivers which implement the interface are expected to act on this
+in the best-effort manner only. There are no guarantees that the soft limits
+will be respected.
+
+DRM scheduling soft limits interface files
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ drm.weight
+ Standard cgroup weight based control [1, 10000] used to configure the
+ relative distributing of GPU time between the sibling groups.
+
+ drm.period_us
+ An integer representing the period with which the controller should look
+ at the GPU usage by the group and potentially send the over/under budget
+ signal.
+ Value of zero (defaul) disables the soft limit checking.
+
Misc
----

diff --git a/kernel/cgroup/drm.c b/kernel/cgroup/drm.c
index 48f1eaaa1c07..af50ead1564a 100644
--- a/kernel/cgroup/drm.c
+++ b/kernel/cgroup/drm.c
@@ -18,6 +18,29 @@ struct drm_cgroup_state {
int priority;
int effective_priority;
unsigned int weight;
+ unsigned int period_us;
+
+ bool scanning_suspended;
+ unsigned int suspended_period_us;
+
+ struct delayed_work scan_work;
+
+ /*
+ * Below fields are owned and updated by the scan worker. Either the
+ * worker accesses them, or worker needs to be suspended and synced
+ * before they can be touched from the outside.
+ */
+ bool scanned;
+
+ ktime_t prev_timestamp;
+
+ u64 sum_children_weights;
+ u64 children_active_us;
+ u64 per_s_budget_ns;
+ u64 prev_active_us;
+ u64 active_us;
+
+ bool over_budget;
};

static DEFINE_MUTEX(drmcg_mutex);
@@ -33,6 +56,31 @@ static inline struct drm_cgroup_state *get_task_drmcs(struct task_struct *task)
return css_to_drmcs(task_get_css(task, drm_cgrp_id));
}

+static u64 drmcs_get_active_time_us(struct drm_cgroup_state *drmcs)
+{
+ struct cgroup *cgrp = drmcs->css.cgroup;
+ struct task_struct *task;
+ struct css_task_iter it;
+ u64 total = 0;
+
+ css_task_iter_start(&cgrp->self,
+ CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED,
+ &it);
+ while ((task = css_task_iter_next(&it))) {
+ u64 time;
+
+ /* Ignore kernel threads here. */
+ if (task->flags & PF_KTHREAD)
+ continue;
+
+ time = drm_pid_get_active_time_us(task_pid(task));
+ total += time;
+ }
+ css_task_iter_end(&it);
+
+ return total;
+}
+
int drmcgroup_lookup_effective_priority(struct task_struct *task)
{
struct drm_cgroup_state *drmcs = get_task_drmcs(task);
@@ -202,9 +250,301 @@ static int drmcs_online(struct cgroup_subsys_state *css)
return 0;
}

+static void
+signal_drm_budget(struct drm_cgroup_state *drmcs, u64 usage, u64 budget)
+{
+ struct cgroup *cgrp = drmcs->css.cgroup;
+ struct task_struct *task;
+ struct css_task_iter it;
+
+ css_task_iter_start(&cgrp->self,
+ CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED,
+ &it);
+ while ((task = css_task_iter_next(&it))) {
+ /* Ignore kernel threads here. */
+ if (task->flags & PF_KTHREAD)
+ continue;
+
+ drm_pid_signal_budget(task_pid(task), usage, budget);
+ }
+ css_task_iter_end(&it);
+}
+
+static bool __start_scanning(struct drm_cgroup_state *root)
+{
+ struct cgroup_subsys_state *node;
+ bool ok = true;
+
+ rcu_read_lock();
+ css_for_each_descendant_pre(node, &root->css) {
+ struct drm_cgroup_state *drmcs = css_to_drmcs(node);
+ unsigned long active;
+
+ if (!css_tryget_online(node)) {
+ ok = false;
+ continue;
+ }
+
+ drmcs->scanned = false;
+ drmcs->sum_children_weights = 0;
+ drmcs->children_active_us = 0;
+ if (node == &root->css)
+ drmcs->per_s_budget_ns = NSEC_PER_SEC;
+ else
+ drmcs->per_s_budget_ns = 0;
+
+ active = drmcs_get_active_time_us(drmcs);
+ if (active >= drmcs->prev_active_us)
+ drmcs->active_us = active - drmcs->prev_active_us;
+ else
+ drmcs->active_us = 0;
+ drmcs->prev_active_us = active;
+
+ css_put(node);
+ }
+ rcu_read_unlock();
+
+ return ok;
+}
+
+static void scan_worker(struct work_struct *work)
+{
+ struct drm_cgroup_state *root =
+ container_of(work, typeof(*root), scan_work.work);
+ struct cgroup_subsys_state *node;
+ unsigned int period_us;
+ ktime_t now;
+
+ rcu_read_lock();

Hi Tvrtko, I think this lock needs to come after the return for the
online check just below here to avoid missing the rcu_read_unlock at
out_retry. Although it doesn't look like this should ever run in the
first place if the DRM controller is disabled.

Yep - thank you. I've fixed it up locally.

Another TODO note for me is to see if I can hook into the cgroup offline hook to avoid all this checks by perhaps making sure scan worker is not running as transitions are happening. Will see if that works.

+
+ if (WARN_ON_ONCE(!css_tryget_online(&root->css)))
+ return;
+
+ /*
+ * 1st pass - reset accumulated values and update group GPU activity.
+ */
+ if (!__start_scanning(root))
+ goto out_retry; /*
+ * Always come back later if scanner races with
+ * core cgroup management. (Repeated pattern.)
+ */
+
+ now = ktime_get();
+ period_us = ktime_to_us(ktime_sub(now, root->prev_timestamp));
+ root->prev_timestamp = now;
+
+ /*
+ * 2nd pass - calculate accumulated GPU activity and relative weights
+ * for each parent's children.
+ */
+ css_for_each_descendant_pre(node, &root->css) {
+ struct drm_cgroup_state *drmcs = css_to_drmcs(node);
+
+ if (!css_tryget_online(node))
+ goto out_retry;
+
+ if (!drmcs->scanned) {
+ struct cgroup_subsys_state *css;
+
+ css_for_each_child(css, &drmcs->css) {
+ struct drm_cgroup_state *sibling =
+ css_to_drmcs(css);
+
+ if (!css_tryget_online(css)) {
+ css_put(node);
+ goto out_retry;
+ }
+
+ drmcs->children_active_us += sibling->active_us;
+ drmcs->sum_children_weights += sibling->weight;
+
+ css_put(css);
+ }
+
+ drmcs->scanned = true;
+ }
+
+ css_put(node);
+ }
+
+ /*
+ * 3rd pass - calculate relative budgets for each group based on
+ * relative weights and parent's budget.
+ *
+ * FIXME: This is for now incomplete in more than one way. There is
+ * no downward propagation of unused budgets, and even no utilisation of
+ * the unused budgets at all.
+ */
+ css_for_each_descendant_pre(node, &root->css) {
+ struct drm_cgroup_state *drmcs, *pdrmcs;
+ bool over, was_over;
+ u64 budget;
+
+ if (!css_tryget_online(node))
+ goto out_retry;
+ if (node->cgroup->level == 1) {
+ css_put(node);
+ continue;
+ }
+ if (!css_tryget_online(node->parent)) {
+ css_put(node);
+ goto out_retry;
+ }
+
+ drmcs = css_to_drmcs(node);
+ pdrmcs = css_to_drmcs(node->parent);
+
+ drmcs->per_s_budget_ns =
+ DIV_ROUND_UP_ULL(pdrmcs->per_s_budget_ns *
+ drmcs->weight,
+ pdrmcs->sum_children_weights);
+ budget = DIV_ROUND_UP_ULL(drmcs->per_s_budget_ns * period_us,
+ NSEC_PER_SEC);
+ over = drmcs->active_us > budget;
+ was_over = drmcs->over_budget;
+ drmcs->over_budget = over;
+ if (over || (!over && was_over))
+ signal_drm_budget(drmcs, drmcs->active_us, budget);
+
+ css_put(node);
+ css_put(node->parent);
+ }
+
+out_retry:
+ rcu_read_unlock();
+
+ period_us = READ_ONCE(root->period_us);
+ if (period_us)
+ schedule_delayed_work(&root->scan_work,
+ usecs_to_jiffies(period_us));
+
+ css_put(&root->css);
+}
+
+static void start_scanning(struct drm_cgroup_state *drmcs, u64 period_us)
+{
+ drmcs->period_us = (unsigned int)period_us;
+ WARN_ON_ONCE(!__start_scanning(drmcs));
+ drmcs->prev_timestamp = ktime_get();
+ mod_delayed_work(system_wq, &drmcs->scan_work,
+ usecs_to_jiffies(period_us));
+}
+
+static void stop_scanning(struct drm_cgroup_state *drmcs)
+{
+ drmcs->period_us = 0;
+ cancel_delayed_work_sync(&drmcs->scan_work);
+ if (drmcs->over_budget) {
+ /*
+ * Signal under budget when scanning goes off so drivers
+ * correctly update their state.
+ */
+ signal_drm_budget(drmcs, 0, drmcs->per_s_budget_ns);
+ drmcs->over_budget = false;
+ }
+}
+
+static struct drm_cgroup_state *drmcs_scanner(struct drm_cgroup_state *drmcs)
+{
+ while (drmcs->css.cgroup->level > 1)
+ drmcs = css_to_drmcs(drmcs->css.parent);
+
+ return drmcs;
+}
+
+static void start_suspend_scanning(struct drm_cgroup_state *drmcs)
+{
+ drmcs = drmcs_scanner(drmcs);
+
+ if (drmcs->scanning_suspended)
+ return;
+
+ drmcs->scanning_suspended = true;
+ drmcs->suspended_period_us = drmcs->period_us;
+ drmcs->period_us = 0;
+}
+
+static void finish_suspend_scanning(struct drm_cgroup_state *drmcs)
+{
+ drmcs = drmcs_scanner(drmcs);
+
+ if (drmcs->suspended_period_us)
+ cancel_delayed_work_sync(&drmcs->scan_work);
+}
+
+static void resume_scanning(struct drm_cgroup_state *drmcs)
+{
+ drmcs = drmcs_scanner(drmcs);
+
+ if (!drmcs->scanning_suspended)
+ return;
+
+ drmcs->scanning_suspended = false;
+ if (drmcs->suspended_period_us) {
+ start_scanning(drmcs, drmcs->suspended_period_us);
+ drmcs->suspended_period_us = 0;
+ }
+}
+
static void drmcs_free(struct cgroup_subsys_state *css)
{
- kfree(css_to_drmcs(css));
+ struct drm_cgroup_state *drmcs = css_to_drmcs(css);
+
+ stop_scanning(drmcs);
+
+ kfree(drmcs);
+}
+
+static int drmcs_can_attach(struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys_state *new_css;
+ struct task_struct *task;
+ int ret;
+
+ /*
+ * As processes are getting moved between groups we need to ensure
+ * both that the old group does not see a sudden downward jump in the
+ * GPU utilisation, and that the new group does not see a sudden jump
+ * up with all the GPU time clients belonging to the migrated process
+ * have accumulated.
+ *
+ * To achieve that we suspend the scanner until the migration is
+ * completed where the resume at the end ensures both groups start
+ * observing GPU utilisation from a reset state.
+ */
+
+ ret = mutex_lock_interruptible(&drmcg_mutex);
+ if (ret)
+ return ret;
+
+ cgroup_taskset_for_each(task, new_css, tset) {
+ start_suspend_scanning(css_to_drmcs(task_css(task,
+ drm_cgrp_id)));
+ start_suspend_scanning(css_to_drmcs(new_css));
+ }
+
+ mutex_unlock(&drmcg_mutex);
+
+ cgroup_taskset_for_each(task, new_css, tset) {
+ finish_suspend_scanning(css_to_drmcs(task_css(task,
+ drm_cgrp_id)));
+ finish_suspend_scanning(css_to_drmcs(new_css));
+ }
+
+ return 0;
+}
+
+static void tset_resume_scanning(struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys_state *new_css;
+ struct task_struct *task;
+
+ mutex_lock(&drmcg_mutex);
+ cgroup_taskset_for_each(task, new_css, tset) {
+ resume_scanning(css_to_drmcs(task_css(task, drm_cgrp_id)));
+ resume_scanning(css_to_drmcs(new_css));
+ }
+ mutex_unlock(&drmcg_mutex);
}

static void drmcs_attach(struct cgroup_taskset *tset)
@@ -219,12 +559,86 @@ static void drmcs_attach(struct cgroup_taskset *tset)
cgroup_taskset_for_each(task, css, tset)
drm_pid_update_priority(task_pid(task),
css_to_drmcs(css)->effective_priority);
+
+ tset_resume_scanning(tset);
+}
+
+static void drmcs_cancel_attach(struct cgroup_taskset *tset)
+{
+ tset_resume_scanning(tset);
+}
+
+static u64
+drmcs_read_period_us(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct drm_cgroup_state *drmcs = css_to_drmcs(css);
+
+ return drmcs->period_us;
+}
+
+static int
+drmcs_write_period_us(struct cgroup_subsys_state *css, struct cftype *cftype,
+ u64 period_us)
+{
+ struct drm_cgroup_state *drmcs = css_to_drmcs(css);
+ int ret;
+
+ if (WARN_ON_ONCE(!css->parent))
+ return -EINVAL;
+ if (css->cgroup->level != 1)
+ return -EINVAL;
+ if ((period_us && period_us < 500000) || period_us > USEC_PER_SEC * 60)
+ return -EINVAL;
+
+ ret = mutex_lock_interruptible(&drmcg_mutex);
+ if (ret)
+ return ret;
+
+ if (!drmcs->scanning_suspended) {
+ if (period_us)
+ start_scanning(drmcs, period_us);
+ else
+ stop_scanning(drmcs);
+ } else {
+ /*
+ * If scanning is temporarily suspended just update the period
+ * which will apply once resumed, or simply skip resuming in
+ * case of disabling.
+ */
+ drmcs->suspended_period_us = period_us;
+ if (!period_us)
+ drmcs->scanning_suspended = false;
+ }
+
+ mutex_unlock(&drmcg_mutex);
+
+ return 0;
}

void drmcgroup_client_exited(struct task_struct *task)
{
struct drm_cgroup_state *drmcs = get_task_drmcs(task);

+ /*
+ * Since we are not tracking accumulated GPU time for each cgroup,
+ * avoid jumps in group observed GPU usage by re-setting the scanner
+ * at a point when GPU usage can suddenly jump down.
+ *
+ * Downside is clients can influence the effectiveness of the over-
+ * budget scanning by continuosly closing DRM file descriptors but for

"continuously"

Thanks, fixed locally.

And I think also if a user has permission to create and migrate
processes between cgroups even just under the same parent, since
css_tryget_online failure would cause an early exit during scanning?

I will need to double check if migration causes online/offline events. I didn't think it does. I know it causes "attach" "family" of events which I hooked into to ensure no sudden jumps in usage during migration. See comment in drmcs_can_attach(). But that is a pretty rare event I think.

Unless you are thinking from the ChromeOS angle and browser foreground/backgroun tabs cgroups? Those are not doing GPU rendering so I did not envisage hooking up the DRM controller there. To start with I only planned to hook it up with the children of the vms cgroup. The model in that branch is not to migrate processes but to change cgroup CPU share based on ::OnWindowActivated hooks.

Regards,

Tvrtko

+ * now we do not worry about it.
+ */
+
+ mutex_lock(&drmcg_mutex);
+ start_suspend_scanning(drmcs);
+ mutex_unlock(&drmcg_mutex);
+
+ finish_suspend_scanning(drmcs);
+
+ mutex_lock(&drmcg_mutex);
+ resume_scanning(drmcs);
+ mutex_unlock(&drmcg_mutex);
+
css_put(&drmcs->css);
}
EXPORT_SYMBOL_GPL(drmcgroup_client_exited);
@@ -232,6 +646,7 @@ EXPORT_SYMBOL_GPL(drmcgroup_client_exited);
static struct drm_cgroup_state root_drmcs = {
.priority = DRM_CGROUP_PRIORITY_DEF,
.effective_priority = DRM_CGROUP_PRIORITY_DEF,
+ .weight = CGROUP_WEIGHT_DFL,
};

static struct cgroup_subsys_state *
@@ -247,6 +662,7 @@ drmcs_alloc(struct cgroup_subsys_state *parent_css)
return ERR_PTR(-ENOMEM);

drmcs->weight = CGROUP_WEIGHT_DFL;
+ INIT_DELAYED_WORK(&drmcs->scan_work, scan_worker);

return &drmcs->css;
}
@@ -274,6 +690,12 @@ struct cftype files[] = {
.read_u64 = drmcs_read_weight,
.write_u64 = drmcs_write_weight,
},
+ {
+ .name = "period_us",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = drmcs_read_period_us,
+ .write_u64 = drmcs_write_period_us,
+ },
{ } /* Zero entry terminates. */
};

@@ -281,7 +703,9 @@ struct cgroup_subsys drm_cgrp_subsys = {
.css_alloc = drmcs_alloc,
.css_free = drmcs_free,
.css_online = drmcs_online,
+ .can_attach = drmcs_can_attach,
.attach = drmcs_attach,
+ .cancel_attach = drmcs_cancel_attach,
.early_init = false,
.legacy_cftypes = files,
.dfl_cftypes = files,
--
2.34.1

Next message: Mark Kettenis: "Re: [RFC PATCH] arm64: Enable data independent timing (DIT) in the kernel"
Previous message: Sean Christopherson: "Re: [PATCH v2 03/16] KVM: x86: Always use non-compat vcpu_runstate_info size for gfn=>pfn cache"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]