Re: [PATCH] sched/schedutil: Fix deadlock between cpuset and cpu hotplug when using schedutil
From: Xuewen Yan
Date: Fri Jul 29 2022 - 03:33:39 EST
Hi Tejun
On Thu, Jul 28, 2022 at 4:09 AM Tejun Heo <tj@xxxxxxxxxx> wrote:
>
> Hello,
>
>
> Can youp please see whether the following patch fixes the problem?
>
I have tested the patch, sadly, although the original deadlock call
stack is gone, there seems to be a new problem.
I did a preliminary analysis, but no further in-depth analysis. I
haven't found the root cause yet, but I think I should let you know
the test results first.
Once I find the root cause, I'll report back immediately.
The thread-A is waiting for the cpu_hotplug_lock's writer, but the
writer is waiting for the readers:
PID: 1106 TASK: ffffff8097e90000 CPU: 7 COMMAND: "OomAdjuster"
#0 [ffffffc011a23850] __switch_to at ffffffc0081229b4
#1 [ffffffc011a238b0] __schedule at ffffffc009c824f8
#2 [ffffffc011a23910] schedule at ffffffc009c82b50
#3 [ffffffc011a23970] percpu_rwsem_wait at ffffffc00828fbf4
#4 [ffffffc011a239b0] __percpu_down_read at ffffffc0082901a8
#5 [ffffffc011a239e0] cpus_read_lock at ffffffc0081dadc8
#6 [ffffffc011a23a20] cpuset_attach at ffffffc008366f10
#7 [ffffffc011a23a90] cgroup_migrate_execute at ffffffc00834f808
#8 [ffffffc011a23b60] cgroup_attach_task at ffffffc0083539f0
#9 [ffffffc011a23bd0] __cgroup1_procs_write at ffffffc00835f6e8
#10 [ffffffc011a23c30] cgroup1_tasks_write at ffffffc00835f054
#11 [ffffffc011a23c60] cgroup_file_write at ffffffc008348b2c
#12 [ffffffc011a23c90] kernfs_fop_write_iter at ffffffc008754514
#13 [ffffffc011a23d50] vfs_write at ffffffc008607ac8
#14 [ffffffc011a23db0] ksys_write at ffffffc008607714
#15 [ffffffc011a23df0] __arm64_sys_write at ffffffc00860767c
#16 [ffffffc011a23e10] invoke_syscall at ffffffc00813dcac
#17 [ffffffc011a23e30] el0_svc_common at ffffffc00813dbc4
#18 [ffffffc011a23e70] do_el0_svc at ffffffc00813daa8
#19 [ffffffc011a23e80] el0_svc at ffffffc0098828d8
#20 [ffffffc011a23ea0] el0t_64_sync_handler at ffffffc00988284c
#21 [ffffffc011a23fe0] el0t_64_sync at ffffffc008091e44
cpu_hotplug_lock = $3 = {
rss = {
gp_state = 2,
gp_count = 1,
gp_wait = {
lock = {
{
rlock = {
raw_lock = {
{
val = {
counter = 0
},
{
locked = 0 '\000',
pending = 0 '\000'
},
{
locked_pending = 0,
tail = 0
}
}
}
}
}
},
head = {
next = 0xffffffc00b0eb6a0 <cpu_hotplug_lock+16>,
prev = 0xffffffc00b0eb6a0 <cpu_hotplug_lock+16>
}
},
cb_head = {
next = 0x0,
func = 0x0
}
},
read_count = 0xffffffc00b0a0808 <__percpu_rwsem_rc_cpu_hotplug_lock>,
writer = {
task = 0xffffff80f303ca00
},
{
waiters = {
lock = {
{
rlock = {
raw_lock = {
{
val = {
counter = 0
},
{
locked = 0 '\000',
pending = 0 '\000'
},
{
locked_pending = 0,
tail = 0
}
}
}
}
}
},
head = {
next = 0xffffffc011a23958,
prev = 0xffffffc01430bcb8
}
},
destroy_list_entry = {
next = 0x0,
prev = 0xffffffc011a23958
}
},
block = {
counter = 1
}
}
PID: 15760 TASK: ffffff80f303ca00 CPU: 5 COMMAND: "test.sh"
#0 [ffffffc0129639a0] __switch_to at ffffffc0081229b4
#1 [ffffffc012963a00] __schedule at ffffffc009c824f8
#2 [ffffffc012963a60] schedule at ffffffc009c82b50
#3 [ffffffc012963a90] percpu_down_write at ffffffc00828f9d8
#4 [ffffffc012963ae0] _cpu_down at ffffffc009884550
#5 [ffffffc012963b40] cpu_device_down at ffffffc0081df814
#6 [ffffffc012963b60] cpu_subsys_offline at ffffffc008d8dd8c
#7 [ffffffc012963b90] device_offline at ffffffc008d77124
#8 [ffffffc012963bd0] online_store at ffffffc008d76d44
#9 [ffffffc012963c30] dev_attr_store at ffffffc008d7ba30
#10 [ffffffc012963c60] sysfs_kf_write at ffffffc0087578d0
#11 [ffffffc012963c90] kernfs_fop_write_iter at ffffffc008754514
#12 [ffffffc012963d50] vfs_write at ffffffc008607ac8
#13 [ffffffc012963db0] ksys_write at ffffffc008607714
#14 [ffffffc012963df0] __arm64_sys_write at ffffffc00860767c
#15 [ffffffc012963e10] invoke_syscall at ffffffc00813dcac
#16 [ffffffc012963e30] el0_svc_common at ffffffc00813dbf4
#17 [ffffffc012963e70] do_el0_svc at ffffffc00813daa8
#18 [ffffffc012963e80] el0_svc at ffffffc0098828d8
#19 [ffffffc012963ea0] el0t_64_sync_handler at ffffffc00988284c
#20 [ffffffc012963fe0] el0t_64_sync at ffffffc008091e44
Thanks!
BR
--
xuewen.yan
> Thanks.
>
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 13c8e91d78620..7caefc8af9127 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -2345,6 +2345,38 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
> }
> EXPORT_SYMBOL_GPL(task_cgroup_path);
>
> +/**
> + * lock_threadgroup - Stabilize threadgroups
> + *
> + * cgroup migration operations need the threadgroups stabilized against forks
> + * and exits, which can be achieved by write-locking cgroup_threadgroup_rwsem.
> + *
> + * Note that write-locking threadgdroup_rwsem is nested inside CPU hotplug
> + * disable. This is because cpuset needs CPU hotplug disabled during ->attach()
> + * and bringing up a CPU may involve creating new tasks which can require
> + * read-locking threadgroup_rwsem. If we call cpuset's ->attach() with
> + * threadgroup_rwsem write-locked with hotplug enabled, we can deadlock by
> + * cpuset waiting for an on-going CPU hotplug operation which in turn is waiting
> + * for the threadgroup_rwsem to be released to create new tasks. See the
> + * following for more details:
> + *
> + * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
> + */
> +static void lock_threadgroup(void)
> +{
> + cpus_read_lock();
> + percpu_down_write(&cgroup_threadgroup_rwsem);
> +}
> +
> +/**
> + * lock_threadgroup - Undo lock_threadgroup
> + */
> +static void unlock_threadgroup(void)
> +{
> + percpu_up_write(&cgroup_threadgroup_rwsem);
> + cpus_read_unlock();
> +}
> +
> /**
> * cgroup_migrate_add_task - add a migration target task to a migration context
> * @task: target task
> @@ -2822,7 +2854,6 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
>
> struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
> bool *locked)
> - __acquires(&cgroup_threadgroup_rwsem)
> {
> struct task_struct *tsk;
> pid_t pid;
> @@ -2840,7 +2871,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
> */
> lockdep_assert_held(&cgroup_mutex);
> if (pid || threadgroup) {
> - percpu_down_write(&cgroup_threadgroup_rwsem);
> + lock_threadgroup();
> *locked = true;
> } else {
> *locked = false;
> @@ -2876,7 +2907,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
>
> out_unlock_threadgroup:
> if (*locked) {
> - percpu_up_write(&cgroup_threadgroup_rwsem);
> + unlock_threadgroup();
> *locked = false;
> }
> out_unlock_rcu:
> @@ -2885,7 +2916,6 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
> }
>
> void cgroup_procs_write_finish(struct task_struct *task, bool locked)
> - __releases(&cgroup_threadgroup_rwsem)
> {
> struct cgroup_subsys *ss;
> int ssid;
> @@ -2894,7 +2924,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked)
> put_task_struct(task);
>
> if (locked)
> - percpu_up_write(&cgroup_threadgroup_rwsem);
> + unlock_threadgroup();
> +
> for_each_subsys(ss, ssid)
> if (ss->post_attach)
> ss->post_attach();
> @@ -2953,7 +2984,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
>
> lockdep_assert_held(&cgroup_mutex);
>
> - percpu_down_write(&cgroup_threadgroup_rwsem);
> + lock_threadgroup();
>
> /* look up all csses currently attached to @cgrp's subtree */
> spin_lock_irq(&css_set_lock);
> @@ -2984,7 +3015,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
> ret = cgroup_migrate_execute(&mgctx);
> out_finish:
> cgroup_migrate_finish(&mgctx);
> - percpu_up_write(&cgroup_threadgroup_rwsem);
> + unlock_threadgroup();
> return ret;
> }
>