回复: [PATCH] pid: add handling of too many zombie processes

From: huyd12
Date: Thu Feb 09 2023 - 02:40:57 EST



Any comments will be appreciated.



-----邮件原件-----
发件人: liuq131@xxxxxxxxxxxxxxx <liuq131@xxxxxxxxxxxxxxx>
发送时间: 2023年2月8日 17:49
收件人: akpm@xxxxxxxxxxxxxxxxxxxx
抄送: agruenba@xxxxxxxxxx; linux-mm@xxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx;
huyd12@xxxxxxxxxxxxxxx; liuq <liuq131@xxxxxxxxxxxxxxx>
主题: [PATCH] pid: add handling of too many zombie processes

There is a common situation that a parent process forks many child processes
to execute tasks, but the parent process does not execute wait/waitpid when
the child process exits, resulting in a large number of child processes
becoming zombie processes.

At this time, if the number of processes in the system out of
kernel.pid_max, the new fork syscall will fail, and the system will not be
able to execute any command at this time (unless an old process exits)

eg:
[root@lq-workstation ~]# ls
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: Resource temporarily unavailable [root@lq-workstation ~]#
reboot
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: Resource temporarily unavailable

I dealt with this situation in the alloc_pid function, and found a process
with the most zombie subprocesses, and more than 10(or other reasonable
values?) zombie subprocesses, so I tried to kill this process to release the
pid resources.

Signed-off-by: liuq <liuq131@xxxxxxxxxxxxxxx>
---
include/linux/mm.h | 2 ++
kernel/pid.c | 6 +++-
mm/oom_kill.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h index
8f857163ac89..afcff08a3878 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1940,6 +1940,8 @@ static inline void clear_page_pfmemalloc(struct page
*page)
* Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
*/
extern void pagefault_out_of_memory(void);
+extern void pid_max_oom_check(struct pid_namespace *ns);
+

#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK)
#define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1))
diff --git a/kernel/pid.c b/kernel/pid.c index 3fbc5e46b721..1a9a60e19ab6
100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -237,7 +237,11 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t
*set_tid,
idr_preload_end();

if (nr < 0) {
- retval = (nr == -ENOSPC) ? -EAGAIN : nr;
+ retval = nr;
+ if (nr == -ENOSPC) {
+ retval = -EAGAIN;
+ pid_max_oom_check(tmp);
+ }
goto out_free;
}

diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1276e49b31b0..18d05d706f48
100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1260,3 +1260,73 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd,
unsigned int, flags)
return -ENOSYS;
#endif /* CONFIG_MMU */
}
+
+static void oom_pid_evaluate_task(struct task_struct *p,
+ struct task_struct **max_zombie_task, int *max_zombie_num) {
+ struct task_struct *child;
+ int zombie_num = 0;
+
+ list_for_each_entry(child, &p->children, sibling) {
+ if (child->exit_state == EXIT_ZOMBIE)
+ zombie_num++;
+ }
+ if (zombie_num > *max_zombie_num) {
+ *max_zombie_num = zombie_num;
+ *max_zombie_task = p;
+ }
+}
+#define MAX_ZOMBIE_NUM 10
+struct task_struct *pid_max_bad_process(struct pid_namespace *ns) {
+ int max_zombie_num = 0;
+ struct task_struct *max_zombie_task = &init_task;
+ struct task_struct *p;
+
+ rcu_read_lock();
+ for_each_process(p)
+ oom_pid_evaluate_task(p, &max_zombie_task, &max_zombie_num);
+ rcu_read_unlock();
+
+ if (max_zombie_num > MAX_ZOMBIE_NUM) {
+ pr_info("process %d has %d zombie child\n",
+ task_pid_nr_ns(max_zombie_task, ns),
max_zombie_num);
+ return max_zombie_task;
+ }
+
+ return NULL;
+}
+
+void pid_max_oom_kill_process(struct task_struct *task) {
+ struct oom_control oc = {
+ .zonelist = NULL,
+ .nodemask = NULL,
+ .memcg = NULL,
+ .gfp_mask = 0,
+ .order = 0,
+ };
+
+ get_task_struct(task);
+ oc.chosen = task;
+
+ if (mem_cgroup_oom_synchronize(true))
+ return;
+
+ if (!mutex_trylock(&oom_lock))
+ return;
+
+ oom_kill_process(&oc, "Out of pid max(oom_kill_allocating_task)");
+ mutex_unlock(&oom_lock);
+}
+
+void pid_max_oom_check(struct pid_namespace *ns) {
+ struct task_struct *p;
+
+ p = pid_max_bad_process(ns);
+ if (p) {
+ pr_info("oom_kill process %d\n", task_pid_nr_ns(p, ns));
+ pid_max_oom_kill_process(p);
+ }
+}
--
2.27.0