Re: [patch v2] mm, oom: fix concurrent munlock and oom reaper unmap
From: Tetsuo Handa
Date: Thu Apr 19 2018 - 07:52:10 EST
Michal Hocko wrote:
> > We need to teach the OOM reaper stop reaping as soon as entering exit_mmap().
> > Maybe let the OOM reaper poll for progress (e.g. none of get_mm_counter(mm, *)
> > decreased for last 1 second) ?
>
> Can we start simple and build a more elaborate heuristics on top _please_?
> In other words holding the mmap_sem for write for oom victims in
> exit_mmap should handle the problem. We can then enhance this to probe
> for progress or any other clever tricks if we find out that the race
> happens too often and we kill more than necessary.
>
> Let's not repeat the error of trying to be too clever from the beginning
> as we did previously. This are is just too subtle and obviously error
> prone.
>
Something like this?
---
mm/mmap.c | 41 +++++++++++++++++++++++------------------
mm/oom_kill.c | 29 +++++++++++------------------
2 files changed, 34 insertions(+), 36 deletions(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index 188f195..3edb7da 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3015,6 +3015,28 @@ void exit_mmap(struct mm_struct *mm)
/* mm's last user has gone, and its about to be pulled down */
mmu_notifier_release(mm);
+ if (unlikely(mm_is_oom_victim(mm))) {
+ /*
+ * Tell oom_reap_task() not to start reaping this mm.
+ *
+ * oom_reap_task() depends on a stable VM_LOCKED flag to
+ * indicate it should not unmap during munlock_vma_pages_all().
+ *
+ * Since MMF_UNSTABLE is set before calling down_write(),
+ * oom_reap_task() which calls down_read() before testing
+ * MMF_UNSTABLE will not run on this mm after up_write().
+ *
+ * mm_is_oom_victim() cannot be set from under us because
+ * victim->mm is already set to NULL under task_lock before
+ * calling mmput() and victim->signal->oom_mm is set by the oom
+ * killer only if victim->mm is non-NULL while holding
+ * task_lock().
+ */
+ set_bit(MMF_UNSTABLE, &mm->flags);
+ down_write(&mm->mmap_sem);
+ up_write(&mm->mmap_sem);
+ }
+
if (mm->locked_vm) {
vma = mm->mmap;
while (vma) {
@@ -3036,26 +3058,9 @@ void exit_mmap(struct mm_struct *mm)
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
unmap_vmas(&tlb, vma, 0, -1);
-
- if (unlikely(mm_is_oom_victim(mm))) {
- /*
- * Wait for oom_reap_task() to stop working on this
- * mm. Because MMF_OOM_SKIP is already set before
- * calling down_read(), oom_reap_task() will not run
- * on this "mm" post up_write().
- *
- * mm_is_oom_victim() cannot be set from under us
- * either because victim->mm is already set to NULL
- * under task_lock before calling mmput and oom_mm is
- * set not NULL by the OOM killer only if victim->mm
- * is found not NULL while holding the task_lock.
- */
- set_bit(MMF_OOM_SKIP, &mm->flags);
- down_write(&mm->mmap_sem);
- up_write(&mm->mmap_sem);
- }
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, 0, -1);
+ set_bit(MMF_OOM_SKIP, &mm->flags);
/*
* Walk the list again, actually closing and freeing it,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ff992fa..1fef1b6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -510,25 +510,16 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
/*
* If the mm has invalidate_{start,end}() notifiers that could block,
+ * or if the mm is in exit_mmap() which has unpredictable dependencies,
* sleep to give the oom victim some more time.
* TODO: we really want to get rid of this ugly hack and make sure that
* notifiers cannot block for unbounded amount of time
*/
- if (mm_has_blockable_invalidate_notifiers(mm)) {
- up_read(&mm->mmap_sem);
- schedule_timeout_idle(HZ);
- goto unlock_oom;
- }
-
- /*
- * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
- * work on the mm anymore. The check for MMF_OOM_SKIP must run
- * under mmap_sem for reading because it serializes against the
- * down_write();up_write() cycle in exit_mmap().
- */
- if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
+ if (mm_has_blockable_invalidate_notifiers(mm) ||
+ test_bit(MMF_UNSTABLE, &mm->flags)) {
up_read(&mm->mmap_sem);
trace_skip_task_reaping(tsk->pid);
+ schedule_timeout_idle(HZ);
goto unlock_oom;
}
@@ -590,11 +581,9 @@ static void oom_reap_task(struct task_struct *tsk)
while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm))
schedule_timeout_idle(HZ/10);
- if (attempts <= MAX_OOM_REAP_RETRIES ||
- test_bit(MMF_OOM_SKIP, &mm->flags))
+ if (test_bit(MMF_UNSTABLE, &mm->flags))
goto done;
-
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
task_pid_nr(tsk), tsk->comm);
debug_show_all_locks();
@@ -603,8 +592,12 @@ static void oom_reap_task(struct task_struct *tsk)
tsk->oom_reaper_list = NULL;
/*
- * Hide this mm from OOM killer because it has been either reaped or
- * somebody can't call up_write(mmap_sem).
+ * Hide this mm from the OOM killer because:
+ * the OOM reaper completed reaping
+ * or
+ * exit_mmap() told the OOM reaper not to start reaping
+ * or
+ * neither exit_mmap() nor the OOM reaper started reaping
*/
set_bit(MMF_OOM_SKIP, &mm->flags);
--
1.8.3.1