[patch] fix for OOM deadlock in swap_in (2.2.10) [Re: [test program]

Andrea Arcangeli (andrea@suse.de)
Sat, 3 Jul 1999 15:52:00 +0200 (CEST)


On Thu, 1 Jul 1999, Bernd Kaindl wrote:

>The program is quite effective in eating every byte of userspace memory:

I have a OOM patch for 2.2.10. Here it is a description of the patch:

o I discovered a deadlock in the current 2.2.10/2.3.x swapin code.
If the swapin was generated by a copy-user, then handle_mm_fault
was unconditionally returning 1 to do_page_fault that so was
returning to the copy-user code without doing progress and without
killing the task. I think this is the reason for the deadlocks
experienced by some people using 2.2.x.

o I also seen a cupole of problems for init. The first problem is
mapping the bad page in place of the swapped out page (init don't
like zeroed pages :). Then we could send a SIGKILL to init by
mistake in oom. The third problem is that we may force a sigsegv
from the signal code if we got a fault in the put_users. The
problem is that when we get a fault from a copy-user function we
can't know if we got it due OOM (no enough ram to allocate the
userspace memory) or due a real fault, this is not a big problem
for normal tasks but since we can't crash init due OOM we must
not send the sigsegv signal to init in all cases. The simpler way
to avoid init getting weird signals due OOM conditions is to
forbid force_sig_info to run if the pid of the task is 1.

o I made sure that we'll send a SIGKILL and not a SIGBUS if
want to kill a task because we are OOM.

o I included a longstanding lowest_bit/highest_bit fix
(mentioned some time on the list but never included).

o I allow only one refresh of the swap_cnt per swapout cycle. (bugfix)

o I stop kswapd for 10 seconds if do_try_to_free_pages failed, so
other tasks will be able to segfault. I run kswapd a bit more
in loop (probably a good idea for routers and when there is
plenty of memory in cache).
I wakeup kswapd from oom(). If oom() doesn't happens
(unlikely) kswapd will return to run after 10 sec (or you can
kill kswapd by hand to make it running again before).

o minor do_syslog fix (-EFAULT)

o set RT priority of sigkilled task (to be sure we are not going
to run another task before running the signal handler while
returning to userspace)

Index: linux/arch/i386/mm/fault.c
===================================================================
RCS file: /var/cvs/linux/arch/i386/mm/fault.c,v
retrieving revision 1.1.1.1
diff -u -r1.1.1.1 fault.c
--- linux/arch/i386/mm/fault.c 1999/01/18 01:28:56 1.1.1.1
+++ linux/arch/i386/mm/fault.c 1999/07/02 17:00:29
@@ -50,7 +50,8 @@
start &= PAGE_MASK;

for (;;) {
- handle_mm_fault(current,vma, start, 1);
+ if (!handle_mm_fault(current,vma, start, 1))
+ goto bad_area;
if (!size)
break;
size--;
@@ -163,7 +164,7 @@
* the fault.
*/
if (!handle_mm_fault(tsk, vma, address, write))
- goto do_sigbus;
+ goto failed;

/*
* Did it hit the DOS screen memory VA from vm86 mode?
@@ -255,7 +256,7 @@
* We ran out of memory, or some other thing happened to us that made
* us unable to handle the page fault gracefully.
*/
-do_sigbus:
+failed:
up(&mm->mmap_sem);

/*
@@ -265,7 +266,6 @@
tsk->tss.cr2 = address;
tsk->tss.error_code = error_code;
tsk->tss.trap_no = 14;
- force_sig(SIGBUS, tsk);

/* Kernel mode? Handle exceptions or die */
if (!(error_code & 4))
Index: linux/include/linux/mm.h
===================================================================
RCS file: /var/cvs/linux/include/linux/mm.h,v
retrieving revision 1.1.1.10
diff -u -r1.1.1.10 mm.h
--- linux/include/linux/mm.h 1999/06/14 15:29:23 1.1.1.10
+++ linux/include/linux/mm.h 1999/07/03 13:10:09
@@ -300,7 +300,7 @@
extern unsigned long paging_init(unsigned long start_mem, unsigned long end_mem);
extern void mem_init(unsigned long start_mem, unsigned long end_mem);
extern void show_mem(void);
-extern void oom(struct task_struct * tsk);
+extern void oom(void);
extern void si_meminfo(struct sysinfo * val);

/* mmap.c */
Index: linux/include/linux/swap.h
===================================================================
RCS file: /var/cvs/linux/include/linux/swap.h,v
retrieving revision 1.1.1.5
diff -u -r1.1.1.5 swap.h
--- linux/include/linux/swap.h 1999/06/14 15:29:38 1.1.1.5
+++ linux/include/linux/swap.h 1999/07/02 01:50:19
@@ -69,6 +69,7 @@
extern struct inode swapper_inode;
extern unsigned long page_cache_size;
extern int buffermem;
+extern struct wait_queue * kswapd_wait_oom;

/* Incomplete types for prototype declarations: */
struct task_struct;
@@ -91,8 +92,8 @@
extern void swap_after_unlock_page (unsigned long entry);

/* linux/mm/page_alloc.c */
-extern void swap_in(struct task_struct *, struct vm_area_struct *,
- pte_t *, unsigned long, int);
+extern int swap_in(struct task_struct *, struct vm_area_struct *,
+ pte_t *, unsigned long, int);


/* linux/mm/swap_state.c */
Index: linux/ipc/shm.c
===================================================================
RCS file: /var/cvs/linux/ipc/shm.c,v
retrieving revision 1.1.1.10
diff -u -r1.1.1.10 shm.c
--- linux/ipc/shm.c 1999/06/14 15:30:04 1.1.1.10
+++ linux/ipc/shm.c 1999/07/03 01:53:39
@@ -636,7 +636,7 @@
if (!pte_present(pte)) {
unsigned long page = get_free_page(GFP_USER);
if (!page) {
- oom(current);
+ oom();
return 0;
}
pte = __pte(shp->shm_pages[idx]);
Index: linux/kernel/printk.c
===================================================================
RCS file: /var/cvs/linux/kernel/printk.c,v
retrieving revision 1.1.1.6
diff -u -r1.1.1.6 printk.c
--- linux/kernel/printk.c 1999/06/14 15:30:05 1.1.1.6
+++ linux/kernel/printk.c 1999/07/01 19:31:06
@@ -147,7 +147,8 @@
log_size--;
log_start &= LOG_BUF_LEN-1;
sti();
- __put_user(c,buf);
+ if ((error = __put_user(c,buf)) == -EFAULT)
+ goto out;
buf++;
i++;
cli();
@@ -183,7 +184,8 @@
__restore_flags(flags);
for (i = 0; i < count; i++) {
c = *((char *) log_buf+(j++ & (LOG_BUF_LEN-1)));
- __put_user(c, buf++);
+ if ((error = __put_user(c, buf++)) == -EFAULT)
+ goto out;
}
if (do_clear)
logged_chars = 0;
Index: linux/kernel/signal.c
===================================================================
RCS file: /var/cvs/linux/kernel/signal.c,v
retrieving revision 1.1.1.9
diff -u -r1.1.1.9 signal.c
--- linux/kernel/signal.c 1999/06/14 15:30:06 1.1.1.9
+++ linux/kernel/signal.c 1999/07/03 02:56:30
@@ -409,6 +409,9 @@
{
unsigned long int flags;

+ if (t->pid == 1)
+ return 0;
+
spin_lock_irqsave(&t->sigmask_lock, flags);
if (t->sig == NULL) {
spin_unlock_irqrestore(&t->sigmask_lock, flags);
Index: linux/mm/filemap.c
===================================================================
RCS file: /var/cvs/linux/mm/filemap.c,v
retrieving revision 1.1.1.17
diff -u -r1.1.1.17 filemap.c
--- linux/mm/filemap.c 1999/06/14 15:30:07 1.1.1.17
+++ linux/mm/filemap.c 1999/07/03 01:52:48
@@ -939,7 +939,7 @@
new_page = 0;
offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
- goto no_page;
+ goto do_sigbus;

/*
* Do we have something in the page cache already?
@@ -958,7 +958,7 @@
if (no_share && !new_page) {
new_page = page_cache_alloc();
if (!new_page)
- goto failure;
+ goto release_and_oom;
}

if (PageLocked(page))
@@ -1006,7 +1006,7 @@
if (!new_page)
new_page = page_cache_alloc();
if (!new_page)
- goto no_page;
+ goto oom;

/*
* During getting the above page we might have slept,
@@ -1058,7 +1058,16 @@
page_cache_release(page);
if (new_page)
page_cache_free(new_page);
-no_page:
+ return 0;
+
+do_sigbus:
+ force_sig(SIGBUS, current);
+ return 0;
+
+release_and_oom:
+ page_cache_release(page);
+oom:
+ oom();
return 0;
}

Index: linux/mm/memory.c
===================================================================
RCS file: /var/cvs/linux/mm/memory.c,v
retrieving revision 1.1.1.8
diff -u -r1.1.1.8 memory.c
--- linux/mm/memory.c 1999/06/14 15:30:07 1.1.1.8
+++ linux/mm/memory.c 1999/07/03 02:56:27
@@ -65,10 +65,23 @@
* oom() prints a message (so that the user knows why the process died),
* and gives the process an untrappable SIGKILL.
*/
-void oom(struct task_struct * task)
+void oom(void)
{
- printk("\nOut of memory for %s.\n", task->comm);
- force_sig(SIGKILL, task);
+ if (current->pid == 1)
+ return;
+
+ /*
+ * Make sure the killed task will be scheduled ASAP. Otherwise
+ * we risk to reschedule another task before running the signal
+ * hander.
+ */
+ current->policy = SCHED_FIFO;
+ current->rt_priority = 1000000;
+
+ printk("\nOut of memory for %s.\n", current->comm);
+ force_sig(SIGKILL, current);
+
+ wake_up_interruptible(&kswapd_wait_oom);
}

/*
@@ -545,19 +558,6 @@
}

/*
- * sanity-check function..
- */
-static void put_page(pte_t * page_table, pte_t pte)
-{
- if (!pte_none(*page_table)) {
- free_page_and_swap_cache(pte_page(pte));
- return;
- }
-/* no need for flush_tlb */
- set_pte(page_table, pte);
-}
-
-/*
* This routine is used to map in a page into an address space: needed by
* execve() for the initial stack and environment pages.
*/
@@ -575,13 +575,13 @@
pmd = pmd_alloc(pgd, address);
if (!pmd) {
free_page(page);
- oom(tsk);
+ oom();
return 0;
}
pte = pte_alloc(pmd, address);
if (!pte) {
free_page(page);
- oom(tsk);
+ oom();
return 0;
}
if (!pte_none(*pte)) {
@@ -614,21 +614,15 @@
* and potentially makes it more efficient.
*/
static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
- unsigned long address, pte_t *page_table)
+ unsigned long address, pte_t *page_table, pte_t pte)
{
- pte_t pte;
unsigned long old_page, new_page;
struct page * page_map;

- pte = *page_table;
new_page = __get_free_page(GFP_USER);
/* Did swap_out() unmapped the protected page while we slept? */
if (pte_val(*page_table) != pte_val(pte))
goto end_wp_page;
- if (!pte_present(pte))
- goto end_wp_page;
- if (pte_write(pte))
- goto end_wp_page;
old_page = pte_page(pte);
if (MAP_NR(old_page) >= max_mapnr)
goto bad_wp_page;
@@ -684,13 +678,16 @@
return 1;

bad_wp_page:
+ unlock_kernel();
printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
send_sig(SIGKILL, tsk, 1);
-no_new_page:
- unlock_kernel();
if (new_page)
free_page(new_page);
return 0;
+no_new_page:
+ unlock_kernel();
+ oom();
+ return 0;
}

/*
@@ -787,8 +784,9 @@
struct vm_area_struct * vma, unsigned long address,
pte_t * page_table, pte_t entry, int write_access)
{
+ int ret = 1;
if (!vma->vm_ops || !vma->vm_ops->swapin) {
- swap_in(tsk, vma, page_table, pte_val(entry), write_access);
+ ret = swap_in(tsk, vma, page_table, pte_val(entry), write_access);
flush_page_to_ram(pte_page(*page_table));
} else {
pte_t page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
@@ -805,7 +803,7 @@
}
}
unlock_kernel();
- return 1;
+ return ret;
}

/*
@@ -817,15 +815,18 @@
if (write_access) {
unsigned long page = __get_free_page(GFP_USER);
if (!page)
- return 0;
+ goto oom;
clear_page(page);
entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
vma->vm_mm->rss++;
tsk->min_flt++;
flush_page_to_ram(page);
}
- put_page(page_table, entry);
+ set_pte(page_table, entry);
return 1;
+ oom:
+ oom();
+ return 0;
}

/*
@@ -882,7 +883,7 @@
} else if (atomic_read(&mem_map[MAP_NR(page)].count) > 1 &&
!(vma->vm_flags & VM_SHARED))
entry = pte_wrprotect(entry);
- put_page(page_table, entry);
+ set_pte(page_table, entry);
/* no need to invalidate: a not-present page shouldn't be cached */
return 1;
}
@@ -916,7 +917,7 @@
flush_tlb_page(vma, address);
if (write_access) {
if (!pte_write(entry))
- return do_wp_page(tsk, vma, address, pte);
+ return do_wp_page(tsk, vma, address, pte, entry);

entry = pte_mkdirty(entry);
set_pte(pte, entry);
@@ -934,18 +935,23 @@
{
pgd_t *pgd;
pmd_t *pmd;
+ pte_t * pte;

pgd = pgd_offset(vma->vm_mm, address);
pmd = pmd_alloc(pgd, address);
- if (pmd) {
- pte_t * pte = pte_alloc(pmd, address);
- if (pte) {
- if (handle_pte_fault(tsk, vma, address, write_access, pte)) {
- update_mmu_cache(vma, address, *pte);
- return 1;
- }
- }
- }
+ if (!pmd)
+ goto oom;
+ pte = pte_alloc(pmd, address);
+ if (!pte)
+ goto oom;
+ if (!handle_pte_fault(tsk, vma, address, write_access, pte))
+ goto fail;
+ update_mmu_cache(vma, address, *pte);
+ return 1;
+
+ oom:
+ oom();
+ fail:
return 0;
}

Index: linux/mm/page_alloc.c
===================================================================
RCS file: /var/cvs/linux/mm/page_alloc.c,v
retrieving revision 1.1.1.10
diff -u -r1.1.1.10 page_alloc.c
--- linux/mm/page_alloc.c 1999/06/14 15:30:08 1.1.1.10
+++ linux/mm/page_alloc.c 1999/07/03 13:12:22
@@ -373,8 +373,13 @@

/* Ok, do the async read-ahead now */
new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), 0);
- if (new_page != NULL)
- __free_page(new_page);
+ /*
+ * If we are OOM or if this isn't a contiguous swap cluster
+ * stop the readahead.
+ */
+ if (!new_page)
+ return;
+ __free_page(new_page);
offset++;
} while (--i);
return;
@@ -387,7 +392,7 @@
* Also, don't bother to add to the swap cache if this page-in
* was due to a write access.
*/
-void swap_in(struct task_struct * tsk, struct vm_area_struct * vma,
+int swap_in(struct task_struct * tsk, struct vm_area_struct * vma,
pte_t * page_table, unsigned long entry, int write_access)
{
unsigned long page;
@@ -400,13 +405,11 @@
if (pte_val(*page_table) != entry) {
if (page_map)
free_page_and_swap_cache(page_address(page_map));
- return;
+ return 1;
}
if (!page_map) {
- set_pte(page_table, BAD_PAGE);
- swap_free(entry);
- oom(tsk);
- return;
+ oom();
+ return 0;
}

page = page_address(page_map);
@@ -416,7 +419,7 @@

if (!write_access || is_page_shared(page_map)) {
set_pte(page_table, mk_pte(page, vma->vm_page_prot));
- return;
+ return 1;
}

/*
@@ -426,5 +429,5 @@
*/
delete_from_swap_cache(page_map);
set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))));
- return;
+ return 1;
}
Index: linux/mm/swapfile.c
===================================================================
RCS file: /var/cvs/linux/mm/swapfile.c,v
retrieving revision 1.1.1.8
diff -u -r1.1.1.8 swapfile.c
--- linux/mm/swapfile.c 1999/06/14 15:30:09 1.1.1.8
+++ linux/mm/swapfile.c 1999/07/01 12:09:00
@@ -131,15 +131,17 @@
offset = SWP_OFFSET(entry);
if (offset >= p->max)
goto bad_offset;
- if (offset < p->lowest_bit)
- p->lowest_bit = offset;
- if (offset > p->highest_bit)
- p->highest_bit = offset;
if (!p->swap_map[offset])
goto bad_free;
if (p->swap_map[offset] < SWAP_MAP_MAX) {
if (!--p->swap_map[offset])
+ {
+ if (offset < p->lowest_bit)
+ p->lowest_bit = offset;
+ if (offset > p->highest_bit)
+ p->highest_bit = offset;
nr_swap_pages++;
+ }
}
#ifdef DEBUG_SWAP
printk("DebugVM: swap_free(entry %08lx, count now %d)\n",
Index: linux/mm/vmscan.c
===================================================================
RCS file: /var/cvs/linux/mm/vmscan.c,v
retrieving revision 1.1.1.8
diff -u -r1.1.1.8 vmscan.c
--- linux/mm/vmscan.c 1999/06/14 15:30:09 1.1.1.8
+++ linux/mm/vmscan.c 1999/07/02 01:36:25
@@ -308,7 +308,8 @@
static int swap_out(unsigned int priority, int gfp_mask)
{
struct task_struct * p, * pbest;
- int counter, assign, max_cnt;
+ int assign = 0, counter;
+ unsigned long max_cnt;

/*
* We make one or two passes through the task list, indexed by
@@ -327,11 +328,8 @@
counter = nr_tasks / (priority+1);
if (counter < 1)
counter = 1;
- if (counter > nr_tasks)
- counter = nr_tasks;

for (; counter >= 0; counter--) {
- assign = 0;
max_cnt = 0;
pbest = NULL;
select:
@@ -343,7 +341,7 @@
if (p->mm->rss <= 0)
continue;
/* Refresh swap_cnt? */
- if (assign)
+ if (assign == 1)
p->mm->swap_cnt = p->mm->rss;
if (p->mm->swap_cnt > max_cnt) {
max_cnt = p->mm->swap_cnt;
@@ -351,6 +349,8 @@
}
}
read_unlock(&tasklist_lock);
+ if (assign == 1)
+ assign = 2;
if (!pbest) {
if (!assign) {
assign = 1;
@@ -435,7 +435,8 @@
printk ("Starting kswapd v%.*s\n", i, s);
}

-static struct task_struct *kswapd_process;
+static struct wait_queue * kswapd_wait = NULL;
+struct wait_queue * kswapd_wait_oom = NULL;

/*
* The background pageout daemon, started as a kernel thread
@@ -455,7 +456,6 @@
{
struct task_struct *tsk = current;

- kswapd_process = tsk;
tsk->session = 1;
tsk->pgrp = 1;
strcpy(tsk->comm, "kswapd");
@@ -484,16 +484,17 @@
* the processes needing more memory will wake us
* up on a more timely basis.
*/
- do {
- if (nr_free_pages >= freepages.high)
- break;
-
- if (!do_try_to_free_pages(GFP_KSWAPD))
- break;
- } while (!tsk->need_resched);
- run_task_queue(&tq_disk);
- tsk->state = TASK_INTERRUPTIBLE;
- schedule_timeout(HZ);
+ interruptible_sleep_on_timeout(&kswapd_wait, HZ);
+ while (nr_free_pages < freepages.high)
+ {
+ if (do_try_to_free_pages(GFP_KSWAPD))
+ {
+ if (tsk->need_resched)
+ schedule();
+ continue;
+ }
+ interruptible_sleep_on_timeout(&kswapd_wait_oom,10*HZ);
+ }
}
}

@@ -516,7 +517,7 @@
{
int retval = 1;

- wake_up_process(kswapd_process);
+ wake_up_interruptible(&kswapd_wait);
if (gfp_mask & __GFP_WAIT)
retval = do_try_to_free_pages(gfp_mask);
return retval;

With this patch applyed to 2.2.10 I can't deadlock the machine anymore
and I can't screw up init anymore.

I would like to get some feedback about the patch. Thanks :).

Andrea

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/