Hi,
The attached patch seems to fix all the reported deadlock
problems with the new VM. Basically they could be grouped
into 2 categories:
1) __GFP_IO related locking issues
2) something sleeps on a free/clean/inactive page goal
that isn't worked towards
The patch has survived some heavy stresstesting on both
SMP and UP machines. I hope nobody will be able to find
a way to still crash this one ;)
A second change is a more dynamic free memory target
(now freepages.high + inactive_target / 3), this seems
to help a little bit in some loads.
If your mailer messes up the patch, you can grab it from
http://www.surriel.com/patches/2.4.0-t9p7-vmpatch
Linus, if this patch turns out to work fine for the people
testing it, could you please apply it to your tree?
regards,
Rik
-- "What you're running that piece of shit Gnome?!?!" -- Miguel de Icaza, UKUUG 2000http://www.conectiva.com/ http://www.surriel.com/
--- linux-2.4.0-test9-pre7/fs/buffer.c.orig Sat Sep 30 18:09:18 2000 +++ linux-2.4.0-test9-pre7/fs/buffer.c Mon Oct 2 00:19:41 2000 @@ -706,7 +706,9 @@ static void refill_freelist(int size) { if (!grow_buffers(size)) { - try_to_free_pages(GFP_BUFFER); + wakeup_bdflush(1); + current->policy |= SCHED_YIELD; + schedule(); } } @@ -859,6 +861,7 @@ int balance_dirty_state(kdev_t dev) { unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit; + int shortage; dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT; tot = nr_free_buffer_pages(); @@ -869,21 +872,20 @@ /* First, check for the "real" dirty limit. */ if (dirty > soft_dirty_limit) { - if (dirty > hard_dirty_limit || inactive_shortage()) + if (dirty > hard_dirty_limit) return 1; return 0; } /* - * Then, make sure the number of inactive pages won't overwhelm - * page replacement ... this should avoid stalls. + * If we are about to get low on free pages and + * cleaning the inactive_dirty pages would help + * fix this, wake up bdflush. */ - if (nr_inactive_dirty_pages > - nr_free_pages() + nr_inactive_clean_pages()) { - if (free_shortage() > freepages.min) - return 1; + if (free_shortage() && nr_inactive_dirty_pages > free_shortage() && + nr_inactive_dirty_pages > freepages.high) return 0; - } + return -1; } @@ -2663,9 +2665,8 @@ CHECK_EMERGENCY_SYNC flushed = flush_dirty_buffers(0); - if (nr_inactive_dirty_pages > nr_free_pages() + - nr_inactive_clean_pages()) - flushed += page_launder(GFP_KSWAPD, 0); + if (free_shortage()) + flushed += page_launder(GFP_BUFFER, 0); /* If wakeup_bdflush will wakeup us after our bdflush_done wakeup, then --- linux-2.4.0-test9-pre7/mm/filemap.c.orig Sat Sep 30 18:16:28 2000 +++ linux-2.4.0-test9-pre7/mm/filemap.c Mon Oct 2 00:11:45 2000 @@ -44,7 +44,6 @@ atomic_t page_cache_size = ATOMIC_INIT(0); unsigned int page_hash_bits; struct page **page_hash_table; -struct list_head lru_cache; spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED; /* --- linux-2.4.0-test9-pre7/mm/page_alloc.c.orig Sat Sep 30 18:16:28 2000 +++ linux-2.4.0-test9-pre7/mm/page_alloc.c Mon Oct 2 00:31:15 2000 @@ -258,13 +258,13 @@ */ switch (limit) { default: - case 0: + case PAGES_MIN: water_mark = z->pages_min; break; - case 1: + case PAGES_LOW: water_mark = z->pages_low; break; - case 2: + case PAGES_HIGH: water_mark = z->pages_high; } @@ -318,10 +318,19 @@ direct_reclaim = 1; /* - * Are we low on inactive pages? + * If we are about to get low on free pages and we also have + * an inactive page shortage, wake up kswapd. */ if (inactive_shortage() > inactive_target / 2 && free_shortage()) wakeup_kswapd(0); + /* + * If we are about to get low on free pages and cleaning + * the inactive_dirty pages would fix the situation, + * wake up bdflush. + */ + else if (free_shortage() && nr_inactive_dirty_pages > free_shortage() + && nr_inactive_dirty_pages > freepages.high) + wakeup_bdflush(0); try_again: /* @@ -378,8 +387,23 @@ * * We wake up kswapd, in the hope that kswapd will * resolve this situation before memory gets tight. + * + * We also yield the CPU, because that: + * - gives kswapd a chance to do something + * - slows down allocations, in particular the + * allocations from the fast allocator that's + * causing the problems ... + * - ... which minimises the impact the "bad guys" + * have on the rest of the system + * - if we don't have __GFP_IO set, kswapd may be + * able to free some memory we can't free ourselves */ wakeup_kswapd(0); + if (gfp_mask & __GFP_WAIT) { + __set_current_state(TASK_RUNNING); + current->policy |= SCHED_YIELD; + schedule(); + } /* * After waking up kswapd, we try to allocate a page @@ -440,28 +464,43 @@ * up again. After that we loop back to the start. * * We have to do this because something else might eat - * the memory kswapd frees for us (interrupts, other - * processes, etc). + * the memory kswapd frees for us and we need to be + * reliable. Note that we don't loop back for higher + * order allocations since it is possible that kswapd + * simply cannot free a large enough contiguous area + * of memory *ever*. */ - if (gfp_mask & __GFP_WAIT) { - /* - * Give other processes a chance to run: - */ - if (current->need_resched) { - __set_current_state(TASK_RUNNING); - schedule(); - } + if (gfp_mask & (__GFP_WAIT|__GFP_IO) == (__GFP_WAIT|__GFP_IO)) { + wakeup_kswapd(1); + memory_pressure++; + if (!order) + goto try_again; + /* + * If __GFP_IO isn't set, we can't wait on kswapd because + * kswapd just might need some IO locks /we/ are holding ... + * + * SUBTLE: The scheduling point above makes sure that + * kswapd does get the chance to free memory we can't + * free ourselves... + */ + } else if (gfp_mask & __GFP_WAIT) { try_to_free_pages(gfp_mask); memory_pressure++; - goto try_again; + if (!order) + goto try_again; } + } /* * Final phase: allocate anything we can! * - * This is basically reserved for PF_MEMALLOC and - * GFP_ATOMIC allocations... + * Higher order allocations, GFP_ATOMIC allocations and + * recursive allocations (PF_MEMALLOC) end up here. + * + * Only recursive allocations can use the very last pages + * in the system, otherwise it would be just too easy to + * deadlock the system... */ zone = zonelist->zones; for (;;) { @@ -472,8 +511,21 @@ if (!z->size) BUG(); + /* + * SUBTLE: direct_reclaim is only possible if the task + * becomes PF_MEMALLOC while looping above. This will + * happen when the OOM killer selects this task for + * instant execution... + */ if (direct_reclaim) page = reclaim_page(z); + if (page) + return page; + + /* XXX: is pages_min/4 a good amount to reserve for this? */ + if (z->free_pages < z->pages_min / 4 && + !(current->flags & PF_MEMALLOC)) + continue; if (!page) page = rmqueue(z, order); if (page) @@ -481,8 +533,7 @@ } /* No luck.. */ - if (!order) - show_free_areas(); + printk(KERN_ERR "__alloc_pages: %d-order allocation failed.\n", order); return NULL; } @@ -572,6 +623,13 @@ sum = nr_free_pages(); sum += nr_inactive_clean_pages(); sum += nr_inactive_dirty_pages; + + /* + * Keep our write behind queue filled, even if + * kswapd lags a bit right now. + */ + if (sum < freepages.high + inactive_target) + sum = freepages.high + inactive_target; /* * We don't want dirty page writebehind to put too * much pressure on the working set, but we want it --- linux-2.4.0-test9-pre7/mm/swap.c.orig Sat Sep 30 18:16:28 2000 +++ linux-2.4.0-test9-pre7/mm/swap.c Sat Sep 30 20:49:27 2000 @@ -100,6 +100,15 @@ page->age = PAGE_AGE_MAX; } +/* + * We use this (minimal) function in the case where we + * know we can't deactivate the page (yet). + */ +void age_page_down_ageonly(struct page * page) +{ + page->age /= 2; +} + void age_page_down_nolock(struct page * page) { /* The actual page aging bit */ @@ -155,30 +164,39 @@ */ void deactivate_page_nolock(struct page * page) { + /* + * One for the cache, one for the extra reference the + * caller has and (maybe) one for the buffers. + * + * This isn't perfect, but works for just about everything. + * Besides, as long as we don't move unfreeable pages to the + * inactive_clean list it doesn't need to be perfect... + */ + int maxcount = (page->buffers ? 3 : 2); page->age = 0; /* * Don't touch it if it's not on the active list. * (some pages aren't on any list at all) */ - if (PageActive(page) && (page_count(page) <= 2 || page->buffers) && + if (PageActive(page) && page_count(page) <= maxcount && !page_ramdisk(page)) { /* * We can move the page to the inactive_dirty list - * if we know there is backing store available. + * if we have the strong suspicion that they might + * become freeable in the near future. * - * We also move pages here that we cannot free yet, - * but may be able to free later - because most likely - * we're holding an extra reference on the page which - * will be dropped right after deactivate_page(). + * That is, the page has buffer heads attached (that + * need to be cleared away) and/or the function calling + * us has an extra reference count on the page. */ if (page->buffers || page_count(page) == 2) { del_page_from_active_list(page); add_page_to_inactive_dirty_list(page); /* - * If the page is clean and immediately reusable, - * we can move it to the inactive_clean list. + * Only if we are SURE the page is clean and immediately + * reusable, we move it to the inactive_clean list. */ } else if (page->mapping && !PageDirty(page) && !PageLocked(page)) { @@ -215,6 +233,10 @@ * not to do anything. */ } + + /* Make sure the page gets a fair chance at staying active. */ + if (page->age < PAGE_AGE_START) + page->age = PAGE_AGE_START; } void activate_page(struct page * page) --- linux-2.4.0-test9-pre7/mm/vmscan.c.orig Sat Sep 30 18:16:28 2000 +++ linux-2.4.0-test9-pre7/mm/vmscan.c Sun Oct 1 19:19:56 2000 @@ -74,7 +74,8 @@ goto out_failed; } if (!onlist) - age_page_down(page); + /* The page is still mapped, so it can't be freeable... */ + age_page_down_ageonly(page); /* * If the page is in active use by us, or if the page @@ -419,7 +420,7 @@ continue; /* Skip tasks which haven't slept long enough yet when idle-swapping. */ if (idle_time && !assign && (!(p->state & TASK_INTERRUPTIBLE) || - time_before(p->sleep_time + idle_time * HZ, jiffies))) + time_after(p->sleep_time + idle_time * HZ, jiffies))) continue; found_task++; /* Refresh swap_cnt? */ @@ -536,6 +537,7 @@ found_page: del_page_from_inactive_clean_list(page); UnlockPage(page); + page->age = PAGE_AGE_START; if (page_count(page) != 1) printk("VM: reclaim_page, found page with count %d!\n", page_count(page)); @@ -565,22 +567,24 @@ * This code is heavily inspired by the FreeBSD source code. Thanks * go out to Matthew Dillon. */ -#define MAX_SYNC_LAUNDER (1 << page_cluster) -#define MAX_LAUNDER (MAX_SYNC_LAUNDER * 4) +#define MAX_LAUNDER (4 * (1 << page_cluster)) int page_launder(int gfp_mask, int sync) { - int synclaunder, launder_loop, maxscan, cleaned_pages, maxlaunder; + int launder_loop, maxscan, cleaned_pages, maxlaunder; + int can_get_io_locks; struct list_head * page_lru; struct page * page; + /* + * We can only grab the IO locks (eg. for flushing dirty + * buffers to disk) if __GFP_IO is set. + */ + can_get_io_locks = gfp_mask & __GFP_IO; + launder_loop = 0; - synclaunder = 0; maxlaunder = 0; cleaned_pages = 0; - if (!(gfp_mask & __GFP_IO)) - return 0; - dirty_page_rescan: spin_lock(&pagemap_lru_lock); maxscan = nr_inactive_dirty_pages; @@ -638,7 +642,7 @@ spin_unlock(&pagemap_lru_lock); /* Will we do (asynchronous) IO? */ - if (launder_loop && synclaunder-- > 0) + if (launder_loop && maxlaunder == 0 && sync) wait = 2; /* Synchrounous IO */ else if (launder_loop && maxlaunder-- > 0) wait = 1; /* Async IO */ @@ -725,10 +729,11 @@ * loads, flush out the dirty pages before we have to wait on * IO. */ - if (!launder_loop && free_shortage()) { + if (can_get_io_locks && !launder_loop && free_shortage()) { launder_loop = 1; - if (sync && !cleaned_pages) - synclaunder = MAX_SYNC_LAUNDER; + /* If we cleaned pages, never do synchronous IO. */ + if (cleaned_pages) + sync = 0; /* We only do a few "out of order" flushes. */ maxlaunder = MAX_LAUNDER; /* Kflushd takes care of the rest. */ @@ -774,8 +779,23 @@ age_page_up_nolock(page); page_active = 1; } else { - age_page_down_nolock(page); - page_active = 0; + age_page_down_ageonly(page); + /* + * Since we don't hold a reference on the page + * ourselves, we have to do our test a bit more + * strict then deactivate_page(). This is needed + * since otherwise the system could hang shuffling + * unfreeable pages from the active list to the + * inactive_dirty list and back again... + * + * SUBTLE: we can have buffer pages with count 1. + */ + if (page_count(page) <= (page->buffers ? 2 : 1)) { + deactivate_page_nolock(page); + page_active = 0; + } else { + page_active = 1; + } } /* * If the page is still on the active list, move it @@ -805,14 +825,11 @@ pg_data_t *pgdat = pgdat_list; int sum = 0; int freeable = nr_free_pages() + nr_inactive_clean_pages(); + int freetarget = freepages.high + inactive_target / 3; - /* Are we low on truly free pages? */ - if (nr_free_pages() < freepages.min) - return freepages.high - nr_free_pages(); - - /* Are we low on free pages over-all? */ - if (freeable < freepages.high) - return freepages.high - freeable; + /* Are we low on free pages globally? */ + if (freeable < freetarget) + return freetarget - freeable; /* If not, are we very low on any particular zone? */ do { @@ -1052,14 +1069,7 @@ /* Do we need to do some synchronous flushing? */ if (waitqueue_active(&kswapd_done)) wait = 1; - if (!do_try_to_free_pages(GFP_KSWAPD, wait)) { - /* - * if (out_of_memory()) { - * try again a few times; - * oom_kill(); - * } - */ - } + do_try_to_free_pages(GFP_KSWAPD, wait); } /* @@ -1096,6 +1106,10 @@ */ if (!free_shortage() || !inactive_shortage()) interruptible_sleep_on_timeout(&kswapd_wait, HZ); + /* + * TODO: insert out of memory check & oom killer + * invocation in an else branch here. + */ } } --- linux-2.4.0-test9-pre7/include/linux/swap.h.orig Sat Sep 30 20:42:01 2000 +++ linux-2.4.0-test9-pre7/include/linux/swap.h Sun Oct 1 15:35:03 2000 @@ -91,6 +91,7 @@ extern void age_page_up_nolock(struct page *); extern void age_page_down(struct page *); extern void age_page_down_nolock(struct page *); +extern void age_page_down_ageonly(struct page *); extern void deactivate_page(struct page *); extern void deactivate_page_nolock(struct page *); extern void activate_page(struct page *);
- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org Please read the FAQ at http://www.tux.org/lkml/
This archive was generated by hypermail 2b29 : Sat Oct 07 2000 - 21:00:09 EST