[PATCH] VM fixes + RSS limits 2.4.0-test13-pre5

From: Rik van Riel (riel@conectiva.com.br)
Date: Thu Dec 28 2000 - 17:48:53 EST


Hi Linus,

I know this is probably not the birthday present you've been
hoping for, but here is a patch agains 2.4.0-test13-pre5 which
does the following - trivial - things:

1. trivially implement RSS ulimit support, with
   p->rlim[RLIMIT_RSS].rlim_max treated as a hard limit
   and .rlim_cur treated as a soft limit

2. fix the return value from try_to_swap_out() to return
   success whenever we make the RSS of a process smaller

3. clean up refill_inactive() ... try_to_swap_out() returns
   the expected result now, so things should be balanced again

4. only call deactivate_page() from generic_file_write() if we
   write "beyond the end of" the page, so partially written
   pages stay active and will remain in memory longer (8% more
   performance for dbench, as tested by Daniel Phillips)

5. (minor) s/unsigned int gfp_mask/int gfp_mask/ in vmscan.c
   ... we had both types used, which is rather inconsistent

Please consider including this patch in the next 2.4 pre-patch,
IMHO all of these things are fairly trivial and it seems to run
very nicely on my test box ;)

regards,

Rik

--
Hollywood goes for world dumbination,
	Trailer at 11.

http://www.surriel.com/ http://www.conectiva.com/ http://distro.conectiva.com.br/

--- linux-2.4.0-test13-pre5/mm/filemap.c.orig Thu Dec 28 19:11:39 2000 +++ linux-2.4.0-test13-pre5/mm/filemap.c Thu Dec 28 19:28:06 2000 @@ -1912,7 +1912,7 @@ /* Make sure this doesn't exceed the process's max rss. */ error = -EIO; - rlim_rss = current->rlim ? current->rlim[RLIMIT_RSS].rlim_cur : + rlim_rss = current->rlim ? (current->rlim[RLIMIT_RSS].rlim_cur >> PAGE_SHIFT) : LONG_MAX; /* default: see resource.h */ if ((vma->vm_mm->rss + (end - start)) > rlim_rss) return error; @@ -2438,7 +2438,7 @@ } while (count) { - unsigned long bytes, index, offset; + unsigned long bytes, index, offset, partial = 0; char *kaddr; /* @@ -2448,8 +2448,10 @@ offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ index = pos >> PAGE_CACHE_SHIFT; bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) + if (bytes > count) { bytes = count; + partial = 1; + } /* * Bring in the user page that we will copy from _first_. @@ -2491,9 +2493,17 @@ buf += status; } unlock: - /* Mark it unlocked again and drop the page.. */ + /* + * Mark it unlocked again and release the page. + * In order to prevent large (fast) file writes + * from causing too much memory pressure we move + * completely written pages to the inactive list. + * We do, however, try to keep the pages that may + * still be written to (ie. partially written pages). + */ UnlockPage(page); - deactivate_page(page); + if (!partial) + deactivate_page(page); page_cache_release(page); if (status < 0) --- linux-2.4.0-test13-pre5/mm/memory.c.orig Thu Dec 28 19:11:39 2000 +++ linux-2.4.0-test13-pre5/mm/memory.c Thu Dec 28 19:12:04 2000 @@ -1198,6 +1198,12 @@ pgd = pgd_offset(mm, address); pmd = pmd_alloc(pgd, address); + if (mm->rss >= (current->rlim[RLIMIT_RSS].rlim_max >> PAGE_SHIFT)) { + lock_kernel(); + enforce_rss_limit(mm, GFP_HIGHUSER); + unlock_kernel(); + } + if (pmd) { pte_t * pte = pte_alloc(pmd, address); if (pte) --- linux-2.4.0-test13-pre5/mm/vmscan.c.orig Thu Dec 28 19:11:40 2000 +++ linux-2.4.0-test13-pre5/mm/vmscan.c Thu Dec 28 20:30:10 2000 @@ -49,7 +49,8 @@ if ((!VALID_PAGE(page)) || PageReserved(page)) goto out_failed; - if (mm->swap_cnt) + /* RSS trimming doesn't change the process' chances wrt. normal swap */ + if (mm->swap_cnt && !(gfp_mask & __GFP_RSS_LIMIT)) mm->swap_cnt--; onlist = PageActive(page); @@ -58,7 +59,13 @@ age_page_up(page); goto out_failed; } - if (!onlist) + /* + * SUBTLE: if the page is on the active list and we're not doing + * RSS ulimit trimming, then we let refill_inactive_scan() take + * care of the down aging. Always aging down here would severely + * disadvantage shared mappings (of eg libc.so). + */ + if (!onlist || (gfp_mask & __GFP_RSS_LIMIT)) /* The page is still mapped, so it can't be freeable... */ age_page_down_ageonly(page); @@ -85,8 +92,8 @@ * we can just drop our reference to it without doing * any IO - it's already up-to-date on disk. * - * Return 0, as we didn't actually free any real - * memory, and we should just continue our scan. + * Return success, we successfully stole a page from + * this process. */ if (PageSwapCache(page)) { entry.val = page->index; @@ -101,8 +108,8 @@ flush_tlb_page(vma, address); deactivate_page(page); page_cache_release(page); -out_failed: - return 0; + + return 1; } /* @@ -152,6 +159,7 @@ out_unlock_restore: set_pte(page_table, pte); UnlockPage(page); +out_failed: return 0; } @@ -192,7 +200,7 @@ int result; mm->swap_address = address + PAGE_SIZE; result = try_to_swap_out(mm, vma, address, pte, gfp_mask); - if (result) + if (result && !(gfp_mask & __GFP_RSS_LIMIT)) return result; if (!mm->swap_cnt) return 0; @@ -303,6 +311,63 @@ } /* + * This function is used to enforce RSS ulimits for a process. When a + * process gets an RSS larger than p->rlim[RLIMIT_RSS].rlim_max, this + * function will get called. + * + * The function is pretty similar to swap_out_mm, except for the fact + * that it scans the whole process regardless of return value and it + * keeps the swapout statistics intact to not disturb normal swapout. + * + * XXX: the caller must hold the kernel lock; this function cannot loop + * because mlock()ed memory could be bigger than the RSS limit. + */ +void enforce_rss_limit(struct mm_struct * mm, int gfp_mask) +{ + unsigned long address, old_swap_address; + struct vm_area_struct* vma; + + /* + * Go through process' page directory. + */ + old_swap_address = mm->swap_address; + address = mm->swap_address = 0; + + /* Don't decrement mm->swap_cnt in try_to_swap_out */ + gfp_mask |= __GFP_RSS_LIMIT; + if (!mm->swap_cnt) + mm->swap_cnt = 1; + + /* + * Find the proper vm-area after freezing the vma chain + * and ptes. + */ + spin_lock(&mm->page_table_lock); + vma = find_vma(mm, address); + if (vma) { + if (address < vma->vm_start) + address = vma->vm_start; + + for (;;) { + /* + * Subtle: swap_out_pmd makes sure we scan the + * whole VMA, that's a lot more efficient than + * a while() loop here would ever be. + */ + swap_out_vma(mm, vma, address, gfp_mask); + vma = vma->vm_next; + if (!vma) + break; + address = vma->vm_start; + } + } + /* Reset swap_address, RSS enforcement shouldn't disturb normal swap */ + mm->swap_address = old_swap_address; + + spin_unlock(&mm->page_table_lock); +} + +/* * Select the task with maximal swap_cnt and try to swap out a page. * N.B. This function returns only 0 or 1. Return values != 1 from * the lower level routines result in continued processing. @@ -310,7 +375,7 @@ #define SWAP_SHIFT 5 #define SWAP_MIN 8 -static int swap_out(unsigned int priority, int gfp_mask, unsigned long idle_time) +static int swap_out(unsigned int priority, int gfp_mask) { struct task_struct * p; int counter; @@ -350,14 +415,15 @@ continue; if (mm->rss <= 0) continue; - /* Skip tasks which haven't slept long enough yet when idle-swapping. */ - if (idle_time && !assign && (!(p->state & TASK_INTERRUPTIBLE) || - time_after(p->sleep_time + idle_time * HZ, jiffies))) - continue; found_task++; + /* If the process' RSS is too big, make it smaller ;) */ + if (mm->rss > (p->rlim[RLIMIT_RSS].rlim_max >> PAGE_SHIFT)) + enforce_rss_limit(mm, gfp_mask); /* Refresh swap_cnt? */ if (assign == 1) { mm->swap_cnt = (mm->rss >> SWAP_SHIFT); + if (mm->rss > (p->rlim[RLIMIT_RSS].rlim_cur >> PAGE_SHIFT)) + mm->swap_cnt = mm->rss; if (mm->swap_cnt < SWAP_MIN) mm->swap_cnt = SWAP_MIN; } @@ -497,7 +563,7 @@ #define MAX_LAUNDER (4 * (1 << page_cluster)) int page_launder(int gfp_mask, int sync) { - int launder_loop, maxscan, cleaned_pages, maxlaunder; + int launder_loop, maxscan, cleaned_pages, maxlaunder, target; int can_get_io_locks; struct list_head * page_lru; struct page * page; @@ -508,6 +574,8 @@ */ can_get_io_locks = gfp_mask & __GFP_IO; + target = free_shortage(); + launder_loop = 0; maxlaunder = 0; cleaned_pages = 0; @@ -538,6 +606,12 @@ } /* + * If we have enough free pages, stop doing (expensive) IO. + */ + if (cleaned_pages > target && !free_shortage()) + break; + + /* * The page is locked. IO in progress? * Move it to the back of the list. */ @@ -846,10 +920,9 @@ * really care about latency. In that case we don't try * to free too many pages. */ -static int refill_inactive(unsigned int gfp_mask, int user) +static int refill_inactive(int gfp_mask, int user) { int priority, count, start_count, made_progress; - unsigned long idle_time; count = inactive_shortage() + free_shortage(); if (user) @@ -859,17 +932,6 @@ /* Always trim SLAB caches when memory gets low. */ kmem_cache_reap(gfp_mask); - /* - * Calculate the minimum time (in seconds) a process must - * have slept before we consider it for idle swapping. - * This must be the number of seconds it takes to go through - * all of the cache. Doing this idle swapping makes the VM - * smoother once we start hitting swap. - */ - idle_time = atomic_read(&page_cache_size); - idle_time += atomic_read(&buffermem_pages); - idle_time /= (inactive_target + 1); - priority = 6; do { made_progress = 0; @@ -879,8 +941,11 @@ schedule(); } - while (refill_inactive_scan(priority, 1) || - swap_out(priority, gfp_mask, idle_time)) { + /* + * Reclaim old pages which aren't mapped into any + * process. + */ + while (refill_inactive_scan(priority, 1)) { made_progress = 1; if (--count <= 0) goto done; @@ -895,9 +960,9 @@ shrink_icache_memory(priority, gfp_mask); /* - * Then, try to page stuff out.. + * Steal pages from processes. */ - while (swap_out(priority, gfp_mask, 0)) { + while (swap_out(priority, gfp_mask)) { made_progress = 1; if (--count <= 0) goto done; @@ -930,7 +995,7 @@ return (count < start_count); } -static int do_try_to_free_pages(unsigned int gfp_mask, int user) +static int do_try_to_free_pages(int gfp_mask, int user) { int ret = 0; @@ -1105,7 +1170,7 @@ * memory but are unable to sleep on kswapd because * they might be holding some IO locks ... */ -int try_to_free_pages(unsigned int gfp_mask) +int try_to_free_pages(int gfp_mask) { int ret = 1; --- linux-2.4.0-test13-pre5/include/linux/mm.h.orig Thu Dec 28 19:11:45 2000 +++ linux-2.4.0-test13-pre5/include/linux/mm.h Thu Dec 28 19:32:22 2000 @@ -460,6 +460,7 @@ #else #define __GFP_HIGHMEM 0x0 /* noop */ #endif +#define __GFP_RSS_LIMIT 0x20 #define GFP_BUFFER (__GFP_HIGH | __GFP_WAIT) --- linux-2.4.0-test13-pre5/include/linux/swap.h.orig Thu Dec 28 19:11:48 2000 +++ linux-2.4.0-test13-pre5/include/linux/swap.h Thu Dec 28 19:37:54 2000 @@ -108,7 +108,8 @@ extern int free_shortage(void); extern int inactive_shortage(void); extern void wakeup_kswapd(int); -extern int try_to_free_pages(unsigned int gfp_mask); +extern int try_to_free_pages(int); +extern void enforce_rss_limit(struct mm_struct *, int); /* linux/mm/page_io.c */ extern void rw_swap_page(int, struct page *, int);

- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Sun Dec 31 2000 - 21:00:11 EST