[PATCH] kswapd fully sysctl tunable

Rik van Riel (H.H.vanRiel@phys.uu.nl)
Sat, 2 May 1998 15:07:14 +0200 (MET DST)


Hi Linus,

I've made a patch (which doesn't change
_any_ kernel semantics) to make every
VM parameter sysctl tunable again.

It:
- puts the high/low water marks from free_memory_available()
in freepages.{low,high}
- makes all kswapd things tunable
- provides for a way do to more agressive dcache pruning
- makes the files in /proc/sys/vm/ world-readable (security
by obscurity doesn't work anyway, so why the annoying
permission denied?)
- updates the documentation
- limits freepages.min to a maximum of 256 pages (we
already had a 48 page minimum and since wasting memory
is the last thing we want...) If you need it bigger
(unlikely) you can always change it via sysctl.

Please apply this patch for 2.1.100,

Rik.
+-------------------------------------------+--------------------------+
| Linux: - LinuxHQ MM-patches page | Scouting webmaster |
| - kswapd ask-him & complain-to guy | Vries cubscout leader |
| http://www.phys.uu.nl/~riel/ | <H.H.vanRiel@phys.uu.nl> |
+-------------------------------------------+--------------------------+

--- linux/kernel/sysctl.c.99 Sat May 2 13:30:16 1998
+++ linux/kernel/sysctl.c Sat May 2 13:32:27 1998
@@ -189,20 +189,22 @@

static ctl_table vm_table[] = {
{VM_SWAPCTL, "swapctl",
- &swap_control, sizeof(swap_control_t), 0600, NULL, &proc_dointvec},
+ &swap_control, sizeof(swap_control_t), 0644, NULL, &proc_dointvec},
{VM_SWAPOUT, "swapout_interval",
- &swapout_interval, sizeof(int), 0600, NULL, &proc_dointvec},
+ &swapout_interval, sizeof(int), 0644, NULL, &proc_dointvec},
{VM_FREEPG, "freepages",
- &freepages, sizeof(freepages_t), 0600, NULL, &proc_dointvec},
+ &freepages, sizeof(freepages_t), 0644, NULL, &proc_dointvec},
{VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0600, NULL,
&proc_dointvec_minmax, &sysctl_intvec, NULL,
&bdflush_min, &bdflush_max},
{VM_OVERCOMMIT_MEMORY, "overcommit_memory", &sysctl_overcommit_memory,
sizeof(sysctl_overcommit_memory), 0644, NULL, &proc_dointvec},
{VM_BUFFERMEM, "buffermem",
- &buffer_mem, sizeof(buffer_mem_t), 0600, NULL, &proc_dointvec},
+ &buffer_mem, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec},
{VM_PAGECACHE, "pagecache",
- &page_cache, sizeof(buffer_mem_t), 0600, NULL, &proc_dointvec},
+ &page_cache, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec},
+ {VM_PAGERDAEMON, "kswapd",
+ &pager_daemon, sizeof(pager_daemon_t), 0644, NULL, &proc_dointvec},
{0}
};

--- linux/mm/page_alloc.c.99 Sat May 2 14:01:30 1998
+++ linux/mm/page_alloc.c Sat May 2 14:11:01 1998
@@ -125,7 +125,7 @@
* free unfragmented memory.
* Added low/high water marks to avoid thrashing -- Rik.
*/
- if (nr_free_pages > (num_physpages >> 5) + (nr ? 0 : num_physpages >> 6))
+ if (nr_free_pages > (nr ? freepages.low : freepages.high))
return nr+1;

list = free_area + NR_MEM_LISTS;
@@ -335,15 +335,19 @@
int i;

/*
- * select nr of pages we try to keep free for important stuff
- * with a minimum of 48 pages. This is totally arbitrary
+ * Select nr of pages we try to keep free for important stuff
+ * with a minimum of 48 pages and a maximum of 256 pages, so
+ * that we don't waste too much memory on large systems.
+ * This is totally arbitrary.
*/
i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+7);
if (i < 48)
i = 48;
+ if (i > 256)
+ i = 256;
freepages.min = i;
- freepages.low = i + (i>>1);
- freepages.high = i + i;
+ freepages.low = i << 1;
+ freepages.high = freepages.low + i;
mem_map = (mem_map_t *) LONG_ALIGN(start_mem);
p = mem_map + MAP_NR(end_mem);
start_mem = LONG_ALIGN((unsigned long) p);
--- linux/mm/swap.c.99 Sat May 2 14:01:06 1998
+++ linux/mm/swap.c Sat May 2 14:01:06 1998
@@ -44,8 +44,8 @@
*/
freepages_t freepages = {
48, /* freepages.min */
- 72, /* freepages.low */
- 96 /* freepages.high */
+ 96, /* freepages.low */
+ 144 /* freepages.high */
};

/* We track the number of pages currently being asynchronously swapped
@@ -76,4 +76,11 @@
10, /* minimum percent page cache */
30, /* borrow percent page cache */
75 /* maximum */
+};
+
+pager_daemon_t pager_daemon = {
+ 512, /* base number for calculating the number of tries */
+ SWAP_CLUSTER_MAX, /* minimum number of tries */
+ SWAP_CLUSTER_MAX, /* do swap I/O in clusters of this size */
+ 0 /* if nonzero, do extra agressive dcache pruning */
};
--- linux/mm/vmscan.c.99 Sat May 2 14:01:40 1998
+++ linux/mm/vmscan.c Sat May 2 14:20:43 1998
@@ -546,29 +546,31 @@
swapstats.wakeups++;

/* This will gently shrink the dcache.. */
- shrink_dcache_memory();
+ tries = pager_daemon.dcache_agressive + 1;
+ while (tries--)
+ shrink_dcache_memory();

/*
* Do the background pageout: be
* more aggressive if we're really
* low on free memory.
*
- * The number of tries is 512 divided by an
- * 'urgency factor'. In practice this will mean
- * a value of 512 / 8 = 64 pages at a time,
- * giving 64 * 4 (times/sec) * 4k (pagesize) =
- * 1 MB/s in lowest-priority background
+ * We try pager_daemon.tries_base times, divided by
+ * an 'urgency factor'. In practice this will mean
+ * a value of pager_daemon.tries_base / 8 or 4 = 64
+ * or 128 pages at a time.
+ * This gives us 64 * 4 (times/sec) * 4k (pagesize) =
+ * 1 or 2 MB/s in low-priority background
* paging. This number rises to 8 MB/s when the
* priority is highest (but then we'll be woken
* up more often and the rate will be even higher).
- * -- Should make this sysctl tunable...
*/
- tries = (512) >> free_memory_available(3);
+ tries = pager_daemon.tries_base >> free_memory_available(3);

while (tries--) {
int gfp_mask;

- if (++tried > SWAP_CLUSTER_MAX && free_memory_available(0))
+ if (++tried > pager_daemon.tries_min && free_memory_available(0))
break;
gfp_mask = __GFP_IO;
try_to_free_page(gfp_mask);
@@ -576,7 +578,7 @@
* Syncing large chunks is faster than swapping
* synchronously (less head movement). -- Rik.
*/
- if (atomic_read(&nr_async_pages) >= SWAP_CLUSTER_MAX)
+ if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster)
run_task_queue(&tq_disk);

}
--- linux/include/linux/sysctl.h.99 Sat May 2 13:20:58 1998
+++ linux/include/linux/sysctl.h Sat May 2 13:30:04 1998
@@ -84,7 +84,8 @@
VM_BDFLUSH, /* struct: Control buffer cache flushing */
VM_OVERCOMMIT_MEMORY, /* Turn off the virtual memory safety limit */
VM_BUFFERMEM, /* struct: Set buffer memory thresholds */
- VM_PAGECACHE /* struct: Set cache memory thresholds */
+ VM_PAGECACHE, /* struct: Set cache memory thresholds */
+ VM_PAGERDAEMON /* struct: Control kswapd behaviour */
};


--- linux/include/linux/swapctl.h.99 Sat May 2 13:32:43 1998
+++ linux/include/linux/swapctl.h Sat May 2 13:57:54 1998
@@ -50,6 +50,16 @@
typedef freepages_v1 freepages_t;
extern freepages_t freepages;

+typedef struct pager_daemon_v1
+{
+ unsigned int tries_base;
+ unsigned int tries_min;
+ unsigned int swap_cluster;
+ unsigned int dcache_agressive;
+} pager_daemon_v1;
+typedef pager_daemon_v1 pager_daemon_t;
+extern pager_daemon_t pager_daemon;
+
#define SC_VERSION 1
#define SC_MAX_VERSION 1

--- linux/Documentation/sysctl/vm.txt.99 Sat May 2 14:21:21 1998
+++ linux/Documentation/sysctl/vm.txt Sat May 2 14:48:05 1998
@@ -18,6 +18,7 @@
- bdflush
- buffermem
- freepages
+- kswapd
- overcommit_memory
- pagecache
- swapctl
@@ -112,9 +113,68 @@
This file contains the values in the struct freepages. That
struct contains three members: min, low and high.

-These variables are currently unused (?), but they're
-very likely to be abused for something else in the near
-future, so don't yet remove it from the source...
+Although the goal of the Linux memory management subsystem
+is to avoid fragmentation and make large chunks of free
+memory (so that we can hand out DMA buffers and such), there
+still are some page-based limits in the system, mainly to
+make sure we don't waste too much memory trying to get large
+free area's.
+
+The meaning of the numbers is:
+
+freepages.min When the number of free pages in the system
+ reaches this number, only the kernel can
+ allocate more memory.
+freepages.low If memory is too fragmented, the swapout
+ daemon is started, except when the number
+ of free pages is larger than freepages.low.
+freepages.high The swapping daemon exits when memory is
+ sufficiently defragmented, when the number
+ of free pages reaches freepages.high or when
+ it has tried the maximum number of times.
+
+==============================================================
+
+kswapd:
+
+Kswapd is the kernel swapout daemon. That is, kswapd is that
+piece of the kernel that frees memory when it get's fragmented
+or full. Since every system is different, you'll probably want
+some control over this piece of the system.
+
+The numbers in this page correspond to the numbers in the
+struct pager_daemon {tries_base, tries_min, swap_cluster,
+dcache_agressive}; The tries_base and swap_cluster probably
+have the largest influence on system performance, whereas
+dcache_agressive is only needed on machines with less than
+16 megabytes of RAM.
+
+tries_base The maximum number of pages kswapd tries to
+ free in one round is calculated from this
+ number. Usually this number will be divided
+ by 4 or 8 (see mm/vmscan.c), so it isn't as
+ big as it looks.
+ When you need to increase the bandwith to/from
+ swap, you'll want to increase this number.
+tries_min This is the minimum number of times kswapd
+ tries to free a page each time it is called.
+ Basically it's just there to make sure that
+ kswapd frees some pages even when it's being
+ called with minimum priority.
+swap_cluster This is the number of pages kswapd writes in
+ one turn. You want this large so that kswapd
+ does it's I/O in large chunks and the disk
+ doesn't have to seek often, but you don't want
+ it to be too large since that would flood the
+ request queue.
+dcache_agressive This option can be used when the dcache
+ is taking up too much memory. This usually
+ only happens on machines with less than 16
+ megabytes of RAM. Increasing this value makes
+ kswapd prune the dcache more agressively, but
+ it makes name cacheing less effective, causing
+ more disk reads and decreased performance.
+ Use this when you need to, but use with care!

==============================================================

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu