[PATCH] per-zone kswapd process

From: Dave Hansen (haveblue@us.ibm.com)
Date: Thu Sep 12 2002 - 22:33:32 EST


This patch implements a kswapd process for each memory zone. The original code
came from Bill Irwin, but the current VM is quite a bit different from the one
that he wrote it for, so not much remains. The current kswapd interface is much
more simple than before because there is a single waitqueue and there is a
single place where it is emptied.

kswapd_can_sleep() and kswapd_balance() are simpler now that the extra pgdat
level of indirection is gone.

Tested on 8-way PIII with highmem off and then 4GB support. With 4GB support, I
did 20 parallel greps through a 10GB fileset while some other processes
allocated and freed 1-2GB chunks of memory. That gave kswapd a good workout,
and I observed it running the zone Highmem and zone Normal kswapd threads. So,
it survives my torture test. It also removes more code than it adds.

include/linux/mmzone.h | 2 +
include/linux/swap.h | 1
mm/page_alloc.c | 11 +++++-
mm/vmscan.c | 88 +++++++++++++++++--------------------------------
4 files changed, 42 insertions(+), 60 deletions(-)

-- 
Dave Hansen
haveblue@us.ibm.com

# This is a BitKeeper generated patch for the following project: # Project Name: Linux kernel tree # This patch format is intended for GNU patch command version 2.5 or higher. # This patch includes the following deltas: # ChangeSet 1.625 -> 1.628 # include/linux/mmzone.h 1.19 -> 1.20 # include/linux/swap.h 1.57 -> 1.58 # mm/page_alloc.c 1.98 -> 1.101 # mm/vmscan.c 1.102 -> 1.105 # # The following is the BitKeeper ChangeSet Log # -------------------------------------------- # 02/09/12 haveblue@elm3b96.(none) 1.626 # add per-zone kswapd # -------------------------------------------- # 02/09/12 haveblue@elm3b96.(none) 1.627 # fix some wli-indicated formatting bits # -------------------------------------------- # 02/09/12 haveblue@elm3b96.(none) 1.628 # move waitqueue init to a more appropriate place # -------------------------------------------- # diff -Nru a/include/linux/mmzone.h b/include/linux/mmzone.h --- a/include/linux/mmzone.h Thu Sep 12 20:24:39 2002 +++ b/include/linux/mmzone.h Thu Sep 12 20:24:39 2002 @@ -108,6 +108,8 @@ unsigned long wait_table_size; unsigned long wait_table_bits; + wait_queue_head_t kswapd_wait; + /* * Discontig memory support fields. */ diff -Nru a/include/linux/swap.h b/include/linux/swap.h --- a/include/linux/swap.h Thu Sep 12 20:24:39 2002 +++ b/include/linux/swap.h Thu Sep 12 20:24:39 2002 @@ -162,7 +162,6 @@ extern void swap_setup(void); /* linux/mm/vmscan.c */ -extern wait_queue_head_t kswapd_wait; extern int try_to_free_pages(struct zone *, unsigned int, unsigned int); /* linux/mm/page_io.c */ diff -Nru a/mm/page_alloc.c b/mm/page_alloc.c --- a/mm/page_alloc.c Thu Sep 12 20:24:39 2002 +++ b/mm/page_alloc.c Thu Sep 12 20:24:39 2002 @@ -345,8 +345,15 @@ classzone->need_balance = 1; mb(); /* we're somewhat low on memory, failed to find what we needed */ - if (waitqueue_active(&kswapd_wait)) - wake_up_interruptible(&kswapd_wait); + for (i = 0; zones[i] != NULL; i++) { + struct zone *z = zones[i]; + + /* We don't want to go swapping on zones that aren't actually + * low. This accounts for "incremental min" from last loop */ + if (z->free_pages <= z->pages_low && + waitqueue_active(&z->kswapd_wait)) + wake_up_interruptible(&z->kswapd_wait); + } /* Go through the zonelist again, taking __GFP_HIGH into account */ min = 1UL << order; @@ -874,6 +881,8 @@ for(i = 0; i < zone->wait_table_size; ++i) init_waitqueue_head(zone->wait_table + i); + init_waitqueue_head(&zone->kswapd_wait); + pgdat->nr_zones = j+1; mask = (realsize / zone_balance_ratio[j]); diff -Nru a/mm/vmscan.c b/mm/vmscan.c --- a/mm/vmscan.c Thu Sep 12 20:24:39 2002 +++ b/mm/vmscan.c Thu Sep 12 20:24:39 2002 @@ -713,8 +713,6 @@ return 0; } -DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); - static int check_classzone_need_balance(struct zone *classzone) { struct zone *first_classzone; @@ -728,71 +726,33 @@ return 1; } -static int kswapd_balance_pgdat(pg_data_t * pgdat) +static int kswapd_balance_zone(struct zone *zone) { - int need_more_balance = 0, i; - struct zone *zone; - - for (i = pgdat->nr_zones-1; i >= 0; i--) { - zone = pgdat->node_zones + i; + int need_more_balance = 0; + + do { cond_resched(); if (!zone->need_balance) - continue; + break; if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) { zone->need_balance = 0; __set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ); - continue; + break; } if (check_classzone_need_balance(zone)) need_more_balance = 1; else zone->need_balance = 0; - } - - return need_more_balance; -} - -static void kswapd_balance(void) -{ - int need_more_balance; - pg_data_t * pgdat; - - do { - need_more_balance = 0; - pgdat = pgdat_list; - do - need_more_balance |= kswapd_balance_pgdat(pgdat); - while ((pgdat = pgdat->pgdat_next)); } while (need_more_balance); -} -static int kswapd_can_sleep_pgdat(pg_data_t * pgdat) -{ - struct zone *zone; - int i; - - for (i = pgdat->nr_zones-1; i >= 0; i--) { - zone = pgdat->node_zones + i; - if (!zone->need_balance) - continue; - return 0; - } - - return 1; + return 0; } -static int kswapd_can_sleep(void) +static int kswapd_can_sleep_zone(struct zone *zone) { - pg_data_t * pgdat; - - pgdat = pgdat_list; - do { - if (kswapd_can_sleep_pgdat(pgdat)) - continue; - return 0; - } while ((pgdat = pgdat->pgdat_next)); - + if (zone->need_balance) + return 0; return 1; } @@ -809,13 +769,18 @@ * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter. */ -int kswapd(void *unused) +int kswapd_zone(void *p) { + struct zone *zone = (struct zone *)p; struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); + + printk( "kswapd%d starting for %s\n", + zone - zone->zone_pgdat->node_zones, + zone->name); daemonize(); - strcpy(tsk->comm, "kswapd"); + sprintf(tsk->comm, "kswapd%d", zone - zone->zone_pgdat->node_zones); sigfillset(&tsk->blocked); /* @@ -839,30 +804,37 @@ if (current->flags & PF_FREEZE) refrigerator(PF_IOTHREAD); __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kswapd_wait, &wait); + add_wait_queue(&zone->kswapd_wait, &wait); mb(); - if (kswapd_can_sleep()) + if (kswapd_can_sleep_zone(zone)) schedule(); __set_current_state(TASK_RUNNING); - remove_wait_queue(&kswapd_wait, &wait); + remove_wait_queue(&zone->kswapd_wait, &wait); /* * If we actually get into a low-memory situation, * the processes needing more memory will wake us * up on a more timely basis. */ - kswapd_balance(); + kswapd_balance_zone(zone); blk_run_queues(); } } static int __init kswapd_init(void) { + struct zone* zone; + printk("Starting kswapd\n"); swap_setup(); - kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + for_each_zone(zone) + if (zone->size) + kernel_thread(kswapd_zone, + zone, + CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + return 0; }

- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Sun Sep 15 2002 - 22:00:31 EST