memory hotremove prototype, take 3

From: IWAMOTO Toshihiro
Date: Sun Nov 30 2003 - 22:43:46 EST


Hi,

this is a new version of my memory hotplug prototype patch, against
linux-2.6.0-test11.

Freeing 100% of a specified memory zone is non-trivial and necessary
for memory hot removal. This patch splits memory into 1GB zones, and
implements complete zone memory freeing using kswapd or "remapping".

A bit more detailed explanation and some test scripts are at:
http://people.valinux.co.jp/~iwamoto/mh.html

Main changes against previous versions are:
- The stability is greatly improved. Kernel crashes (probably related
with kswapd) still happen, but they are rather rare so that I'm
having difficulty reproducing crashes.
Page remapping under simultaneous tar + rm -rf works.
- Implemented a solution to a deadlock caused by ext2_rename, which
increments a refcount of a directory page twice.

Questions and comments are welcome.

$Id: memoryhotplug.patch,v 1.26 2003/11/28 09:12:12 iwamoto Exp $

diff -dpur linux-2.6.0-test11/arch/i386/Kconfig linux-2.6.0-test11-mh/arch/i386/Kconfig
--- linux-2.6.0-test11/arch/i386/Kconfig Thu Nov 27 05:43:07 2003
+++ linux-2.6.0-test11-mh/arch/i386/Kconfig Fri Nov 28 17:45:42 2003
@@ -706,14 +706,18 @@ comment "NUMA (NUMA-Q) requires SMP, 64G
comment "NUMA (Summit) requires SMP, 64GB highmem support, full ACPI"
depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI || ACPI_HT_ONLY)

+config MEMHOTPLUGTEST
+ bool "Memory hotplug test"
+ default n
+
config DISCONTIGMEM
bool
- depends on NUMA
+ depends on NUMA || MEMHOTPLUGTEST
default y

config HAVE_ARCH_BOOTMEM_NODE
bool
- depends on NUMA
+ depends on NUMA || MEMHOTPLUGTEST
default y

config HIGHPTE
diff -dpur linux-2.6.0-test11/arch/i386/mm/discontig.c linux-2.6.0-test11-mh/arch/i386/mm/discontig.c
--- linux-2.6.0-test11/arch/i386/mm/discontig.c Thu Nov 27 05:44:20 2003
+++ linux-2.6.0-test11-mh/arch/i386/mm/discontig.c Fri Nov 28 17:45:42 2003
@@ -28,6 +28,7 @@
#include <linux/mmzone.h>
#include <linux/highmem.h>
#include <linux/initrd.h>
+#include <linux/proc_fs.h>
#include <asm/e820.h>
#include <asm/setup.h>
#include <asm/mmzone.h>
@@ -111,6 +112,49 @@ int __init get_memcfg_numa_flat(void)
return 1;
}

+int __init get_memcfg_numa_blks(void)
+{
+ int i, pfn;
+
+ printk("NUMA - single node, flat memory mode, but broken in several blocks\n");
+
+ /* Run the memory configuration and find the top of memory. */
+ find_max_pfn();
+ if (max_pfn & (PTRS_PER_PTE - 1)) {
+ pfn = max_pfn & ~(PTRS_PER_PTE - 1);
+ printk("Rounding down maxpfn %d -> %d\n", max_pfn, pfn);
+ max_pfn = pfn;
+ }
+ for(i = 0; i < MAX_NUMNODES; i++) {
+ pfn = PFN_DOWN(1 << 30) * i;
+ node_start_pfn[i] = pfn;
+ pfn += PFN_DOWN(1 << 30);
+ if (pfn < max_pfn)
+ node_end_pfn[i] = pfn;
+ else {
+ node_end_pfn[i] = max_pfn;
+ i++;
+ printk("total %d blocks, max %d\n", i, max_pfn);
+ break;
+ }
+ }
+
+ /* Fill in the physnode_map with our simplistic memory model,
+ * all memory is in node 0.
+ */
+ for (pfn = node_start_pfn[0]; pfn <= max_pfn;
+ pfn += PAGES_PER_ELEMENT)
+ {
+ physnode_map[pfn / PAGES_PER_ELEMENT] = pfn / PFN_DOWN(1 << 30);
+ }
+
+ /* Indicate there is one node available. */
+ node_set_online(0);
+ numnodes = i;
+
+ return 1;
+}
+
/*
* Find the highest page frame number we have available for the node
*/
@@ -183,6 +227,8 @@ static void __init register_bootmem_low_
}
}

+static struct kcore_list numa_kc;
+
void __init remap_numa_kva(void)
{
void *vaddr;
@@ -196,7 +242,11 @@ void __init remap_numa_kva(void)
node_remap_start_pfn[node] + pfn,
PAGE_KERNEL_LARGE);
}
+ memset(node_remap_start_vaddr[node], 0,
+ node_remap_size[node] * PAGE_SIZE);
}
+ kclist_add(&numa_kc, node_remap_start_vaddr[numnodes - 1],
+ node_remap_offset[numnodes - 1] << PAGE_SHIFT);
}

static unsigned long calculate_numa_remap_pages(void)
diff -dpur linux-2.6.0-test11/include/asm-i386/kmap_types.h linux-2.6.0-test11-mh/include/asm-i386/kmap_types.h
--- linux-2.6.0-test11/include/asm-i386/kmap_types.h Thu Nov 27 05:44:56 2003
+++ linux-2.6.0-test11-mh/include/asm-i386/kmap_types.h Fri Nov 28 17:52:08 2003
@@ -24,7 +24,13 @@ D(10) KM_IRQ0,
D(11) KM_IRQ1,
D(12) KM_SOFTIRQ0,
D(13) KM_SOFTIRQ1,
+#ifdef CONFIG_MEMHOTPLUGTEST
+D(14) KM_REMAP0,
+D(15) KM_REMAP1,
+D(16) KM_TYPE_NR,
+#else
D(14) KM_TYPE_NR
+#endif
};

#undef D
diff -dpur linux-2.6.0-test11/include/asm-i386/mmzone.h linux-2.6.0-test11-mh/include/asm-i386/mmzone.h
--- linux-2.6.0-test11/include/asm-i386/mmzone.h Thu Nov 27 05:44:10 2003
+++ linux-2.6.0-test11-mh/include/asm-i386/mmzone.h Fri Nov 28 17:45:42 2003
@@ -128,6 +128,10 @@ static inline struct pglist_data *pfn_to
#endif /* CONFIG_X86_NUMAQ */

extern int get_memcfg_numa_flat(void );
+#ifdef CONFIG_MEMHOTPLUGTEST
+extern int get_memcfg_numa_blks(void);
+#endif
+
/*
* This allows any one NUMA architecture to be compiled
* for, and still fall back to the flat function if it
@@ -143,6 +147,10 @@ static inline void get_memcfg_numa(void)
return;
#endif

+#ifdef CONFIG_MEMHOTPLUGTEST
+ get_memcfg_numa_blks();
+ return;
+#endif
get_memcfg_numa_flat();
}

diff -dpur linux-2.6.0-test11/include/asm-i386/numnodes.h linux-2.6.0-test11-mh/include/asm-i386/numnodes.h
--- linux-2.6.0-test11/include/asm-i386/numnodes.h Thu Nov 27 05:43:09 2003
+++ linux-2.6.0-test11-mh/include/asm-i386/numnodes.h Fri Nov 28 17:45:42 2003
@@ -13,6 +13,10 @@
/* Max 8 Nodes */
#define NODES_SHIFT 3

+#elif defined(CONFIG_MEMHOTPLUGTEST)
+
+#define NODES_SHIFT 3
+
#endif /* CONFIG_X86_NUMAQ */

#endif /* _ASM_MAX_NUMNODES_H */
diff -dpur linux-2.6.0-test11/include/linux/mm.h linux-2.6.0-test11-mh/include/linux/mm.h
--- linux-2.6.0-test11/include/linux/mm.h Thu Nov 27 05:42:55 2003
+++ linux-2.6.0-test11-mh/include/linux/mm.h Fri Nov 28 17:45:42 2003
@@ -219,7 +219,14 @@ struct page {
*/
#define put_page_testzero(p) \
({ \
- BUG_ON(page_count(p) == 0); \
+ if (page_count(p) == 0) { \
+ int i; \
+ printk("Page: %lx ", (long)p); \
+ for(i = 0; i < sizeof(struct page); i++) \
+ printk(" %02x", ((unsigned char *)p)[i]); \
+ printk("\n"); \
+ BUG(); \
+ } \
atomic_dec_and_test(&(p)->count); \
})

diff -dpur linux-2.6.0-test11/include/linux/mmzone.h linux-2.6.0-test11-mh/include/linux/mmzone.h
--- linux-2.6.0-test11/include/linux/mmzone.h Thu Nov 27 05:44:20 2003
+++ linux-2.6.0-test11-mh/include/linux/mmzone.h Fri Nov 28 17:45:42 2003
@@ -360,6 +360,10 @@ static inline unsigned int num_online_me
return num;
}

+#ifdef CONFIG_MEMHOTPLUGTEST
+int zone_activep(const struct zone *);
+int remapd(void *p);
+#endif
#else /* !CONFIG_DISCONTIGMEM && !CONFIG_NUMA */

#define node_online(node) \
diff -dpur linux-2.6.0-test11/include/linux/page-flags.h linux-2.6.0-test11-mh/include/linux/page-flags.h
--- linux-2.6.0-test11/include/linux/page-flags.h Thu Nov 27 05:44:52 2003
+++ linux-2.6.0-test11-mh/include/linux/page-flags.h Fri Nov 28 17:45:42 2003
@@ -76,6 +76,8 @@
#define PG_reclaim 18 /* To be reclaimed asap */
#define PG_compound 19 /* Part of a compound page */

+#define PG_again 20
+

/*
* Global page accounting. One instance per CPU. Only unsigned longs are
@@ -268,6 +270,10 @@ extern void get_full_page_state(struct p
#define PageCompound(page) test_bit(PG_compound, &(page)->flags)
#define SetPageCompound(page) set_bit(PG_compound, &(page)->flags)
#define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags)
+
+#define PageAgain(page) test_bit(PG_again, &(page)->flags)
+#define SetPageAgain(page) set_bit(PG_again, &(page)->flags)
+#define ClearPageAgain(page) clear_bit(PG_again, &(page)->flags)

/*
* The PageSwapCache predicate doesn't use a PG_flag at this time,
diff -dpur linux-2.6.0-test11/mm/filemap.c linux-2.6.0-test11-mh/mm/filemap.c
--- linux-2.6.0-test11/mm/filemap.c Thu Nov 27 05:43:33 2003
+++ linux-2.6.0-test11-mh/mm/filemap.c Fri Nov 28 17:45:42 2003
@@ -448,7 +448,8 @@ repeat:
spin_lock(&mapping->page_lock);

/* Has the page been truncated while we slept? */
- if (page->mapping != mapping || page->index != offset) {
+ if (page->mapping != mapping || page->index != offset ||
+ PageAgain(page)) {
unlock_page(page);
page_cache_release(page);
goto repeat;
@@ -677,6 +678,12 @@ page_not_up_to_date:
goto page_ok;
}

+ if (PageAgain(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto find_page;
+ }
+
readpage:
/* ... and start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);
@@ -1120,6 +1127,12 @@ page_not_uptodate:
goto success;
}

+ if (PageAgain(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto retry_find;
+ }
+
if (!mapping->a_ops->readpage(file, page)) {
wait_on_page_locked(page);
if (PageUptodate(page))
@@ -1228,6 +1241,12 @@ page_not_uptodate:
goto success;
}

+ if (PageAgain(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto retry_find;
+ }
+
if (!mapping->a_ops->readpage(file, page)) {
wait_on_page_locked(page);
if (PageUptodate(page))
@@ -1436,6 +1455,11 @@ retry:
if (PageUptodate(page)) {
unlock_page(page);
goto out;
+ }
+ if (PageAgain(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto retry;
}
err = filler(data, page);
if (err < 0) {
diff -dpur linux-2.6.0-test11/mm/page_alloc.c linux-2.6.0-test11-mh/mm/page_alloc.c
--- linux-2.6.0-test11/mm/page_alloc.c Thu Nov 27 05:42:56 2003
+++ linux-2.6.0-test11-mh/mm/page_alloc.c Fri Nov 28 17:45:42 2003
@@ -31,6 +31,7 @@
#include <linux/topology.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
+#include <linux/proc_fs.h>

#include <asm/tlbflush.h>

@@ -52,6 +53,9 @@ EXPORT_SYMBOL(nr_swap_pages);
*/
struct zone *zone_table[MAX_NR_ZONES*MAX_NUMNODES];
EXPORT_SYMBOL(zone_table);
+#ifdef CONFIG_MEMHOTPLUGTEST
+static char zone_active[MAX_NR_ZONES*MAX_NUMNODES];
+#endif

static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
int min_free_kbytes = 1024;
@@ -411,7 +415,9 @@ int is_head_of_free_region(struct page *
spin_unlock_irqrestore(&zone->lock, flags);
return 0;
}
+#endif

+#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_MEMHOTPLUGTEST)
/*
* Spill all of this CPU's per-cpu pages back into the buddy allocator.
*/
@@ -512,9 +518,28 @@ static struct page *buffered_rmqueue(str
mod_page_state(pgalloc, 1 << order);
prep_new_page(page, order);
}
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (page != NULL && ! zone_active[page->flags >> ZONE_SHIFT])
+ printk("alloc_page from disabled zone: %p\n", page);
+#endif
return page;
}

+#ifdef CONFIG_MEMHOTPLUGTEST
+int
+zone_activep(const struct zone *z)
+{
+ int i;
+
+ for(i = 0; ; i++) {
+ if (zone_table[i] == z)
+ return zone_active[i];
+ if (zone_table[i] == NULL)
+ BUG();
+ }
+}
+#endif
+
/*
* This is the 'heart' of the zoned buddy allocator.
*
@@ -562,6 +587,10 @@ __alloc_pages(unsigned int gfp_mask, uns
struct zone *z = zones[i];
unsigned long local_low;

+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(z))
+ continue;
+#endif
/*
* This is the fabled 'incremental min'. We let real-time tasks
* dip their real-time paws a little deeper into reserves.
@@ -590,6 +619,10 @@ __alloc_pages(unsigned int gfp_mask, uns
unsigned long local_min;
struct zone *z = zones[i];

+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(z))
+ continue;
+#endif
local_min = z->pages_min;
if (gfp_mask & __GFP_HIGH)
local_min >>= 2;
@@ -613,6 +646,10 @@ rebalance:
for (i = 0; zones[i] != NULL; i++) {
struct zone *z = zones[i];

+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(z))
+ continue;
+#endif
page = buffered_rmqueue(z, order, cold);
if (page)
goto got_pg;
@@ -638,6 +675,10 @@ rebalance:
for (i = 0; zones[i] != NULL; i++) {
struct zone *z = zones[i];

+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(z))
+ continue;
+#endif
min += z->pages_min;
if (z->free_pages >= min ||
(!wait && z->free_pages >= z->pages_high)) {
@@ -1076,6 +1117,9 @@ static int __init build_zonelists_node(p
static void __init build_zonelists(pg_data_t *pgdat)
{
int i, j, k, node, local_node;
+#ifdef CONFIG_MEMHOTPLUGTEST
+ struct zone *zone;
+#endif

local_node = pgdat->node_id;
printk("Building zonelist for node : %d\n", local_node);
@@ -1091,7 +1135,7 @@ static void __init build_zonelists(pg_da
k = ZONE_HIGHMEM;
if (i & __GFP_DMA)
k = ZONE_DMA;
-
+#ifndef CONFIG_MEMHOTPLUGTEST
j = build_zonelists_node(pgdat, zonelist, j, k);
/*
* Now we build the zonelist so that it contains the zones
@@ -1107,6 +1151,23 @@ static void __init build_zonelists(pg_da
j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);

zonelist->zones[j++] = NULL;
+#else
+ for(; k >= 0; k--) {
+ zone = pgdat->node_zones + k;
+ if (zone->present_pages)
+ zonelist->zones[j++] = zone;
+ for (node = local_node + 1; node < numnodes; node++) {
+ zone = NODE_DATA(node)->node_zones + k;
+ if (zone->present_pages)
+ zonelist->zones[j++] = zone;
+ }
+ for (node = 0; node < local_node; node++) {
+ zone = NODE_DATA(node)->node_zones + k;
+ if (zone->present_pages)
+ zonelist->zones[j++] = zone;
+ }
+ }
+#endif
}
}

@@ -1252,6 +1313,9 @@ static void __init free_area_init_core(s
unsigned long batch;

zone_table[nid * MAX_NR_ZONES + j] = zone;
+#ifdef CONFIG_MEMHOTPLUGTEST
+ zone_active[nid * MAX_NR_ZONES + j] = 1;
+#endif
realsize = size = zones_size[j];
if (zholes_size)
realsize -= zholes_size[j];
@@ -1644,3 +1708,145 @@ int min_free_kbytes_sysctl_handler(ctl_t
setup_per_zone_pages_min();
return 0;
}
+
+#ifdef CONFIG_MEMHOTPLUGTEST
+static int mhtest_read(char *page, char **start, off_t off, int count,
+ int *eof, void *data)
+{
+ char *p;
+ int i, len;
+ const struct zone *z;
+
+ p = page;
+ for(i = 0; ; i++) {
+ z = zone_table[i];
+ if (z == NULL)
+ break;
+ if (! z->present_pages)
+ /* skip empty zone */
+ continue;
+ len = sprintf(p, "Zone %d: %sabled free %d, active %d, present %d\n", i,
+ zone_active[i] ? "en" : "dis", z->free_pages, z->nr_active,
+ z->present_pages);
+ p += len;
+ }
+ len = p - page;
+
+ if (len <= off + count)
+ *eof = 1;
+ *start = page + off;
+ len -= off;
+ if (len < 0)
+ len = 0;
+ if (len > count)
+ len = count;
+
+ return len;
+}
+
+static int mhtest_write(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ unsigned long idx;
+ char buf[64], *p;
+ int i;
+ struct list_head *l;
+
+ if (count > sizeof(buf) - 1)
+ count = sizeof(buf) - 1;
+ if (copy_from_user(buf, buffer, count))
+ return -EFAULT;
+
+ buf[count] = 0;
+
+ p = strchr(buf, ' ');
+ if (p == NULL)
+ goto out;
+
+ *p++ = '\0';
+ idx = simple_strtoul(p, NULL, 0);
+
+ if (idx > MAX_NR_ZONES*MAX_NUMNODES) {
+ printk("Argument out of range\n");
+ goto out;
+ }
+ if (strcmp(buf, "disable") == 0) {
+ printk("disable %d\n", idx);
+ /* XXX */
+ for (i = 0; i < NR_CPUS; i++) {
+ struct per_cpu_pages *pcp;
+
+ pcp = &zone_table[idx]->pageset[i].pcp[0]; /* hot */
+ pcp->low = pcp->high = 0;
+
+ pcp = &zone_table[idx]->pageset[i].pcp[1]; /* cold */
+ pcp->low = pcp->high = 0;
+ }
+ zone_active[idx] = 0;
+ zone_table[idx]->pages_high = zone_table[idx]->present_pages;
+ } else if (strcmp(buf, "purge") == 0) {
+ if (zone_active[idx])
+ printk("Zone %d still active (proceeding anyway)\n",
+ idx);
+ printk("purge %d\n", idx);
+ wake_up_interruptible(&zone_table[idx]->zone_pgdat->kswapd_wait);
+ /* XXX overkill, but who cares? */
+ on_each_cpu(drain_local_pages, NULL, 1, 1);
+ } else if (strcmp(buf, "enable") == 0) {
+ printk("enable %d\n", idx);
+ zone_active[idx] = 1;
+ zone_table[idx]->pages_high =
+ zone_table[idx]->pages_min * 3;
+ /* XXX */
+ for (i = 0; i < NR_CPUS; i++) {
+ struct per_cpu_pages *pcp;
+
+ pcp = &zone_table[idx]->pageset[i].pcp[0]; /* hot */
+ pcp->low = 2 * pcp->batch;
+ pcp->high = 6 * pcp->batch;
+
+ pcp = &zone_table[idx]->pageset[i].pcp[1]; /* cold */
+ pcp->high = 2 * pcp->batch;
+ }
+ } else if (strcmp(buf, "remap") == 0) {
+ on_each_cpu(drain_local_pages, NULL, 1, 1);
+ kernel_thread(remapd, zone_table[idx], CLONE_KERNEL);
+ } else if (strcmp(buf, "active") == 0) {
+ if (zone_table[idx] == NULL)
+ goto out;
+ spin_lock_irq(&zone_table[idx]->lru_lock);
+ i = 0;
+ list_for_each(l, &zone_table[idx]->active_list) {
+ printk(" %lx", (unsigned long)list_entry(l, struct page, lru));
+ i++;
+ if (i == 10)
+ break;
+ }
+ spin_unlock_irq(&zone_table[idx]->lru_lock);
+ printk("\n");
+ } else if (strcmp(buf, "inuse") == 0) {
+ if (zone_table[idx] == NULL)
+ goto out;
+ for(i = 0; i < zone_table[idx]->spanned_pages; i++)
+ if (page_count(&zone_table[idx]->zone_mem_map[i]))
+ printk(" %lx", (unsigned long)&zone_table[idx]->zone_mem_map[i]);
+ printk("\n");
+ }
+out:
+ return count;
+}
+
+static int __init procmhtest_init(void)
+{
+ struct proc_dir_entry *entry;
+
+ entry = create_proc_entry("memhotplug", 0, NULL);
+ if (entry == NULL)
+ return -1;
+
+ entry->read_proc = &mhtest_read;
+ entry->write_proc = &mhtest_write;
+ return 0;
+}
+__initcall(procmhtest_init);
+#endif
diff -dpur linux-2.6.0-test11/mm/shmem.c linux-2.6.0-test11-mh/mm/shmem.c
--- linux-2.6.0-test11/mm/shmem.c Thu Nov 27 05:43:41 2003
+++ linux-2.6.0-test11-mh/mm/shmem.c Fri Nov 28 17:45:42 2003
@@ -80,7 +80,15 @@ static inline struct page *shmem_dir_all
* BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
* might be reconsidered if it ever diverges from PAGE_SIZE.
*/
- return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
+#ifdef CONFIG_MEMHOTPLUGTEST
+ struct page* p = alloc_pages(gfp_mask & ~__GFP_HIGHMEM,
+ PAGE_CACHE_SHIFT-PAGE_SHIFT);
+ printk("shmem_dir_alloc: %lx\n", (unsigned long)p);
+ return p;
+#else
+ return alloc_pages(gfp_mask & ~__GFP_HIGHMEM,
+ PAGE_CACHE_SHIFT-PAGE_SHIFT);
+#endif
}

static inline void shmem_dir_free(struct page *page)
diff -dpur linux-2.6.0-test11/mm/truncate.c linux-2.6.0-test11-mh/mm/truncate.c
--- linux-2.6.0-test11/mm/truncate.c Thu Nov 27 05:45:39 2003
+++ linux-2.6.0-test11-mh/mm/truncate.c Fri Nov 28 17:45:42 2003
@@ -132,6 +132,10 @@ void truncate_inode_pages(struct address
next++;
if (TestSetPageLocked(page))
continue;
+ if (PageAgain(page)) {
+ unlock_page(page);
+ continue;
+ }
if (PageWriteback(page)) {
unlock_page(page);
continue;
@@ -165,6 +169,14 @@ void truncate_inode_pages(struct address
struct page *page = pvec.pages[i];

lock_page(page);
+ if (PageAgain(page)) {
+ unsigned long index = page->index;
+
+ unlock_page(page);
+ put_page(page);
+ page = find_lock_page(mapping, index);
+ pvec.pages[i] = page;
+ }
wait_on_page_writeback(page);
if (page->index > next)
next = page->index;
@@ -255,6 +267,14 @@ void invalidate_inode_pages2(struct addr
struct page *page = pvec.pages[i];

lock_page(page);
+ if (PageAgain(page)) {
+ unsigned long index = page->index;
+
+ unlock_page(page);
+ put_page(page);
+ page = find_lock_page(mapping, index);
+ pvec.pages[i] = page;
+ }
if (page->mapping == mapping) { /* truncate race? */
wait_on_page_writeback(page);
next = page->index + 1;
diff -dpur linux-2.6.0-test11/mm/vmalloc.c linux-2.6.0-test11-mh/mm/vmalloc.c
--- linux-2.6.0-test11/mm/vmalloc.c Thu Nov 27 05:44:23 2003
+++ linux-2.6.0-test11-mh/mm/vmalloc.c Fri Nov 28 17:45:42 2003
@@ -447,7 +447,11 @@ EXPORT_SYMBOL(__vmalloc);
*/
void *vmalloc(unsigned long size)
{
+#ifdef CONFIG_MEMHOTPLUGTEST
+ return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
+#else
return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+#endif
}

EXPORT_SYMBOL(vmalloc);
diff -dpur linux-2.6.0-test11/mm/vmscan.c linux-2.6.0-test11-mh/mm/vmscan.c
--- linux-2.6.0-test11/mm/vmscan.c Thu Nov 27 05:43:06 2003
+++ linux-2.6.0-test11-mh/mm/vmscan.c Fri Nov 28 17:55:35 2003
@@ -36,6 +36,9 @@
#include <asm/div64.h>

#include <linux/swapops.h>
+#ifdef CONFIG_KDB
+#include <linux/kdb.h>
+#endif

/*
* The "priority" of VM scanning is how much of the queues we will scan in one
@@ -285,6 +288,8 @@ shrink_list(struct list_head *page_list,
goto keep_locked;

pte_chain_lock(page);
+ if ((! zone_activep(page_zone(page))) && page_mapped(page))
+ page_referenced(page);
referenced = page_referenced(page);
if (referenced && page_mapping_inuse(page)) {
/* In active use or really unfreeable. Activate it. */
@@ -589,7 +594,7 @@ done:
* But we had to alter page->flags anyway.
*/
static void
-refill_inactive_zone(struct zone *zone, const int nr_pages_in,
+refill_inactive_zone(struct zone *zone, int nr_pages_in,
struct page_state *ps, int priority)
{
int pgmoved;
@@ -607,6 +612,12 @@ refill_inactive_zone(struct zone *zone,

lru_add_drain();
pgmoved = 0;
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(zone)) {
+ nr_pages = nr_pages_in = zone->present_pages - zone->free_pages;
+ printk("Purging active list of disabled zone\n");
+ }
+#endif
spin_lock_irq(&zone->lru_lock);
while (nr_pages && !list_empty(&zone->active_list)) {
page = list_entry(zone->active_list.prev, struct page, lru);
@@ -658,12 +669,20 @@ refill_inactive_zone(struct zone *zone,
*/
if (swap_tendency >= 100)
reclaim_mapped = 1;
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(zone))
+ reclaim_mapped = 1;
+#endif

while (!list_empty(&l_hold)) {
page = list_entry(l_hold.prev, struct page, lru);
list_del(&page->lru);
if (page_mapped(page)) {
pte_chain_lock(page);
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(zone))
+ page_referenced(page); /* XXX */
+#endif
if (page_mapped(page) && page_referenced(page)) {
pte_chain_unlock(page);
list_add(&page->lru, &l_active);
@@ -767,6 +786,11 @@ shrink_zone(struct zone *zone, int max_s
ratio = (unsigned long)nr_pages * zone->nr_active /
((zone->nr_inactive | 1) * 2);
atomic_add(ratio+1, &zone->refill_counter);
+#ifdef CONFIG_MEMHOTPLUGTEST
+ if (! zone_activep(zone))
+ /* XXX */
+ atomic_add(SWAP_CLUSTER_MAX, &zone->refill_counter);
+#endif
if (atomic_read(&zone->refill_counter) > SWAP_CLUSTER_MAX) {
int count;

@@ -1048,6 +1072,439 @@ int kswapd(void *p)
balance_pgdat(pgdat, 0, &ps);
}
}
+
+#ifdef CONFIG_MEMHOTPLUGTEST
+static void
+print_buffer(struct page* page)
+{
+ struct address_space* mapping = page->mapping;
+ struct buffer_head *bh, *head;
+
+ spin_lock(&mapping->private_lock);
+ bh = head = page_buffers(page);
+ printk("buffers:");
+ do {
+ printk(" %lx %d\n", bh->b_state, atomic_read(&bh->b_count));
+
+ bh = bh->b_this_page;
+ } while (bh != head);
+ printk("\n");
+ spin_unlock(&mapping->private_lock);
+}
+/* try to remap a page. returns non-zero on failure */
+int remap_onepage(struct page *page)
+{
+ struct page *newpage;
+ struct zone *zone;
+ struct address_space *mapping = page->mapping;
+ char *np, *op;
+ void *p;
+ int waitcnt, error = -1;
+
+ newpage = alloc_page(GFP_HIGHUSER);
+ if (newpage == NULL)
+ return -ENOMEM;
+ if (TestSetPageLocked(newpage))
+ BUG();
+ lock_page(page);
+
+ if (! PagePrivate(page) && PageWriteback(page))
+#ifdef CONFIG_KDB
+ KDB_ENTER();
+#else
+ BUG();
+#endif
+ if (PagePrivate(page)) {
+ waitcnt = 100;
+ while (PageWriteback(page)) {
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(10);
+ __set_current_state(TASK_RUNNING);
+ if (! --waitcnt)
+ goto radixfail;
+ }
+
+ /* XXX copied from shrink_list() */
+ if (PageDirty(page) &&
+ is_page_cache_freeable(page) &&
+ mapping != NULL &&
+ mapping->a_ops->writepage != NULL) {
+ spin_lock(&mapping->page_lock);
+ if (test_clear_page_dirty(page)) {
+ int res;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = SWAP_CLUSTER_MAX,
+ .nonblocking = 1,
+ .for_reclaim = 1,
+ };
+
+ list_move(&page->list, &mapping->locked_pages);
+ spin_unlock(&mapping->page_lock);
+
+ SetPageReclaim(page);
+ res = mapping->a_ops->writepage(page, &wbc);
+
+ if (res == WRITEPAGE_ACTIVATE) {
+ ClearPageReclaim(page);
+ goto radixfail;
+ }
+ if (!PageWriteback(page)) {
+ /* synchronous write or broken a_ops? */
+ ClearPageReclaim(page);
+ }
+ lock_page(page);
+ if (! PagePrivate(page))
+ goto bufferdone;
+ } else
+ spin_unlock(&mapping->page_lock);
+ }
+
+ waitcnt = 100;
+ while (1) {
+ if (try_to_release_page(page, GFP_KERNEL))
+ break;
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(10);
+ __set_current_state(TASK_RUNNING);
+ if (! --waitcnt) {
+ print_buffer(page);
+ goto radixfail;
+ }
+ }
+ }
+bufferdone:
+ if (mapping == NULL) {
+ /* The page is an anon page. Allocate swap entry. */
+ if (!add_to_swap(page))
+ goto radixfail;
+ mapping = page->mapping;
+ }
+ error = radix_tree_preload(GFP_KERNEL);
+ if (error)
+ goto radixfail;
+ if (PagePrivate(page)) /* XXX */
+ BUG();
+
+ /* should {__add_to,__remove_from}_page_cache be used instead? */
+ spin_lock(&mapping->page_lock);
+ if (mapping != page->mapping)
+ printk("mapping changed %p -> %p, page %p\n",
+ mapping, page->mapping, page);
+ if (radix_tree_delete(&mapping->page_tree, page->index) == NULL) {
+ /* Page truncated. */
+ spin_unlock(&mapping->page_lock);
+ radix_tree_preload_end();
+ goto radixfail;
+ }
+ /* don't __put_page(page) here. truncate may be in progress */
+ newpage->flags |= page->flags & ~(1 << PG_uptodate) &
+ ~(1 << PG_highmem) & ~(1 << PG_chainlock) &
+ ~(1 << PG_direct) & ~(~0UL << ZONE_SHIFT);
+
+ /* list_del(&page->list); XXX */
+ radix_tree_insert(&mapping->page_tree, page->index, newpage);
+ page_cache_get(newpage);
+ newpage->mapping = mapping;
+ newpage->index = page->index;
+ if (PageDirty(page))
+ list_add(&newpage->list, &mapping->dirty_pages);
+ else
+ list_add(&newpage->list, &mapping->clean_pages);
+ spin_unlock(&mapping->page_lock);
+ radix_tree_preload_end();
+
+ pte_chain_lock(page);
+ if (page_mapped(page)) {
+ while ((error = try_to_unmap(page)) == SWAP_AGAIN) {
+ pte_chain_unlock(page);
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(1);
+ __set_current_state(TASK_RUNNING);
+ pte_chain_lock(page);
+ }
+ if (error == SWAP_FAIL) {
+ pte_chain_unlock(page); /* XXX */
+ /* either during mremap or mlocked */
+ goto unmapfail;
+ }
+ }
+ pte_chain_unlock(page);
+ if (PagePrivate(page))
+ printk("buffer reappeared\n");
+
+ unlock_page(page); /* no lock needed while waiting page count */
+
+ waitcnt = 1;
+wait_again:
+ while (page_count(page) > 2) {
+ waitcnt++;
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(1);
+ if ((waitcnt % 5000) == 0) {
+ printk("remap_onepage: still waiting on %p %d\n", page, waitcnt);
+ break;
+ }
+ if (PagePrivate(page))
+ break; /* see below */
+ }
+
+ lock_page(page);
+ BUG_ON(page_count(page) == 0);
+ if (PagePrivate(page))
+ try_to_release_page(page, GFP_KERNEL);
+ if (page_count(page) > 2) {
+ if (waitcnt > 50000)
+ goto unmapfail;
+ unlock_page(page);
+ goto wait_again;
+ }
+ if (PageReclaim(page) || PageWriteback(page) || PagePrivate(page))
+#ifdef CONFIG_KDB
+ KDB_ENTER();
+#else
+ BUG();
+#endif
+ if (page_count(page) == 1) {
+ /* page has been truncated. free both pages. */
+ spin_lock(&mapping->page_lock);
+ p = radix_tree_lookup(&mapping->page_tree, newpage->index);
+ if (p != NULL) {
+ /* new cache page appeared after truncation */
+ printk("page %p newpage %p radix %p\n",
+ page, newpage, p);
+ BUG_ON(p == newpage);
+ }
+ list_del(&newpage->list);
+ put_page(newpage);
+ if (page_count(newpage) != 1) {
+ printk("newpage count %d != 1, %p\n",
+ page_count(newpage), newpage);
+ BUG();
+ }
+ /* No need to do page->list. remove_from_page_cache did. */
+ newpage->mapping = page->mapping = NULL;
+ spin_unlock(&mapping->page_lock);
+ ClearPageActive(page);
+ ClearPageActive(newpage);
+ unlock_page(page);
+ unlock_page(newpage);
+ put_page(page);
+ put_page(newpage);
+ return 0;
+ }
+
+ spin_lock(&mapping->page_lock);
+ list_del(&page->list); /* XXX */
+ page->mapping = NULL;
+ spin_unlock(&mapping->page_lock);
+ unlock_page(page);
+
+ np = kmap_atomic(newpage, KM_REMAP0);
+ op = kmap_atomic(page, KM_REMAP1);
+ if (np == NULL || op == NULL) { /* XXX */
+ printk("%p %p %p %p\n", np, op, newpage, page);
+ BUG();
+ }
+ memcpy(np, op, PAGE_SIZE);
+ kunmap_atomic(page, KM_REMAP1);
+ kunmap_atomic(newpage, KM_REMAP0);
+ ClearPageActive(page);
+ __put_page(page);
+ put_page(page);
+
+ /* We are done. Finish and let the waiters run. */
+ SetPageUptodate(newpage);
+ /* XXX locking order correct? */
+ zone = page_zone(newpage);
+ spin_lock_irq(&zone->lru_lock);
+ if (PageActive(newpage)) {
+ list_add(&newpage->lru, &zone->active_list);
+ zone->nr_active++;
+ } else {
+ list_add(&newpage->lru, &zone->inactive_list);
+ zone->nr_inactive++;
+ }
+ SetPageLRU(newpage);
+ spin_unlock_irq(&zone->lru_lock);
+ unlock_page(newpage);
+ page_cache_release(newpage);
+ return 0;
+
+unmapfail:
+ /*
+ * Try to unwind by notifying waiters. If someone misbehaves,
+ * we die.
+ */
+ error = radix_tree_preload(GFP_KERNEL);
+ if (error)
+ BUG();
+ /* should {__add_to,__remove_from}_page_cache be used instead? */
+ spin_lock(&mapping->page_lock);
+ /* list_del(&newpage->list); */
+ if (radix_tree_delete(&mapping->page_tree, page->index) == NULL)
+ /* Hold extra count to handle truncate */
+ page_cache_get(newpage);
+ radix_tree_insert(&mapping->page_tree, page->index, page);
+ /* no page_cache_get(page); needed */
+ radix_tree_preload_end();
+ spin_unlock(&mapping->page_lock);
+
+ SetPageAgain(newpage);
+ /* XXX unmap needed? No, it shouldn't. Handled by fault handlers. */
+ unlock_page(newpage);
+
+ waitcnt = 1;
+ for(; page_count(newpage) > 2; waitcnt++) {
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(1);
+ if ((waitcnt % 10000) == 0) {
+ printk("You are hosed.\n");
+ printk("newpage %p\n", newpage);
+ BUG();
+ }
+ }
+ BUG_ON(PageUptodate(newpage));
+ ClearPageDirty(newpage);
+ ClearPageActive(newpage);
+ spin_lock(&mapping->page_lock);
+ newpage->mapping = NULL;
+ if (page_count(newpage) == 1) {
+ printk("newpage %p truncated. page %p\n", newpage, page);
+ BUG();
+ }
+ list_del(&newpage->list);
+ spin_unlock(&mapping->page_lock);
+ unlock_page(page);
+ __put_page(newpage);
+ __free_page(newpage);
+ return 1;
+
+radixfail:
+ unlock_page(page);
+ unlock_page(newpage);
+ __free_page(newpage);
+ return 1;
+}
+
+static struct work_struct lru_drain_wq[NR_CPUS];
+static void
+lru_drain_schedule(void *p)
+{
+ int cpu = get_cpu();
+
+ schedule_work(&lru_drain_wq[cpu]);
+ put_cpu();
+}
+
+atomic_t remapd_count;
+int remapd(void *p)
+{
+ struct zone *zone = p;
+ struct page *page, *page1;
+ struct list_head *l;
+ int active, i, nr_failed = 0;
+ int fastmode = 100;
+ LIST_HEAD(failedp);
+
+ daemonize("remap%d", zone->zone_start_pfn);
+ if (atomic_read(&remapd_count) > 0) {
+ printk("remapd already running\n");
+ return 0;
+ }
+ atomic_inc(&remapd_count);
+ on_each_cpu(lru_drain_schedule, NULL, 1, 1);
+ while(nr_failed < 100) {
+ spin_lock_irq(&zone->lru_lock);
+ for(active = 0; active < 2; active++) {
+ l = active ? &zone->active_list :
+ &zone->inactive_list;
+ for(i = 0; ! list_empty(l) && i < 10; i++) {
+ page = list_entry(l->prev, struct page, lru);
+ if (fastmode && PageLocked(page)) {
+ page1 = page;
+ while (fastmode && PageLocked(page)) {
+ page =
+ list_entry(page->lru.prev,
+ struct page, lru);
+ fastmode--;
+ if (&page->lru == l) {
+ /* scanned the whole
+ list */
+ page = page1;
+ break;
+ }
+ if (page == page1)
+ BUG();
+ }
+ if (! fastmode) {
+ printk("used up fastmode\n");
+ page = page1;
+ }
+ }
+ if (! TestClearPageLRU(page))
+ BUG();
+ list_del(&page->lru);
+ if (page_count(page) == 0) {
+ /* the page is in pagevec_release();
+ shrink_cache says so. */
+ SetPageLRU(page);
+ list_add(&page->lru, l);
+ continue;
+ }
+ if (active)
+ zone->nr_active--;
+ else
+ zone->nr_inactive--;
+ page_cache_get(page);
+ spin_unlock_irq(&zone->lru_lock);
+ goto got_page;
+ }
+ }
+ spin_unlock_irq(&zone->lru_lock);
+ break;
+
+ got_page:
+ if (remap_onepage(page)) {
+ nr_failed++;
+ list_add(&page->lru, &failedp);
+ }
+ }
+ if (list_empty(&failedp))
+ goto out;
+
+ while (! list_empty(&failedp)) {
+ spin_lock_irq(&zone->lru_lock);
+ page = list_entry(failedp.prev, struct page, lru);
+ list_del(&page->lru);
+ if (PageActive(page)) {
+ list_add(&page->lru, &zone->active_list);
+ zone->nr_active++;
+ } else {
+ list_add(&page->lru, &zone->inactive_list);
+ zone->nr_inactive++;
+ }
+ if (TestSetPageLRU(page))
+ BUG();
+ spin_unlock_irq(&zone->lru_lock);
+ page_cache_release(page);
+ }
+out:
+ atomic_dec(&remapd_count);
+ return 0;
+}
+
+static int __init remapd_init(void)
+{
+ int i;
+
+ for(i = 0; i < NR_CPUS; i++)
+ INIT_WORK(&lru_drain_wq[i], lru_add_drain, NULL);
+ return 0;
+}
+
+module_init(remapd_init);
+#endif

/*
* A zone is low on free memory, so wake its kswapd task to service it.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/