new memory hotremoval patch

From: IWAMOTO Toshihiro
Date: Wed Jun 30 2004 - 06:19:30 EST


Hi,

this is an updated version of my memory hotremoval patch.
I'll only include the main patch which contains page remapping code.
The other two files, which haven't changed much from April, can be
found at http://people.valinux.co.jp/~iwamoto/mh.html .

Page "remapping" is a mechanism to free a specified page by copying the
page content to a newly allocated replacement page and redirecting
references to the original page to the new page.
This was designed to reliably free specified pages, unlike the swapout
code.

The patch is against linux-2.6.7, and fully supports objrmap.
Though there seems to be some lock related lossage, the page remapping
code works fairly well.

Due to struct page changes, page->mapping == NULL predicate can no
longer be used for detecting cancellation of an anonymous page
remapping operation. So the PG_again bit is being used again.
It may be still possible to kill the PG_again bit, but the priority is
rather low.

I will be working on the following items.

1. Prototype implementation of memsection support.
It seems some people wants to hotremove small regions of memory
rather than zones or nodes. A prototype implementation will
show how Takahashi's hugetlb page code can be used for such a
purpose.

2. Handling of pages with dirty buffers without writing them back.
This is file system specific. I plan to do against ext2 and
ext3.


My patch supports remapping of normal pages, Takahashi's hugepage
remapping patch will be posted in a few days.


$Id: memoryhotplug.patch,v 1.95 2004/06/30 07:31:37 iwamoto Exp $

--- linux-2.6.7.ORG/arch/i386/Kconfig 2004-06-16 14:18:59.000000000 +0900
+++ linux-2.6.7/arch/i386/Kconfig 2004-06-17 16:31:41.000000000 +0900
@@ -734,9 +734,19 @@ comment "NUMA (NUMA-Q) requires SMP, 64G
comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI)

+config MEMHOTPLUG
+ bool "Memory hotplug test"
+ depends on !X86_PAE
+ default n
+
+config MEMHOTPLUG_BLKSIZE
+ int "Size of a memory hotplug unit (in MB, must be multiple of 256)."
+ range 256 1024
+ depends on MEMHOTPLUG
+
config DISCONTIGMEM
bool
- depends on NUMA
+ depends on NUMA || MEMHOTPLUG
default y

config HAVE_ARCH_BOOTMEM_NODE
--- linux-2.6.7.ORG/include/linux/gfp.h 2004-06-16 14:19:02.000000000 +0900
+++ linux-2.6.7/include/linux/gfp.h 2004-06-17 11:39:11.000000000 +0900
@@ -11,9 +11,10 @@ struct vm_area_struct;
/*
* GFP bitmasks..
*/
-/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low two bits) */
-#define __GFP_DMA 0x01
-#define __GFP_HIGHMEM 0x02
+/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low three bits) */
+#define __GFP_DMA 0x01
+#define __GFP_HIGHMEM 0x02
+#define __GFP_HOTREMOVABLE 0x03

/*
* Action modifiers - doesn't change the zoning
@@ -51,7 +52,7 @@ struct vm_area_struct;
#define GFP_NOFS (__GFP_WAIT | __GFP_IO)
#define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS)
#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS)
-#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM)
+#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM | __GFP_HOTREMOVABLE)

/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some
platforms, used as appropriate on others */
--- linux-2.6.7.ORG/include/linux/mm.h 2004-06-16 14:18:56.000000000 +0900
+++ linux-2.6.7/include/linux/mm.h 2004-06-17 16:26:50.000000000 +0900
@@ -314,6 +314,11 @@ static inline void put_page(struct page

#endif /* CONFIG_HUGETLB_PAGE */

+static inline int is_page_cache_freeable(struct page *page)
+{
+ return page_count(page) - !!PagePrivate(page) == 2;
+}
+
/*
* Multiple processes may "see" the same page. E.g. for untouched
* mappings of /dev/null, all processes see the same page full of
--- linux-2.6.7.ORG/include/linux/mmzone.h 2004-06-16 14:19:36.000000000 +0900
+++ linux-2.6.7/include/linux/mmzone.h 2004-06-17 11:39:11.000000000 +0900
@@ -65,8 +65,10 @@ struct per_cpu_pageset {
#define ZONE_DMA 0
#define ZONE_NORMAL 1
#define ZONE_HIGHMEM 2
+#define ZONE_HOTREMOVABLE 3 /* only for zonelists */

#define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */
+#define MAX_NR_ZONELISTS 4
#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */

#define GFP_ZONEMASK 0x03
@@ -225,7 +227,7 @@ struct zonelist {
struct bootmem_data;
typedef struct pglist_data {
struct zone node_zones[MAX_NR_ZONES];
- struct zonelist node_zonelists[MAX_NR_ZONES];
+ struct zonelist node_zonelists[MAX_NR_ZONELISTS];
int nr_zones;
struct page *node_mem_map;
struct bootmem_data *bdata;
@@ -237,6 +239,7 @@ typedef struct pglist_data {
struct pglist_data *pgdat_next;
wait_queue_head_t kswapd_wait;
struct task_struct *kswapd;
+ char removable, enabled;
} pg_data_t;

#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
--- linux-2.6.7.ORG/include/linux/page-flags.h 2004-06-16 14:19:42.000000000 +0900
+++ linux-2.6.7/include/linux/page-flags.h 2004-06-17 11:39:11.000000000 +0900
@@ -78,6 +78,8 @@

#define PG_anon 20 /* Anonymous: anon_vma in mapping */

+#define PG_again 21
+

/*
* Global page accounting. One instance per CPU. Only unsigned longs are
@@ -298,6 +300,10 @@ extern unsigned long __read_page_state(u
#define SetPageCompound(page) set_bit(PG_compound, &(page)->flags)
#define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags)

+#define PageAgain(page) test_bit(PG_again, &(page)->flags)
+#define SetPageAgain(page) set_bit(PG_again, &(page)->flags)
+#define ClearPageAgain(page) clear_bit(PG_again, &(page)->flags)
+
#define PageAnon(page) test_bit(PG_anon, &(page)->flags)
#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags)
#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags)
--- linux-2.6.7.ORG/include/linux/rmap.h 2004-06-16 14:18:57.000000000 +0900
+++ linux-2.6.7/include/linux/rmap.h 2004-06-17 11:39:11.000000000 +0900
@@ -96,7 +96,7 @@ static inline void page_dup_rmap(struct
* Called from mm/vmscan.c to handle paging out
*/
int page_referenced(struct page *);
-int try_to_unmap(struct page *);
+int try_to_unmap(struct page *, struct list_head *);

#else /* !CONFIG_MMU */

@@ -105,7 +105,7 @@ int try_to_unmap(struct page *);
#define anon_vma_link(vma) do {} while (0)

#define page_referenced(page) TestClearPageReferenced(page)
-#define try_to_unmap(page) SWAP_FAIL
+#define try_to_unmap(page, force) SWAP_FAIL

#endif /* CONFIG_MMU */

--- linux-2.6.7.ORG/mm/Makefile 2004-06-16 14:19:37.000000000 +0900
+++ linux-2.6.7/mm/Makefile 2004-06-17 11:39:11.000000000 +0900
@@ -15,3 +15,5 @@ obj-y := bootmem.o filemap.o mempool.o
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
+
+obj-$(CONFIG_MEMHOTPLUG) += memhotplug.o
--- linux-2.6.7.ORG/mm/filemap.c 2004-06-16 14:19:12.000000000 +0900
+++ linux-2.6.7/mm/filemap.c 2004-06-17 11:39:11.000000000 +0900
@@ -250,7 +250,8 @@ int filemap_write_and_wait(struct addres
int add_to_page_cache(struct page *page, struct address_space *mapping,
pgoff_t offset, int gfp_mask)
{
- int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+ int error = radix_tree_preload((gfp_mask & ~GFP_ZONEMASK) |
+ ((gfp_mask & GFP_ZONEMASK) == __GFP_DMA ? __GFP_DMA : 0));

if (error == 0) {
spin_lock_irq(&mapping->tree_lock);
@@ -495,6 +496,7 @@ repeat:
page_cache_release(page);
goto repeat;
}
+ BUG_ON(PageAgain(page));
}
}
spin_unlock_irq(&mapping->tree_lock);
@@ -738,6 +740,8 @@ page_not_up_to_date:
goto page_ok;
}

+ BUG_ON(PageAgain(page));
+
readpage:
/* ... and start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);
@@ -1206,6 +1210,8 @@ page_not_uptodate:
goto success;
}

+ BUG_ON(PageAgain(page));
+
if (!mapping->a_ops->readpage(file, page)) {
wait_on_page_locked(page);
if (PageUptodate(page))
@@ -1314,6 +1320,8 @@ page_not_uptodate:
goto success;
}

+ BUG_ON(PageAgain(page));
+
if (!mapping->a_ops->readpage(file, page)) {
wait_on_page_locked(page);
if (PageUptodate(page))
@@ -1518,6 +1526,8 @@ retry:
unlock_page(page);
goto out;
}
+ BUG_ON(PageAgain(page));
+
err = filler(data, page);
if (err < 0) {
page_cache_release(page);
--- linux-2.6.7.ORG/mm/memory.c 2004-06-16 14:19:22.000000000 +0900
+++ linux-2.6.7/mm/memory.c 2004-06-17 16:26:50.000000000 +0900
@@ -1305,6 +1305,7 @@ static int do_swap_page(struct mm_struct

pte_unmap(page_table);
spin_unlock(&mm->page_table_lock);
+again:
page = lookup_swap_cache(entry);
if (!page) {
swapin_readahead(entry, address, vma);
@@ -1332,6 +1333,12 @@ static int do_swap_page(struct mm_struct

mark_page_accessed(page);
lock_page(page);
+ if (PageAgain(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto again;
+ }
+ BUG_ON(PageAgain(page));

/*
* Back out if somebody else faulted in this pte while we
--- linux-2.6.7.ORG/mm/page_alloc.c 2004-06-16 14:18:57.000000000 +0900
+++ linux-2.6.7/mm/page_alloc.c 2004-06-17 16:31:41.000000000 +0900
@@ -25,6 +25,7 @@
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/pagevec.h>
+#include <linux/memhotplug.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/notifier.h>
@@ -231,6 +232,7 @@ static inline void free_pages_check(cons
1 << PG_maplock |
1 << PG_anon |
1 << PG_swapcache |
+ 1 << PG_again |
1 << PG_writeback )))
bad_page(function, page);
if (PageDirty(page))
@@ -341,12 +343,13 @@ static void prep_new_page(struct page *p
1 << PG_maplock |
1 << PG_anon |
1 << PG_swapcache |
+ 1 << PG_again |
1 << PG_writeback )))
bad_page(__FUNCTION__, page);

page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
1 << PG_referenced | 1 << PG_arch_1 |
- 1 << PG_checked | 1 << PG_mappedtodisk);
+ 1 << PG_checked | 1 << PG_mappedtodisk | 1 << PG_again);
page->private = 0;
set_page_refs(page, order);
}
@@ -404,7 +407,7 @@ static int rmqueue_bulk(struct zone *zon
return allocated;
}

-#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
+#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMHOTPLUG)
static void __drain_pages(unsigned int cpu)
{
struct zone *zone;
@@ -447,7 +450,9 @@ int is_head_of_free_region(struct page *
spin_unlock_irqrestore(&zone->lock, flags);
return 0;
}
+#endif

+#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_MEMHOTPLUG)
/*
* Spill all of this CPU's per-cpu pages back into the buddy allocator.
*/
@@ -847,7 +852,8 @@ unsigned int nr_free_pages(void)
struct zone *zone;

for_each_zone(zone)
- sum += zone->free_pages;
+ if (zone->zone_pgdat->enabled)
+ sum += zone->free_pages;

return sum;
}
@@ -860,7 +866,8 @@ unsigned int nr_used_zone_pages(void)
struct zone *zone;

for_each_zone(zone)
- pages += zone->nr_active + zone->nr_inactive;
+ if (zone->zone_pgdat->enabled)
+ pages += zone->nr_active + zone->nr_inactive;

return pages;
}
@@ -887,6 +894,8 @@ static unsigned int nr_free_zone_pages(i
struct zone **zonep = zonelist->zones;
struct zone *zone;

+ if (!pgdat->enabled)
+ continue;
for (zone = *zonep++; zone; zone = *zonep++) {
unsigned long size = zone->present_pages;
unsigned long high = zone->pages_high;
@@ -921,7 +930,8 @@ unsigned int nr_free_highpages (void)
unsigned int pages = 0;

for_each_pgdat(pgdat)
- pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+ if (pgdat->enabled)
+ pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;

return pages;
}
@@ -1171,13 +1181,21 @@ void show_free_areas(void)
/*
* Builds allocation fallback zone lists.
*/
-static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
+static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
{
+
+ if (!pgdat->enabled)
+ return j;
+ if (k != ZONE_HOTREMOVABLE &&
+ pgdat->removable)
+ return j;
+
switch (k) {
struct zone *zone;
default:
BUG();
case ZONE_HIGHMEM:
+ case ZONE_HOTREMOVABLE:
zone = pgdat->node_zones + ZONE_HIGHMEM;
if (zone->present_pages) {
#ifndef CONFIG_HIGHMEM
@@ -1304,24 +1322,48 @@ static void __init build_zonelists(pg_da

#else /* CONFIG_NUMA */

-static void __init build_zonelists(pg_data_t *pgdat)
+static void build_zonelists(pg_data_t *pgdat)
{
int i, j, k, node, local_node;
+ int hotremovable;
+#ifdef CONFIG_MEMHOTPLUG
+ struct zone *zone;
+#endif

local_node = pgdat->node_id;
- for (i = 0; i < MAX_NR_ZONES; i++) {
+ for (i = 0; i < MAX_NR_ZONELISTS; i++) {
struct zonelist *zonelist;

zonelist = pgdat->node_zonelists + i;
- memset(zonelist, 0, sizeof(*zonelist));
+ /* memset(zonelist, 0, sizeof(*zonelist)); */

j = 0;
k = ZONE_NORMAL;
- if (i & __GFP_HIGHMEM)
+ hotremovable = 0;
+ switch (i) {
+ default:
+ BUG();
+ return;
+ case 0:
+ k = ZONE_NORMAL;
+ break;
+ case __GFP_HIGHMEM:
k = ZONE_HIGHMEM;
- if (i & __GFP_DMA)
+ break;
+ case __GFP_DMA:
k = ZONE_DMA;
+ break;
+ case __GFP_HOTREMOVABLE:
+#ifdef CONFIG_MEMHOTPLUG
+ k = ZONE_HIGHMEM;
+#else
+ k = ZONE_HOTREMOVABLE;
+#endif
+ hotremovable = 1;
+ break;
+ }

+#ifndef CONFIG_MEMHOTPLUG
j = build_zonelists_node(pgdat, zonelist, j, k);
/*
* Now we build the zonelist so that it contains the zones
@@ -1335,19 +1377,54 @@ static void __init build_zonelists(pg_da
j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
for (node = 0; node < local_node; node++)
j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
-
- zonelist->zones[j] = NULL;
- }
+#else
+ while (hotremovable >= 0) {
+ for(; k >= 0; k--) {
+ zone = pgdat->node_zones + k;
+ for (node = local_node; ;) {
+ if (NODE_DATA(node) == NULL ||
+ !NODE_DATA(node)->enabled ||
+ (!!NODE_DATA(node)->removable) !=
+ (!!hotremovable))
+ goto next;
+ zone = NODE_DATA(node)->node_zones + k;
+ if (zone->present_pages)
+ zonelist->zones[j++] = zone;
+ next:
+ node = (node + 1) % numnodes;
+ if (node == local_node)
+ break;
+ }
+ }
+ if (hotremovable) {
+ /* place non-hotremovable after hotremovable */
+ k = ZONE_HIGHMEM;
+ }
+ hotremovable--;
+ }
+#endif
+ BUG_ON(j > sizeof(zonelist->zones) /
+ sizeof(zonelist->zones[0]) - 1);
+ for(; j < sizeof(zonelist->zones) /
+ sizeof(zonelist->zones[0]); j++)
+ zonelist->zones[j] = NULL;
+ }
}

#endif /* CONFIG_NUMA */

-void __init build_all_zonelists(void)
+#ifdef CONFIG_MEMHOTPLUG
+void
+#else
+void __init
+#endif
+build_all_zonelists(void)
{
int i;

for(i = 0 ; i < numnodes ; i++)
- build_zonelists(NODE_DATA(i));
+ if (NODE_DATA(i) != NULL)
+ build_zonelists(NODE_DATA(i));
printk("Built %i zonelists\n", numnodes);
}

@@ -1419,7 +1496,7 @@ static void __init calculate_zone_totalp
* up by free_all_bootmem() once the early boot process is
* done. Non-atomic initialization, single-pass.
*/
-void __init memmap_init_zone(struct page *start, unsigned long size, int nid,
+void memmap_init_zone(struct page *start, unsigned long size, int nid,
unsigned long zone, unsigned long start_pfn)
{
struct page *page;
@@ -1457,10 +1534,13 @@ static void __init free_area_init_core(s
int cpu, nid = pgdat->node_id;
struct page *lmem_map = pgdat->node_mem_map;
unsigned long zone_start_pfn = pgdat->node_start_pfn;
+#ifdef CONFIG_MEMHOTPLUG
+ int cold = !nid;
+#endif

pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
-
+
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize;
@@ -1530,6 +1610,13 @@ static void __init free_area_init_core(s
zone->wait_table_size = wait_table_size(size);
zone->wait_table_bits =
wait_table_bits(zone->wait_table_size);
+#ifdef CONFIG_MEMHOTPLUG
+ if (!cold)
+ zone->wait_table = (wait_queue_head_t *)
+ kmalloc(zone->wait_table_size
+ * sizeof(wait_queue_head_t), GFP_KERNEL);
+ else
+#endif
zone->wait_table = (wait_queue_head_t *)
alloc_bootmem_node(pgdat, zone->wait_table_size
* sizeof(wait_queue_head_t));
@@ -1584,6 +1671,13 @@ static void __init free_area_init_core(s
*/
bitmap_size = (size-1) >> (i+4);
bitmap_size = LONG_ALIGN(bitmap_size+1);
+#ifdef CONFIG_MEMHOTPLUG
+ if (!cold) {
+ zone->free_area[i].map =
+ (unsigned long *)kmalloc(bitmap_size, GFP_KERNEL);
+ memset(zone->free_area[i].map, 0, bitmap_size);
+ } else
+#endif
zone->free_area[i].map =
(unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
}
@@ -1901,7 +1995,7 @@ static void setup_per_zone_protection(vo
* that the pages_{min,low,high} values for each zone are set correctly
* with respect to min_free_kbytes.
*/
-static void setup_per_zone_pages_min(void)
+void setup_per_zone_pages_min(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
--- linux-2.6.7.ORG/mm/rmap.c 2004-06-16 14:20:03.000000000 +0900
+++ linux-2.6.7/mm/rmap.c 2004-06-17 11:39:12.000000000 +0900
@@ -30,6 +30,7 @@
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/memhotplug.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/rmap.h>
@@ -421,7 +422,8 @@ void page_remove_rmap(struct page *page)
* Subfunctions of try_to_unmap: try_to_unmap_one called
* repeatedly from either try_to_unmap_anon or try_to_unmap_file.
*/
-static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
+static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+ struct list_head *force)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
@@ -429,6 +431,9 @@ static int try_to_unmap_one(struct page
pmd_t *pmd;
pte_t *pte;
pte_t pteval;
+#ifdef CONFIG_MEMHOTPLUG
+ struct page_va_list *vlist;
+#endif
int ret = SWAP_AGAIN;

if (!mm->rss)
@@ -466,8 +471,22 @@ static int try_to_unmap_one(struct page
*/
if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
ptep_test_and_clear_young(pte)) {
- ret = SWAP_FAIL;
- goto out_unmap;
+ if (force == NULL || vma->vm_flags & VM_RESERVED) {
+ ret = SWAP_FAIL;
+ goto out_unmap;
+ }
+#ifdef CONFIG_MEMHOTPLUG
+ vlist = kmalloc(sizeof(struct page_va_list), GFP_KERNEL);
+ atomic_inc(&mm->mm_count);
+ vlist->mm = mmgrab(mm);
+ if (vlist->mm == NULL) {
+ mmdrop(mm);
+ kfree(vlist);
+ } else {
+ vlist->addr = address;
+ list_add(&vlist->list, force);
+ }
+#endif
}

/*
@@ -620,7 +639,7 @@ out_unlock:
return SWAP_AGAIN;
}

-static inline int try_to_unmap_anon(struct page *page)
+static inline int try_to_unmap_anon(struct page *page, struct list_head *force)
{
struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
struct vm_area_struct *vma;
@@ -629,7 +648,7 @@ static inline int try_to_unmap_anon(stru
spin_lock(&anon_vma->lock);
BUG_ON(list_empty(&anon_vma->head));
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
- ret = try_to_unmap_one(page, vma);
+ ret = try_to_unmap_one(page, vma, force);
if (ret == SWAP_FAIL || !page->mapcount)
break;
}
@@ -649,7 +668,7 @@ static inline int try_to_unmap_anon(stru
* The spinlock address_space->i_mmap_lock is tried. If it can't be gotten,
* return a temporary error.
*/
-static inline int try_to_unmap_file(struct page *page)
+static inline int try_to_unmap_file(struct page *page, struct list_head *force)
{
struct address_space *mapping = page->mapping;
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -666,7 +685,7 @@ static inline int try_to_unmap_file(stru

while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
&iter, pgoff, pgoff)) != NULL) {
- ret = try_to_unmap_one(page, vma);
+ ret = try_to_unmap_one(page, vma, force);
if (ret == SWAP_FAIL || !page->mapcount)
goto out;
}
@@ -760,7 +779,7 @@ out:
* SWAP_AGAIN - we missed a trylock, try again later
* SWAP_FAIL - the page is unswappable
*/
-int try_to_unmap(struct page *page)
+int try_to_unmap(struct page *page, struct list_head *force)
{
int ret;

@@ -769,9 +788,9 @@ int try_to_unmap(struct page *page)
BUG_ON(!page->mapcount);

if (PageAnon(page))
- ret = try_to_unmap_anon(page);
+ ret = try_to_unmap_anon(page, force);
else
- ret = try_to_unmap_file(page);
+ ret = try_to_unmap_file(page, force);

if (!page->mapcount) {
if (page_test_and_clear_dirty(page))
--- linux-2.6.7.ORG/mm/swapfile.c 2004-06-16 14:19:01.000000000 +0900
+++ linux-2.6.7/mm/swapfile.c 2004-06-17 11:39:12.000000000 +0900
@@ -658,6 +658,7 @@ static int try_to_unuse(unsigned int typ
*/
swap_map = &si->swap_map[i];
entry = swp_entry(type, i);
+ again:
page = read_swap_cache_async(entry, NULL, 0);
if (!page) {
/*
@@ -692,6 +693,11 @@ static int try_to_unuse(unsigned int typ
wait_on_page_locked(page);
wait_on_page_writeback(page);
lock_page(page);
+ if (PageAgain(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto again;
+ }
wait_on_page_writeback(page);

/*
@@ -800,6 +806,7 @@ static int try_to_unuse(unsigned int typ

swap_writepage(page, &wbc);
lock_page(page);
+ BUG_ON(PageAgain(page));
wait_on_page_writeback(page);
}
if (PageSwapCache(page)) {
--- linux-2.6.7.ORG/mm/truncate.c 2004-06-16 14:20:04.000000000 +0900
+++ linux-2.6.7/mm/truncate.c 2004-06-17 11:39:12.000000000 +0900
@@ -132,6 +132,8 @@ void truncate_inode_pages(struct address
next++;
if (TestSetPageLocked(page))
continue;
+ /* no PageAgain(page) check; page->mapping check
+ * is done in truncate_complete_page */
if (PageWriteback(page)) {
unlock_page(page);
continue;
@@ -165,6 +167,24 @@ void truncate_inode_pages(struct address
struct page *page = pvec.pages[i];

lock_page(page);
+ if (page->mapping == NULL) {
+ /* XXX Is page->index still valid? */
+ unsigned long index = page->index;
+ int again = PageAgain(page);
+
+ unlock_page(page);
+ put_page(page);
+ page = find_lock_page(mapping, index);
+ if (page == NULL) {
+ BUG_ON(again);
+ /* XXX */
+ if (page->index > next)
+ next = page->index;
+ next++;
+ }
+ BUG_ON(!again);
+ pvec.pages[i] = page;
+ }
wait_on_page_writeback(page);
if (page->index > next)
next = page->index;
@@ -257,14 +277,29 @@ void invalidate_inode_pages2(struct addr
struct page *page = pvec.pages[i];

lock_page(page);
- if (page->mapping == mapping) { /* truncate race? */
- wait_on_page_writeback(page);
- next = page->index + 1;
- if (page_mapped(page))
- clear_page_dirty(page);
- else
- invalidate_complete_page(mapping, page);
+ while (page->mapping != mapping) {
+ struct page *newpage;
+ unsigned long index = page->index;
+
+ BUG_ON(page->mapping != NULL);
+
+ unlock_page(page);
+ newpage = find_lock_page(mapping, index);
+ if (page == newpage) {
+ put_page(page);
+ break;
+ }
+ BUG_ON(!PageAgain(page));
+ pvec.pages[i] = newpage;
+ put_page(page);
+ page = newpage;
}
+ wait_on_page_writeback(page);
+ next = page->index + 1;
+ if (page_mapped(page))
+ clear_page_dirty(page);
+ else
+ invalidate_complete_page(mapping, page);
unlock_page(page);
}
pagevec_release(&pvec);
--- linux-2.6.7.ORG/mm/vmscan.c 2004-06-16 14:18:58.000000000 +0900
+++ linux-2.6.7/mm/vmscan.c 2004-06-17 11:39:12.000000000 +0900
@@ -32,6 +32,7 @@
#include <linux/topology.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
+#include <linux/kthread.h>

#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
@@ -197,11 +198,6 @@ static inline int page_mapping_inuse(str
return mapping_mapped(mapping);
}

-static inline int is_page_cache_freeable(struct page *page)
-{
- return page_count(page) - !!PagePrivate(page) == 2;
-}
-
static int may_write_to_queue(struct backing_dev_info *bdi)
{
if (current_is_kswapd())
@@ -399,7 +395,7 @@ static int shrink_list(struct list_head
* processes. Try to unmap it here.
*/
if (page_mapped(page) && mapping) {
- switch (try_to_unmap(page)) {
+ switch (try_to_unmap(page, NULL)) {
case SWAP_FAIL:
page_map_unlock(page);
goto activate_locked;
@@ -1134,6 +1130,8 @@ int kswapd(void *p)
if (current->flags & PF_FREEZE)
refrigerator(PF_FREEZE);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+ if (kthread_should_stop())
+ return 0;
schedule();
finish_wait(&pgdat->kswapd_wait, &wait);

@@ -1217,4 +1215,14 @@ static int __init kswapd_init(void)
return 0;
}

+#ifdef CONFIG_MEMHOTPLUG
+void
+kswapd_start_one(pg_data_t *pgdat)
+{
+ pgdat->kswapd = kthread_create(kswapd, pgdat, "kswapd%d",
+ pgdat->node_id);
+ total_memory = nr_free_pagecache_pages();
+}
+#endif
+
module_init(kswapd_init)
--- linux-2.6.7.ORG/include/linux/memhotplug.h 1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.7/include/linux/memhotplug.h 2004-06-17 11:39:12.000000000 +0900
@@ -0,0 +1,32 @@
+#ifndef _LINUX_MEMHOTPLUG_H
+#define _LINUX_MEMHOTPLUG_H
+
+#include <linux/config.h>
+#include <linux/mm.h>
+
+#ifdef __KERNEL__
+
+struct page_va_list {
+ struct mm_struct *mm;
+ unsigned long addr;
+ struct list_head list;
+};
+
+struct remap_operations {
+ struct page * (*remap_alloc_page)(int);
+ int (*remap_delete_page)(struct page *);
+ int (*remap_copy_page)(struct page *, struct page *);
+ int (*remap_lru_add_page)(struct page *, int);
+ int (*remap_release_buffers)(struct page *);
+ int (*remap_prepare)(struct page *page, int fastmode);
+ int (*remap_stick_page)(struct list_head *vlist);
+};
+
+extern int remapd(void *p);
+extern int remap_onepage(struct page *, int, int, struct remap_operations *);
+extern int remap_onepage_normal(struct page *, int, int);
+
+#define REMAP_ANYNODE (-1)
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_MEMHOTPLUG_H */
--- linux-2.6.7.ORG/mm/memhotplug.c 1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.7/mm/memhotplug.c 2004-06-17 11:39:12.000000000 +0900
@@ -0,0 +1,708 @@
+/*
+ * linux/mm/memhotplug.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * Support of memory hotplug, Iwamoto
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+#include <linux/rmap.h>
+#include <linux/memhotplug.h>
+
+#ifdef CONFIG_KDB
+#include <linux/kdb.h>
+#endif
+
+static void
+print_buffer(struct page* page)
+{
+ struct address_space* mapping = page_mapping(page);
+ struct buffer_head *bh, *head;
+
+ spin_lock(&mapping->private_lock);
+ bh = head = page_buffers(page);
+ printk("buffers:");
+ do {
+ printk(" %lx %d", bh->b_state, atomic_read(&bh->b_count));
+
+ bh = bh->b_this_page;
+ } while (bh != head);
+ printk("\n");
+ spin_unlock(&mapping->private_lock);
+}
+
+static int
+stick_mlocked_page(struct list_head *vlist)
+{
+ struct page_va_list *v1;
+ struct vm_area_struct *vma;
+ int error;
+
+ while(!list_empty(vlist)) {
+ v1 = list_entry(vlist->next, struct page_va_list, list);
+ list_del(&v1->list);
+ down_read(&v1->mm->mmap_sem);
+ vma = find_vma(v1->mm, v1->addr);
+ if (vma == NULL || !(vma->vm_flags & VM_LOCKED))
+ goto out;
+ error = get_user_pages(current, v1->mm, v1->addr, PAGE_SIZE,
+ (vma->vm_flags & VM_WRITE) != 0, 0, NULL, NULL);
+ out:
+ up_read(&v1->mm->mmap_sem);
+ mmput(v1->mm);
+ kfree(v1);
+ }
+ return 0;
+}
+
+/* helper function for remap_onepage */
+#define REMAPPREP_WB 1
+#define REMAPPREP_BUFFER 2
+
+/*
+ * Try to free buffers if "page" has them.
+ */
+static int
+remap_preparepage(struct page *page, int fastmode)
+{
+ struct address_space *mapping;
+ int waitcnt = fastmode ? 0 : 100;
+
+ BUG_ON(!PageLocked(page));
+
+ mapping = page_mapping(page);
+
+ if (!PagePrivate(page) && PageWriteback(page) &&
+ !PageSwapCache(page)) {
+ printk("remap_preparepage: mapping %p page %p\n",
+ page->mapping, page);
+ return -REMAPPREP_WB;
+ }
+
+ while (PageWriteback(page)) {
+ if (!waitcnt)
+ return -REMAPPREP_WB;
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(10);
+ __set_current_state(TASK_RUNNING);
+ waitcnt--;
+ }
+ if (PagePrivate(page)) {
+ /* XXX copied from shrink_list() */
+ if (PageDirty(page) &&
+ is_page_cache_freeable(page) &&
+ mapping != NULL &&
+ mapping->a_ops->writepage != NULL) {
+ spin_lock_irq(&mapping->tree_lock);
+ if (clear_page_dirty_for_io(page)) {
+ int res;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = SWAP_CLUSTER_MAX,
+ .nonblocking = 1,
+ .for_reclaim = 1,
+ };
+
+ spin_unlock_irq(&mapping->tree_lock);
+
+ SetPageReclaim(page);
+ res = mapping->a_ops->writepage(page, &wbc);
+
+ if (res < 0)
+ /* not implemented. help */
+ BUG();
+ if (res == WRITEPAGE_ACTIVATE) {
+ ClearPageReclaim(page);
+ return -REMAPPREP_WB;
+ }
+ if (!PageWriteback(page)) {
+ /* synchronous write or broken a_ops? */
+ ClearPageReclaim(page);
+ }
+ lock_page(page);
+ mapping = page_mapping(page);
+ if (!PagePrivate(page))
+ return 0;
+ } else
+ spin_unlock_irq(&mapping->tree_lock);
+ }
+
+ while (1) {
+ if (try_to_release_page(page, GFP_KERNEL))
+ break;
+ if (!waitcnt)
+ return -REMAPPREP_BUFFER;
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(10);
+ __set_current_state(TASK_RUNNING);
+ waitcnt--;
+ if (!waitcnt)
+ print_buffer(page);
+ }
+ }
+ return 0;
+}
+
+/*
+ * Just assign swap space to a anonymous page if it doesn't have yet,
+ * so that the page can be handled like a page in the page cache
+ * since it in the swap cache.
+ */
+static struct address_space *
+make_page_mapped(struct page *page)
+{
+ if (!page_mapped(page)) {
+ if (page_count(page) > 1)
+ printk("page %p not mapped: count %d\n",
+ page, page_count(page));
+ return NULL;
+ }
+ /* The page is an anon page. Allocate its swap entry. */
+ page_map_unlock(page);
+ add_to_swap(page);
+ page_map_lock(page);
+ return page_mapping(page);
+}
+
+/*
+ * Replace "page" with "newpage" on the radix tree. After that, all
+ * new access to "page" will be redirected to "newpage" and it
+ * will be blocked until remapping has been done.
+ */
+static int
+radix_tree_replace_pages(struct page *page, struct page *newpage,
+ struct address_space *mapping)
+{
+ if (radix_tree_preload(GFP_KERNEL))
+ return -1;
+
+ if (PagePrivate(page)) /* XXX */
+ BUG();
+
+ /* should {__add_to,__remove_from}_page_cache be used instead? */
+ spin_lock_irq(&mapping->tree_lock);
+ if (mapping != page_mapping(page))
+ printk("mapping changed %p -> %p, page %p\n",
+ mapping, page_mapping(page), page);
+ if (radix_tree_delete(&mapping->page_tree,
+ PageSwapCache(page) ? page->private : page->index) == NULL) {
+ /* Page truncated. */
+ spin_unlock_irq(&mapping->tree_lock);
+ radix_tree_preload_end();
+ return -1;
+ }
+ /* Don't __put_page(page) here. Truncate may be in progress. */
+ newpage->flags |= page->flags & ~(1 << PG_uptodate) &
+ ~(1 << PG_highmem) & ~(1 << PG_anon) &
+ ~(1 << PG_maplock) &
+ ~(1 << PG_active) & ~(~0UL << NODEZONE_SHIFT);
+
+ /* list_del(&page->list); XXX */
+ radix_tree_insert(&mapping->page_tree,
+ PageSwapCache(page) ? page->private : page->index, newpage);
+ page_cache_get(newpage);
+ newpage->index = page->index;
+ if (PageSwapCache(page))
+ newpage->private = page->private;
+ else
+ newpage->mapping = page->mapping;
+ spin_unlock_irq(&mapping->tree_lock);
+ radix_tree_preload_end();
+ return 0;
+}
+
+/*
+ * Remove all PTE mappings to "page".
+ */
+static int
+unmap_page(struct page *page, struct list_head *vlist)
+{
+ int error = SWAP_SUCCESS;
+
+ page_map_lock(page);
+ while (page_mapped(page) &&
+ (error = try_to_unmap(page, vlist)) == SWAP_AGAIN) {
+ page_map_unlock(page);
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(1);
+ __set_current_state(TASK_RUNNING);
+ page_map_lock(page);
+ }
+ page_map_unlock(page);
+ if (error == SWAP_FAIL) {
+ /* either during mremap or mlocked */
+ return -1;
+ }
+ return 0;
+}
+
+/*
+ * Wait for "page" to become free. Almost same as waiting for its
+ * page count to drop to 2, but truncated pages are special.
+ */
+static int
+wait_on_page_freeable(struct page *page, struct address_space *mapping,
+ struct list_head *vlist, int truncated,
+ int nretry, struct remap_operations *ops)
+{
+ struct address_space *mapping1;
+
+ while ((truncated + page_count(page)) > 2) {
+ if (nretry <= 0)
+ return -1;
+ /* no lock needed while waiting page count */
+ unlock_page(page);
+
+ while ((truncated + page_count(page)) > 2) {
+ nretry--;
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(1);
+ if ((nretry % 5000) == 0) {
+ printk("remap_onepage: still waiting on %p %d\n", page, nretry);
+ break;
+ }
+ if (PagePrivate(page) || page_mapped(page))
+ break; /* see below */
+ }
+
+ lock_page(page);
+ BUG_ON(page_count(page) == 0);
+ mapping1 = page_mapping(page);
+ if (mapping != mapping1 && mapping1 != NULL)
+ printk("mapping changed %p -> %p, page %p\n",
+ mapping, mapping1, page);
+ if (PagePrivate(page))
+ ops->remap_release_buffers(page);
+ unmap_page(page, vlist);
+ }
+ return nretry;
+}
+
+/*
+ * A file which "page" belongs to has been truncated. Free both pages.
+ */
+static void
+free_truncated_pages(struct page *page, struct page *newpage,
+ struct address_space *mapping)
+{
+ void *p;
+ /* mapping->tree_lock must be held. */
+ p = radix_tree_lookup(&mapping->page_tree,
+ PageSwapCache(newpage) ? newpage->private : newpage->index);
+ if (p != NULL) {
+ /* new cache page appeared after truncation */
+ printk("page %p newpage %p radix %p\n",
+ page, newpage, p);
+ BUG_ON(p == newpage);
+ }
+ BUG_ON(page_mapping(page) != NULL);
+ put_page(newpage);
+ if (page_count(newpage) != 1) {
+ printk("newpage count %d != 1, %p\n",
+ page_count(newpage), newpage);
+ BUG();
+ }
+ /* No need to do page->list. remove_from_page_cache did. */
+ newpage->mapping = page->mapping = NULL;
+ spin_unlock_irq(&mapping->tree_lock);
+ ClearPageActive(page);
+ ClearPageActive(newpage);
+ ClearPageSwapCache(page);
+ ClearPageSwapCache(newpage);
+ unlock_page(page);
+ unlock_page(newpage);
+ put_page(newpage);
+}
+
+static inline int
+is_page_truncated(struct page *page, struct page *newpage,
+ struct address_space *mapping)
+{
+ void *p;
+ spin_lock_irq(&mapping->tree_lock);
+ if (page_count(page) == 1) {
+ /* page has been truncated. */
+ return 0;
+ }
+ p = radix_tree_lookup(&mapping->page_tree,
+ PageSwapCache(newpage) ? newpage->private : newpage->index);
+ spin_unlock_irq(&mapping->tree_lock);
+ if (p == NULL) {
+ BUG_ON(page->mapping != NULL);
+ return -1;
+ }
+ return 1;
+}
+
+/*
+ * Replace "page" with "newpage" on the list of clean/dirty pages.
+ */
+static void
+remap_exchange_pages(struct page *page, struct page *newpage,
+ struct address_space *mapping)
+{
+ if (PageDirty(page))
+ set_page_dirty(newpage);
+ page->mapping = NULL;
+ unlock_page(page);
+
+ __put_page(page);
+
+ /* We are done. Finish and let the waiters run. */
+ SetPageUptodate(newpage);
+}
+
+/*
+ * Roll back all remapping operations.
+ */
+static int
+radix_tree_rewind_page(struct page *page, struct page *newpage,
+ struct address_space *mapping)
+{
+ int waitcnt;
+ unsigned long index;
+
+ /*
+ * Try to unwind by notifying waiters. If someone misbehaves,
+ * we die.
+ */
+ if (radix_tree_preload(GFP_KERNEL))
+ BUG();
+ /* should {__add_to,__remove_from}_page_cache be used instead? */
+ spin_lock_irq(&mapping->tree_lock);
+ /* list_del(&newpage->list); */
+ index = PageSwapCache(page) ? page->private : page->index;
+ if (radix_tree_delete(&mapping->page_tree, index) == NULL)
+ /* Hold extra count to handle truncate */
+ page_cache_get(newpage);
+ radix_tree_insert(&mapping->page_tree, index, page);
+ /* no page_cache_get(page); needed */
+ radix_tree_preload_end();
+ spin_unlock_irq(&mapping->tree_lock);
+
+ SetPageAgain(newpage);
+ newpage->mapping = NULL;
+ /* XXX unmap needed? No, it shouldn't. Handled by fault handlers. */
+ unlock_page(newpage);
+
+ waitcnt = HZ;
+ for(; page_count(newpage) > 2; waitcnt--) {
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(10);
+ if (waitcnt == 0) {
+ printk("You are hosed.\n");
+ printk("newpage %p flags %lx %d %d, page %p flags %lx %d\n",
+ newpage, newpage->flags, page_count(newpage),
+ newpage->mapcount,
+ page, page->flags, page_count(page));
+ BUG();
+ }
+ }
+ BUG_ON(PageUptodate(newpage));
+ ClearPageDirty(newpage);
+ ClearPageActive(newpage);
+ spin_lock_irq(&mapping->tree_lock);
+ if (page_count(newpage) == 1) {
+ printk("newpage %p truncated. page %p\n", newpage, page);
+ BUG();
+ }
+ spin_unlock_irq(&mapping->tree_lock);
+ unlock_page(page);
+ BUG_ON(page_count(newpage) != 2);
+ ClearPageAgain(newpage);
+ __put_page(newpage);
+ return 1;
+}
+
+/*
+ * Allocate a new page from specified node.
+ */
+static struct page *
+remap_alloc_page(int nid)
+{
+ if (nid == REMAP_ANYNODE)
+ return alloc_page(GFP_HIGHUSER);
+ else
+ return alloc_pages_node(nid, GFP_HIGHUSER, 0);
+}
+
+static int
+remap_delete_page(struct page *page)
+{
+ BUG_ON(page_count(page) != 1);
+ put_page(page);
+ return 0;
+}
+
+static int
+remap_copy_page(struct page *to, struct page *from)
+{
+ copy_highpage(to, from);
+ return 0;
+}
+
+static int
+remap_lru_add_page(struct page *page, int active)
+{
+ if (active)
+ lru_cache_add_active(page);
+ else
+ lru_cache_add(page);
+ return 0;
+}
+
+static int
+remap_release_buffer(struct page *page)
+{
+ try_to_release_page(page, GFP_KERNEL);
+ return 0;
+}
+
+static struct remap_operations remap_ops = {
+ .remap_alloc_page = remap_alloc_page,
+ .remap_delete_page = remap_delete_page,
+ .remap_copy_page = remap_copy_page,
+ .remap_lru_add_page = remap_lru_add_page,
+ .remap_release_buffers = remap_release_buffer,
+ .remap_prepare = remap_preparepage,
+ .remap_stick_page = stick_mlocked_page
+};
+
+/*
+ * Try to remap a page. Returns non-zero on failure.
+ */
+int remap_onepage(struct page *page, int nodeid, int fastmode,
+ struct remap_operations *ops)
+{
+ struct page *newpage;
+ struct address_space *mapping;
+ LIST_HEAD(vlist);
+ int truncated = 0;
+ int nretry = fastmode ? HZ/50: HZ*10; /* XXXX */
+
+ if ((newpage = ops->remap_alloc_page(nodeid)) == NULL)
+ return -ENOMEM;
+ if (TestSetPageLocked(newpage))
+ BUG();
+ lock_page(page);
+
+ if (ops->remap_prepare && ops->remap_prepare(page, fastmode))
+ goto radixfail;
+ page_map_lock(page);
+ if (PageAnon(page) && !PageSwapCache(page))
+ make_page_mapped(page);
+ mapping = page_mapping(page);
+ page_map_unlock(page);
+ if (mapping == NULL)
+ goto radixfail;
+ if (radix_tree_replace_pages(page, newpage, mapping))
+ goto radixfail;
+ if (unmap_page(page, &vlist))
+ goto unmapfail;
+ if (PagePrivate(page))
+ printk("buffer reappeared\n");
+wait_again:
+ if ((nretry = wait_on_page_freeable(page, mapping, &vlist, truncated, nretry, ops)) < 0)
+ goto unmapfail;
+
+ if (PageReclaim(page) || PageWriteback(page) || PagePrivate(page))
+#ifdef CONFIG_KDB
+ KDB_ENTER();
+#else
+ BUG();
+#endif
+ switch (is_page_truncated(page, newpage, mapping)) {
+ case 0:
+ /* has been truncated */
+ free_truncated_pages(page, newpage, mapping);
+ ops->remap_delete_page(page);
+ return 0;
+ case -1:
+ /* being truncated */
+ truncated = 1;
+ BUG_ON(page_mapping(page) != NULL);
+ goto wait_again;
+ default:
+ /* through */
+ }
+
+ BUG_ON(mapping != page_mapping(page));
+
+ ops->remap_copy_page(newpage, page);
+ remap_exchange_pages(page, newpage, mapping);
+ if (ops->remap_lru_add_page)
+ ops->remap_lru_add_page(newpage, PageActive(page));
+ ClearPageActive(page);
+ ClearPageSwapCache(page);
+ ops->remap_delete_page(page);
+
+ /*
+ * Wake up all waiters which are waiting for completion
+ * of remapping operations.
+ */
+ unlock_page(newpage);
+
+ if (ops->remap_stick_page)
+ ops->remap_stick_page(&vlist);
+ page_cache_release(newpage);
+ return 0;
+
+unmapfail:
+ radix_tree_rewind_page(page, newpage, mapping);
+ if (ops->remap_stick_page)
+ ops->remap_stick_page(&vlist);
+ ClearPageActive(newpage);
+ ClearPageSwapCache(newpage);
+ ops->remap_delete_page(newpage);
+ return 1;
+
+radixfail:
+ unlock_page(page);
+ unlock_page(newpage);
+ if (ops->remap_stick_page)
+ ops->remap_stick_page(&vlist);
+ ops->remap_delete_page(newpage);
+ return 1;
+}
+
+int remap_onepage_normal(struct page *page, int nodeid, int fastmode)
+{
+ return remap_onepage(page, nodeid, fastmode, &remap_ops);
+}
+
+static struct work_struct lru_drain_wq[NR_CPUS];
+static void
+lru_drain_schedule(void *p)
+{
+ int cpu = get_cpu();
+
+ schedule_work(&lru_drain_wq[cpu]);
+ put_cpu();
+}
+
+atomic_t remapd_count;
+int remapd(void *p)
+{
+ struct zone *zone = p;
+ struct page *page, *page1;
+ struct list_head *l;
+ int active, i, nr_failed = 0;
+ int fastmode = 100;
+ LIST_HEAD(failedp);
+
+ daemonize("remap%d", zone->zone_start_pfn);
+ if (atomic_read(&remapd_count) > 0) {
+ printk("remapd already running\n");
+ return 0;
+ }
+ atomic_inc(&remapd_count);
+ on_each_cpu(lru_drain_schedule, NULL, 1, 1);
+ while(nr_failed < 100) {
+ spin_lock_irq(&zone->lru_lock);
+ for(active = 0; active < 2; active++) {
+ l = active ? &zone->active_list :
+ &zone->inactive_list;
+ for(i = 0; !list_empty(l) && i < 10; i++) {
+ page = list_entry(l->prev, struct page, lru);
+ if (fastmode && PageLocked(page)) {
+ page1 = page;
+ while (fastmode && PageLocked(page)) {
+ page =
+ list_entry(page->lru.prev,
+ struct page, lru);
+ fastmode--;
+ if (&page->lru == l) {
+ /* scanned the whole
+ list */
+ page = page1;
+ break;
+ }
+ if (page == page1)
+ BUG();
+ }
+ if (!fastmode) {
+ printk("used up fastmode\n");
+ page = page1;
+ }
+ }
+ if (!TestClearPageLRU(page))
+ BUG();
+ list_del(&page->lru);
+ if (get_page_testone(page)) {
+ /* the page is in pagevec_release();
+ shrink_cache says so. */
+ __put_page(page);
+ SetPageLRU(page);
+ list_add(&page->lru, l);
+ continue;
+ }
+ if (active)
+ zone->nr_active--;
+ else
+ zone->nr_inactive--;
+ spin_unlock_irq(&zone->lru_lock);
+ goto got_page;
+ }
+ }
+ spin_unlock_irq(&zone->lru_lock);
+ break;
+
+ got_page:
+ if (remap_onepage(page, REMAP_ANYNODE, fastmode, &remap_ops)) {
+ nr_failed++;
+ if (fastmode)
+ fastmode--;
+ list_add(&page->lru, &failedp);
+ }
+ }
+ if (list_empty(&failedp))
+ goto out;
+
+ while (!list_empty(&failedp)) {
+ page = list_entry(failedp.prev, struct page, lru);
+ list_del(&page->lru);
+ if (!TestSetPageLocked(page)) {
+ if (remap_preparepage(page, 10 /* XXX */)) {
+ unlock_page(page);
+ } else {
+ ClearPageLocked(page); /* XXX */
+ if (!remap_onepage(page, REMAP_ANYNODE, 0, &remap_ops))
+ continue;
+ }
+ }
+ spin_lock_irq(&zone->lru_lock);
+ if (PageActive(page)) {
+ list_add(&page->lru, &zone->active_list);
+ zone->nr_active++;
+ } else {
+ list_add(&page->lru, &zone->inactive_list);
+ zone->nr_inactive++;
+ }
+ if (TestSetPageLRU(page))
+ BUG();
+ spin_unlock_irq(&zone->lru_lock);
+ page_cache_release(page);
+ }
+out:
+ atomic_dec(&remapd_count);
+ return 0;
+}
+
+static int __init remapd_init(void)
+{
+ int i;
+
+ for(i = 0; i < NR_CPUS; i++)
+ INIT_WORK(&lru_drain_wq[i], (void (*)(void *))lru_add_drain, NULL);
+ return 0;
+}
+
+module_init(remapd_init);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/