[PATCH] VM: Implements the swap-out page-clustering technique

From: Hamid R. Jahanjou
Date: Thu Sep 04 2008 - 07:03:32 EST


From: Hamid R. Jahanjou

Implements the idea of swap-out page clustering from *BSD for
Linux. Each time a candidate page is to be swapped out,
virtually-nearby pages are scanned to find eligible pages to be
swapped out too as a cluster. This technique increases the likelihood of
bringing in related data on a page fault and decreases swap space
fragmentation in the long run. Currently, Linux searches only
physically-nearby pages which is not optimal since, over time, physically-
adjacent pages may become unrelated.

The code can be statically tuned. No benchmarks. I'm not sure whether
the added complexity is acceptable.

Signed-off-by: Hamid R. Jahanjou <hamid.jahanjou@xxxxxxxxx>
---

diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff linux-2.6.26.1-vanilla/include/linux/pageout_clustering.h linux-2.6.26.1HJ/include/linux/pageout_clustering.h
--- linux-2.6.26.1-vanilla/include/linux/pageout_clustering.h 1970-01-01 03:30:00.000000000 +0330
+++ linux-2.6.26.1HJ/include/linux/pageout_clustering.h 2008-08-28 20:19:52.000000000 +0330
@@ -0,0 +1,12 @@
+#ifndef PAGEOUT_CLUSTERING_H_
+#define PAGEOUT_CLUSTERING_H_
+
+#include<linux/list.h>
+#define INITIAL_CLUSTER_SCAN_RANGE 4
+#define HALF_PAGEOUT_CLUSTER_SIZE 8 /* default number of pages to be clustered
+ * on either side of a given page */
+
+int try_to_cluster(struct page *, struct list_head *,
+ struct list_head *, int, int);
+
+#endif
diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff linux-2.6.26.1-vanilla/include/linux/rmap.h linux-2.6.26.1HJ/include/linux/rmap.h
--- linux-2.6.26.1-vanilla/include/linux/rmap.h 2008-08-02 02:28:24.000000000 +0330
+++ linux-2.6.26.1HJ/include/linux/rmap.h 2008-08-28 20:19:52.000000000 +0330
@@ -102,6 +102,12 @@ pte_t *page_check_address(struct page *,
unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);

/*
+ * Used by pageout clustering
+ */
+struct anon_vma *page_lock_anon_vma(struct page *);
+void page_unlock_anon_vma(struct anon_vma *);
+
+/*
* Cleans the PTEs of shared mappings.
* (and since clean PTEs should also be readonly, write protects them too)
*
diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff linux-2.6.26.1-vanilla/mm/Makefile linux-2.6.26.1HJ/mm/Makefile
--- linux-2.6.26.1-vanilla/mm/Makefile 2008-08-02 02:28:24.000000000 +0330
+++ linux-2.6.26.1HJ/mm/Makefile 2008-08-28 20:19:52.000000000 +0330
@@ -9,9 +9,9 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o

obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o pdflush.o \
- readahead.o swap.o truncate.o vmscan.o \
- prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
- page_isolation.o $(mmu-y)
+ readahead.o swap.o truncate.o pageout_clustering.o \
+ vmscan.o prio_tree.o util.o mmzone.o vmstat.o \
+ backing-dev.o page_isolation.o $(mmu-y)

obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
obj-$(CONFIG_BOUNCE) += bounce.o
diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff linux-2.6.26.1-vanilla/mm/pageout_clustering.c linux-2.6.26.1HJ/mm/pageout_clustering.c
--- linux-2.6.26.1-vanilla/mm/pageout_clustering.c 1970-01-01 03:30:00.000000000 +0330
+++ linux-2.6.26.1HJ/mm/pageout_clustering.c 2008-09-04 09:26:40.000000000 +0330
@@ -0,0 +1,278 @@
+#include<linux/pageout_clustering.h>
+#include<linux/mm.h>
+#include<linux/pagemap.h>
+#include<linux/swap.h>
+#include<linux/swapops.h>
+#include<linux/slab.h>
+#include<linux/init.h>
+#include<linux/rmap.h>
+#include<linux/module.h>
+#include<linux/memcontrol.h>
+
+
+/*
+ * contains data related to pageout clustering
+ */
+struct clustering_info
+{
+
+ struct page *page; /* page descriptor of a page used for sampling cluster pages */
+ struct vm_area_struct *vma; /* VMA under consideration */
+ struct list_head *src; /* source page list used by shrink_list and friends */
+
+ union {
+ struct anon_vma *anon_vma; /* anon_vma object we're using for reverse mapping */
+ struct address_space *mapping; /* address_space object we're using for reverse mapping */
+};
+
+ struct page **cluster; /* our page cluster */
+ unsigned int cluster_size; /* cluster size */
+ unsigned int page_index; /* index of the sampling page within the cluster */
+ unsigned int index; /* slot in the cluster for the next page */
+ unsigned int range; /* maximum number of VMA's to scan */
+ unsigned int nr_collected; /* number of pages collected in the cluster */
+ unsigned int nr_sc; /* current number of VMA's scanned */
+ int mode; /* page-isolation mode */
+ int anonym; /* indicates whether we are to collect mapped pages (0)
+ * or anonymous pages (1) */
+ int (*continue_search)
+ (struct clustering_info *); /* returns true if the search can go on */
+};
+
+
+static inline int check_conditions(struct clustering_info *ci)
+{
+ return (ci->nr_sc < ci->range) && (ci->nr_collected < ci->cluster_size);
+}
+
+
+/* embodies page-selection-for-clustering policy */
+static inline int page_allowed_in_cluster(struct page *page, struct clustering_info *ci)
+{
+ int zone_id = page_zone_id(ci->page);
+
+
+ /* scanning the original page ? */
+ if (unlikely(page == ci->page))
+ return 0;
+
+ /* skip pages belonging to other zones */
+ if (unlikely(page_zone_id(page) != zone_id))
+ return 0;
+
+ /* skip pages not in LRU lists */
+ if (!PageLRU(page))
+ return 0;
+
+ /* skip active pages */
+ if (page->flags & PG_active)
+ return 0;
+
+ /* skip non-dirty pages */
+ if (!(page->flags & PG_dirty))
+ return 0;
+
+ return 1;
+}
+
+
+
+/* embodies vma-selection-for-clustering policy */
+static inline int vma_allowed_in_cluster(struct vm_area_struct *vma, struct clustering_info *ci)
+{
+ if ( (vma->vm_flags & VM_RESERVED) ||
+ (vma->vm_flags & VM_LOCKED) ||
+ (vma->vm_flags & VM_IO) ||
+ (vma->vm_flags & VM_NONLINEAR) )
+ return 0;
+
+ /* we ignore mapped VMA's when doing anonym clustering; */
+ /* similarly, we ignore anonym VMA's when doing mapped clustering */
+ if ( ((ci->anonym) && (vma->vm_file != NULL)) ||
+ (!(ci->anonym) && (vma->vm_file == NULL)) )
+ return 0;
+
+ return 1;
+}
+
+
+
+/*
+ * scans a process's vma structures, to find pages eligible for being added to cluster
+ */
+static void try_to_cluster_vma(struct clustering_info *ci)
+{
+ struct vm_area_struct *cursor_vma = ci->vma;
+ struct page *cursor_page;
+ struct mm_struct *cursor_mm;
+ unsigned long vm_address;
+
+
+ cursor_mm = cursor_vma->vm_mm;
+ if (!down_read_trylock(&cursor_mm->mmap_sem))
+ return;
+
+ do
+ {
+ if (!vma_allowed_in_cluster(cursor_vma, ci))
+ goto next_vma;
+
+ ci->nr_sc++;
+
+ for(vm_address = cursor_vma->vm_start;
+ vm_address < cursor_vma->vm_end && ci->nr_collected < ci->cluster_size;
+ vm_address += PAGE_SIZE)
+ {
+ cursor_page = virt_to_page(vm_address);
+ if (!page_allowed_in_cluster(cursor_page, ci))
+ continue;
+
+ switch (__isolate_lru_page(cursor_page, ci->mode))
+ {
+ case 0:
+ ci->cluster[ci->index] = cursor_page;
+ ci->nr_collected++;
+ ci->index = (++ci->index) % (ci->cluster_size);
+ break;
+
+ case -EBUSY:
+ /* it is being freed else where */
+ list_move(&cursor_page->lru, ci->src);
+ break;
+
+ default:
+ break;
+ };
+ }
+
+next_vma:
+ cursor_vma = cursor_vma->vm_next;
+ /* begin from the first VMA if the end of list is reached */
+ if (cursor_vma == NULL)
+ cursor_vma = cursor_mm->mmap;
+
+ } while (cursor_vma != ci->vma && ci->continue_search(ci));
+
+ up_read(&cursor_mm->mmap_sem);
+ return;
+}
+
+
+
+/*
+ * helper function for try_to_cluster(); handles mapped pages
+ */
+static inline void try_to_cluster_mapped(struct clustering_info *ci)
+{
+ struct vm_area_struct *vma;
+ struct prio_tree_iter pst_itr;
+ const pgoff_t pgoff = ci->page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+
+
+ vma_prio_tree_foreach(vma, &pst_itr, &ci->mapping->i_mmap, pgoff, pgoff) {
+ if (!ci->continue_search(ci))
+ break;
+
+ ci->vma = vma;
+ try_to_cluster_vma(ci);
+ }
+ return;
+}
+
+
+
+/*
+ * helper function for try_to_cluster(); handles anonymous pages
+ */
+static inline void try_to_cluster_anon(struct clustering_info *ci)
+{
+ struct list_head *list_item;
+ struct vm_area_struct *vma;
+
+
+ for (list_item = ci->anon_vma->head.next;
+ (list_item != &ci->anon_vma->head) && ci->continue_search(ci);
+ list_item = list_item->next)
+ {
+ vma = list_entry(list_item, struct vm_area_struct, anon_vma_node);
+ ci->vma = vma;
+ try_to_cluster_vma(ci);
+ }
+
+ return;
+}
+
+
+
+/*
+ * Searches in the virtual address space of the process possessing the page
+ * to find other suitable pages to be swapped out as well. This is known as
+ * page clustering. Not to be confused with other usages of the word in Linux.
+ *
+ * We adjust the range of our search by the value of the allocation order.
+ *
+ * @page: an originally-selected page
+ * @half_cluster_size: maximum number of pages to cluster on each side of the page
+ * @src: the list containing the page
+ * @dst: the list to insert pages to
+ * @order: allocation order
+ * @mode: page-isolation mode
+ *
+ * returns the number of pages added to the list, including the passed page.
+ */
+int try_to_cluster(struct page *page, struct list_head *src,
+ struct list_head *dst, int order, int mode)
+{
+ struct clustering_info cluster_info;
+ struct address_space *mapping;
+ struct anon_vma *anon_vma;
+ unsigned int range = INITIAL_CLUSTER_SCAN_RANGE << order;
+#define def_cluster_size (2 * HALF_PAGEOUT_CLUSTER_SIZE + 1)
+ struct page *cluster[def_cluster_size] = {NULL};
+ int i;
+
+
+ cluster[HALF_PAGEOUT_CLUSTER_SIZE] = page;
+ /* initialize commom fields only; other fields are
+ * set as we go on */
+ cluster_info.page = page;
+ cluster_info.vma = NULL;
+ cluster_info.src = src;
+ cluster_info.cluster = cluster;
+ cluster_info.cluster_size = def_cluster_size;
+ cluster_info.page_index = HALF_PAGEOUT_CLUSTER_SIZE;
+ cluster_info.index = HALF_PAGEOUT_CLUSTER_SIZE + 1;
+ cluster_info.range = range;
+ cluster_info.nr_collected = 1;
+ cluster_info.nr_sc = 0;
+ cluster_info.mode = mode;
+ cluster_info.continue_search = check_conditions;
+
+ /* try to cluster anonymous pages ? */
+ anon_vma = page_lock_anon_vma(page);
+ if (anon_vma != NULL)
+ {
+ cluster_info.anon_vma = anon_vma;
+ cluster_info.anonym = 1;
+ try_to_cluster_anon(&cluster_info);
+ page_unlock_anon_vma(anon_vma);
+ }
+
+ /* try to cluster mapped pages ? */
+ else if (page_mapped(page))
+ {
+ mapping = page->mapping;
+ spin_lock(&mapping->i_mmap_lock);
+ cluster_info.mapping = mapping;
+ cluster_info.anonym = 0;
+ try_to_cluster_mapped(&cluster_info);
+ spin_unlock(&mapping->i_mmap_lock);
+ }
+
+ /* add to destination list */
+ for (i = 0; i < def_cluster_size; i++)
+ if (cluster[i] != NULL)
+ list_move(&cluster[i]->lru, dst);
+
+ return cluster_info.nr_collected;
+}
diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff linux-2.6.26.1-vanilla/mm/rmap.c linux-2.6.26.1HJ/mm/rmap.c
--- linux-2.6.26.1-vanilla/mm/rmap.c 2008-08-02 02:28:24.000000000 +0330
+++ linux-2.6.26.1HJ/mm/rmap.c 2008-08-28 20:19:52.000000000 +0330
@@ -156,7 +156,7 @@ void __init anon_vma_init(void)
* Getting a lock on a stable anon_vma from a page off the LRU is
* tricky: page_lock_anon_vma rely on RCU to guard against the races.
*/
-static struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_lock_anon_vma(struct page *page)
{
struct anon_vma *anon_vma;
unsigned long anon_mapping;
@@ -176,7 +176,7 @@ out:
return NULL;
}

-static void page_unlock_anon_vma(struct anon_vma *anon_vma)
+void page_unlock_anon_vma(struct anon_vma *anon_vma)
{
spin_unlock(&anon_vma->lock);
rcu_read_unlock();
diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff linux-2.6.26.1-vanilla/mm/vmscan.c linux-2.6.26.1HJ/mm/vmscan.c
--- linux-2.6.26.1-vanilla/mm/vmscan.c 2008-08-02 02:28:24.000000000 +0330
+++ linux-2.6.26.1HJ/mm/vmscan.c 2008-08-28 20:19:52.000000000 +0330
@@ -38,6 +38,7 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/memcontrol.h>
+#include <linux/pageout_clustering.h>

#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -700,10 +701,7 @@ static unsigned long isolate_lru_pages(u

for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
struct page *page;
- unsigned long pfn;
- unsigned long end_pfn;
- unsigned long page_pfn;
- int zone_id;
+ int nr_clustered;

page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
@@ -712,8 +710,7 @@ static unsigned long isolate_lru_pages(u

switch (__isolate_lru_page(page, mode)) {
case 0:
- list_move(&page->lru, dst);
- nr_taken++;
+ /* counted and added to dst in try_to_cluster() */
break;

case -EBUSY:
@@ -725,51 +722,13 @@ static unsigned long isolate_lru_pages(u
BUG();
}

- if (!order)
- continue;
-
- /*
- * Attempt to take all pages in the order aligned region
- * surrounding the tag page. Only take those pages of
- * the same active state as that tag page. We may safely
- * round the target page pfn down to the requested order
- * as the mem_map is guarenteed valid out to MAX_ORDER,
- * where that page is in a different zone we will detect
- * it from its zone id and abort this block scan.
- */
- zone_id = page_zone_id(page);
- page_pfn = page_to_pfn(page);
- pfn = page_pfn & ~((1 << order) - 1);
- end_pfn = pfn + (1 << order);
- for (; pfn < end_pfn; pfn++) {
- struct page *cursor_page;
-
- /* The target page is in the block, ignore it. */
- if (unlikely(pfn == page_pfn))
- continue;
-
- /* Avoid holes within the zone. */
- if (unlikely(!pfn_valid_within(pfn)))
- break;
-
- cursor_page = pfn_to_page(pfn);
- /* Check that we have not crossed a zone boundary. */
- if (unlikely(page_zone_id(cursor_page) != zone_id))
- continue;
- switch (__isolate_lru_page(cursor_page, mode)) {
- case 0:
- list_move(&cursor_page->lru, dst);
- nr_taken++;
- scan++;
- break;
-
- case -EBUSY:
- /* else it is being freed elsewhere */
- list_move(&cursor_page->lru, src);
- default:
- break;
- }
- }
+ /*
+ * scan virtually-nearby pages in process address space
+ * to find eligible pages to be swapped out as well
+ */
+ nr_clustered = try_to_cluster(page, src, dst, order, mode);
+ nr_taken += nr_clustered;
+ scan += nr_clustered - 1;
}

*scanned = scan;


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/