[RFC][PATCH 8/8] RSS controller support reclamation

From: Balbir Singh
Date: Thu Nov 09 2006 - 14:38:54 EST




Reclaim memory as we hit the max_shares limit. The code for reclamation
is inspired from Dave Hansen's challenged memory controller and from the
shrink_all_memory() code

Reclamation can be triggered from two paths

1. While incrementing the RSS, we hit the limit of the container
2. A container is resized, such that it's new limit is below its current
RSS

In (1) reclamation takes place in the background.

TODO's

1. max_shares currently works like a soft limit. The RSS can grow beyond it's
limit. One possible fix is to introduce a soft limit (reclaim when the
container hits the soft limit) and fail when we hit the hard limit

Signed-off-by: Balbir Singh <balbir@xxxxxxxxxx>
---

include/linux/memctlr.h | 17 ++++++
kernel/fork.c | 1
kernel/res_group/memctlr.c | 116 ++++++++++++++++++++++++++++++++++++++-------
mm/rmap.c | 72 +++++++++++++++++++++++++++
mm/vmscan.c | 72 +++++++++++++++++++++++++++
5 files changed, 260 insertions(+), 18 deletions(-)

diff -puN mm/vmscan.c~container-memctlr-reclaim mm/vmscan.c
--- linux-2.6.19-rc2/mm/vmscan.c~container-memctlr-reclaim 2006-11-09 22:21:11.000000000 +0530
+++ linux-2.6.19-rc2-balbir/mm/vmscan.c 2006-11-09 22:21:11.000000000 +0530
@@ -36,6 +36,8 @@
#include <linux/rwsem.h>
#include <linux/delay.h>
#include <linux/kthread.h>
+#include <linux/container.h>
+#include <linux/memctlr.h>

#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -65,6 +67,9 @@ struct scan_control {
int swappiness;

int all_unreclaimable;
+
+ int overlimit;
+ void *container; /* Added as void * to avoid #ifdef's */
};

/*
@@ -811,6 +816,10 @@ force_reclaim_mapped:
cond_resched();
page = lru_to_page(&l_hold);
list_del(&page->lru);
+ if (!memctlr_page_reclaim(page, sc->container, sc->overlimit)) {
+ list_add(&page->lru, &l_active);
+ continue;
+ }
if (page_mapped(page)) {
if (!reclaim_mapped ||
(total_swap_pages == 0 && PageAnon(page)) ||
@@ -1008,6 +1017,8 @@ unsigned long try_to_free_pages(struct z
.swap_cluster_max = SWAP_CLUSTER_MAX,
.may_swap = 1,
.swappiness = vm_swappiness,
+ .overlimit = SC_OVERLIMIT_NONE,
+ .container = NULL,
};

count_vm_event(ALLOCSTALL);
@@ -1104,6 +1115,8 @@ static unsigned long balance_pgdat(pg_da
.may_swap = 1,
.swap_cluster_max = SWAP_CLUSTER_MAX,
.swappiness = vm_swappiness,
+ .overlimit = SC_OVERLIMIT_NONE,
+ .container = NULL,
};

loop_again:
@@ -1324,7 +1337,7 @@ void wakeup_kswapd(struct zone *zone, in
wake_up_interruptible(&pgdat->kswapd_wait);
}

-#ifdef CONFIG_PM
+#if defined(CONFIG_PM) || defined(CONFIG_RES_GROUPS_MEMORY)
/*
* Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
* from LRU lists system-wide, for given pass and priority, and returns the
@@ -1368,7 +1381,60 @@ static unsigned long shrink_all_zones(un

return ret;
}
+#endif

+#ifdef CONFIG_RES_GROUPS_MEMORY
+/*
+ * Modelled after shrink_all_memory
+ */
+unsigned long memctlr_shrink_container_memory(unsigned long nr_pages,
+ struct container *container,
+ int overlimit)
+{
+ unsigned long lru_pages;
+ unsigned long ret = 0;
+ int pass;
+ struct zone *zone;
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .may_swap = 0,
+ .swap_cluster_max = nr_pages,
+ .may_writepage = 1,
+ .swappiness = vm_swappiness,
+ .overlimit = overlimit,
+ .container = container,
+ };
+
+ lru_pages = 0;
+ for_each_zone(zone)
+ lru_pages += zone->nr_active + zone->nr_inactive;
+
+ for (pass = 0; pass < 5; pass++) {
+ int prio;
+
+ /* Force reclaiming mapped pages in the passes #3 and #4 */
+ if (pass > 2) {
+ sc.may_swap = 1;
+ sc.swappiness = 100;
+ }
+
+ for (prio = DEF_PRIORITY; prio >= 0; prio--) {
+ unsigned long nr_to_scan = nr_pages - ret;
+
+ sc.nr_scanned = 0;
+ ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
+ if (ret >= nr_pages)
+ break;
+
+ if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
+ blk_congestion_wait(WRITE, HZ / 10);
+ }
+ }
+ return ret;
+}
+#endif
+
+#ifdef CONFIG_PM
/*
* Try to free `nr_pages' of memory, system-wide, and return the number of
* freed pages.
@@ -1390,6 +1456,8 @@ unsigned long shrink_all_memory(unsigned
.swap_cluster_max = nr_pages,
.may_writepage = 1,
.swappiness = vm_swappiness,
+ .overlimit = SC_OVERLIMIT_NONE,
+ .container = NULL,
};

current->reclaim_state = &reclaim_state;
@@ -1585,6 +1653,8 @@ static int __zone_reclaim(struct zone *z
SWAP_CLUSTER_MAX),
.gfp_mask = gfp_mask,
.swappiness = vm_swappiness,
+ .overlimit = SC_OVERLIMIT_NONE,
+ .container = NULL,
};
unsigned long slab_reclaimable;

diff -puN kernel/res_group/memctlr.c~container-memctlr-reclaim kernel/res_group/memctlr.c
--- linux-2.6.19-rc2/kernel/res_group/memctlr.c~container-memctlr-reclaim 2006-11-09 22:21:11.000000000 +0530
+++ linux-2.6.19-rc2-balbir/kernel/res_group/memctlr.c 2006-11-09 22:21:11.000000000 +0530
@@ -33,6 +33,7 @@
#include <linux/memctlr.h>
#include <linux/mm.h>
#include <linux/swap.h>
+#include <linux/workqueue.h>
#include <asm/pgtable.h>

static const char res_ctlr_name[] = "memctlr";
@@ -40,7 +41,10 @@ static struct resource_group *root_rgrou
static const char version[] = "0.05";
static struct memctlr *memctlr_root;

-#define MEMCTLR_MAGIC 0xdededede
+static void memctlr_callback(void *data);
+static atomic_long_t failed_inc_rss;
+static atomic_long_t failed_dec_rss;
+

struct mem_counter {
atomic_long_t rss;
@@ -57,9 +61,12 @@ struct memctlr {
int magic;
spinlock_t lock;
long nr_pages;
+ int reclaim_in_progress;
};

struct res_controller memctlr_rg;
+static DECLARE_WORK(memctlr_work, memctlr_callback, NULL);
+#define MEMCTLR_MAGIC 0xdededede

static struct memctlr *get_memctlr_from_shares(struct res_shares *shares)
{
@@ -96,7 +103,7 @@ void mm_free_mem_counter(struct mm_struc
void mm_assign_container(struct mm_struct *mm, struct task_struct *p)
{
rcu_read_lock();
- mm->container = rcu_dereference(p->container);
+ rcu_assign_pointer(mm->container, rcu_dereference(p->container));
rcu_read_unlock();
}

@@ -123,38 +130,64 @@ static inline struct memctlr *get_task_m
return res;
}

-
-void memctlr_inc_rss_mm(struct page *page, struct mm_struct *mm)
+static void memctlr_callback(void *data)
{
- struct memctlr *res;
+ struct memctlr *res = (struct memctlr *)data;
+ long rss;
+ unsigned long nr_shrink = 0;

- res = get_task_memctlr(current);
- if (!res) {
- printk(KERN_INFO "inc_rss no res set *---*\n");
- return;
- }
+ BUG_ON(!res);

spin_lock(&res->lock);
- atomic_long_inc(&mm->counter->rss);
- atomic_long_inc(&res->counter.rss);
+ rss = atomic_long_read(&res->counter.rss);
+ if ((rss > res->nr_pages) && (res->nr_pages > 0))
+ nr_shrink = rss - ((res->nr_pages * 4) / 5);
+ spin_unlock(&res->lock);
+
+ if (nr_shrink)
+ memctlr_shrink_container_memory(nr_shrink, res->rgroup,
+ SC_OVERLIMIT_ONE);
+ spin_lock(&res->lock);
+ res->reclaim_in_progress = 0;
spin_unlock(&res->lock);
}

-void memctlr_inc_rss(struct page *page)
+void memctlr_inc_rss_mm(struct page *page, struct mm_struct *mm)
{
struct memctlr *res;
- struct mm_struct *mm = get_task_mm(current);
+ long rss;

res = get_task_memctlr(current);
if (!res) {
- printk(KERN_INFO "inc_rss no res set *---*\n");
+ atomic_long_inc(&failed_inc_rss);
return;
}

spin_lock(&res->lock);
atomic_long_inc(&mm->counter->rss);
atomic_long_inc(&res->counter.rss);
+ rss = atomic_long_read(&res->counter.rss);
+ if ((res->nr_pages < rss) && (res->nr_pages > 0)) {
+ /*
+ * Reclaim if we exceed our limit
+ * Schedule a job to do so
+ */
+ if (res->reclaim_in_progress)
+ goto done;
+ res->reclaim_in_progress = 1;
+ spin_unlock(&res->lock);
+ PREPARE_WORK(&memctlr_work, memctlr_callback, res);
+ schedule_work(&memctlr_work);
+ return;
+ }
+done:
spin_unlock(&res->lock);
+}
+
+void memctlr_inc_rss(struct page *page)
+{
+ struct mm_struct *mm = get_task_mm(current);
+ memctlr_inc_rss_mm(page, mm);
mmput(mm);
}

@@ -162,9 +195,9 @@ void memctlr_dec_rss(struct page *page,
{
struct memctlr *res;

- res = get_task_memctlr(current);
+ res = get_memctlr(mm->container);
if (!res) {
- printk(KERN_INFO "dec_rss no res set *---*\n");
+ atomic_long_inc(&failed_dec_rss);
return;
}

@@ -183,6 +216,7 @@ static void memctlr_init_new(struct memc

memctlr_init_mem_counter(&res->counter);
res->nr_pages = SHARE_DONT_CARE;
+ res->reclaim_in_progress = 0;
spin_lock_init(&res->lock);
}

@@ -200,6 +234,7 @@ static struct res_shares *memctlr_alloc_
root_rgroup = rgroup;
memctlr_root = res;
res->nr_pages = nr_free_pages();
+ res->shares.max_shares = SHARE_DEFAULT_DIVISOR;
printk("Memory Controller version %s\n", version);
}
return &res->shares;
@@ -355,6 +390,20 @@ static ssize_t memctlr_show_stats(struct
buf += i;
len -= i;
j += i;
+
+ i = snprintf(buf, len, "Failed INC RSS Pages %ld\n",
+ atomic_long_read(&failed_inc_rss));
+
+ buf += i;
+ len -= i;
+ j += i;
+
+ i = snprintf(buf, len, "Failed DEC RSS Pages %ld\n",
+ atomic_long_read(&failed_dec_rss));
+
+ buf += i;
+ len -= i;
+ j += i;
return j;
}

@@ -421,6 +470,8 @@ static void recalc_and_propagate(struct
int child_divisor;
u64 numerator;
struct memctlr *child_res;
+ long rss;
+ unsigned long nr_shrink = 0;

if (parres) {
if (res->shares.max_shares == SHARE_DONT_CARE ||
@@ -445,6 +496,35 @@ static void recalc_and_propagate(struct
recalc_and_propagate(child_res, res);
}

+ /*
+ * Reclaim if our limit was shrunk
+ */
+ spin_lock(&res->lock);
+ rss = atomic_long_read(&res->counter.rss);
+ if ((rss > res->nr_pages) && (res->nr_pages > 0))
+ nr_shrink = rss - ((res->nr_pages * 4) / 5);
+ spin_unlock(&res->lock);
+
+ if (nr_shrink)
+ memctlr_shrink_container_memory(nr_shrink, NULL,
+ SC_OVERLIMIT_ALL);
+}
+
+int memctlr_over_limit(struct container *container)
+{
+ struct resource_group *rgroup = container;
+ struct memctlr *res;
+ int ret = 0;
+
+ res = get_memctlr(rgroup);
+ if (!res)
+ return ret;
+
+ spin_lock(&res->lock);
+ if (atomic_long_read(&res->counter.rss) > res->nr_pages)
+ ret = 1;
+ spin_unlock(&res->lock);
+ return ret;
}

static void memctlr_shares_changed(struct res_shares *shares)
@@ -477,6 +557,8 @@ int __init memctlr_init(void)
{
if (memctlr_rg.ctlr_id != NO_RES_ID)
return -EBUSY; /* already registered */
+ atomic_long_set(&failed_inc_rss, 0);
+ atomic_long_set(&failed_dec_rss, 0);
return register_controller(&memctlr_rg);
}

diff -puN include/linux/memctlr.h~container-memctlr-reclaim include/linux/memctlr.h
--- linux-2.6.19-rc2/include/linux/memctlr.h~container-memctlr-reclaim 2006-11-09 22:21:11.000000000 +0530
+++ linux-2.6.19-rc2-balbir/include/linux/memctlr.h 2006-11-09 22:21:11.000000000 +0530
@@ -34,6 +34,12 @@ extern void memctlr_inc_rss_mm(struct pa
extern void memctlr_dec_rss(struct page *page, struct mm_struct *mm);
extern void mm_free_mem_counter(struct mm_struct *mm);
extern int proc_memacct(struct task_struct *task, char *buffer);
+extern unsigned long memctlr_shrink_container_memory(unsigned long nr_pages,
+ struct container *container,
+ int overlimit);
+extern int memctlr_page_reclaim(struct page *page, void *container,
+ int overlimit);
+extern int memctlr_over_limit(struct container *container);

#else /* CONFIG_RES_GROUPS_MEMORY */

@@ -54,9 +60,20 @@ int mm_init_mem_counter(struct mm_struct
void mm_assign_container(struct mm_struct *mm, struct task_struct *p)
{}

+int memctlr_page_reclaim(struct page *page, void *container, int overlimit)
+{
+ return 1;
+}
+
void mm_free_mem_counter(struct mm_struct *mm)
{}

#endif /* CONFIG_RES_GROUPS_MEMORY */

+enum {
+ SC_OVERLIMIT_NONE, /* The scan is container independent */
+ SC_OVERLIMIT_ONE, /* Scan the one container specified */
+ SC_OVERLIMIT_ALL, /* Scan all containers */
+};
+
#endif /* _LINUX_MEMCTRL_H */
diff -puN mm/rmap.c~container-memctlr-reclaim mm/rmap.c
--- linux-2.6.19-rc2/mm/rmap.c~container-memctlr-reclaim 2006-11-09 22:21:11.000000000 +0530
+++ linux-2.6.19-rc2-balbir/mm/rmap.c 2006-11-09 22:21:11.000000000 +0530
@@ -604,6 +604,78 @@ void page_remove_rmap(struct page *page,
memctlr_dec_rss(page, mm);
}

+#ifdef CONFIG_RES_GROUPS_MEMORY
+/*
+ * Can we push this code down to try_to_unmap()?
+ */
+int memctlr_page_reclaim(struct page *page, void *container, int overlimit)
+{
+ int ret = 0;
+
+ if (overlimit == SC_OVERLIMIT_NONE)
+ return 1;
+ if (container == NULL && overlimit != SC_OVERLIMIT_ALL)
+ return 1;
+
+ if (!page_mapped(page))
+ return 0;
+
+ if (PageAnon(page)) {
+ struct anon_vma *anon_vma;
+ struct vm_area_struct *vma;
+
+ anon_vma = page_lock_anon_vma(page);
+ if (!anon_vma)
+ return 0;
+
+ list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ if (memctlr_over_limit(vma->vm_mm->container) &&
+ ((container == vma->vm_mm->container) ||
+ (overlimit == SC_OVERLIMIT_ALL))) {
+ ret = 1;
+ break;
+ }
+ }
+ spin_unlock(&anon_vma->lock);
+ } else {
+ struct address_space *mapping = page_mapping(page);
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ struct vm_area_struct *vma;
+ struct prio_tree_iter iter;
+
+ if (!mapping)
+ return 0;
+
+ spin_lock(&mapping->i_mmap_lock);
+ vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
+ pgoff) {
+ if (memctlr_over_limit(vma->vm_mm->container) &&
+ ((container == vma->vm_mm->container) ||
+ (overlimit == SC_OVERLIMIT_ALL))) {
+ ret = 1;
+ break;
+ }
+ }
+ if (ret)
+ goto done;
+
+ list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+ shared.vm_set.list) {
+ if (memctlr_over_limit(vma->vm_mm->container) &&
+ ((container == vma->vm_mm->container) ||
+ (overlimit == SC_OVERLIMIT_ALL))) {
+ ret = 1;
+ break;
+ }
+ }
+done:
+ spin_unlock(&mapping->i_mmap_lock);
+ }
+
+ return ret;
+}
+#endif
+
/*
* Subfunctions of try_to_unmap: try_to_unmap_one called
* repeatedly from either try_to_unmap_anon or try_to_unmap_file.
diff -puN kernel/fork.c~container-memctlr-reclaim kernel/fork.c
--- linux-2.6.19-rc2/kernel/fork.c~container-memctlr-reclaim 2006-11-09 22:21:11.000000000 +0530
+++ linux-2.6.19-rc2-balbir/kernel/fork.c 2006-11-09 22:21:11.000000000 +0530
@@ -364,6 +364,7 @@ struct mm_struct * mm_alloc(void)
if (mm) {
memset(mm, 0, sizeof(*mm));
mm = mm_init(mm);
+ mm_assign_container(mm, current);
}
return mm;
}
_

--

Balbir Singh,
Linux Technology Center,
IBM Software Labs
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/