[PATCH(fixed) 11/12] free page cgroup from LRU in add

From: KAMEZAWA Hiroyuki
Date: Thu Sep 25 2008 - 22:22:20 EST


Fixed HUNK with 9/12(fixed)
==
Delaying add_to_lru() and do it in batched manner like page_vec.
For doing that 2 flags PCG_USED and PCG_LRU.

If PCG_LRU is set, page is on LRU. It safe to access LRU via page_cgroup.
(under some lock.)

For avoiding race, this patch uses TestSetPageCgroupUsed().
and checking PCG_USED bit and PCG_LRU bit in add/free vector.
By this, lock_page_cgroup() in mem_cgroup_charge() is removed.

(I don't want to call lock_page_cgroup() under mz->lru_lock when
add/free vector core logic. So, TestSetPageCgroupUsed() logic is added.
TestSet is an easy way to avoid unneccesary nest of locks.)


Changelog: v3 -> v5.
- removed css_get/put per page_cgroup struct.
Now, *new* force_empty checks there is page_cgroup on the memcg.
We don't need to be afraid of leak.

Changelog: v2 -> v3
- added TRANSIT flag and removed lock from core logic.
Changelog: v1 -> v2:
- renamed function name from use_page_cgroup to set_page_cgroup_lru().

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>

include/linux/page_cgroup.h | 10 +++
mm/memcontrol.c | 121 +++++++++++++++++++++++++++++++-------------
2 files changed, 96 insertions(+), 35 deletions(-)

Index: mmotm-2.6.27-rc7+/include/linux/page_cgroup.h
===================================================================
--- mmotm-2.6.27-rc7+.orig/include/linux/page_cgroup.h
+++ mmotm-2.6.27-rc7+/include/linux/page_cgroup.h
@@ -24,6 +24,7 @@ enum {
PCG_LOCK, /* page cgroup is locked */
PCG_CACHE, /* charged as cache */
PCG_USED, /* this object is in use. */
+ PCG_LRU, /* this is on LRU */
/* flags for LRU placement */
PCG_ACTIVE, /* page is active in this cgroup */
PCG_FILE, /* page is file system backed */
@@ -42,11 +43,20 @@ static inline void SetPageCgroup##uname(
static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
{ clear_bit(PCG_##lname, &pc->flags); }

+#define TESTSETPCGFLAG(uname, lname)\
+static inline int TestSetPageCgroup##uname(struct page_cgroup *pc) \
+ { return test_and_set_bit(PCG_##lname, &pc->flags); }
+
/* Cache flag is set only once (at allocation) */
TESTPCGFLAG(Cache, CACHE)

TESTPCGFLAG(Used, USED)
CLEARPCGFLAG(Used, USED)
+TESTSETPCGFLAG(Used, USED)
+
+TESTPCGFLAG(LRU, LRU)
+SETPCGFLAG(LRU, LRU)
+CLEARPCGFLAG(LRU, LRU)

/* LRU management flags (from global-lru definition) */
TESTPCGFLAG(File, FILE)
Index: mmotm-2.6.27-rc7+/mm/memcontrol.c
===================================================================
--- mmotm-2.6.27-rc7+.orig/mm/memcontrol.c
+++ mmotm-2.6.27-rc7+/mm/memcontrol.c
@@ -149,9 +149,9 @@ enum charge_type {

static const unsigned long
pcg_default_flags[NR_CHARGE_TYPE] = {
- (1 << PCG_CACHE) | (1 << PCG_FILE) | (1 << PCG_USED) | (1 << PCG_LOCK),
- (1 << PCG_ACTIVE) | (1 << PCG_LOCK) | (1 << PCG_USED),
- (1 << PCG_ACTIVE) | (1 << PCG_CACHE) | (1 << PCG_USED)| (1 << PCG_LOCK),
+ (1 << PCG_CACHE) | (1 << PCG_FILE) | (1 << PCG_USED),
+ (1 << PCG_ACTIVE) | (1 << PCG_USED),
+ (1 << PCG_ACTIVE) | (1 << PCG_CACHE) | (1 << PCG_USED),
};

/*
@@ -193,7 +193,6 @@ page_cgroup_zoneinfo(struct page_cgroup
struct mem_cgroup *mem = pc->mem_cgroup;
int nid = page_cgroup_nid(pc);
int zid = page_cgroup_zid(pc);
-
return mem_cgroup_zoneinfo(mem, nid, zid);
}

@@ -341,7 +340,7 @@ void mem_cgroup_move_lists(struct page *
if (!trylock_page_cgroup(pc))
return;

- if (PageCgroupUsed(pc)) {
+ if (PageCgroupUsed(pc) && PageCgroupLRU(pc)) {
mem = pc->mem_cgroup;
mz = page_cgroup_zoneinfo(pc);
spin_lock_irqsave(&mz->lru_lock, flags);
@@ -500,6 +499,9 @@ int mem_cgroup_move_account(struct page
from_mz = mem_cgroup_zoneinfo(from, nid, zid);
to_mz = mem_cgroup_zoneinfo(to, nid, zid);

+ if (!PageCgroupLRU(pc))
+ return ret;
+
if (res_counter_charge(&to->res, PAGE_SIZE)) {
/* Now, we assume no_limit...no failure here. */
return ret;
@@ -516,10 +518,8 @@ int mem_cgroup_move_account(struct page

if (spin_trylock(&to_mz->lru_lock)) {
__mem_cgroup_remove_list(from_mz, pc);
- css_put(&from->css);
res_counter_uncharge(&from->res, PAGE_SIZE);
pc->mem_cgroup = to;
- css_get(&to->css);
__mem_cgroup_add_list(to_mz, pc);
ret = 0;
spin_unlock(&to_mz->lru_lock);
@@ -540,6 +540,7 @@ struct memcg_percpu_vec {
struct page_cgroup *vec[MEMCG_PCPVEC_SIZE];
};
static DEFINE_PER_CPU(struct memcg_percpu_vec, memcg_free_vec);
+static DEFINE_PER_CPU(struct memcg_percpu_vec, memcg_add_vec);

static void
__release_page_cgroup(struct memcg_percpu_vec *mpv)
@@ -555,7 +556,6 @@ __release_page_cgroup(struct memcg_percp
prev_mz = NULL;
for (i = nr - 1; i >= 0; i--) {
pc = mpv->vec[i];
- VM_BUG_ON(PageCgroupUsed(pc));
mz = page_cgroup_zoneinfo(pc);
if (prev_mz != mz) {
if (prev_mz)
@@ -563,9 +563,10 @@ __release_page_cgroup(struct memcg_percp
prev_mz = mz;
spin_lock(&mz->lru_lock);
}
- __mem_cgroup_remove_list(mz, pc);
- css_put(&pc->mem_cgroup->css);
- pc->mem_cgroup = NULL;
+ if (!PageCgroupUsed(pc) && PageCgroupLRU(pc)) {
+ __mem_cgroup_remove_list(mz, pc);
+ ClearPageCgroupLRU(pc);
+ }
}
if (prev_mz)
spin_unlock(&prev_mz->lru_lock);
@@ -574,10 +575,43 @@ __release_page_cgroup(struct memcg_percp
}

static void
+__set_page_cgroup_lru(struct memcg_percpu_vec *mpv)
+{
+ unsigned long flags;
+ struct mem_cgroup_per_zone *mz, *prev_mz;
+ struct page_cgroup *pc;
+ int i, nr;
+
+ local_irq_save(flags);
+ nr = mpv->nr;
+ mpv->nr = 0;
+ prev_mz = NULL;
+
+ for (i = nr - 1; i >= 0; i--) {
+ pc = mpv->vec[i];
+ mz = page_cgroup_zoneinfo(pc);
+ if (prev_mz != mz) {
+ if (prev_mz)
+ spin_unlock(&prev_mz->lru_lock);
+ prev_mz = mz;
+ spin_lock(&mz->lru_lock);
+ }
+ if (PageCgroupUsed(pc) && !PageCgroupLRU(pc)) {
+ SetPageCgroupLRU(pc);
+ __mem_cgroup_add_list(mz, pc);
+ }
+ }
+
+ if (prev_mz)
+ spin_unlock(&prev_mz->lru_lock);
+ local_irq_restore(flags);
+
+}
+
+static void
release_page_cgroup(struct page_cgroup *pc)
{
struct memcg_percpu_vec *mpv;
-
mpv = &get_cpu_var(memcg_free_vec);
mpv->vec[mpv->nr++] = pc;
if (mpv->nr >= mpv->limit)
@@ -585,11 +619,25 @@ release_page_cgroup(struct page_cgroup *
put_cpu_var(memcg_free_vec);
}

+static void
+set_page_cgroup_lru(struct page_cgroup *pc)
+{
+ struct memcg_percpu_vec *mpv;
+
+ mpv = &get_cpu_var(memcg_add_vec);
+ mpv->vec[mpv->nr++] = pc;
+ if (mpv->nr >= mpv->limit)
+ __set_page_cgroup_lru(mpv);
+ put_cpu_var(memcg_add_vec);
+}
+
static void page_cgroup_start_cache_cpu(int cpu)
{
struct memcg_percpu_vec *mpv;
mpv = &per_cpu(memcg_free_vec, cpu);
mpv->limit = MEMCG_PCPVEC_SIZE;
+ mpv = &per_cpu(memcg_add_vec, cpu);
+ mpv->limit = MEMCG_PCPVEC_SIZE;
}

#ifdef CONFIG_HOTPLUG_CPU
@@ -598,6 +646,8 @@ static void page_cgroup_stop_cache_cpu(i
struct memcg_percpu_vec *mpv;
mpv = &per_cpu(memcg_free_vec, cpu);
mpv->limit = 0;
+ mpv = &per_cpu(memcg_add_vec, cpu);
+ mpv->limit = 0;
}
#endif

@@ -611,6 +661,9 @@ static DEFINE_MUTEX(memcg_force_drain_mu
static void drain_page_cgroup_local(struct work_struct *work)
{
struct memcg_percpu_vec *mpv;
+ mpv = &get_cpu_var(memcg_add_vec);
+ __set_page_cgroup_lru(mpv);
+ put_cpu_var(mpv);
mpv = &get_cpu_var(memcg_free_vec);
__release_page_cgroup(mpv);
put_cpu_var(mpv);
@@ -677,14 +730,9 @@ static int mem_cgroup_charge_common(stru
rcu_read_unlock();
return 0;
}
- /*
- * For every charge from the cgroup, increment reference count
- */
- css_get(&mem->css);
rcu_read_unlock();
} else {
mem = memcg;
- css_get(&memcg->css);
}

while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
@@ -711,33 +759,36 @@ static int mem_cgroup_charge_common(stru
}

preempt_disable();
- lock_page_cgroup(pc);
- if (unlikely(PageCgroupUsed(pc))) {
- unlock_page_cgroup(pc);
+ if (TestSetPageCgroupUsed(pc)) {
res_counter_uncharge(&mem->res, PAGE_SIZE);
- css_put(&mem->css);
preempt_enable();
goto done;
}
- pc->mem_cgroup = mem;
/*
- * If a page is accounted as a page cache, insert to inactive list.
- * If anon, insert to active list.
- */
- pc->flags = pcg_default_flags[ctype];
-
- mz = page_cgroup_zoneinfo(pc);
+ * page cgroup is *unused* now....but....
+ * We can assume old mem_cgroup's metadata is still available
+ * because pc is not on stale LRU after force_empty() is called.
+ */
+ if (likely(!PageCgroupLRU(pc)))
+ pc->flags = pcg_default_flags[ctype];
+ else {
+ mz = page_cgroup_zoneinfo(pc);
+ spin_lock_irqsave(&mz->lru_lock, flags);
+ if (PageCgroupLRU(pc)) {
+ __mem_cgroup_remove_list(mz, pc);
+ ClearPageCgroupLRU(pc);
+ }
+ pc->flags = pcg_default_flags[ctype];
+ spin_unlock_irqrestore(&mz->lru_lock, flags);
+ }

- spin_lock_irqsave(&mz->lru_lock, flags);
- __mem_cgroup_add_list(mz, pc);
- spin_unlock_irqrestore(&mz->lru_lock, flags);
- unlock_page_cgroup(pc);
+ pc->mem_cgroup = mem;
+ set_page_cgroup_lru(pc);
preempt_enable();

done:
return 0;
out:
- css_put(&mem->css);
return -ENOMEM;
}

@@ -823,12 +874,12 @@ __mem_cgroup_uncharge_common(struct page
preempt_disable();
lock_page_cgroup(pc);
ClearPageCgroupUsed(pc);
+ mem = pc->mem_cgroup;
unlock_page_cgroup(pc);
preempt_enable();
+ res_counter_uncharge(&mem->res, PAGE_SIZE);

- mem = pc->mem_cgroup;
release_page_cgroup(pc);
- res_counter_uncharge(&mem->res, PAGE_SIZE);

return;
}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/