Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

From: Daisuke Nishimura
Date: Wed Mar 03 2010 - 01:07:44 EST


On Wed, 3 Mar 2010 12:29:06 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> wrote:
> On Wed, 3 Mar 2010 11:12:38 +0900
> Daisuke Nishimura <nishimura@xxxxxxxxxxxxxxxxx> wrote:
>
> > > diff --git a/mm/filemap.c b/mm/filemap.c
> > > index fe09e51..f85acae 100644
> > > --- a/mm/filemap.c
> > > +++ b/mm/filemap.c
> > > @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page)
> > > * having removed the page entirely.
> > > */
> > > if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
> > > + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, -1);
> > > dec_zone_page_state(page, NR_FILE_DIRTY);
> > > dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
> > > }
> > (snip)
> > > @@ -1096,6 +1113,7 @@ int __set_page_dirty_no_writeback(struct page *page)
> > > void account_page_dirtied(struct page *page, struct address_space *mapping)
> > > {
> > > if (mapping_cap_account_dirty(mapping)) {
> > > + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, 1);
> > > __inc_zone_page_state(page, NR_FILE_DIRTY);
> > > __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
> > > task_dirty_inc(current);
> > As long as I can see, those two functions(at least) calls mem_cgroup_update_state(),
> > which acquires page cgroup lock, under mapping->tree_lock.
> > But as I fixed before in commit e767e056, page cgroup lock must not acquired under
> > mapping->tree_lock.
> > hmm, we should call those mem_cgroup_update_state() outside mapping->tree_lock,
> > or add local_irq_save/restore() around lock/unlock_page_cgroup() to avoid dead-lock.
> >
> Ah, good catch! But hmmmmmm...
> This account_page_dirtted() seems to be called under IRQ-disabled.
> About __remove_from_page_cache(), I think page_cgroup should have its own DIRTY flag,
> then, mem_cgroup_uncharge_page() can handle it automatically.
>
> But. there are no guarantee that following never happens.
> lock_page_cgroup()
> <=== interrupt.
> -> mapping->tree_lock()
> Even if mapping->tree_lock is held with IRQ-disabled.
> Then, if we add local_irq_save(), we have to add it to all lock_page_cgroup().
>
> Then, hm...some kind of new trick ? as..
> (Follwoing patch is not tested!!)
>
If we can verify that all callers of mem_cgroup_update_stat() have always either aquired
or not aquired tree_lock, this direction will work fine.
But if we can't, we have to add local_irq_save() to lock_page_cgroup() like below.

===
include/linux/page_cgroup.h | 8 ++++++--
mm/memcontrol.c | 43 +++++++++++++++++++++++++------------------
2 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 30b0813..51da916 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -83,15 +83,19 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
return page_zonenum(pc->page);
}

-static inline void lock_page_cgroup(struct page_cgroup *pc)
+static inline void __lock_page_cgroup(struct page_cgroup *pc)
{
bit_spin_lock(PCG_LOCK, &pc->flags);
}
+#define lock_page_cgroup(pc, flags) \
+ do { local_irq_save(flags); __lock_page_cgroup(pc); } while (0)

-static inline void unlock_page_cgroup(struct page_cgroup *pc)
+static inline void __unlock_page_cgroup(struct page_cgroup *pc)
{
bit_spin_unlock(PCG_LOCK, &pc->flags);
}
+#define unlock_page_cgroup(pc, flags) \
+ do { __unlock_page_cgroup(pc); local_irq_restore(flags); } while (0)

#else /* CONFIG_CGROUP_MEM_RES_CTLR */
struct page_cgroup;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 00ed4b1..40b9be4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1327,12 +1327,13 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
{
struct mem_cgroup *mem;
struct page_cgroup *pc;
+ unsigned long flags;

pc = lookup_page_cgroup(page);
if (unlikely(!pc))
return;

- lock_page_cgroup(pc);
+ lock_page_cgroup(pc, flags);
mem = pc->mem_cgroup;
if (!mem)
goto done;
@@ -1346,7 +1347,7 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);

done:
- unlock_page_cgroup(pc);
+ unlock_page_cgroup(pc, flags);
}

/*
@@ -1680,11 +1681,12 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
struct page_cgroup *pc;
unsigned short id;
swp_entry_t ent;
+ unsigned long flags;

VM_BUG_ON(!PageLocked(page));

pc = lookup_page_cgroup(page);
- lock_page_cgroup(pc);
+ lock_page_cgroup(pc, flags);
if (PageCgroupUsed(pc)) {
mem = pc->mem_cgroup;
if (mem && !css_tryget(&mem->css))
@@ -1698,7 +1700,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
mem = NULL;
rcu_read_unlock();
}
- unlock_page_cgroup(pc);
+ unlock_page_cgroup(pc, flags);
return mem;
}

@@ -1711,13 +1713,15 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
struct page_cgroup *pc,
enum charge_type ctype)
{
+ unsigned long flags;
+
/* try_charge() can return NULL to *memcg, taking care of it. */
if (!mem)
return;

- lock_page_cgroup(pc);
+ lock_page_cgroup(pc, flags);
if (unlikely(PageCgroupUsed(pc))) {
- unlock_page_cgroup(pc);
+ unlock_page_cgroup(pc, flags);
mem_cgroup_cancel_charge(mem);
return;
}
@@ -1747,7 +1751,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,

mem_cgroup_charge_statistics(mem, pc, true);

- unlock_page_cgroup(pc);
+ unlock_page_cgroup(pc, flags);
/*
* "charge_statistics" updated event counter. Then, check it.
* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
@@ -1817,12 +1821,13 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
{
int ret = -EINVAL;
- lock_page_cgroup(pc);
+ unsigned long flags;
+ lock_page_cgroup(pc, flags);
if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
__mem_cgroup_move_account(pc, from, to, uncharge);
ret = 0;
}
- unlock_page_cgroup(pc);
+ unlock_page_cgroup(pc, flags);
/*
* check events
*/
@@ -1949,17 +1954,17 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
*/
if (!(gfp_mask & __GFP_WAIT)) {
struct page_cgroup *pc;
-
+ unsigned long flags;

pc = lookup_page_cgroup(page);
if (!pc)
return 0;
- lock_page_cgroup(pc);
+ lock_page_cgroup(pc, flags);
if (PageCgroupUsed(pc)) {
- unlock_page_cgroup(pc);
+ unlock_page_cgroup(pc, flags);
return 0;
}
- unlock_page_cgroup(pc);
+ unlock_page_cgroup(pc, flags);
}

if (unlikely(!mm && !mem))
@@ -2141,6 +2146,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
struct page_cgroup *pc;
struct mem_cgroup *mem = NULL;
struct mem_cgroup_per_zone *mz;
+ unsigned long flags;

if (mem_cgroup_disabled())
return NULL;
@@ -2155,7 +2161,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
if (unlikely(!pc || !PageCgroupUsed(pc)))
return NULL;

- lock_page_cgroup(pc);
+ lock_page_cgroup(pc, flags);

mem = pc->mem_cgroup;

@@ -2194,7 +2200,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
*/

mz = page_cgroup_zoneinfo(pc);
- unlock_page_cgroup(pc);
+ unlock_page_cgroup(pc, flags);

memcg_check_events(mem, page);
/* at swapout, this memcg will be accessed to record to swap */
@@ -2204,7 +2210,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
return mem;

unlock_out:
- unlock_page_cgroup(pc);
+ unlock_page_cgroup(pc, flags);
return NULL;
}

@@ -2392,17 +2398,18 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
struct page_cgroup *pc;
struct mem_cgroup *mem = NULL;
int ret = 0;
+ unsigned long flags;

if (mem_cgroup_disabled())
return 0;

pc = lookup_page_cgroup(page);
- lock_page_cgroup(pc);
+ lock_page_cgroup(pc, flags);
if (PageCgroupUsed(pc)) {
mem = pc->mem_cgroup;
css_get(&mem->css);
}
- unlock_page_cgroup(pc);
+ unlock_page_cgroup(pc, flags);

if (mem) {
ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);


> ==
> ---
> include/linux/page_cgroup.h | 14 ++++++++++++++
> mm/memcontrol.c | 27 +++++++++++++++++----------
> 2 files changed, 31 insertions(+), 10 deletions(-)
>
> Index: mmotm-2.6.33-Feb11/include/linux/page_cgroup.h
> ===================================================================
> --- mmotm-2.6.33-Feb11.orig/include/linux/page_cgroup.h
> +++ mmotm-2.6.33-Feb11/include/linux/page_cgroup.h
> @@ -39,6 +39,7 @@ enum {
> PCG_CACHE, /* charged as cache */
> PCG_USED, /* this object is in use. */
> PCG_ACCT_LRU, /* page has been accounted for */
> + PCG_MIGRATE, /* page cgroup is under memcg account migration */
> };
>
> #define TESTPCGFLAG(uname, lname) \
> @@ -73,6 +74,8 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU)
> TESTPCGFLAG(AcctLRU, ACCT_LRU)
> TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
>
> +TESTPCGFLAG(Migrate, MIGRATE)
> +
> static inline int page_cgroup_nid(struct page_cgroup *pc)
> {
> return page_to_nid(pc->page);
> @@ -93,6 +96,17 @@ static inline void unlock_page_cgroup(st
> bit_spin_unlock(PCG_LOCK, &pc->flags);
> }
>
> +static inline unsigned long page_cgroup_migration_lock(struct page_cgroup *pc)
> +{
> + local_irq_save(flags);
> + bit_spin_lock(PCG_MIGRATE, &pc->flags);
> +}
> +static inline void
> +page_cgroup_migration_lock(struct page_cgroup *pc, unsigned long flags)
> +{
> + bit_spin_lock(PCG_MIGRATE, &pc->flags);
> + local_irq_restore(flags);
> +}
> #else /* CONFIG_CGROUP_MEM_RES_CTLR */
> struct page_cgroup;
>
> Index: mmotm-2.6.33-Feb11/mm/memcontrol.c
> ===================================================================
> --- mmotm-2.6.33-Feb11.orig/mm/memcontrol.c
> +++ mmotm-2.6.33-Feb11/mm/memcontrol.c
> @@ -1321,7 +1321,7 @@ bool mem_cgroup_handle_oom(struct mem_cg
> * Currently used to update mapped file statistics, but the routine can be
> * generalized to update other statistics as well.
> */
> -void mem_cgroup_update_file_mapped(struct page *page, int val)
> +void mem_cgroup_update_file_mapped(struct page *page, int val, int locked)
> {
> struct mem_cgroup *mem;
> struct page_cgroup *pc;
> @@ -1329,22 +1329,27 @@ void mem_cgroup_update_file_mapped(struc
> pc = lookup_page_cgroup(page);
> if (unlikely(!pc))
> return;
> -
> - lock_page_cgroup(pc);
> + /*
> + * if locked==1, mapping->tree_lock is held. We don't have to take
> + * care of charge/uncharge. just think about migration.
> + */
> + if (!locked)
> + lock_page_cgroup(pc);
> + else
> + page_cgroup_migration_lock(pc);
> mem = pc->mem_cgroup;
> - if (!mem)
> + if (!mem || !PageCgroupUsed(pc))
> goto done;
> -
> - if (!PageCgroupUsed(pc))
> - goto done;
> -
> /*
> * Preemption is already disabled. We can use __this_cpu_xxx
> */
> __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
>
> done:
> - unlock_page_cgroup(pc);
> + if (!locked)
> + lock_page_cgroup(pc);
> + else
> + page_cgroup_migration_unlock(pc);
> }
>
> /*
> @@ -1785,7 +1790,8 @@ static void __mem_cgroup_move_account(st
> VM_BUG_ON(!PageCgroupLocked(pc));
> VM_BUG_ON(!PageCgroupUsed(pc));
> VM_BUG_ON(pc->mem_cgroup != from);
> -
> +
> + page_cgroup_migration_lock(pc);
> page = pc->page;
> if (page_mapped(page) && !PageAnon(page)) {
> /* Update mapped_file data for mem_cgroup */
> @@ -1802,6 +1808,7 @@ static void __mem_cgroup_move_account(st
> /* caller should have done css_get */
> pc->mem_cgroup = to;
> mem_cgroup_charge_statistics(to, pc, true);
> + page_cgroup_migration_lock(pc);
> /*
> * We charges against "to" which may not have any tasks. Then, "to"
> * can be under rmdir(). But in current implementation, caller of
>
>
>
>
> Thanks,
> -Kame
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/