Re: [patch 01/10] mm: memcg: fix compaction/migration failing due tomemcg limits

From: Wanpeng Li
Date: Thu Jul 12 2012 - 05:10:49 EST


On Thu, Jul 12, 2012 at 04:54:07PM +0800, Wanpeng Li wrote:
>On Wed, Jul 11, 2012 at 07:02:13PM +0200, Johannes Weiner wrote:
>>Compaction (and page migration in general) can currently be hindered
>>through pages being owned by memory cgroups that are at their limits
>>and unreclaimable.
>>
>>The reason is that the replacement page is being charged against the
>>limit while the page being replaced is also still charged. But this
>>seems unnecessary, given that only one of the two pages will still be
>>in use after migration finishes.
>>
>>This patch changes the memcg migration sequence so that the
>>replacement page is not charged. Whatever page is still in use after
>>successful or failed migration gets to keep the charge of the page
>>that was going to be replaced.
>>
>>The replacement page will still show up temporarily in the rss/cache
>>statistics, this can be fixed in a later patch as it's less urgent.
>>
>
>So I want to know after this patch be merged if mem_cgroup_wait_acct_move
>still make sense, if the answer is no, I will send a patch to remove it.

And if this still make sense, I want to change check in
mem_cgroup_do_charge:

if (mem_cgroup_wait_acct_move(mem_over_limit))
return CHARGE_RETRY;

=>

if (mem_cgroup_wait_acct_move(mem_over_limit) &&
mem_cgroup_margin(mem_over_limit) >= nr_pages)
return CHARGE_RETRY;

Since mem_cgroup_relcaim can reclaim some pages, but in
mem_cgroup_reclaim function there are some exit condition:

total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
if(total && (flag & MEM_CGROUP_RECLAIM_SHRINK))
break;

and

if (mem_cgroup_margin(memcg))
break;

So maybe mem_cgroup_reclaim not reclaim enough pages >= nr_pages, this
time we should go to mem_cgroup_handle_oom instead of return
CHARGE_RETRY.

Hopefully, you can verify if my idea make sense.


>>Reported-by: David Rientjes <rientjes@xxxxxxxxxx>
>>Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
>>Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
>>Acked-by: Michal Hocko <mhocko@xxxxxxx>
>>---
>> include/linux/memcontrol.h | 11 +++----
>> mm/memcontrol.c | 67 +++++++++++++++++++++++--------------------
>> mm/migrate.c | 27 ++++--------------
>> 3 files changed, 47 insertions(+), 58 deletions(-)
>>
>>diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>>index 5a3ee64..8d9489f 100644
>>--- a/include/linux/memcontrol.h
>>+++ b/include/linux/memcontrol.h
>>@@ -98,9 +98,9 @@ int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
>>
>> extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
>>
>>-extern int
>>-mem_cgroup_prepare_migration(struct page *page,
>>- struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask);
>>+extern void
>>+mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
>>+ struct mem_cgroup **memcgp);
>> extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
>> struct page *oldpage, struct page *newpage, bool migration_ok);
>>
>>@@ -276,11 +276,10 @@ static inline struct cgroup_subsys_state
>> return NULL;
>> }
>>
>>-static inline int
>>+static inline void
>> mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
>>- struct mem_cgroup **memcgp, gfp_t gfp_mask)
>>+ struct mem_cgroup **memcgp)
>> {
>>- return 0;
>> }
>>
>> static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
>>diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>index e8ddc00..12ee2de 100644
>>--- a/mm/memcontrol.c
>>+++ b/mm/memcontrol.c
>>@@ -2977,7 +2977,8 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
>> * uncharge if !page_mapped(page)
>> */
>> static struct mem_cgroup *
>>-__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>>+__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
>>+ bool end_migration)
>> {
>> struct mem_cgroup *memcg = NULL;
>> unsigned int nr_pages = 1;
>>@@ -3021,7 +3022,16 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>> /* fallthrough */
>> case MEM_CGROUP_CHARGE_TYPE_DROP:
>> /* See mem_cgroup_prepare_migration() */
>>- if (page_mapped(page) || PageCgroupMigration(pc))
>>+ if (page_mapped(page))
>>+ goto unlock_out;
>>+ /*
>>+ * Pages under migration may not be uncharged. But
>>+ * end_migration() /must/ be the one uncharging the
>>+ * unused post-migration page and so it has to call
>>+ * here with the migration bit still set. See the
>>+ * res_counter handling below.
>>+ */
>>+ if (!end_migration && PageCgroupMigration(pc))
>> goto unlock_out;
>> break;
>> case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
>>@@ -3055,7 +3065,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>> mem_cgroup_swap_statistics(memcg, true);
>> mem_cgroup_get(memcg);
>> }
>>- if (!mem_cgroup_is_root(memcg))
>>+ /*
>>+ * Migration does not charge the res_counter for the
>>+ * replacement page, so leave it alone when phasing out the
>>+ * page that is unused after the migration.
>>+ */
>>+ if (!end_migration && !mem_cgroup_is_root(memcg))
>> mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
>>
>> return memcg;
>>@@ -3071,14 +3086,14 @@ void mem_cgroup_uncharge_page(struct page *page)
>> if (page_mapped(page))
>> return;
>> VM_BUG_ON(page->mapping && !PageAnon(page));
>>- __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON);
>>+ __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
>> }
>>
>> void mem_cgroup_uncharge_cache_page(struct page *page)
>> {
>> VM_BUG_ON(page_mapped(page));
>> VM_BUG_ON(page->mapping);
>>- __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
>>+ __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
>> }
>>
>> /*
>>@@ -3142,7 +3157,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
>> if (!swapout) /* this was a swap cache but the swap is unused ! */
>> ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
>>
>>- memcg = __mem_cgroup_uncharge_common(page, ctype);
>>+ memcg = __mem_cgroup_uncharge_common(page, ctype, false);
>>
>> /*
>> * record memcg information, if swapout && memcg != NULL,
>>@@ -3232,19 +3247,18 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
>> * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
>> * page belongs to.
>> */
>>-int mem_cgroup_prepare_migration(struct page *page,
>>- struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask)
>>+void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
>>+ struct mem_cgroup **memcgp)
>> {
>> struct mem_cgroup *memcg = NULL;
>> struct page_cgroup *pc;
>> enum charge_type ctype;
>>- int ret = 0;
>>
>> *memcgp = NULL;
>>
>> VM_BUG_ON(PageTransHuge(page));
>> if (mem_cgroup_disabled())
>>- return 0;
>>+ return;
>>
>> pc = lookup_page_cgroup(page);
>> lock_page_cgroup(pc);
>>@@ -3289,24 +3303,9 @@ int mem_cgroup_prepare_migration(struct page *page,
>> * we return here.
>> */
>> if (!memcg)
>>- return 0;
>>+ return;
>>
>> *memcgp = memcg;
>>- ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false);
>>- css_put(&memcg->css);/* drop extra refcnt */
>>- if (ret) {
>>- if (PageAnon(page)) {
>>- lock_page_cgroup(pc);
>>- ClearPageCgroupMigration(pc);
>>- unlock_page_cgroup(pc);
>>- /*
>>- * The old page may be fully unmapped while we kept it.
>>- */
>>- mem_cgroup_uncharge_page(page);
>>- }
>>- /* we'll need to revisit this error code (we have -EINTR) */
>>- return -ENOMEM;
>>- }
>> /*
>> * We charge new page before it's used/mapped. So, even if unlock_page()
>> * is called before end_migration, we can catch all events on this new
>>@@ -3319,8 +3318,12 @@ int mem_cgroup_prepare_migration(struct page *page,
>> ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
>> else
>> ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
>>+ /*
>>+ * The page is committed to the memcg, but it's not actually
>>+ * charged to the res_counter since we plan on replacing the
>>+ * old one and only one page is going to be left afterwards.
>>+ */
>> __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
>>- return ret;
>> }
>>
>> /* remove redundant charge if migration failed*/
>>@@ -3342,6 +3345,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
>> used = newpage;
>> unused = oldpage;
>> }
>>+ anon = PageAnon(used);
>>+ __mem_cgroup_uncharge_common(unused,
>>+ anon ? MEM_CGROUP_CHARGE_TYPE_ANON
>>+ : MEM_CGROUP_CHARGE_TYPE_CACHE,
>>+ true);
>>+ css_put(&memcg->css);
>> /*
>> * We disallowed uncharge of pages under migration because mapcount
>> * of the page goes down to zero, temporarly.
>>@@ -3351,10 +3360,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
>> lock_page_cgroup(pc);
>> ClearPageCgroupMigration(pc);
>> unlock_page_cgroup(pc);
>>- anon = PageAnon(used);
>>- __mem_cgroup_uncharge_common(unused,
>>- anon ? MEM_CGROUP_CHARGE_TYPE_ANON
>>- : MEM_CGROUP_CHARGE_TYPE_CACHE);
>>
>> /*
>> * If a page is a file cache, radix-tree replacement is very atomic
>>diff --git a/mm/migrate.c b/mm/migrate.c
>>index 8137aea..aa06bf4 100644
>>--- a/mm/migrate.c
>>+++ b/mm/migrate.c
>>@@ -687,7 +687,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>> {
>> int rc = -EAGAIN;
>> int remap_swapcache = 1;
>>- int charge = 0;
>> struct mem_cgroup *mem;
>> struct anon_vma *anon_vma = NULL;
>>
>>@@ -729,12 +728,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>> }
>>
>> /* charge against new page */
>>- charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL);
>>- if (charge == -ENOMEM) {
>>- rc = -ENOMEM;
>>- goto unlock;
>>- }
>>- BUG_ON(charge);
>>+ mem_cgroup_prepare_migration(page, newpage, &mem);
>>
>> if (PageWriteback(page)) {
>> /*
>>@@ -824,8 +818,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>> put_anon_vma(anon_vma);
>>
>> uncharge:
>>- if (!charge)
>>- mem_cgroup_end_migration(mem, page, newpage, rc == 0);
>>+ mem_cgroup_end_migration(mem, page, newpage, rc == 0);
>> unlock:
>> unlock_page(page);
>> out:
>>@@ -1519,10 +1512,9 @@ migrate_misplaced_page(struct page *page, struct mm_struct *mm, int node)
>> {
>> struct page *oldpage = page, *newpage;
>> struct address_space *mapping = page_mapping(page);
>>- struct mem_cgroup *mcg;
>>+ struct mem_cgroup *memcg;
>> unsigned int gfp;
>> int rc = 0;
>>- int charge = -ENOMEM;
>>
>> VM_BUG_ON(!PageLocked(page));
>> VM_BUG_ON(page_mapcount(page));
>>@@ -1556,12 +1548,7 @@ migrate_misplaced_page(struct page *page, struct mm_struct *mm, int node)
>> if (!trylock_page(newpage))
>> BUG(); /* new page should be unlocked!!! */
>>
>>- // XXX hnaz, is this right?
>>- charge = mem_cgroup_prepare_migration(page, newpage, &mcg, gfp);
>>- if (charge == -ENOMEM) {
>>- rc = charge;
>>- goto out;
>>- }
>>+ mem_cgroup_prepare_migration(page, newpage, &memcg);
>>
>> newpage->index = page->index;
>> newpage->mapping = page->mapping;
>>@@ -1581,11 +1568,9 @@ migrate_misplaced_page(struct page *page, struct mm_struct *mm, int node)
>> page = newpage;
>> }
>>
>>+ mem_cgroup_end_migration(memcg, oldpage, newpage, !rc);
>> out:
>>- if (!charge)
>>- mem_cgroup_end_migration(mcg, oldpage, newpage, !rc);
>>-
>>- if (oldpage != page)
>>+ if (oldpage != page)
>> put_page(oldpage);
>>
>> if (rc) {
>>--
>>1.7.7.6
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/