[patch] mm: memcontrol: zap outstanding cache/swap references during unbind

From: Johannes Weiner
Date: Sun Jan 11 2015 - 10:29:05 EST


Cgroup core assumes that any outstanding css references after
offlining are temporary in nature, and e.g. mount waits for them to
disappear and release the root cgroup. But leftover page cache and
swapout records in an offlined memcg are only dropped when the pages
get reclaimed under pressure or the swapped out pages get faulted in
from other cgroups, and so those cgroup operations can hang forever.

Implement the ->unbind() callback to actively get rid of outstanding
references when cgroup core wants them gone. Swap out records are
deleted, such that the swap-in path will charge those pages to the
faulting task. Page cache pages are moved to the root memory cgroup.

Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
---
include/linux/swap_cgroup.h | 6 +++
mm/memcontrol.c | 126 ++++++++++++++++++++++++++++++++++++++++++++
mm/swap_cgroup.c | 38 +++++++++++++
3 files changed, 170 insertions(+)

diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h
index 145306bdc92f..ffe0866d2997 100644
--- a/include/linux/swap_cgroup.h
+++ b/include/linux/swap_cgroup.h
@@ -9,6 +9,7 @@ extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
unsigned short old, unsigned short new);
extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
+extern unsigned long swap_cgroup_zap_records(unsigned short id);
extern int swap_cgroup_swapon(int type, unsigned long max_pages);
extern void swap_cgroup_swapoff(int type);

@@ -26,6 +27,11 @@ unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
return 0;
}

+static inline unsigned long swap_cgroup_zap_records(unsigned short id)
+{
+ return 0;
+}
+
static inline int
swap_cgroup_swapon(int type, unsigned long max_pages)
{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 692e96407627..40c426add613 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5197,6 +5197,131 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
mem_cgroup_from_css(root_css)->use_hierarchy = true;
}

+static void unbind_lru_list(struct mem_cgroup *memcg,
+ struct zone *zone, enum lru_list lru)
+{
+ struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ struct list_head *list = &lruvec->lists[lru];
+
+ while (!list_empty(list)) {
+ unsigned int nr_pages;
+ unsigned long flags;
+ struct page *page;
+
+ spin_lock_irqsave(&zone->lru_lock, flags);
+ if (list_empty(list)) {
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ break;
+ }
+ page = list_last_entry(list, struct page, lru);
+ if (!get_page_unless_zero(page)) {
+ list_move(&page->lru, list);
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ continue;
+ }
+ BUG_ON(!PageLRU(page));
+ ClearPageLRU(page);
+ del_page_from_lru_list(page, lruvec, lru);
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+
+ compound_lock(page);
+ nr_pages = hpage_nr_pages(page);
+
+ if (!mem_cgroup_move_account(page, nr_pages,
+ memcg, root_mem_cgroup)) {
+ /*
+ * root_mem_cgroup page counters are not used,
+ * otherwise we'd have to charge them here.
+ */
+ page_counter_uncharge(&memcg->memory, nr_pages);
+ if (do_swap_account)
+ page_counter_uncharge(&memcg->memsw, nr_pages);
+ css_put_many(&memcg->css, nr_pages);
+ }
+
+ compound_unlock(page);
+
+ putback_lru_page(page);
+ }
+}
+
+static void unbind_work_fn(struct work_struct *work)
+{
+ struct cgroup_subsys_state *css;
+retry:
+ drain_all_stock(root_mem_cgroup);
+
+ rcu_read_lock();
+ css_for_each_child(css, &root_mem_cgroup->css) {
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ /* Drop references from swap-out records */
+ if (do_swap_account) {
+ long zapped;
+
+ zapped = swap_cgroup_zap_records(memcg->css.id);
+ page_counter_uncharge(&memcg->memsw, zapped);
+ css_put_many(&memcg->css, zapped);
+ }
+
+ /* Drop references from leftover LRU pages */
+ css_get(css);
+ rcu_read_unlock();
+ atomic_inc(&memcg->moving_account);
+ synchronize_rcu();
+ while (page_counter_read(&memcg->memory) -
+ page_counter_read(&memcg->kmem) > 0) {
+ struct zone *zone;
+ enum lru_list lru;
+
+ lru_add_drain_all();
+
+ for_each_zone(zone)
+ for_each_lru(lru)
+ unbind_lru_list(memcg, zone, lru);
+
+ cond_resched();
+ }
+ atomic_dec(&memcg->moving_account);
+ rcu_read_lock();
+ css_put(css);
+ }
+ rcu_read_unlock();
+ /*
+ * Swap-in is racy:
+ *
+ * #0 #1
+ * lookup_swap_cgroup_id()
+ * rcu_read_lock()
+ * mem_cgroup_lookup()
+ * css_tryget_online()
+ * rcu_read_unlock()
+ * cgroup_kill_sb()
+ * !css_has_online_children()
+ * ->unbind()
+ * page_counter_try_charge()
+ * css_put()
+ * css_free()
+ * pc->mem_cgroup = dead memcg
+ * add page to lru
+ *
+ * Loop until until all references established from previously
+ * existing swap-out records have been transferred to pages on
+ * the LRU and then uncharged from there.
+ */
+ if (!list_empty(&root_mem_cgroup->css.children)) {
+ msleep(10);
+ goto retry;
+ }
+}
+
+static DECLARE_WORK(unbind_work, unbind_work_fn);
+
+static void mem_cgroup_unbind(struct cgroup_subsys_state *root_css)
+{
+ schedule_work(&unbind_work);
+}
+
static u64 memory_current_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -5360,6 +5485,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
.cancel_attach = mem_cgroup_cancel_attach,
.attach = mem_cgroup_move_task,
.bind = mem_cgroup_bind,
+ .unbind = mem_cgroup_unbind,
.dfl_cftypes = memory_files,
.legacy_cftypes = mem_cgroup_legacy_files,
.early_init = 0,
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index b5f7f24b8dd1..665923a558c4 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -140,6 +140,44 @@ unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
return lookup_swap_cgroup(ent, NULL)->id;
}

+/**
+ * swap_cgroup_zap_records - delete all swapout records of one cgroup
+ * @id: memcg id
+ *
+ * Returns the number of deleted records.
+ */
+unsigned long swap_cgroup_zap_records(unsigned short id)
+{
+ unsigned long zapped = 0;
+ unsigned int type;
+
+ for (type = 0; type < MAX_SWAPFILES; type++) {
+ struct swap_cgroup_ctrl *ctrl;
+ unsigned long flags;
+ unsigned int page;
+
+ ctrl = &swap_cgroup_ctrl[type];
+ spin_lock_irqsave(&ctrl->lock, flags);
+ for (page = 0; page < ctrl->length; page++) {
+ struct swap_cgroup *base;
+ pgoff_t offset;
+
+ base = page_address(ctrl->map[page]);
+ for (offset = 0; offset < SC_PER_PAGE; offset++) {
+ struct swap_cgroup *sc;
+
+ sc = base + offset;
+ if (sc->id == id) {
+ sc->id = 0;
+ zapped++;
+ }
+ }
+ }
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+ }
+ return zapped;
+}
+
int swap_cgroup_swapon(int type, unsigned long max_pages)
{
void *array;
--
2.2.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/