Re: mmotm 2014-02-05 list_lru_add lockdep splat

From: Hugh Dickins
Date: Thu Feb 06 2014 - 17:19:27 EST


On Thu, 6 Feb 2014, Johannes Weiner wrote:
> On Wed, Feb 05, 2014 at 07:50:10PM -0800, Hugh Dickins wrote:
> > ======================================================
> > [ INFO: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected ]
> > 3.14.0-rc1-mm1 #1 Not tainted
> > ------------------------------------------------------
> > kswapd0/48 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
> > (&(&lru->node[i].lock)->rlock){+.+.-.}, at: [<ffffffff81117064>] list_lru_add+0x80/0xf4
> >
> > s already holding:
> > (&(&mapping->tree_lock)->rlock){..-.-.}, at: [<ffffffff81108c63>] __remove_mapping+0x3b/0x12d
> > which would create a new lock dependency:
> > (&(&mapping->tree_lock)->rlock){..-.-.} -> (&(&lru->node[i].lock)->rlock){+.+.-.}
>
> Thanks for the report. The first time I saw this on my own machine, I
> misinterpreted it as a false positive (could have sworn the "possible
> unsafe scenario" section looked different, too).
>
> Looking at it again, there really is a deadlock scenario when the
> shadow shrinker races with a page cache insertion or deletion and is
> interrupted by the IO completion handler while holding the list_lru
> lock:
>
> > Possible interrupt unsafe locking scenario:
> >
> > CPU0 CPU1
> > ---- ----
> > lock(&(&lru->node[i].lock)->rlock);
> > local_irq_disable();
> > lock(&(&mapping->tree_lock)->rlock);
> > lock(&(&lru->node[i].lock)->rlock);
> > <Interrupt>
> > lock(&(&mapping->tree_lock)->rlock);
>
> Could you please try with the following patch?

Sure, that fixes it for me (with one trivial correction appended), thanks.
But don't imagine I've given it anything as demanding as thought!

Hugh

>
> ---
> From: Johannes Weiner <hannes@xxxxxxxxxxx>
> Subject: [patch] mm: keep page cache radix tree nodes in check fix
>
> Hugh Dickin reports the following lockdep splat:
>
> ======================================================
> [ INFO: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected ]
> 3.14.0-rc1-mm1 #1 Not tainted
> ------------------------------------------------------
> kswapd0/48 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
> (&(&lru->node[i].lock)->rlock){+.+.-.}, at: [<ffffffff81117064>] list_lru_add+0x80/0xf4
>
> s already holding:
> (&(&mapping->tree_lock)->rlock){..-.-.}, at: [<ffffffff81108c63>] __remove_mapping+0x3b/0x12d
> which would create a new lock dependency:
> (&(&mapping->tree_lock)->rlock){..-.-.} -> (&(&lru->node[i].lock)->rlock){+.+.-.}
>
> lru->node[i].lock nests inside the mapping->tree_lock when page cache
> insertions and deletions add or remove radix tree nodes to the shadow
> LRU list.
>
> However, paths that only hold the IRQ-unsafe lru->node[i].lock, like
> the shadow shrinker, can be interrupted at any time by the IO
> completion handler, which in turn acquires the mapping->tree_lock.
> This is a simple locking order inversion and can deadlock like so:
>
> CPU#0: shadow shrinker CPU#1: page cache modification
> lru->node[i].lock
> mapping->tree_lock
> lru->node[i].lock
> <interrupt>
> mapping->tree_lock
>
> Make the shadow lru->node[i].lock IRQ-safe to remove the order
> dictated by interruption. This slightly increases the IRQ-disabled
> section in the shadow shrinker, but it still drops all locks and
> enables IRQ after every reclaimed shadow radix tree node.
>
> Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
> ---
> include/linux/list_lru.h | 6 +++++-
> mm/list_lru.c | 4 +++-
> mm/workingset.c | 24 ++++++++++++++++++++----
> 3 files changed, 28 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
> index b02fc233eadd..f3434533fbf8 100644
> --- a/include/linux/list_lru.h
> +++ b/include/linux/list_lru.h
> @@ -34,7 +34,11 @@ struct list_lru {
> };
>
> void list_lru_destroy(struct list_lru *lru);
> -int list_lru_init(struct list_lru *lru);
> +int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key);
> +static inline int list_lru_init(struct list_lru *lru)
> +{
> + return list_lru_init_key(lru, NULL);
> +}
>
> /**
> * list_lru_add: add an element to the lru list's tail
> diff --git a/mm/list_lru.c b/mm/list_lru.c
> index 7f5b73e2513b..2a5b8fd45669 100644
> --- a/mm/list_lru.c
> +++ b/mm/list_lru.c
> @@ -124,7 +124,7 @@ restart:
> }
> EXPORT_SYMBOL_GPL(list_lru_walk_node);
>
> -int list_lru_init(struct list_lru *lru)
> +int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
> {
> int i;
> size_t size = sizeof(*lru->node) * nr_node_ids;
> @@ -136,6 +136,8 @@ int list_lru_init(struct list_lru *lru)
> nodes_clear(lru->active_nodes);
> for (i = 0; i < nr_node_ids; i++) {
> spin_lock_init(&lru->node[i].lock);
> + if (key)
> + lockdep_set_class(&lru->node[i].lock, key);
> INIT_LIST_HEAD(&lru->node[i].list);
> lru->node[i].nr_items = 0;
> }
> diff --git a/mm/workingset.c b/mm/workingset.c
> index 33429c7ddec5..20aa16754305 100644
> --- a/mm/workingset.c
> +++ b/mm/workingset.c
> @@ -273,7 +273,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
> unsigned long max_nodes;
> unsigned long pages;
>
> + local_irq_disable();
> shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid);
> + local_irq_enable();
> +
> pages = node_present_pages(sc->nid);
> /*
> * Active cache pages are limited to 50% of memory, and shadow
> @@ -322,7 +325,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
> mapping = node->private_data;
>
> /* Coming from the list, invert the lock order */
> - if (!spin_trylock_irq(&mapping->tree_lock)) {
> + if (!spin_trylock(&mapping->tree_lock)) {
> spin_unlock(lru_lock);
> ret = LRU_RETRY;
> goto out;
> @@ -355,10 +358,12 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
> if (!__radix_tree_delete_node(&mapping->page_tree, node))
> BUG();
>
> - spin_unlock_irq(&mapping->tree_lock);
> + spin_unlock(&mapping->tree_lock);
> ret = LRU_REMOVED_RETRY;
> out:
> + local_irq_enable();
> cond_resched();
> + local_irq_disable();
> spin_lock(lru_lock);
> return ret;
> }
> @@ -366,8 +371,13 @@ out:
> static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
> struct shrink_control *sc)
> {
> - return list_lru_walk_node(&workingset_shadow_nodes, sc->nid,
> + unsigned long ret;
> +
> + local_irq_disable();
> + ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid,
> shadow_lru_isolate, NULL, &sc->nr_to_scan);
> + local_irq_enable();
> + return ret;
> }
>
> static struct shrinker workingset_shadow_shrinker = {
> @@ -377,11 +387,17 @@ static struct shrinker workingset_shadow_shrinker = {
> .flags = SHRINKER_NUMA_AWARE,
> };
>
> +/*
> + * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
> + * mapping->tree_lock.
> + */
> +static struct lock_class_key shadow_nodes_key;
> +
> static int __init workingset_init(void)
> {
> int ret;
>
> - ret = list_lru_init(&workingset_shadow_nodes);
> + ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
> if (ret)
> goto err;
> ret = register_shrinker(&workingset_shadow_shrinker);
> --
> 1.8.5.3

--- hannes/mm/list_lru.c 2014-02-06 08:50:25.104032277 -0800
+++ hughd/mm/list_lru.c 2014-02-06 08:58:36.884043965 -0800
@@ -143,7 +143,7 @@ int list_lru_init_key(struct list_lru *l
}
return 0;
}
-EXPORT_SYMBOL_GPL(list_lru_init);
+EXPORT_SYMBOL_GPL(list_lru_init_key);

void list_lru_destroy(struct list_lru *lru)
{
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/