[RFC 3/4] mm: support remove_mapping in irqcontext

From: Minchan Kim
Date: Sun May 12 2013 - 22:11:07 EST


This patch makes to use remove_mapping in irq context.
For it, this patch adds irqcontext argument in some functions
(ex, remove_maping and __remove_mapping) but these functions
are not hot path so i believe it's not a problem.

And it exports swap_info_get and check that we can get a
swap_info_struct->s lock in advance in irq context.
Because __swapcache_free must be succesful.
Otherwise, If __swapcache_free can be failed, we should rollback
things done by __delete_from_swap_cache and it could need memory
allocation for radix tree node in irq context.

More concern is handling for mem_cgroup_uncharge_swapcache in
irqcontext, which isn't aware of irqcontext at the moment and it
should be successful like above reason.

After I review that code, I think it's not a big challenge if I missed
somethings.

My rough plan is following as,

1. Make mctz->lock beging aware of irq by changing spin_lock with
spin_lock_irqsave.
2. Introuduce new argument "locked" in __mem_cgroup_uncharge_common
so that __mem_cgroup_uncharge_common can avoid lock_page_cgroup in
irqcontext to avoid deadlock but caller in irqcontext should be held
it in advance by next patch.
3. Introduce try_lock_page_cgroup, which will be used __swapcache_free.
4. __remove_mapping can held a page_cgroup lock in advance before calling
__swapcache_free

I'd like to listen memcg people's opinions before diving into coding.

Signed-off-by: Minchan Kim <minchan@xxxxxxxxxx>
---
fs/splice.c | 2 +-
include/linux/swap.h | 12 ++++++++++-
mm/swapfile.c | 2 +-
mm/truncate.c | 2 +-
mm/vmscan.c | 56 +++++++++++++++++++++++++++++++++++++++++++---------
5 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/fs/splice.c b/fs/splice.c
index e6b2559..db77694 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -70,7 +70,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
* If we succeeded in removing the mapping, set LRU flag
* and return good.
*/
- if (remove_mapping(mapping, page)) {
+ if (remove_mapping(mapping, page, false)) {
buf->flags |= PIPE_BUF_FLAG_LRU;
return 0;
}
diff --git a/include/linux/swap.h b/include/linux/swap.h
index ca031f7..eb126d2 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -274,7 +274,8 @@ extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
unsigned long *nr_scanned);
extern unsigned long shrink_all_memory(unsigned long nr_pages);
extern int vm_swappiness;
-extern int remove_mapping(struct address_space *mapping, struct page *page);
+extern int remove_mapping(struct address_space *mapping, struct page *page,
+ bool irqcontext);
extern unsigned long vm_total_pages;

#ifdef CONFIG_NUMA
@@ -407,6 +408,9 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
}
#endif

+
+extern struct swap_info_struct *swap_info_get(swp_entry_t entry);
+
#else /* CONFIG_SWAP */

#define get_nr_swap_pages() 0L
@@ -430,6 +434,12 @@ static inline void show_swap_cache_info(void)
#define free_swap_and_cache(swp) is_migration_entry(swp)
#define swapcache_prepare(swp) is_migration_entry(swp)

+
+struct swap_info_struct *swap_info_get(swp_entry_t entry)
+{
+ return NULL;
+}
+
static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
{
return 0;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 33ebdd5..8a425d4 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -505,7 +505,7 @@ swp_entry_t get_swap_page_of_type(int type)
return (swp_entry_t) {0};
}

-static struct swap_info_struct *swap_info_get(swp_entry_t entry)
+struct swap_info_struct *swap_info_get(swp_entry_t entry)
{
struct swap_info_struct *p;
unsigned long offset, type;
diff --git a/mm/truncate.c b/mm/truncate.c
index c75b736..fa1dc60 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -131,7 +131,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
if (page_has_private(page) && !try_to_release_page(page, 0))
return 0;

- ret = remove_mapping(mapping, page);
+ ret = remove_mapping(mapping, page, false);

return ret;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fa6a853..d14c9be 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -450,12 +450,18 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
* Same as remove_mapping, but if the page is removed from the mapping, it
* gets returned with a refcount of 0.
*/
-static int __remove_mapping(struct address_space *mapping, struct page *page)
+static int __remove_mapping(struct address_space *mapping, struct page *page,
+ bool irqcontext)
{
+ unsigned long flags;
+
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));

- spin_lock_irq(&mapping->tree_lock);
+ if (irqcontext)
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ else
+ spin_lock_irq(&mapping->tree_lock);
/*
* The non racy check for a busy page.
*
@@ -490,17 +496,45 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
}

if (PageSwapCache(page)) {
+ struct swap_info_struct *p;
swp_entry_t swap = { .val = page_private(page) };
+ p = swap_info_get(swap);
+ /*
+ * If we are irq context, check that we can get a
+ * swap_info_strcut->lock before removing the page from
+ * swap cache. Because __swapcache_free must be successful.
+ * If __swapcache_free can be failed, we should rollback
+ * things done by __delete_from_swap_cache and it needs
+ * memory allocation for radix tree node in irqcontext
+ * That's thing we really want to avoid.
+ * TODO : memcg mem_cgroup_uncharge_swapcache handling
+ * in irqcontext
+ */
+ if (irqcontext && p && !spin_trylock(&p->lock)) {
+ page_unfreeze_refs(page, 2);
+ goto cannot_free;
+ }
+
__delete_from_swap_cache(page);
- spin_unlock_irq(&mapping->tree_lock);
- swapcache_free(swap, page);
+ if (irqcontext) {
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ if (p)
+ __swapcache_free(p, swap, page);
+ spin_unlock(&p->lock);
+ } else {
+ spin_unlock_irq(&mapping->tree_lock);
+ swapcache_free(swap, page);
+ }
} else {
void (*freepage)(struct page *);

freepage = mapping->a_ops->freepage;

__delete_from_page_cache(page);
- spin_unlock_irq(&mapping->tree_lock);
+ if (irqcontext)
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ else
+ spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);

if (freepage != NULL)
@@ -510,7 +544,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
return 1;

cannot_free:
- spin_unlock_irq(&mapping->tree_lock);
+ if (irqcontext)
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ else
+ spin_unlock_irq(&mapping->tree_lock);
return 0;
}

@@ -520,9 +557,10 @@ cannot_free:
* successfully detached, return 1. Assumes the caller has a single ref on
* this page.
*/
-int remove_mapping(struct address_space *mapping, struct page *page)
+int remove_mapping(struct address_space *mapping, struct page *page,
+ bool irqcontext)
{
- if (__remove_mapping(mapping, page)) {
+ if (__remove_mapping(mapping, page, irqcontext)) {
/*
* Unfreezing the refcount with 1 rather than 2 effectively
* drops the pagecache ref for us without requiring another
@@ -904,7 +942,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
}

- if (!mapping || !__remove_mapping(mapping, page))
+ if (!mapping || !__remove_mapping(mapping, page, false))
goto keep_locked;

/*
--
1.8.2.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/