[DEBUG PATCH 1/2] mm, debug: report when GFP_NO{FS,IO} is used explicitly from memalloc_no{fs,io}_{save,restore} context

From: Michal Hocko
Date: Fri Dec 16 2016 - 07:47:18 EST


From: Michal Hocko <mhocko@xxxxxxxx>

THIS PATCH IS FOR TESTING ONLY AND NOT MEANT TO HIT LINUS TREE

It is desirable to reduce the direct GFP_NO{FS,IO} usage at minimum and
prefer scope usage defined by memalloc_no{fs,io}_{save,restore} API.

Let's help this process and add a debugging tool to catch when an
explicit allocation request for GFP_NO{FS,IO} is done from the scope
context. The printed stacktrace should help to identify the caller
and evaluate whether it can be changed to use a wider context or whether
it is called from another potentially dangerous context which needs
a scope protection as well.

The checks have to be enabled explicitly by debug_scope_gfp kernel
command line parameter.

Signed-off-by: Michal Hocko <mhocko@xxxxxxxx>
---
include/linux/sched.h | 14 +++++++++++--
include/linux/slab.h | 3 +++
mm/page_alloc.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1c9fbcbcfcc8..288946bfc326 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1988,6 +1988,8 @@ struct task_struct {
/* A live task holds one reference. */
atomic_t stack_refcount;
#endif
+ unsigned long nofs_caller;
+ unsigned long noio_caller;
/* CPU-specific state of this task */
struct thread_struct thread;
/*
@@ -2345,6 +2347,8 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
#define used_math() tsk_used_math(current)

+extern void debug_scope_gfp_context(gfp_t gfp_mask);
+
/*
* Applies per-task gfp context to the given allocation flags.
* PF_MEMALLOC_NOIO implies GFP_NOIO
@@ -2363,25 +2367,31 @@ static inline gfp_t current_gfp_context(gfp_t flags)
return flags;
}

-static inline unsigned int memalloc_noio_save(void)
+static inline unsigned int __memalloc_noio_save(unsigned long caller)
{
unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
current->flags |= PF_MEMALLOC_NOIO;
+ current->noio_caller = caller;
return flags;
}

+#define memalloc_noio_save() __memalloc_noio_save(_RET_IP_)
+
static inline void memalloc_noio_restore(unsigned int flags)
{
current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
}

-static inline unsigned int memalloc_nofs_save(void)
+static inline unsigned int __memalloc_nofs_save(unsigned long caller)
{
unsigned int flags = current->flags & PF_MEMALLOC_NOFS;
current->flags |= PF_MEMALLOC_NOFS;
+ current->nofs_caller = caller;
return flags;
}

+#define memalloc_nofs_save() __memalloc_nofs_save(_RET_IP_)
+
static inline void memalloc_nofs_restore(unsigned int flags)
{
current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 084b12bad198..6559668e29db 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -477,6 +477,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
*/
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
+ debug_scope_gfp_context(flags);
if (__builtin_constant_p(size)) {
if (size > KMALLOC_MAX_CACHE_SIZE)
return kmalloc_large(size, flags);
@@ -517,6 +518,7 @@ static __always_inline int kmalloc_size(int n)

static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
{
+ debug_scope_gfp_context(flags);
#ifndef CONFIG_SLOB
if (__builtin_constant_p(size) &&
size <= KMALLOC_MAX_CACHE_SIZE && !(flags & GFP_DMA)) {
@@ -575,6 +577,7 @@ int memcg_update_all_caches(int num_memcgs);
*/
static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
{
+ debug_scope_gfp_context(flags);
if (size != 0 && n > SIZE_MAX / size)
return NULL;
if (__builtin_constant_p(n) && __builtin_constant_p(size))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e701be6b930a..9e35fb2a8681 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3734,6 +3734,63 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
return page;
}

+static bool debug_scope_gfp;
+
+static int __init enable_debug_scope_gfp(char *unused)
+{
+ debug_scope_gfp = true;
+ return 0;
+}
+
+/*
+ * spit the stack trace if the given gfp_mask clears flags which are context
+ * wide cleared. Such a caller can remove special flags clearing and rely on
+ * the context wide mask.
+ */
+void debug_scope_gfp_context(gfp_t gfp_mask)
+{
+ gfp_t restrict_mask;
+
+ if (likely(!debug_scope_gfp))
+ return;
+
+ /* both NOFS, NOIO are irrelevant when direct reclaim is disabled */
+ if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
+ return;
+
+ if (current->flags & PF_MEMALLOC_NOIO)
+ restrict_mask = __GFP_IO;
+ else if ((current->flags & PF_MEMALLOC_NOFS) && (gfp_mask & __GFP_IO))
+ restrict_mask = __GFP_FS;
+ else
+ return;
+
+ if ((gfp_mask & restrict_mask) != restrict_mask) {
+ /*
+ * If you see this this warning then the code does:
+ * memalloc_no{fs,io}_save()
+ * ...
+ * foo()
+ * alloc_page(GFP_NO{FS,IO})
+ * ...
+ * memalloc_no{fs,io}_restore()
+ *
+ * allocation which is unnecessary because the scope gfp
+ * context will do that for all allocation requests already.
+ * If foo() is called from multiple contexts then make sure other
+ * contexts are safe wrt. GFP_NO{FS,IO} semantic and either add
+ * scope protection into particular paths or change the gfp mask
+ * to GFP_KERNEL.
+ */
+ pr_info("Unnecesarily specific gfp mask:%#x(%pGg) for the %s task wide context from %ps\n", gfp_mask, &gfp_mask,
+ (current->flags & PF_MEMALLOC_NOIO)?"NOIO":"NOFS",
+ (void*)((current->flags & PF_MEMALLOC_NOIO)?current->noio_caller:current->nofs_caller));
+ dump_stack();
+ }
+}
+EXPORT_SYMBOL(debug_scope_gfp_context);
+early_param("debug_scope_gfp", enable_debug_scope_gfp);
+
/*
* This is the 'heart' of the zoned buddy allocator.
*/
@@ -3798,6 +3855,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
}

/* First allocation attempt */
+ debug_scope_gfp_context(gfp_mask);
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
if (likely(page))
goto out;
--
2.10.2