[PATCH] mm/debug: Change BUG_ON() crashes to survivable WARN_ON() warnings

From: Ingo Molnar
Date: Thu Sep 07 2017 - 02:44:13 EST


So a VM_BUG_ON() that triggered with the following bug:

72c0098d92ce: ("x86/mm: Reinitialize TLB state on hotplug and resume")

... crashed and made Linus's laptop totally undebuggable, because when
it triggered there was no screen up yet. It looked like a total lockup
on resume - although we produced a warning that could have helped
narrowing down the problem.

Thus instead of being able to report the warning, Linus had to bisect
the bug the hard way in the middle of the merge window - which is beyond
most users' capability and won't work with regular distro kernels anyway.

To make matters worse, a BUG_ON() done when Xorg is active is utterly
undebuggable anyway in most cases, because it won't be printed on the
framebuffer, and because the BUG_ON() prevents the system log to be
synced to disk.

The symptoms, typically, are similar to what Linus saw: a hard lockup
followed by a bootup that shows nothing in the logs ...

Utterly crazy behavior from the kernel, IMHO!

So instead of crashing the system with a BUG_ON(), use a WARN_ON()
instead. In the above situation the kernel would probably have survived
long enough to produce a kernel log.

I realize that in principle there might be bugs where it's better to stop,
i.e. crash the kernel intentionally.

But I argue that most of the kernel bugs are _not_ such bugs, and being
able to get a log out trumps that concern - because the people who run
new kernels early are not crazy enough to _depend_ on that kernel, and
the ability to get logs off is actually more important.

People wanting to crash the kernel here and now have the burden of proof
and we should not make it the default for any widely used assert to crash
the kernel ...

To not have to do a mass rename this patch simply reuses the existing
VM_BUG_ON() which becomes somewhat of a misnomer after this change.

I will send a rename patch as well after the merge window, separately.

Note that I also made mmdebug.h a bit more readable:

- align the various constructs coherently and separate them
visually a bit better

- use consistent definitions. I mean, half the functions have
externs, half don't - what the heck?

- add a bit of description what this is about

Plus, for consistency's sake, VIRTUAL_BUG_ON() is changed as well,
but it's not a widespread primitive.

Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
---
include/linux/mmdebug.h | 56 +++++++++++++++++++++++++++++--------------------
1 file changed, 33 insertions(+), 23 deletions(-)

diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 451a811f48f2..ad127a020c3f 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -1,6 +1,11 @@
#ifndef LINUX_MM_DEBUG_H
#define LINUX_MM_DEBUG_H 1

+/*
+ * Various VM related debug assert helper functions.
+ * On perfect kernels they should never trigger.
+ */
+
#include <linux/bug.h>
#include <linux/stringify.h>

@@ -8,59 +13,64 @@ struct page;
struct vm_area_struct;
struct mm_struct;

-extern void dump_page(struct page *page, const char *reason);
+extern void dump_page(struct page *page, const char *reason);
extern void __dump_page(struct page *page, const char *reason);
-void dump_vma(const struct vm_area_struct *vma);
-void dump_mm(const struct mm_struct *mm);
+extern void dump_vma(const struct vm_area_struct *vma);
+extern void dump_mm(const struct mm_struct *mm);

#ifdef CONFIG_DEBUG_VM
-#define VM_BUG_ON(cond) BUG_ON(cond)
+
+#define VM_BUG_ON(cond) WARN_ON(cond)
+
#define VM_BUG_ON_PAGE(cond, page) \
do { \
if (unlikely(cond)) { \
dump_page(page, "VM_BUG_ON_PAGE(" __stringify(cond)")");\
- BUG(); \
+ WARN_ON(1); \
} \
} while (0)
+
#define VM_BUG_ON_VMA(cond, vma) \
do { \
if (unlikely(cond)) { \
dump_vma(vma); \
- BUG(); \
+ WARN_ON(1); \
} \
} while (0)
+
#define VM_BUG_ON_MM(cond, mm) \
do { \
if (unlikely(cond)) { \
dump_mm(mm); \
- BUG(); \
+ WARN_ON(1); \
} \
} while (0)
-#define VM_WARN_ON(cond) WARN_ON(cond)
-#define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond)
-#define VM_WARN_ONCE(cond, format...) WARN_ONCE(cond, format)
-#define VM_WARN(cond, format...) WARN(cond, format)
+
+#define VM_WARN_ON(cond) WARN_ON(cond)
+#define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond)
+#define VM_WARN_ONCE(cond, format...) WARN_ONCE(cond, format)
+#define VM_WARN(cond, format...) WARN(cond, format)
#else
-#define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
-#define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond)
-#define VM_BUG_ON_VMA(cond, vma) VM_BUG_ON(cond)
-#define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond)
-#define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
-#define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
-#define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
-#define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
+#define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
+#define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond)
+#define VM_BUG_ON_VMA(cond, vma) VM_BUG_ON(cond)
+#define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond)
+#define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
+#define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
+#define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
+#define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
#endif

#ifdef CONFIG_DEBUG_VIRTUAL
-#define VIRTUAL_BUG_ON(cond) BUG_ON(cond)
+#define VIRTUAL_BUG_ON(cond) WARN_ON(cond)
#else
-#define VIRTUAL_BUG_ON(cond) do { } while (0)
+#define VIRTUAL_BUG_ON(cond) do { } while (0)
#endif

#ifdef CONFIG_DEBUG_VM_PGFLAGS
-#define VM_BUG_ON_PGFLAGS(cond, page) VM_BUG_ON_PAGE(cond, page)
+#define VM_BUG_ON_PGFLAGS(cond, page) VM_BUG_ON_PAGE(cond, page)
#else
-#define VM_BUG_ON_PGFLAGS(cond, page) BUILD_BUG_ON_INVALID(cond)
+#define VM_BUG_ON_PGFLAGS(cond, page) BUILD_BUG_ON_INVALID(cond)
#endif

#endif