Re: ARM board lockups/hangs triggered by locks and mutexes

From: Rafał Miłecki
Date: Fri Aug 04 2023 - 07:08:03 EST


On 2.08.2023 00:10, Rafał Miłecki wrote:
Unfortunately enabling *any* of following options:
CONFIG_DEBUG_RT_MUTEXES=y
CONFIG_DEBUG_SPINLOCK=y
CONFIG_DEBUG_MUTEXES=y
seems to make locksup/hangs go away. I tried for few hours.

I decided to find out why enabling CONFIG_DEBUG_MUTEXES "fixes" kernel /
device stability for me. I tried enabling manually code that normally
hides behind the #ifdev CONFIG_DEBUG_MUTEXES.

Attached to this e-mail is a small patch that is enough to make my
kernel stable (mutex-fix-bcm53573.diff).

#####

It's not what's the most interesting thought. What really doesn't make
sense anymore is that below diff (on top of attached one) brings back
hangs/lockups.

I triple checked that. Dropping a single unused function breaks kernel /
device stability on BCM53573!

AFAIK the only thing below diff actually affects is location of symbols
(I actually verified that by comparing System.map before and after -
over 22'000 of relocated symbols).

Can some unfortunate location of symbols cause those hangs/lockups?


diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 4fe40910f..c440222a4 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -34,6 +34,8 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
INIT_LIST_HEAD(&waiter->list);
}

+/* Dropping below function brings back hangs/lockups & reboots */
+#if 0
void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
{
lockdep_assert_held(&lock->wait_lock);
@@ -41,6 +43,7 @@ void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
DEBUG_LOCKS_WARN_ON(waiter->magic != waiter);
DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
}
+#endif

void debug_mutex_free_waiter(struct mutex_waiter *waiter)
{
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 479bc96c3..15bd4691b 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -57,9 +57,7 @@ struct mutex {
struct optimistic_spin_queue osq; /* Spinner MCS lock */
#endif
struct list_head wait_list;
-#ifdef CONFIG_DEBUG_MUTEXES
void *magic;
-#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
@@ -73,12 +71,10 @@ struct mutex_waiter {
struct list_head list;
struct task_struct *task;
struct ww_acquire_ctx *ww_ctx;
-#ifdef CONFIG_DEBUG_MUTEXES
void *magic;
-#endif
};

-#ifdef CONFIG_DEBUG_MUTEXES
+#if 1 //def CONFIG_DEBUG_MUTEXES

#define __DEBUG_MUTEX_INITIALIZER(lockname) \
, .magic = &lockname
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d0e639497..8fef4485e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -958,10 +958,8 @@ struct task_struct {
struct rt_mutex_waiter *pi_blocked_on;
#endif

-#ifdef CONFIG_DEBUG_MUTEXES
/* Mutex deadlock detection: */
struct mutex_waiter *blocked_on;
-#endif

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
int non_block_count;
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 45452facf..b22e6ecd8 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -12,7 +12,7 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
endif

-obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
+obj-y += mutex-debug.o
obj-$(CONFIG_LOCKDEP) += lockdep.o
ifeq ($(CONFIG_PROC_FS),y)
obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index b02fff282..6dc3f80a3 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -946,9 +946,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,

might_sleep();

-#ifdef CONFIG_DEBUG_MUTEXES
DEBUG_LOCKS_WARN_ON(lock->magic != lock);
-#endif

ww = container_of(lock, struct ww_mutex, base);
if (ww_ctx) {
@@ -1417,9 +1415,7 @@ int __sched mutex_trylock(struct mutex *lock)
{
bool locked;

-#ifdef CONFIG_DEBUG_MUTEXES
DEBUG_LOCKS_WARN_ON(lock->magic != lock);
-#endif

locked = __mutex_trylock(lock);
if (locked)