Re: [PATCH] bpf: don't call mmap_read_trylock() from IRQ context

From: Tetsuo Handa
Date: Sat Jun 15 2024 - 07:00:10 EST


Hello.

On 2024/06/15 1:40, Alexei Starovoitov wrote:
> On Fri, Jun 14, 2024 at 8:15 AM Tetsuo Handa
> <penguin-kernel@xxxxxxxxxxxxxxxxxxx> wrote:
>> "inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage." upon unlock from IRQ work
>> was reported at https://syzkaller.appspot.com/bug?extid=40905bca570ae6784745 .
>
> imo the issue is elsewhere. syzbot reports:
> local_lock_acquire include/linux/local_lock_internal.h:29 [inline]
> __mmap_lock_do_trace_released+0x9c/0x620 mm/mmap_lock.c:243
> __mmap_lock_trace_released include/linux/mmap_lock.h:42 [inline]
>
> it complains about:
> local_lock(&memcg_paths.lock);
> in TRACE_MMAP_LOCK_EVENT.
> which looks like a false positive.

Commit 2b5067a8143e ("mm: mmap_lock: add tracepoints around lock
acquisition") introduced TRACE_MMAP_LOCK_EVENT() macro as below.

#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
do { \
const char *memcg_path; \
preempt_disable(); \
memcg_path = get_mm_memcg_path(mm); \
trace_mmap_lock_##type(mm, \
memcg_path != NULL ? memcg_path : "", \
##__VA_ARGS__); \
if (likely(memcg_path != NULL)) \
put_memcg_path_buf(); \
preempt_enable(); \
} while (0)

Commit d01079f3d0c0 ("mm/mmap_lock: remove dead code for !CONFIG_TRACING
configurations") moved the location of this macro.

Commit 832b50725373 ("mm: mmap_lock: use local locks instead of disabling
preemption") replaced preempt_disable() with local_lock(&memcg_paths.lock)
based on an argument that preempt_disable() has to be avoided because
get_mm_memcg_path() might sleep if PREEMPT_RT=y.

The local_lock() macro is defined as

#define local_lock(lock) __local_lock(lock)

in include/linux/local_lock.h and __local_lock() macro is defined as

#define __local_lock(lock) \
do { \
preempt_disable(); \
local_lock_acquire(this_cpu_ptr(lock)); \
} while (0)

if PREEMPT_RT=n or

#define __local_lock(__lock) \
do { \
migrate_disable(); \
spin_lock(this_cpu_ptr((__lock))); \
} while (0)

if PREEMPT_RT=y in include/linux/local_lock_internal.h .
Note that the difference between preempt_disable() and migrate_disable().

Commit d01079f3d0c0 ("mm/mmap_lock: remove dead code for !CONFIG_TRACING
configurations") by error replaced local_lock() with preempt_disable(),
and commit e904c2ccf9b5 ("mm: mmap_lock: fix disabling preemption
directly") replaced preempt_disable() with local_lock().



Now, syzbot started reporting

inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.

and

inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.

messages, for local_lock() does not disable IRQ.

I think we can replace local_lock() with local_lock_irqsave() like

diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 1854850b4b89..1901ad22ccbd 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -156,14 +156,15 @@ static inline void put_memcg_path_buf(void)
#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
do { \
const char *memcg_path; \
- local_lock(&memcg_paths.lock); \
+ unsigned long flags; \
+ local_lock_irqsave(&memcg_paths.lock, flags); \
memcg_path = get_mm_memcg_path(mm); \
trace_mmap_lock_##type(mm, \
memcg_path != NULL ? memcg_path : "", \
##__VA_ARGS__); \
if (likely(memcg_path != NULL)) \
put_memcg_path_buf(); \
- local_unlock(&memcg_paths.lock); \
+ local_unlock_irqrestore(&memcg_paths.lock, flags); \
} while (0)

#else /* !CONFIG_MEMCG */

because local_lock_irqsave() is defined as

#define local_lock_irqsave(lock, flags) __local_lock_irqsave(lock, flags)

in include/linux/local_lock.h and __local_lock_irqsave() macro is defined as

#define __local_lock_irqsave(lock, flags) \
do { \
local_irq_save(flags); \
local_lock_acquire(this_cpu_ptr(lock)); \
} while (0)

if PREEMPT_RT=n or

#define __local_lock_irqsave(lock, flags) \
do { \
typecheck(unsigned long, flags); \
flags = 0; \
__local_lock(lock); \
} while (0)

if PREEMPT_RT=y in include/linux/local_lock_internal.h .



But a fundamental question arises here; why do we need to hold
memcg_paths.lock here, for as of commit 44ef20baed8e ("Merge tag
's390-6.10-4' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux"),
"git grep -nF memcg_paths" indicates that memcg_paths.lock is used within
only TRACE_MMAP_LOCK_EVENT() macro.

mm/mmap_lock.c:48:static DEFINE_PER_CPU(struct memcg_path, memcg_paths) = {
mm/mmap_lock.c:63: memcg_path = per_cpu_ptr(&memcg_paths, cpu);
mm/mmap_lock.c:101: rcu_assign_pointer(per_cpu_ptr(&memcg_paths, cpu)->buf, new);
mm/mmap_lock.c:135: struct memcg_path *memcg_path = this_cpu_ptr(&memcg_paths);
mm/mmap_lock.c:152: local_sub(MEMCG_PATH_BUF_SIZE, &this_cpu_ptr(&memcg_paths)->buf_idx);
mm/mmap_lock.c:159: local_lock(&memcg_paths.lock); \
mm/mmap_lock.c:166: local_unlock(&memcg_paths.lock); \

That is, what is the reason we can't revert commit 832b50725373 and
make the TRACE_MMAP_LOCK_EVENT() macro to branch directly like below?

diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 1854850b4b89..1e440c7d48e6 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -153,19 +153,35 @@ static inline void put_memcg_path_buf(void)
rcu_read_unlock();
}

+#ifndef CONFIG_PREEMPT_RT
#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
do { \
const char *memcg_path; \
- local_lock(&memcg_paths.lock); \
+ preempt_disable(); \
memcg_path = get_mm_memcg_path(mm); \
trace_mmap_lock_##type(mm, \
memcg_path != NULL ? memcg_path : "", \
##__VA_ARGS__); \
if (likely(memcg_path != NULL)) \
put_memcg_path_buf(); \
- local_unlock(&memcg_paths.lock); \
+ preempt_enable(); \
} while (0)

+#else
+#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
+ do { \
+ const char *memcg_path; \
+ migrate_disable(); \
+ memcg_path = get_mm_memcg_path(mm); \
+ trace_mmap_lock_##type(mm, \
+ memcg_path != NULL ? memcg_path : "", \
+ ##__VA_ARGS__); \
+ if (likely(memcg_path != NULL)) \
+ put_memcg_path_buf(); \
+ migrate_enable(); \
+ } while (0)
+#endif
+
#else /* !CONFIG_MEMCG */

int trace_mmap_lock_reg(void)

Is the reason because &buf[idx] in get_memcg_path_buf() might become out of
bounds due to preemption in normal context if PREEMPT_RT=y ? If so, can't we
add "idx >=0 && idx < CONTEXT_COUNT" check into get_memcg_path_buf() and
return NULL if preemption (or interrupt or recursion if any) exhausted the per
cpu buffer?

/*
* How many contexts our trace events might be called in: normal, softirq, irq,
* and NMI.
*/
#define CONTEXT_COUNT 4