Re: [BUG RT] dump-capture kernel not executed for panic in interrupt context

From: Eric W. Biederman
Date: Fri Sep 11 2020 - 18:49:29 EST


Joerg Vehlow <lkml@xxxxxxxxxxx> writes:

> Hi,
>
> here is the new version of the patch based on Peters suggestion
> It looks like it works fine. I added the BUG_ON to __crash_kexec, because it is
> a precondition, that panic_cpu is set correctly, otherwise the whole locking
> logic fails.
>
> The mutex_trylock can still be used, because it is only in syscall context and
> no interrupt context.

What is this patch supposed to be doing?

What bug is it fixing?

A BUG_ON that triggers inside of BUG_ONs seems not just suspect but
outright impossible to make use of.


I get the feeling skimming this that it is time to sort out and simplify
the locking here, rather than make it more complex, and more likely to
fail.

I get the feeling that over the years somehow the assumption that the
rest of the kernel is broken and that we need to get out of the broken
kernel as fast and as simply as possible has been lost.

Eric



> ---
>  kernel/kexec.c          |  8 ++--
>  kernel/kexec_core.c     | 86 +++++++++++++++++++++++++++--------------
>  kernel/kexec_file.c     |  4 +-
>  kernel/kexec_internal.h |  6 ++-
>  4 files changed, 69 insertions(+), 35 deletions(-)
>
> diff --git a/kernel/kexec.c b/kernel/kexec.c
> index f977786fe498..118a012aeac2 100644
> --- a/kernel/kexec.c
> +++ b/kernel/kexec.c
> @@ -255,12 +255,12 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned
> long, nr_segments,
>       *
>       * KISS: always take the mutex.
>       */
> -    if (!mutex_trylock(&kexec_mutex))
> +    if (!kexec_trylock())
>          return -EBUSY;
>
>      result = do_kexec_load(entry, nr_segments, segments, flags);
>
> -    mutex_unlock(&kexec_mutex);
> +    kexec_unlock();
>
>      return result;
>  }
> @@ -309,12 +309,12 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
>       *
>       * KISS: always take the mutex.
>       */
> -    if (!mutex_trylock(&kexec_mutex))
> +    if (!kexec_trylock())
>          return -EBUSY;
>
>      result = do_kexec_load(entry, nr_segments, ksegments, flags);
>
> -    mutex_unlock(&kexec_mutex);
> +    kexec_unlock();
>
>      return result;
>  }
> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
> index c19c0dad1ebe..71682a33b1ba 100644
> --- a/kernel/kexec_core.c
> +++ b/kernel/kexec_core.c
> @@ -45,7 +45,7 @@
>  #include <crypto/sha.h>
>  #include "kexec_internal.h"
>
> -DEFINE_MUTEX(kexec_mutex);
> +static DEFINE_MUTEX(kexec_mutex);
>
>  /* Per cpu memory for storing cpu states in case of system crash. */
>  note_buf_t __percpu *crash_notes;
> @@ -70,6 +70,43 @@ struct resource crashk_low_res = {
>      .desc  = IORES_DESC_CRASH_KERNEL
>  };
>
> +void kexec_lock(void)
> +{
> +    /*
> +     * LOCK kexec_mutex        cmpxchg(&panic_cpu, INVALID, cpu)
> +     *   MB                  MB
> +     * panic_cpu == INVALID        kexec_mutex == LOCKED
> +     *
> +     * Ensures either we observe the cmpxchg, or crash_kernel() observes
> +     * our lock acquisition.
> +     */
> +    mutex_lock(&kexec_mutex);
> +    smp_mb();
> +    atomic_cond_read_acquire(&panic_cpu, VAL == PANIC_CPU_INVALID);
> +}
> +
> +int kexec_trylock(void) {
> +    if (!mutex_trylock(&kexec_mutex)) {
> +        return 0;
> +    }
> +    smp_mb();
> +    if (atomic_read(&panic_cpu) != PANIC_CPU_INVALID) {
> +         mutex_unlock(&kexec_mutex);
> +         return 0;
> +    }
> +    return 1;
> +}
> +
> +void kexec_unlock(void)
> +{
> +    mutex_unlock(&kexec_mutex);
> +}
> +
> +int kexec_is_locked(void)
> +{
> +    return mutex_is_locked(&kexec_mutex);
> +}
> +
>  int kexec_should_crash(struct task_struct *p)
>  {
>      /*
> @@ -943,24 +980,15 @@ int kexec_load_disabled;
>   */
>  void __noclone __crash_kexec(struct pt_regs *regs)
>  {
> -    /* Take the kexec_mutex here to prevent sys_kexec_load
> -     * running on one cpu from replacing the crash kernel
> -     * we are using after a panic on a different cpu.
> -     *
> -     * If the crash kernel was not located in a fixed area
> -     * of memory the xchg(&kexec_crash_image) would be
> -     * sufficient.  But since I reuse the memory...
> -     */
> -    if (mutex_trylock(&kexec_mutex)) {
> -        if (kexec_crash_image) {
> -            struct pt_regs fixed_regs;
> -
> -            crash_setup_regs(&fixed_regs, regs);
> -            crash_save_vmcoreinfo();
> -            machine_crash_shutdown(&fixed_regs);
> -            machine_kexec(kexec_crash_image);
> -        }
> -        mutex_unlock(&kexec_mutex);
> +    BUG_ON(atomic_read(&panic_cpu) != raw_smp_processor_id());
> +
> +    if (!kexec_is_locked() && kexec_crash_image) {
> +        struct pt_regs fixed_regs;
> +
> +        crash_setup_regs(&fixed_regs, regs);
> +        crash_save_vmcoreinfo();
> +        machine_crash_shutdown(&fixed_regs);
> +        machine_kexec(kexec_crash_image);
>      }
>  }
>  STACK_FRAME_NON_STANDARD(__crash_kexec);
> @@ -977,9 +1005,11 @@ void crash_kexec(struct pt_regs *regs)
>      this_cpu = raw_smp_processor_id();
>      old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
>      if (old_cpu == PANIC_CPU_INVALID) {
> -        /* This is the 1st CPU which comes here, so go ahead. */
> -        printk_safe_flush_on_panic();
> -        __crash_kexec(regs);
> +        if (!kexec_is_locked()) {
> +            /* This is the 1st CPU which comes here, so go ahead. */
> +            printk_safe_flush_on_panic();
> +            __crash_kexec(regs);
> +        }
>
>          /*
>           * Reset panic_cpu to allow another panic()/crash_kexec()
> @@ -993,10 +1023,10 @@ size_t crash_get_memory_size(void)
>  {
>      size_t size = 0;
>
> -    mutex_lock(&kexec_mutex);
> +    kexec_lock();
>      if (crashk_res.end != crashk_res.start)
>          size = resource_size(&crashk_res);
> -    mutex_unlock(&kexec_mutex);
> +    kexec_unlock();
>      return size;
>  }
>
> @@ -1016,7 +1046,7 @@ int crash_shrink_memory(unsigned long new_size)
>      unsigned long old_size;
>      struct resource *ram_res;
>
> -    mutex_lock(&kexec_mutex);
> +    kexec_lock();
>
>      if (kexec_crash_image) {
>          ret = -ENOENT;
> @@ -1054,7 +1084,7 @@ int crash_shrink_memory(unsigned long new_size)
>      insert_resource(&iomem_resource, ram_res);
>
>  unlock:
> -    mutex_unlock(&kexec_mutex);
> +    kexec_unlock();
>      return ret;
>  }
>
> @@ -1126,7 +1156,7 @@ int kernel_kexec(void)
>  {
>      int error = 0;
>
> -    if (!mutex_trylock(&kexec_mutex))
> +    if (!kexec_trylock())
>          return -EBUSY;
>      if (!kexec_image) {
>          error = -EINVAL;
> @@ -1203,7 +1233,7 @@ int kernel_kexec(void)
>  #endif
>
>   Unlock:
> -    mutex_unlock(&kexec_mutex);
> +    kexec_unlock();
>      return error;
>  }
>
> diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
> index ca40bef75a61..d40b0aedc187 100644
> --- a/kernel/kexec_file.c
> +++ b/kernel/kexec_file.c
> @@ -362,7 +362,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int,
> initrd_fd,
>
>      image = NULL;
>
> -    if (!mutex_trylock(&kexec_mutex))
> +    if (!kexec_trylock())
>          return -EBUSY;
>
>      dest_image = &kexec_image;
> @@ -434,7 +434,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int,
> initrd_fd,
>      if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
>          arch_kexec_protect_crashkres();
>
> -    mutex_unlock(&kexec_mutex);
> +    kexec_unlock();
>      kimage_free(image);
>      return ret;
>  }
> diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
> index 39d30ccf8d87..2c1683cb1082 100644
> --- a/kernel/kexec_internal.h
> +++ b/kernel/kexec_internal.h
> @@ -15,7 +15,11 @@ int kimage_is_destination_range(struct kimage *image,
>
>  int machine_kexec_post_load(struct kimage *image);
>
> -extern struct mutex kexec_mutex;
> +void kexec_lock(void);
> +int kexec_trylock(void);
> +void kexec_unlock(void);
> +int kexec_is_locked(void);
> +
>
>  #ifdef CONFIG_KEXEC_FILE
>  #include <linux/purgatory.h>