Re: Linux 6.3-rc1

From: Guenter Roeck
Date: Mon Mar 06 2023 - 14:53:41 EST


On Mon, Mar 06, 2023 at 10:12:01AM -0800, Linus Torvalds wrote:
> On Mon, Mar 6, 2023 at 8:52 AM Guenter Roeck <linux@xxxxxxxxxxxx> wrote:
> >
> > Various crashes, affecting several architectures.
> >
> > [ 11.664666] BUG: unable to handle page fault for address: 000000000002d3db
> > [ 11.667043] lock_acquire+0xcb/0x330
> > [ 11.667163] _raw_spin_lock+0x2b/0x40
> > [ 11.667255] add_timer_on+0x92/0x150
> > [ 11.667302] try_to_generate_entropy+0x256/0x280
>
> Ok, this seems to be due to the issue discussed here
>
> https://lore.kernel.org/all/CAHk-=witXXeQuP9fgs4dDL2Ex0meXQiHJs+3JEfNdaPwngMVEg@xxxxxxxxxxxxxx/
>
> and I think the fix is as per the appended patch (that particular
> crash is due to the random.c part, but I did the others that 'git
> grep' picked out too).
>
> Can you run your test-runs on this patch?
>

With the patch below applied on top of v6.3-rc1, the crashes are gone:

Qemu test results:
total: 511 pass: 511 fail: 0

Tested-by: Guenter Roeck <linux@xxxxxxxxxxxx>

Guenter

> Linus
>
> arch/powerpc/xmon/xmon.c | 2 +-
> drivers/char/random.c | 2 +-
> drivers/net/wireguard/queueing.h | 2 +-
> drivers/scsi/lpfc/lpfc_init.c | 14 +++++++-------
> 4 files changed, 10 insertions(+), 10 deletions(-)
>
> diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
> index 73c620c2a3a1..e753a6bd4888 100644
> --- a/arch/powerpc/xmon/xmon.c
> +++ b/arch/powerpc/xmon/xmon.c
> @@ -1275,7 +1275,7 @@ static int xmon_batch_next_cpu(void)
> while (!cpumask_empty(&xmon_batch_cpus)) {
> cpu = cpumask_next_wrap(smp_processor_id(), &xmon_batch_cpus,
> xmon_batch_start_cpu, true);
> - if (cpu == nr_cpumask_bits)
> + if (cpu >= nr_cpu_ids)
> break;
> if (xmon_batch_start_cpu == -1)
> xmon_batch_start_cpu = cpu;
> diff --git a/drivers/char/random.c b/drivers/char/random.c
> index ce3ccd172cc8..253f2ddb8913 100644
> --- a/drivers/char/random.c
> +++ b/drivers/char/random.c
> @@ -1311,7 +1311,7 @@ static void __cold try_to_generate_entropy(void)
> /* Basic CPU round-robin, which avoids the current CPU. */
> do {
> cpu = cpumask_next(cpu, &timer_cpus);
> - if (cpu == nr_cpumask_bits)
> + if (cpu >= nr_cpu_ids)
> cpu = cpumask_first(&timer_cpus);
> } while (cpu == smp_processor_id() && num_cpus > 1);
>
> diff --git a/drivers/net/wireguard/queueing.h b/drivers/net/wireguard/queueing.h
> index 583adb37ee1e..125284b346a7 100644
> --- a/drivers/net/wireguard/queueing.h
> +++ b/drivers/net/wireguard/queueing.h
> @@ -106,7 +106,7 @@ static inline int wg_cpumask_choose_online(int *stored_cpu, unsigned int id)
> {
> unsigned int cpu = *stored_cpu, cpu_index, i;
>
> - if (unlikely(cpu == nr_cpumask_bits ||
> + if (unlikely(cpu >= nr_cpu_ids ||
> !cpumask_test_cpu(cpu, cpu_online_mask))) {
> cpu_index = id % cpumask_weight(cpu_online_mask);
> cpu = cpumask_first(cpu_online_mask);
> diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
> index 61958a24a43d..73b544bfbb2e 100644
> --- a/drivers/scsi/lpfc/lpfc_init.c
> +++ b/drivers/scsi/lpfc/lpfc_init.c
> @@ -12563,7 +12563,7 @@ lpfc_cpu_affinity_check(struct lpfc_hba *phba, int vectors)
> goto found_same;
> new_cpu = cpumask_next(
> new_cpu, cpu_present_mask);
> - if (new_cpu == nr_cpumask_bits)
> + if (new_cpu >= nr_cpu_ids)
> new_cpu = first_cpu;
> }
> /* At this point, we leave the CPU as unassigned */
> @@ -12577,7 +12577,7 @@ lpfc_cpu_affinity_check(struct lpfc_hba *phba, int vectors)
> * selecting the same IRQ.
> */
> start_cpu = cpumask_next(new_cpu, cpu_present_mask);
> - if (start_cpu == nr_cpumask_bits)
> + if (start_cpu >= nr_cpu_ids)
> start_cpu = first_cpu;
>
> lpfc_printf_log(phba, KERN_INFO, LOG_INIT,
> @@ -12613,7 +12613,7 @@ lpfc_cpu_affinity_check(struct lpfc_hba *phba, int vectors)
> goto found_any;
> new_cpu = cpumask_next(
> new_cpu, cpu_present_mask);
> - if (new_cpu == nr_cpumask_bits)
> + if (new_cpu >= nr_cpu_ids)
> new_cpu = first_cpu;
> }
> /* We should never leave an entry unassigned */
> @@ -12631,7 +12631,7 @@ lpfc_cpu_affinity_check(struct lpfc_hba *phba, int vectors)
> * selecting the same IRQ.
> */
> start_cpu = cpumask_next(new_cpu, cpu_present_mask);
> - if (start_cpu == nr_cpumask_bits)
> + if (start_cpu >= nr_cpu_ids)
> start_cpu = first_cpu;
>
> lpfc_printf_log(phba, KERN_INFO, LOG_INIT,
> @@ -12704,7 +12704,7 @@ lpfc_cpu_affinity_check(struct lpfc_hba *phba, int vectors)
> goto found_hdwq;
> }
> new_cpu = cpumask_next(new_cpu, cpu_present_mask);
> - if (new_cpu == nr_cpumask_bits)
> + if (new_cpu >= nr_cpu_ids)
> new_cpu = first_cpu;
> }
>
> @@ -12719,7 +12719,7 @@ lpfc_cpu_affinity_check(struct lpfc_hba *phba, int vectors)
> goto found_hdwq;
>
> new_cpu = cpumask_next(new_cpu, cpu_present_mask);
> - if (new_cpu == nr_cpumask_bits)
> + if (new_cpu >= nr_cpu_ids)
> new_cpu = first_cpu;
> }
>
> @@ -12730,7 +12730,7 @@ lpfc_cpu_affinity_check(struct lpfc_hba *phba, int vectors)
> found_hdwq:
> /* We found an available entry, copy the IRQ info */
> start_cpu = cpumask_next(new_cpu, cpu_present_mask);
> - if (start_cpu == nr_cpumask_bits)
> + if (start_cpu >= nr_cpu_ids)
> start_cpu = first_cpu;
> cpup->hdwq = new_cpup->hdwq;
> logit:
>