Re: [PATCHv6 5/7] printk: report lost messages in printk safe/nmi contexts

From: Petr Mladek
Date: Fri Dec 23 2016 - 05:54:40 EST


On Wed 2016-12-21 23:36:03, Sergey Senozhatsky wrote:
> Account lost messages in pritk-safe and printk-safe-nmi
> contexts and report those numbers during printk_safe_flush().
>
> The patch also moves lost message counter to struct
> `printk_safe_seq_buf' instead of having dedicated static
> counters - this simplifies the code.
>
> Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@xxxxxxxxx>
> Reviewed-by: Petr Mladek <pmladek@xxxxxxxx>
> ---
> kernel/printk/internal.h | 17 -----------------
> kernel/printk/printk.c | 10 ----------
> kernel/printk/printk_safe.c | 18 +++++++++++++++---
> 3 files changed, 15 insertions(+), 30 deletions(-)
>
> diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
> index 97cee4f482f4..d2456020120c 100644
> --- a/kernel/printk/internal.h
> +++ b/kernel/printk/internal.h
> @@ -16,23 +16,6 @@
> */
> #include <linux/percpu.h>
>
> -#ifdef CONFIG_PRINTK_NMI
> -
> -extern atomic_t nmi_message_lost;
> -static inline int get_nmi_message_lost(void)
> -{
> - return atomic_xchg(&nmi_message_lost, 0);
> -}
> -
> -#else /* CONFIG_PRINTK_NMI */
> -
> -static inline int get_nmi_message_lost(void)
> -{
> - return 0;
> -}
> -
> -#endif /* CONFIG_PRINTK_NMI */
> -
> #ifdef CONFIG_PRINTK
>
> #define PRINTK_SAFE_CONTEXT_MASK 0x7fffffff
> diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
> index 335b0fcb538e..6ec61ae0d122 100644
> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -1677,7 +1677,6 @@ asmlinkage int vprintk_emit(int facility, int level,
> unsigned long flags;
> int this_cpu;
> int printed_len = 0;
> - int nmi_message_lost;
> bool in_sched = false;
> /* cpu currently holding logbuf_lock in this function */
> static unsigned int logbuf_cpu = UINT_MAX;
> @@ -1728,15 +1727,6 @@ asmlinkage int vprintk_emit(int facility, int level,
> strlen(recursion_msg));
> }
>
> - nmi_message_lost = get_nmi_message_lost();
> - if (unlikely(nmi_message_lost)) {
> - text_len = scnprintf(textbuf, sizeof(textbuf),
> - "BAD LUCK: lost %d message(s) from NMI context!",
> - nmi_message_lost);
> - printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
> - NULL, 0, textbuf, text_len);
> - }
> -
> /*
> * The printf needs to come first; we need the syslog
> * prefix which might be passed-in as a parameter.
> diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
> index 9e8ef6a2070c..cd5160112374 100644
> --- a/kernel/printk/printk_safe.c
> +++ b/kernel/printk/printk_safe.c
> @@ -42,10 +42,13 @@
> static int printk_safe_irq_ready;
>
> #define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \
> - sizeof(atomic_t) - sizeof(struct irq_work))
> + sizeof(atomic_t) - \
> + sizeof(atomic_t) - \
> + sizeof(struct irq_work))
>
> struct printk_safe_seq_buf {
> atomic_t len; /* length of written data */
> + atomic_t message_lost;
> struct irq_work work; /* IRQ work that flushes the buffer */
> unsigned char buffer[SAFE_LOG_BUF_LEN];
> };
> @@ -55,7 +58,6 @@ static DEFINE_PER_CPU(int, printk_context);
>
> #ifdef CONFIG_PRINTK_NMI
> static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
> -atomic_t nmi_message_lost;
> #endif
>
> /*
> @@ -79,7 +81,7 @@ static int printk_safe_log_store(struct printk_safe_seq_buf *s,
>
> /* The trailing '\0' is not counted into len. */
> if (len >= sizeof(s->buffer) - 1) {
> - atomic_inc(&nmi_message_lost);
> + atomic_inc(&s->message_lost);
> return 0;
> }
>
> @@ -168,6 +170,14 @@ static int printk_safe_flush_buffer(const char *start, size_t len)
> return len;
> }
>
> +static void report_message_lost(struct printk_safe_seq_buf *s)
> +{
> + int lost = atomic_xchg(&s->message_lost, 0);
> +
> + if (lost)
> + printk_deferred("Lost %d message(s)!\n", lost);
> +}
> +
> /*
> * Flush data from the associated per-CPU buffer. The function
> * can be called either via IRQ work or independently.
> @@ -224,6 +234,8 @@ static void __printk_safe_flush(struct irq_work *work)
> if (atomic_cmpxchg(&s->len, len, 0) != len)
> goto more;
>
> + report_message_lost(s);

There is a potential race:

CPU0 CPU1

printk_safe_log_store()
len = atomic_read(&s->len);

__printk_save_flush()

atomic_cmpxchg(&s->len, len, 0)

report_message_lost(s);

if (len >= sizeof(s->buffer) - 1) {
atomic_inc(&s->message_lost);
return 0;

We check the outdated len, account lost message, but it will not
be reported until some other message appears in the log buffer.

> +
> out:

I would make sense to move report_message_lost(s) here, after
the out: label.

Also it would make sense to queue the irq_work also when we
bump s->message lost in printk_save_log_store().

It still will not be perfect because the "racy" warning about the lost
messages will not be printed together with the messages that fit
the buffer. Also there might appear new messages in the meantime.
But it is a real corner case. We do the best effort. I think
that it is not worth more complications than what I suggested above.
We still have much better results than the original code.
And the new code looks much better.

Best Regards,
Petr