Re: [RFC PATCH v4 9/9] printk: use a new ringbuffer implementation

From: Linus Torvalds
Date: Thu Aug 08 2019 - 15:14:54 EST


On Wed, Aug 7, 2019 at 3:27 PM John Ogness <john.ogness@xxxxxxxxxxxxx> wrote:
>
> 2. For the CONFIG_PPC_POWERNV powerpc platform, kernel log buffer
> registration is no longer available because there is no longer
> a single contigous block of memory to represent all of the
> ringbuffer.

So this is tangential, but I've actually been wishing for a special
"raw dump" format that has absolutely *no* structure to it at all, and
is as a result not necessarily strictly reliable, but is a lot more
robust.

The background for that is that we have a class of bugs that are
really hard to debug "in the wild", because people don't have access
to serial consoles or any kind of special hardware at all (ie forget
things like nvram etc), and when the machine locks up you're happy to
just have a reset button (but more likely you have to turn power off
and on).

End result: a DRAM buffer can work, but is not "reliable".
Particularly if you turn power on and off, data retention of DRAM is
iffy. But it's possible, at least in theory.

So I have a patch that implements a "stupid ring buffer" for thisa
case, with absolutely zero data structures (because in the presense of
DRAM corruption, all you can get is "hopefully only slightly garbled
ASCII".

It actually does work. It's a complete hack, but I have used this on
real hardware to see dumps that happened after the machine could no
longer send them to any device.

I actually suspect that this kind of "stupid non-structured secondary
log" can often be much more useful than the existing nvram special
cases - yes the output can be garbled for multi-cpu cases because it
not only is lockless, it's lockess without even any data structures -
but it also works somewhat reliably when the machine is _really_
borked. Which is exactly when you want a log that isn't just the
normal "working machine syslog".

NOTE! This is *not* a replacement for a lockless printk. This is very
much an _additional_ "low overhead buffer in RAM" for post-mortem
analysis when anything fancier doesn't work.

So I'm throwing this patch out there in case people have interest in
looking at that very special case. Also note how right now the example
code just steals a random physical memory area at roughly physical
location 12GB - this is a hack and would need to be configurable
obviously in real life, but it worked for the machines I tested (which
both happened to have 16GB of RAM).

Those parts are marked with "// HACK HACK HACK" and just a hardcoded
physical address (0x320000000).

Linus
From 074ea67afcaba37996a615c41685cd72b088f583 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Date: Thu, 30 May 2019 19:56:13 -0700
Subject: [PATCH] Trial "power off buffer" for printk data retention

This circumvents ACPI and just forces a random physical address (which
happens to be at 0x320000000) to contain a 64kB buffer that we take
over.

Not-yet-signed-off-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
---
arch/x86/kernel/setup.c | 7 ++
include/linux/printk.h | 3 +
init/main.c | 11 ++
kernel/printk/Makefile | 2 +-
kernel/printk/poweroff_buffer.c | 179 ++++++++++++++++++++++++++++++++
kernel/printk/printk.c | 2 +
6 files changed, 203 insertions(+), 1 deletion(-)
create mode 100644 kernel/printk/poweroff_buffer.c

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 08a5f4a131f5..2a1d7d7f3f4f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1042,6 +1042,13 @@ void __init setup_arch(char **cmdline_p)
early_gart_iommu_check();
#endif

+ // HACK HACK HACK
+ // Magic "this RAM survives boot" fake
+ e820__range_update(0x320000000, 65536, E820_TYPE_RAM, E820_TYPE_RESERVED);
+ e820__update_table(e820_table);
+ printk(KERN_INFO "fixed physical RAM map:\n");
+ e820__print_table("fake boot-safe buffers");
+
/*
* partially used pages are not usable - thus
* we are rounding upwards:
diff --git a/include/linux/printk.h b/include/linux/printk.h
index cefd374c47b1..905c47efb98c 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -170,6 +170,9 @@ int vprintk(const char *fmt, va_list args);
asmlinkage __printf(1, 2) __cold
int printk(const char *fmt, ...);

+void poweroff_buffer_log(const char *buf, size_t len);
+void poweroff_buffer_register(char *buf, size_t size);
+
/*
* Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ !
*/
diff --git a/init/main.c b/init/main.c
index 66a196c5e4c3..232778603490 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1100,6 +1100,17 @@ static int __ref kernel_init(void *unused)
system_state = SYSTEM_RUNNING;
numa_default_policy();

+ //
+ // HACK HACK HACK
+ //
+ {
+ void *base = ioremap_cache(0x320000000,65536);
+ if (base)
+ poweroff_buffer_register(base, 65536);
+ else
+ printk("ioremap failed\n");
+ }
+
rcu_end_inkernel_boot();

if (ramdisk_execute_command) {
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 4d052fc6bcde..7ca11d92f280 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,4 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y = printk.o
-obj-$(CONFIG_PRINTK) += printk_safe.o
+obj-$(CONFIG_PRINTK) += printk_safe.o poweroff_buffer.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
diff --git a/kernel/printk/poweroff_buffer.c b/kernel/printk/poweroff_buffer.c
new file mode 100644
index 000000000000..79023b49470b
--- /dev/null
+++ b/kernel/printk/poweroff_buffer.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/printk.h>
+#include <linux/string.h>
+
+#include <asm/unaligned.h>
+
+/*
+ * The local buffer log is incredibly simplistic: it's just
+ * a circular buffer that gets filled with text (from printk
+ * or explicit local buffer logging), and that intentionally
+ * does not perform any locking (although printk will itself
+ * serialize things a bit - other uses may not).
+ *
+ * The buffer - if it exists - is intended to survive a power
+ * event - although possibly corrupted by lack of DRAM refresh,
+ * and has absolutely no structure. It gets logged at boot, so
+ * that the last messages from a previous kernel that crashed
+ * may be recovered.
+ *
+ * NOTE! The existence of the buffer is indicated by a non-zero
+ * log position. A zero 'pos' means "no buffer", while a 'pos'
+ * that is at the buffer size means "next write goes to the start
+ * of the buffer".
+ *
+ * NOTE2! This is not even trying to be thread-safe. Multiple
+ * CPU's writing at the same time may overwrite each other.
+ * It's not trying to be less confusing than what might show
+ * up on the screen, it's trying to be simpler.
+ *
+ * But while the output may be garbled in the presense of
+ * multiple threads logging concurrently, we're still thread-
+ * safe enough to not use a buffer before it's initialized,
+ * and not write to beyond the buffer. So the contents may
+ * be garbled, but not other memory.
+ */
+static struct {
+ void *base;
+ unsigned long size;
+ unsigned long pos;
+} buffer;
+
+// Optimistic end marker so that people can
+// try to make sense of the circular part
+// of the buffer, even if we can't put real
+// reliable data in it.
+//
+// Note that power_buffer_register() has explictly
+// left space for this, and 'size' is not the
+// actual full physical size of the buffer. The
+// end marker does not wrap.
+static void set_end_marker(void *base, unsigned long pos)
+{
+ put_unaligned(-1, (unsigned int *)(base + pos));
+}
+
+/*
+ * If we have a local CPU buffer that survives a power cycle (possibly
+ * corrupt due to lack of refresh), emit log messages into it that the
+ * next boot may replay.
+ *
+ * Note that the buffer base or size will not change once the buffer
+ * exists, so the only thing that needs some care is 'pos'.
+ */
+void poweroff_buffer_log(const char *buf, size_t len)
+{
+ void *base;
+ unsigned long pos, size, left;
+
+ if (!len)
+ return;
+ pos = READ_ONCE(buffer.pos);
+ if (!pos)
+ return;
+ smp_rmb();
+ base = buffer.base;
+ size = buffer.size;
+
+ /* Must not happen! But let's check and abort */
+ if (pos > size)
+ return;
+ left = size - pos;
+ if (len > left) {
+ memcpy(base + pos, buf, left);
+ buf += left;
+ len -= left;
+
+ // Did we have more data than the whole buffer?
+ // Just truncate it if so.
+ if (len > pos)
+ len = pos;
+ pos = 0;
+ }
+ memcpy(base + pos, buf, len);
+
+ // NOTE! "len+pos" is not zero, and not larger than 'size'.
+ // Also note that READ_ONCE -> WRITE_ONCE isn't atomic, so
+ // in the presense of races, one thread will win (but it
+ // might not be the one that won the set_end_marker and the
+ // memcpy races.
+ pos += len;
+ set_end_marker(base, pos);
+ WRITE_ONCE(buffer.pos, pos);
+}
+
+#define MAXDMPLINE 80
+// Dump the possibly corrupted text in the area.
+static void dump_area(unsigned char *p, size_t size)
+{
+ unsigned char line[MAXDMPLINE+1];
+ int len = 0, i;
+
+ for (i = 0; i < size; i++) {
+ // Ignore high bit
+ unsigned char c = p[i] & 0x7f;
+ // Ignore NUL or all bits set bytes
+ if (!c || c == 0x7f)
+ continue;
+ // Otherwise try to turn it printable
+ if (c < 32) {
+ if (c == '\n') {
+ if (!len)
+ continue;
+ printk("BDMP: %s\n", line);
+ len = 0;
+ continue;
+ }
+ if (c != '\t')
+ continue;
+ }
+ line[len++] = c;
+ line[len] = 0;
+ if (len < MAXDMPLINE)
+ continue;
+ printk("BDMP: %s+\n", line);
+ len = 0;
+ }
+ if (len)
+ printk("BDMP: %s\n", line);
+}
+
+static void buffer_dump(unsigned char *p, size_t size)
+{
+ unsigned int marker;
+
+ // See if at least one of the end marker bytes have
+ // survived.
+ for (marker = 0; marker < size; marker++) {
+ if (p[marker] == 0xff) {
+ dump_area(p+marker, size - marker);
+ size = marker;
+ break;
+ }
+ }
+ dump_area(p, size);
+}
+
+void poweroff_buffer_register(char *buf, size_t size)
+{
+ if (size < 1024)
+ return;
+
+ if (READ_ONCE(buffer.pos))
+ return;
+
+ printk("Registering poweroff buffer\n");
+
+ // Dump and _then_ clear the contents from the previous boot
+ buffer_dump(buf, size);
+ memset(buf, 0, size);
+
+ // Leave room for end marker
+ size -= sizeof(unsigned int);
+
+ buffer.base = buf;
+ buffer.size = size;
+ set_end_marker(buf, 0);
+ smp_wmb();
+ WRITE_ONCE(buffer.pos, size);
+}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 1888f6a3b694..65f17638b944 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1906,6 +1906,8 @@ int vprintk_store(int facility, int level,
*/
text_len = vscnprintf(text, sizeof(textbuf), fmt, args);

+ poweroff_buffer_log(text, text_len);
+
/* mark and strip a trailing newline */
if (text_len && text[text_len-1] == '\n') {
text_len--;
--
2.22.0.457.gf7a37b2d76