[PATCH printk v1 05/18] printk: Add non-BKL console basic infrastructure

From: John Ogness
Date: Thu Mar 02 2023 - 14:58:32 EST


From: Thomas Gleixner <tglx@xxxxxxxxxxxxx>

The current console/printk subsystem is protected by a Big Kernel Lock,
(aka console_lock) which has ill defined semantics and is more or less
stateless. This puts severe limitations on the console subsystem and
makes forced takeover and output in emergency and panic situations a
fragile endavour which is based on try and pray.

The goal of non-BKL consoles is to break out of the console lock jail
and to provide a new infrastructure that avoids the pitfalls and
allows console drivers to be gradually converted over.

The proposed infrastructure aims for the following properties:

- Per console locking instead of global locking
- Per console state which allows to make informed decisions
- Stateful handover and takeover

As a first step state is added to struct console. The per console state
is an atomic_long_t with a 32bit bit field and on 64bit also a 32bit
sequence for tracking the last printed ringbuffer sequence number. On
32bit the sequence is separate from state for obvious reasons which
requires handling a few extra race conditions.

Reserve state bits, which will be populated later in the series. Wire
it up into the console register/unregister functionality and exclude
such consoles from being handled in the console BKL mechanisms. Since
the non-BKL consoles will not depend on the console lock/unlock dance
for printing, only perform said dance if a BKL console is registered.

The decision to use a bitfield was made as using a plain u32 with
mask/shift operations turned out to result in uncomprehensible code.

Co-developed-by: John Ogness <john.ogness@xxxxxxxxxxxxx>
Signed-off-by: John Ogness <john.ogness@xxxxxxxxxxxxx>
Signed-off-by: Thomas Gleixner (Intel) <tglx@xxxxxxxxxxxxx>
---
fs/proc/consoles.c | 1 +
include/linux/console.h | 33 +++++++++
kernel/printk/Makefile | 2 +-
kernel/printk/internal.h | 10 +++
kernel/printk/printk.c | 50 +++++++++++--
kernel/printk/printk_nobkl.c | 137 +++++++++++++++++++++++++++++++++++
6 files changed, 226 insertions(+), 7 deletions(-)
create mode 100644 kernel/printk/printk_nobkl.c

diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index e0758fe7936d..9ce506866e60 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -21,6 +21,7 @@ static int show_console_dev(struct seq_file *m, void *v)
{ CON_ENABLED, 'E' },
{ CON_CONSDEV, 'C' },
{ CON_BOOT, 'B' },
+ { CON_NO_BKL, 'N' },
{ CON_PRINTBUFFER, 'p' },
{ CON_BRL, 'b' },
{ CON_ANYTIME, 'a' },
diff --git a/include/linux/console.h b/include/linux/console.h
index f7967fb238e0..b9d2ad580128 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -155,6 +155,8 @@ static inline int con_debug_leave(void)
* /dev/kmesg which requires a larger output buffer.
* @CON_SUSPENDED: Indicates if a console is suspended. If true, the
* printing callbacks must not be called.
+ * @CON_NO_BKL: Console can operate outside of the BKL style console_lock
+ * constraints.
*/
enum cons_flags {
CON_PRINTBUFFER = BIT(0),
@@ -165,6 +167,32 @@ enum cons_flags {
CON_BRL = BIT(5),
CON_EXTENDED = BIT(6),
CON_SUSPENDED = BIT(7),
+ CON_NO_BKL = BIT(8),
+};
+
+/**
+ * struct cons_state - console state for NOBKL consoles
+ * @atom: Compound of the state fields for atomic operations
+ * @seq: Sequence for record tracking (64bit only)
+ * @bits: Compound of the state bits below
+ *
+ * To be used for state read and preparation of atomic_long_cmpxchg()
+ * operations.
+ */
+struct cons_state {
+ union {
+ unsigned long atom;
+ struct {
+#ifdef CONFIG_64BIT
+ u32 seq;
+#endif
+ union {
+ u32 bits;
+ struct {
+ };
+ };
+ };
+ };
};

/**
@@ -186,6 +214,8 @@ enum cons_flags {
* @dropped: Number of unreported dropped ringbuffer records
* @data: Driver private data
* @node: hlist node for the console list
+ *
+ * @atomic_state: State array for NOBKL consoles; real and handover
*/
struct console {
char name[16];
@@ -205,6 +235,9 @@ struct console {
unsigned long dropped;
void *data;
struct hlist_node node;
+
+ /* NOBKL console specific members */
+ atomic_long_t __private atomic_state[2];
};

#ifdef CONFIG_LOCKDEP
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index f5b388e810b9..b36683bd2f82 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y = printk.o
-obj-$(CONFIG_PRINTK) += printk_safe.o
+obj-$(CONFIG_PRINTK) += printk_safe.o printk_nobkl.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
obj-$(CONFIG_PRINTK_INDEX) += index.o

diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 2a17704136f1..da380579263b 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -3,6 +3,7 @@
* internal.h - printk internal definitions
*/
#include <linux/percpu.h>
+#include <linux/console.h>

#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
void __init printk_sysctl_init(void);
@@ -61,6 +62,10 @@ void defer_console_output(void);

u16 printk_parse_prefix(const char *text, int *level,
enum printk_info_flags *flags);
+
+void cons_nobkl_cleanup(struct console *con);
+void cons_nobkl_init(struct console *con);
+
#else

#define PRINTK_PREFIX_MAX 0
@@ -76,8 +81,13 @@ u16 printk_parse_prefix(const char *text, int *level,
#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)

static inline bool printk_percpu_data_ready(void) { return false; }
+static inline void cons_nobkl_init(struct console *con) { }
+static inline void cons_nobkl_cleanup(struct console *con) { }
+
#endif /* CONFIG_PRINTK */

+extern bool have_boot_console;
+
/**
* struct printk_buffers - Buffers to read/format/output printk messages.
* @outbuf: After formatting, contains text to output.
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 626d467c7e9b..b2c7c92c3d79 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -446,6 +446,19 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
/* syslog_lock protects syslog_* variables and write access to clear_seq. */
static DEFINE_MUTEX(syslog_lock);

+/*
+ * Specifies if a BKL console was ever registered. Used to determine if the
+ * console lock/unlock dance is needed for console printing.
+ */
+static bool have_bkl_console;
+
+/*
+ * Specifies if a boot console is registered. Used to determine if NOBKL
+ * consoles may be used since NOBKL consoles cannot synchronize with boot
+ * consoles.
+ */
+bool have_boot_console;
+
#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* All 3 protected by @syslog_lock. */
@@ -2301,7 +2314,7 @@ asmlinkage int vprintk_emit(int facility, int level,
printed_len = vprintk_store(facility, level, dev_info, fmt, args);

/* If called from the scheduler, we can not call up(). */
- if (!in_sched) {
+ if (!in_sched && have_bkl_console) {
/*
* The caller may be holding system-critical or
* timing-sensitive locks. Disable preemption during
@@ -2624,7 +2637,7 @@ void resume_console(void)
*/
static int console_cpu_notify(unsigned int cpu)
{
- if (!cpuhp_tasks_frozen) {
+ if (!cpuhp_tasks_frozen && have_bkl_console) {
/* If trylock fails, someone else is doing the printing */
if (console_trylock())
console_unlock();
@@ -3098,6 +3111,9 @@ void console_unblank(void)
struct console *c;
int cookie;

+ if (!have_bkl_console)
+ return;
+
/*
* Stop console printing because the unblank() callback may
* assume the console is not within its write() callback.
@@ -3135,6 +3151,9 @@ void console_unblank(void)
*/
void console_flush_on_panic(enum con_flush_mode mode)
{
+ if (!have_bkl_console)
+ return;
+
/*
* If someone else is holding the console lock, trylock will fail
* and may_schedule may be set. Ignore and proceed to unlock so
@@ -3310,9 +3329,10 @@ static void try_enable_default_console(struct console *newcon)
newcon->flags |= CON_CONSDEV;
}

-#define con_printk(lvl, con, fmt, ...) \
- printk(lvl pr_fmt("%sconsole [%s%d] " fmt), \
- (con->flags & CON_BOOT) ? "boot" : "", \
+#define con_printk(lvl, con, fmt, ...) \
+ printk(lvl pr_fmt("%s%sconsole [%s%d] " fmt), \
+ (con->flags & CON_NO_BKL) ? "" : "legacy ", \
+ (con->flags & CON_BOOT) ? "boot" : "", \
con->name, con->index, ##__VA_ARGS__)

static void console_init_seq(struct console *newcon, bool bootcon_registered)
@@ -3472,6 +3492,14 @@ void register_console(struct console *newcon)
newcon->dropped = 0;
console_init_seq(newcon, bootcon_registered);

+ if (!(newcon->flags & CON_NO_BKL))
+ have_bkl_console = true;
+ else
+ cons_nobkl_init(newcon);
+
+ if (newcon->flags & CON_BOOT)
+ have_boot_console = true;
+
/*
* Put this console in the list - keep the
* preferred driver at the head of the list.
@@ -3515,6 +3543,9 @@ void register_console(struct console *newcon)
if (con->flags & CON_BOOT)
unregister_console_locked(con);
}
+
+ /* All boot consoles have been unregistered. */
+ have_boot_console = false;
}
unlock:
console_list_unlock();
@@ -3563,6 +3594,9 @@ static int unregister_console_locked(struct console *console)
*/
synchronize_srcu(&console_srcu);

+ if (console->flags & CON_NO_BKL)
+ cons_nobkl_cleanup(console);
+
console_sysfs_notify();

if (console->exit)
@@ -3866,11 +3900,15 @@ void wake_up_klogd(void)
*/
void defer_console_output(void)
{
+ int val = PRINTK_PENDING_WAKEUP;
+
/*
* New messages may have been added directly to the ringbuffer
* using vprintk_store(), so wake any waiters as well.
*/
- __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT);
+ if (have_bkl_console)
+ val |= PRINTK_PENDING_OUTPUT;
+ __wake_up_klogd(val);
}

void printk_trigger_flush(void)
diff --git a/kernel/printk/printk_nobkl.c b/kernel/printk/printk_nobkl.c
new file mode 100644
index 000000000000..8df3626808dd
--- /dev/null
+++ b/kernel/printk/printk_nobkl.c
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2022 Linutronix GmbH, John Ogness
+// Copyright (C) 2022 Intel, Thomas Gleixner
+
+#include <linux/kernel.h>
+#include <linux/console.h>
+#include "internal.h"
+/*
+ * Printk implementation for consoles that do not depend on the BKL style
+ * console_lock mechanism.
+ *
+ * Console is locked on a CPU when state::locked is set and state:cpu ==
+ * current CPU. This is valid for the current execution context.
+ *
+ * Nesting execution contexts on the same CPU can carefully take over
+ * if the driver allows reentrancy via state::unsafe = false. When the
+ * interrupted context resumes it checks the state before entering
+ * an unsafe region and aborts the operation if it detects a takeover.
+ *
+ * In case of panic or emergency the nesting context can take over the
+ * console forcefully. The write callback is then invoked with the unsafe
+ * flag set in the write context data, which allows the driver side to avoid
+ * locks and to evaluate the driver state so it can use an emergency path
+ * or repair the state instead of blindly assuming that it works.
+ *
+ * If the interrupted context touches the assigned record buffer after
+ * takeover, it does not cause harm because at the same execution level
+ * there is no concurrency on the same CPU. A threaded printer always has
+ * its own record buffer so it can never interfere with any of the per CPU
+ * record buffers.
+ *
+ * A concurrent writer on a different CPU can request to take over the
+ * console by:
+ *
+ * 1) Carefully writing the desired state into state[REQ]
+ * if there is no same or higher priority request pending.
+ * This locks state[REQ] except for higher priority
+ * waiters.
+ *
+ * 2) Setting state[CUR].req_prio unless a same or higher
+ * priority waiter won the race.
+ *
+ * 3) Carefully spin on state[CUR] until that is locked with the
+ * expected state. When the state is not the expected one then it
+ * has to verify that state[REQ] is still the same and that
+ * state[CUR] has not been taken over or unlocked.
+ *
+ * The unlocker hands over to state[REQ], but only if state[CUR]
+ * matches.
+ *
+ * In case that the owner does not react on the request and does not make
+ * observable progress, the waiter will timeout and can then decide to do
+ * a hostile takeover.
+ */
+
+#define copy_full_state(_dst, _src) do { _dst = _src; } while (0)
+#define copy_bit_state(_dst, _src) do { _dst.bits = _src.bits; } while (0)
+
+#ifdef CONFIG_64BIT
+#define copy_seq_state64(_dst, _src) do { _dst.seq = _src.seq; } while (0)
+#else
+#define copy_seq_state64(_dst, _src) do { } while (0)
+#endif
+
+enum state_selector {
+ CON_STATE_CUR,
+ CON_STATE_REQ,
+};
+
+/**
+ * cons_state_set - Helper function to set the console state
+ * @con: Console to update
+ * @which: Selects real state or handover state
+ * @new: The new state to write
+ *
+ * Only to be used when the console is not yet or no longer visible in the
+ * system. Otherwise use cons_state_try_cmpxchg().
+ */
+static inline void cons_state_set(struct console *con, enum state_selector which,
+ struct cons_state *new)
+{
+ atomic_long_set(&ACCESS_PRIVATE(con, atomic_state[which]), new->atom);
+}
+
+/**
+ * cons_state_read - Helper function to read the console state
+ * @con: Console to update
+ * @which: Selects real state or handover state
+ * @state: The state to store the result
+ */
+static inline void cons_state_read(struct console *con, enum state_selector which,
+ struct cons_state *state)
+{
+ state->atom = atomic_long_read(&ACCESS_PRIVATE(con, atomic_state[which]));
+}
+
+/**
+ * cons_state_try_cmpxchg() - Helper function for atomic_long_try_cmpxchg() on console state
+ * @con: Console to update
+ * @which: Selects real state or handover state
+ * @old: Old/expected state
+ * @new: New state
+ *
+ * Returns: True on success, false on fail
+ */
+static inline bool cons_state_try_cmpxchg(struct console *con,
+ enum state_selector which,
+ struct cons_state *old,
+ struct cons_state *new)
+{
+ return atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, atomic_state[which]),
+ &old->atom, new->atom);
+}
+
+/**
+ * cons_nobkl_init - Initialize the NOBKL console specific data
+ * @con: Console to initialize
+ */
+void cons_nobkl_init(struct console *con)
+{
+ struct cons_state state = { };
+
+ cons_state_set(con, CON_STATE_CUR, &state);
+ cons_state_set(con, CON_STATE_REQ, &state);
+}
+
+/**
+ * cons_nobkl_cleanup - Cleanup the NOBKL console specific data
+ * @con: Console to cleanup
+ */
+void cons_nobkl_cleanup(struct console *con)
+{
+ struct cons_state state = { };
+
+ cons_state_set(con, CON_STATE_CUR, &state);
+ cons_state_set(con, CON_STATE_REQ, &state);
+}
--
2.30.2