[ANNOUNCE] v5.10.1-rt20

From: Sebastian Andrzej Siewior
Date: Fri Dec 18 2020 - 13:17:25 EST


Dear RT folks!

I'm pleased to announce the v5.10.1-rt20 patch set.

Changes since v5.10.1-rt19:

- Fix a compile failure in the trace branch profiler introduced in the
previous release. Reported by kernel test bot.

- Mike Galbraith reported errors in the z3fold code. Patches by Vitaly
Wool.

- Update John Ogness' printk patches.
- vprintk_store() sync'd with upstream.
- pr_flush() now auto-detects if it may sleep.
- pr_flush() is now called for all panic and oops scenarios.
- @clear_seq changed from atomic64_t to latched_seq to allow
lockless reader access.
- @devkmsg_user.seq changed to atomic64_t to guarantee
consistent values.
- Now waking all printing threads instead of just one for
new messages.
- kmsg dumpers now called using a copy of the iterator.

The issues in the "Known issues" section are not introduced as part of
this release. They were noticed by John during his printk rework.


Known issues
- kdb/kgdb can easily deadlock.
- kmsg dumpers expecting not to be called in parallel can clobber
their temp buffer.
- netconsole triggers WARN.

The delta patch against v5.10.1-rt19 is appended below and can be found here:

https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.10/incr/patch-5.10.1-rt19-rt20.patch.xz

You can get this release via the git tree at:

git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v5.10.1-rt20

The RT patch against v5.10.1 can be found here:

https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patch-5.10.1-rt20.patch.xz

The split quilt queue is available at:

https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.1-rt20.tar.xz

Sebastian

diff --git a/include/linux/printk.h b/include/linux/printk.h
index a964b42ccb974..2408f6c956deb 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -476,7 +476,7 @@ extern int kptr_restrict;
no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

-bool pr_flush(bool may_sleep, int timeout_ms, bool reset_on_progress);
+bool pr_flush(int timeout_ms, bool reset_on_progress);

/*
* ratelimited messages with local ratelimit_state,
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 515e552879dd3..b5f9604d0c98a 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -457,7 +457,6 @@ void __init mount_block_root(char *name, int flags)
printk("DEBUG_BLOCK_EXT_DEVT is enabled, you need to specify "
"explicit textual name for \"root=\" boot option.\n");
#endif
- pr_flush(true, 1000, true);
panic("VFS: Unable to mount root fs on %s", b);
}
if (!(flags & SB_RDONLY)) {
diff --git a/kernel/panic.c b/kernel/panic.c
index c48865044949b..a14e2f5a9f550 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -177,12 +177,28 @@ static void panic_print_sys_info(void)
void panic(const char *fmt, ...)
{
static char buf[1024];
+ va_list args2;
va_list args;
long i, i_next = 0, len;
int state = 0;
int old_cpu, this_cpu;
bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;

+ console_verbose();
+ pr_emerg("Kernel panic - not syncing:\n");
+ va_start(args2, fmt);
+ va_copy(args, args2);
+ vprintk(fmt, args2);
+ va_end(args2);
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+ /*
+ * Avoid nested stack-dumping if a panic occurs during oops processing
+ */
+ if (!test_taint(TAINT_DIE) && oops_in_progress <= 1)
+ dump_stack();
+#endif
+ pr_flush(1000, true);
+
/*
* Disable local interrupts. This will prevent panic_smp_self_stop
* from deadlocking the first cpu that invokes the panic, since
@@ -213,24 +229,13 @@ void panic(const char *fmt, ...)
if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
panic_smp_self_stop();

- console_verbose();
bust_spinlocks(1);
- va_start(args, fmt);
len = vscnprintf(buf, sizeof(buf), fmt, args);
va_end(args);

if (len && buf[len - 1] == '\n')
buf[len - 1] = '\0';

- pr_emerg("Kernel panic - not syncing: %s\n", buf);
-#ifdef CONFIG_DEBUG_BUGVERBOSE
- /*
- * Avoid nested stack-dumping if a panic occurs during oops processing
- */
- if (!test_taint(TAINT_DIE) && oops_in_progress <= 1)
- dump_stack();
-#endif
-
/*
* If kgdb is enabled, give it a chance to run before we stop all
* the other CPUs or else we won't be able to debug processes left
@@ -554,6 +559,7 @@ static void print_oops_end_marker(void)
{
init_oops_id();
pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id);
+ pr_flush(1000, true);
}

/*
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index dc5da374a005f..56d84a28ca079 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -337,7 +337,7 @@ enum log_flags {
};

#ifdef CONFIG_PRINTK
-/* The syslog_lock protects syslog_* variables. */
+/* syslog_lock protects syslog_* variables and write access to clear_seq. */
static DEFINE_SPINLOCK(syslog_lock);

/* Set to enable sync mode. Once set, it is never cleared. */
@@ -350,8 +350,21 @@ static u64 syslog_seq;
static size_t syslog_partial;
static bool syslog_time;

-/* the next printk record to read after the last 'clear' command */
-static atomic64_t clear_seq = ATOMIC64_INIT(0);
+struct latched_seq {
+ seqcount_latch_t latch;
+ u64 val[2];
+};
+
+/*
+ * The next printk record to read after the last 'clear' command. There are
+ * two copies (updated with seqcount_latch) so that reads can locklessly
+ * access a valid value. Writers are synchronized by @syslog_lock.
+ */
+static struct latched_seq clear_seq = {
+ .latch = SEQCNT_LATCH_ZERO(clear_seq.latch),
+ .val[0] = 0,
+ .val[1] = 0,
+};

#ifdef CONFIG_PRINTK_CALLER
#define PREFIX_MAX 48
@@ -400,6 +413,31 @@ static bool printk_percpu_data_ready(void)
return __printk_percpu_data_ready;
}

+/* Must be called under syslog_lock. */
+void latched_seq_write(struct latched_seq *ls, u64 val)
+{
+ raw_write_seqcount_latch(&ls->latch);
+ ls->val[0] = val;
+ raw_write_seqcount_latch(&ls->latch);
+ ls->val[1] = val;
+}
+
+/* Can be called from any context. */
+u64 latched_seq_read_nolock(struct latched_seq *ls)
+{
+ unsigned int seq;
+ unsigned int idx;
+ u64 val;
+
+ do {
+ seq = raw_read_seqcount_latch(&ls->latch);
+ idx = seq & 0x1;
+ val = ls->val[idx];
+ } while (read_seqcount_latch_retry(&ls->latch, seq));
+
+ return val;
+}
+
/* Return log buffer address */
char *log_buf_addr_get(void)
{
@@ -567,7 +605,7 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,

/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
- u64 seq;
+ atomic64_t seq;
struct ratelimit_state rs;
struct mutex lock;
char buf[CONSOLE_EXT_LOG_MAX];
@@ -668,21 +706,21 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
if (ret)
return ret;

- if (!prb_read_valid(prb, user->seq, r)) {
+ if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
goto out;
}

ret = wait_event_interruptible(log_wait,
- prb_read_valid(prb, user->seq, r));
+ prb_read_valid(prb, atomic64_read(&user->seq), r));
if (ret)
goto out;
}

- if (user->seq < prb_first_valid_seq(prb)) {
+ if (atomic64_read(&user->seq) < prb_first_valid_seq(prb)) {
/* our last seen message is gone, return error and reset */
- user->seq = prb_first_valid_seq(prb);
+ atomic64_set(&user->seq, prb_first_valid_seq(prb));
ret = -EPIPE;
goto out;
}
@@ -692,7 +730,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
&r->text_buf[0], r->info->text_len,
&r->info->dev_info);

- user->seq = r->info->seq + 1;
+ atomic64_set(&user->seq, r->info->seq + 1);

if (len > count) {
ret = -EINVAL;
@@ -730,7 +768,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
switch (whence) {
case SEEK_SET:
/* the first record */
- user->seq = prb_first_valid_seq(prb);
+ atomic64_set(&user->seq, prb_first_valid_seq(prb));
break;
case SEEK_DATA:
/*
@@ -738,11 +776,11 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
* like issued by 'dmesg -c'. Reading /dev/kmsg itself
* changes no global state, and does not clear anything.
*/
- user->seq = atomic64_read(&clear_seq);
+ atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq));
break;
case SEEK_END:
/* after the last record */
- user->seq = prb_next_seq(prb);
+ atomic64_set(&user->seq, prb_next_seq(prb));
break;
default:
ret = -EINVAL;
@@ -760,9 +798,9 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait)

poll_wait(file, &log_wait, wait);

- if (prb_read_valid(prb, user->seq, NULL)) {
+ if (prb_read_valid(prb, atomic64_read(&user->seq), NULL)) {
/* return error when data has vanished underneath us */
- if (user->seq < prb_first_valid_seq(prb))
+ if (atomic64_read(&user->seq) < prb_first_valid_seq(prb))
ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
else
ret = EPOLLIN|EPOLLRDNORM;
@@ -799,7 +837,7 @@ static int devkmsg_open(struct inode *inode, struct file *file)
prb_rec_init_rd(&user->record, &user->info,
&user->text_buf[0], sizeof(user->text_buf));

- user->seq = prb_first_valid_seq(prb);
+ atomic64_set(&user->seq, prb_first_valid_seq(prb));

file->private_data = user;
return 0;
@@ -850,9 +888,6 @@ void log_buf_vmcoreinfo_setup(void)
* parse it and detect any changes to structure down the line.
*/

- VMCOREINFO_SIZE(atomic64_t);
- VMCOREINFO_TYPE_OFFSET(atomic64_t, counter);
-
VMCOREINFO_STRUCT_SIZE(printk_ringbuffer);
VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring);
VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring);
@@ -894,6 +929,9 @@ void log_buf_vmcoreinfo_setup(void)

VMCOREINFO_SIZE(atomic_long_t);
VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter);
+
+ VMCOREINFO_STRUCT_SIZE(latched_seq);
+ VMCOREINFO_OFFSET(latched_seq, val);
}
#endif

@@ -1341,11 +1379,11 @@ static int syslog_print(char __user *buf, int size)
char *text;
int len = 0;

- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+ text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;

- prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX);
+ prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);

while (size > 0) {
size_t n;
@@ -1414,12 +1452,12 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
u64 seq;
bool time;

- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+ text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;

time = printk_time;
- clr_seq = atomic64_read(&clear_seq);
+ clr_seq = latched_seq_read_nolock(&clear_seq);

/*
* Find first record that fits, including all following records,
@@ -1430,24 +1468,19 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
len += get_record_print_text_size(&info, line_count, true, time);

/*
- * Keep track of the latest in case new records are coming in fast
- * and overwriting the older records.
+ * Move first record forward until length fits into the buffer. Ignore
+ * newest messages that were not counted in the above cycle. Messages
+ * might appear and get lost in the meantime. This is the best effort
+ * that prevents an infinite loop.
*/
newest_seq = seq;
-
- /*
- * Move first record forward until length fits into the buffer. This
- * is a best effort attempt. If @newest_seq is reached because the
- * ringbuffer is wrapping too fast, just start filling the buffer
- * from there.
- */
prb_for_each_info(clr_seq, prb, seq, &info, &line_count) {
if (len <= size || info.seq > newest_seq)
break;
len -= get_record_print_text_size(&info, line_count, true, time);
}

- prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX);
+ prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);

len = 0;
prb_for_each_record(seq, prb, seq, &r) {
@@ -1469,8 +1502,11 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
break;
}

- if (clear)
- atomic64_set(&clear_seq, seq);
+ if (clear) {
+ spin_lock_irq(&syslog_lock);
+ latched_seq_write(&clear_seq, seq);
+ spin_unlock_irq(&syslog_lock);
+ }

kfree(text);
return len;
@@ -1478,7 +1514,9 @@ static int syslog_print_all(char __user *buf, int size, bool clear)

static void syslog_clear(void)
{
- atomic64_set(&clear_seq, prb_next_seq(prb));
+ spin_lock_irq(&syslog_lock);
+ latched_seq_write(&clear_seq, prb_next_seq(prb));
+ spin_unlock_irq(&syslog_lock);
}

int do_syslog(int type, char __user *buf, int len, int source)
@@ -1504,9 +1542,12 @@ int do_syslog(int type, char __user *buf, int len, int source)
return 0;
if (!access_ok(buf, len))
return -EFAULT;
+
+ /* Get a consistent copy of @syslog_seq. */
spin_lock_irq(&syslog_lock);
seq = syslog_seq;
spin_unlock_irq(&syslog_lock);
+
error = wait_event_interruptible(log_wait,
prb_read_valid(prb, seq, NULL));
if (error)
@@ -1743,7 +1784,7 @@ struct printk_recursion {
static DEFINE_PER_CPU(struct printk_recursion, percpu_printk_recursion);
static char printk_recursion_count[NUM_RECURSION_CTX];

-static char *get_printk_count(void)
+static char *printk_recursion_counter(void)
{
struct printk_recursion *rec;
char *count;
@@ -1764,12 +1805,12 @@ static char *get_printk_count(void)
return count;
}

-static bool printk_enter(unsigned long *flags)
+static bool printk_enter_irqsave(unsigned long *flags)
{
char *count;

local_irq_save(*flags);
- count = get_printk_count();
+ count = printk_recursion_counter();
/* Only 1 level of recursion allowed. */
if (*count > 1) {
local_irq_restore(*flags);
@@ -1780,11 +1821,11 @@ static bool printk_enter(unsigned long *flags)
return true;
}

-static void printk_exit(unsigned long flags)
+static void printk_exit_irqrestore(unsigned long flags)
{
char *count;

- count = get_printk_count();
+ count = printk_recursion_counter();
(*count)--;
local_irq_restore(flags);
}
@@ -1795,10 +1836,53 @@ static inline u32 printk_caller_id(void)
0x80000000 + raw_smp_processor_id();
}

+/**
+ * parse_prefix - Parse level and control flags.
+ *
+ * @text: The terminated text message.
+ * @level: A pointer to the current level value, will be updated.
+ * @lflags: A pointer to the current log flags, will be updated.
+ *
+ * @level may be NULL if the caller is not interested in the parsed value.
+ * Otherwise the variable pointed to by @level must be set to
+ * LOGLEVEL_DEFAULT in order to be updated with the parsed value.
+ *
+ * @lflags may be NULL if the caller is not interested in the parsed value.
+ * Otherwise the variable pointed to by @lflags will be OR'd with the parsed
+ * value.
+ *
+ * Return: The length of the parsed level and control flags.
+ */
+static u16 parse_prefix(char *text, int *level, enum log_flags *lflags)
+{
+ u16 prefix_len = 0;
+ int kern_level;
+
+ while (*text) {
+ kern_level = printk_get_level(text);
+ if (!kern_level)
+ break;
+
+ switch (kern_level) {
+ case '0' ... '7':
+ if (level && *level == LOGLEVEL_DEFAULT)
+ *level = kern_level - '0';
+ break;
+ case 'c': /* KERN_CONT */
+ if (lflags)
+ *lflags |= LOG_CONT;
+ }
+
+ prefix_len += 2;
+ text += 2;
+ }
+
+ return prefix_len;
+}
+
static u16 printk_sprint(char *text, u16 size, int facility, enum log_flags *lflags,
const char *fmt, va_list args)
{
- char *orig_text = text;
u16 text_len;

text_len = vscnprintf(text, size, fmt, args);
@@ -1809,15 +1893,15 @@ static u16 printk_sprint(char *text, u16 size, int facility, enum log_flags *lfl
*lflags |= LOG_NEWLINE;
}

- /* Strip kernel syslog prefix. */
+ /* Strip log level and control flags. */
if (facility == 0) {
- while (text_len >= 2 && printk_get_level(text)) {
- text_len -= 2;
- text += 2;
- }
+ u16 prefix_len;

- if (text != orig_text)
- memmove(orig_text, text, text_len);
+ prefix_len = parse_prefix(text, NULL, NULL);
+ if (prefix_len) {
+ text_len -= prefix_len;
+ memmove(text, text + prefix_len, text_len);
+ }
}

return text_len;
@@ -1835,47 +1919,41 @@ static int vprintk_store(int facility, int level,
struct printk_record r;
unsigned long irqflags;
u16 trunc_msg_len = 0;
- char lvlbuf[8];
+ char prefix_buf[8];
+ u16 reserve_size;
va_list args2;
u16 text_len;
int ret = 0;
u64 ts_nsec;
u64 seq;

+ /*
+ * Since the duration of printk() can vary depending on the message
+ * and state of the ringbuffer, grab the timestamp now so that it is
+ * close to the call of printk(). This provides a more deterministic
+ * timestamp with respect to the caller.
+ */
ts_nsec = local_clock();

- if (!printk_enter(&irqflags))
+ if (!printk_enter_irqsave(&irqflags))
return 0;

- va_copy(args2, args);
-
/*
- * The printf needs to come first; we need the syslog
- * prefix which might be passed-in as a parameter.
+ * The sprintf needs to come first since the syslog prefix might be
+ * passed in as a parameter. An extra byte must be reserved so that
+ * later the vscnprintf() into the reserved buffer has room for the
+ * terminating '\0', which is not counted by vsnprintf().
*/
- text_len = vsnprintf(&lvlbuf[0], sizeof(lvlbuf), fmt, args) + 1;
- if (text_len > CONSOLE_LOG_MAX)
- text_len = CONSOLE_LOG_MAX;
+ va_copy(args2, args);
+ reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1;
+ va_end(args2);
+
+ if (reserve_size > LOG_LINE_MAX)
+ reserve_size = LOG_LINE_MAX;

/* Extract log level or control flags. */
- if (facility == 0) {
- int kern_level;
- int i;
-
- for (i = 0; i < sizeof(lvlbuf); i += 2) {
- kern_level = printk_get_level(&lvlbuf[i]);
- if (!kern_level)
- break;
- switch (kern_level) {
- case '0' ... '7':
- if (level == LOGLEVEL_DEFAULT)
- level = kern_level - '0';
- break;
- case 'c': /* KERN_CONT */
- lflags |= LOG_CONT;
- }
- }
- }
+ if (facility == 0)
+ parse_prefix(&prefix_buf[0], &level, &lflags);

if (level == LOGLEVEL_DEFAULT)
level = default_message_loglevel;
@@ -1884,11 +1962,11 @@ static int vprintk_store(int facility, int level,
lflags |= LOG_NEWLINE;

if (lflags & LOG_CONT) {
- prb_rec_init_wr(&r, text_len);
+ prb_rec_init_wr(&r, reserve_size);
if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) {
seq = r.info->seq;
- text_len = printk_sprint(&r.text_buf[r.info->text_len], text_len,
- facility, &lflags, fmt, args2);
+ text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size,
+ facility, &lflags, fmt, args);
r.info->text_len += text_len;

if (lflags & LOG_NEWLINE) {
@@ -1904,12 +1982,17 @@ static int vprintk_store(int facility, int level,
}
}

- prb_rec_init_wr(&r, text_len);
+ /*
+ * Explicitly initialize the record before every prb_reserve() call.
+ * prb_reserve_in_last() and prb_reserve() purposely invalidate the
+ * structure when they fail.
+ */
+ prb_rec_init_wr(&r, reserve_size);
if (!prb_reserve(&e, prb, &r)) {
/* truncate the message if it is too long for empty buffer */
- truncate_msg(&text_len, &trunc_msg_len);
+ truncate_msg(&reserve_size, &trunc_msg_len);

- prb_rec_init_wr(&r, text_len + trunc_msg_len);
+ prb_rec_init_wr(&r, reserve_size + trunc_msg_len);
if (!prb_reserve(&e, prb, &r))
goto out;
}
@@ -1917,7 +2000,7 @@ static int vprintk_store(int facility, int level,
seq = r.info->seq;

/* fill message */
- text_len = printk_sprint(&r.text_buf[0], text_len, facility, &lflags, fmt, args2);
+ text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &lflags, fmt, args);
if (trunc_msg_len)
memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len);
r.info->text_len = text_len + trunc_msg_len;
@@ -1943,8 +2026,7 @@ static int vprintk_store(int facility, int level,
if (facility == 0 && final_commit && any_console_can_sync())
print_sync_until(NULL, seq + 1);

- va_end(args2);
- printk_exit(irqflags);
+ printk_exit_irqrestore(irqflags);
return ret;
}

@@ -2174,8 +2256,6 @@ static void console_try_thread(struct console *con)

#else /* CONFIG_PRINTK */

-#define LOG_LINE_MAX 0
-#define PREFIX_MAX 0
#define printk_time false

#define prb_read_valid(rb, seq, r) false
@@ -2930,7 +3010,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
int pending = __this_cpu_xchg(printk_pending, 0);

if (pending & PRINTK_PENDING_WAKEUP)
- wake_up_interruptible(&log_wait);
+ wake_up_interruptible_all(&log_wait);
}

static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
@@ -3107,15 +3187,16 @@ void kmsg_dump(enum kmsg_dump_reason reason)
}

/*
- * Give the printing threads time to flush, allowing up to 1
- * second of no printing forward progress before giving up.
+ * Give the printing threads time to flush, allowing up to
+ * 1s of no printing forward progress before giving up.
*/
- pr_flush(false, 100, true);
+ pr_flush(1000, true);
}

rcu_read_lock();
list_for_each_entry_rcu(dumper, &dump_list, list) {
enum kmsg_dump_reason max_reason = dumper->max_reason;
+ struct kmsg_dumper dumper_copy;

/*
* If client has not provided a specific max_reason, default
@@ -3128,16 +3209,18 @@ void kmsg_dump(enum kmsg_dump_reason reason)
if (reason > max_reason)
continue;

- /* initialize iterator with data about the stored records */
- dumper->active = true;
+ /*
+ * Invoke a copy of the dumper to iterate over the records.
+ * This allows kmsg_dump() to be called simultaneously on
+ * multiple CPUs.
+ */

- kmsg_dump_rewind_nolock(dumper);
+ memcpy(&dumper_copy, dumper, sizeof(dumper_copy));
+ INIT_LIST_HEAD(&dumper_copy.list);
+ dumper_copy.active = true;

- /* invoke dumper which will iterate over records */
- dumper->dump(dumper, reason);
-
- /* reset iterator */
- dumper->active = false;
+ kmsg_dump_rewind_nolock(&dumper_copy);
+ dumper_copy.dump(&dumper_copy, reason);
}
rcu_read_unlock();
}
@@ -3277,10 +3360,10 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
}

/*
- * Move first record forward until length fits into the buffer. This
- * is a best effort attempt. If @dumper->next_seq is reached because
- * the ringbuffer is wrapping too fast, just start filling the buffer
- * from there.
+ * Move first record forward until length fits into the buffer. Ignore
+ * newest messages that were not counted in the above cycle. Messages
+ * might appear and get lost in the meantime. This is the best effort
+ * that prevents an infinite loop.
*/
prb_for_each_info(dumper->cur_seq, prb, seq, &info, &line_count) {
if (len <= size || info.seq >= dumper->next_seq)
@@ -3320,10 +3403,12 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
* Reset the dumper's iterator so that kmsg_dump_get_line() and
* kmsg_dump_get_buffer() can be called again and used multiple
* times within the same dumper.dump() callback.
+ *
+ * The function is similar to kmsg_dump_rewind(), but grabs no locks.
*/
void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
{
- dumper->cur_seq = atomic64_read(&clear_seq);
+ dumper->cur_seq = latched_seq_read_nolock(&clear_seq);
dumper->next_seq = prb_next_seq(prb);
}

@@ -3456,7 +3541,6 @@ static void pr_msleep(bool may_sleep, int ms)
/**
* pr_flush() - Wait for printing threads to catch up.
*
- * @may_sleep: Context allows msleep() calls.
* @timeout_ms: The maximum time (in ms) to wait.
* @reset_on_progress: Reset the timeout if forward progress is seen.
*
@@ -3466,19 +3550,21 @@ static void pr_msleep(bool may_sleep, int ms)
* If @reset_on_progress is true, the timeout will be reset whenever any
* printer has been seen to make some forward progress.
*
- * Context: Any context if @timeout_ms is 0 or @may_sleep is false. Otherwise
- * process context.
+ * Context: Any context.
* Return: true if all enabled printers are caught up.
*/
-bool pr_flush(bool may_sleep, int timeout_ms, bool reset_on_progress)
+bool pr_flush(int timeout_ms, bool reset_on_progress)
{
int remaining = timeout_ms;
struct console *con;
u64 last_diff = 0;
+ bool may_sleep;
u64 printk_seq;
u64 diff;
u64 seq;

+ may_sleep = (preemptible() && !in_softirq());
+
seq = prb_next_seq(prb);

for (;;) {
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index eff099123aa27..0de1e9c4d381b 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -37,7 +37,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
struct ring_buffer_event *event;
struct trace_branch *entry;
unsigned long flags;
- int pc;
+ unsigned int pc;
const char *p;

if (current->trace_recursion & TRACE_BRANCH_BIT)
@@ -59,10 +59,10 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
if (atomic_read(&data->disabled))
goto out;

- pc = preempt_count();
+ trace_ctx = _tracing_gen_ctx_flags(flags);
buffer = tr->array_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
- sizeof(*entry), flags, pc);
+ sizeof(*entry), trace_ctx);
if (!event)
goto out;

diff --git a/lib/bug.c b/lib/bug.c
index 7103440c0ee1a..baf61c307a6a2 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -205,6 +205,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
else
pr_crit("Kernel BUG at %pB [verbose debug info unavailable]\n",
(void *)bugaddr);
+ pr_flush(1000, true);

return BUG_TRAP_TYPE_BUG;
}
diff --git a/localversion-rt b/localversion-rt
index 483ad771f201a..e095ab8197147 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt19
+-rt20
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 18feaa0bc5377..dacb0d70fa61c 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -90,7 +90,7 @@ struct z3fold_buddy_slots {
* be enough slots to hold all possible variants
*/
unsigned long slot[BUDDY_MASK + 1];
- unsigned long pool; /* back link + flags */
+ unsigned long pool; /* back link */
rwlock_t lock;
};
#define HANDLE_FLAG_MASK (0x03)
@@ -185,7 +185,7 @@ enum z3fold_page_flags {
* handle flags, go under HANDLE_FLAG_MASK
*/
enum z3fold_handle_flags {
- HANDLES_ORPHANED = 0,
+ HANDLES_NOFREE = 0,
};

/*
@@ -303,10 +303,9 @@ static inline void put_z3fold_header(struct z3fold_header *zhdr)
z3fold_page_unlock(zhdr);
}

-static inline void free_handle(unsigned long handle)
+static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr)
{
struct z3fold_buddy_slots *slots;
- struct z3fold_header *zhdr;
int i;
bool is_free;

@@ -316,22 +315,19 @@ static inline void free_handle(unsigned long handle)
if (WARN_ON(*(unsigned long *)handle == 0))
return;

- zhdr = handle_to_z3fold_header(handle);
slots = handle_to_slots(handle);
write_lock(&slots->lock);
*(unsigned long *)handle = 0;
- if (zhdr->slots == slots) {
+
+ if (test_bit(HANDLES_NOFREE, &slots->pool)) {
write_unlock(&slots->lock);
return; /* simple case, nothing else to do */
}

- /* we are freeing a foreign handle if we are here */
- zhdr->foreign_handles--;
+ if (zhdr->slots != slots)
+ zhdr->foreign_handles--;
+
is_free = true;
- if (!test_bit(HANDLES_ORPHANED, &slots->pool)) {
- write_unlock(&slots->lock);
- return;
- }
for (i = 0; i <= BUDDY_MASK; i++) {
if (slots->slot[i]) {
is_free = false;
@@ -343,6 +339,8 @@ static inline void free_handle(unsigned long handle)
if (is_free) {
struct z3fold_pool *pool = slots_to_pool(slots);

+ if (zhdr->slots == slots)
+ zhdr->slots = NULL;
kmem_cache_free(pool->c_handle, slots);
}
}
@@ -525,8 +523,6 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
{
struct page *page = virt_to_page(zhdr);
struct z3fold_pool *pool = zhdr_to_pool(zhdr);
- bool is_free = true;
- int i;

WARN_ON(!list_empty(&zhdr->buddy));
set_bit(PAGE_STALE, &page->private);
@@ -536,21 +532,6 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
list_del_init(&page->lru);
spin_unlock(&pool->lock);

- /* If there are no foreign handles, free the handles array */
- read_lock(&zhdr->slots->lock);
- for (i = 0; i <= BUDDY_MASK; i++) {
- if (zhdr->slots->slot[i]) {
- is_free = false;
- break;
- }
- }
- if (!is_free)
- set_bit(HANDLES_ORPHANED, &zhdr->slots->pool);
- read_unlock(&zhdr->slots->lock);
-
- if (is_free)
- kmem_cache_free(pool->c_handle, zhdr->slots);
-
if (locked)
z3fold_page_unlock(zhdr);

@@ -642,17 +623,41 @@ static inline void add_to_unbuddied(struct z3fold_pool *pool,
{
if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
zhdr->middle_chunks == 0) {
- struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
-
+ struct list_head *unbuddied;
int freechunks = num_free_chunks(zhdr);
+
+ migrate_disable();
+ unbuddied = this_cpu_ptr(pool->unbuddied);
spin_lock(&pool->lock);
list_add(&zhdr->buddy, &unbuddied[freechunks]);
spin_unlock(&pool->lock);
zhdr->cpu = smp_processor_id();
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
}
}

+static inline enum buddy get_free_buddy(struct z3fold_header *zhdr, int chunks)
+{
+ enum buddy bud = HEADLESS;
+
+ if (zhdr->middle_chunks) {
+ if (!zhdr->first_chunks &&
+ chunks <= zhdr->start_middle - ZHDR_CHUNKS)
+ bud = FIRST;
+ else if (!zhdr->last_chunks)
+ bud = LAST;
+ } else {
+ if (!zhdr->first_chunks)
+ bud = FIRST;
+ else if (!zhdr->last_chunks)
+ bud = LAST;
+ else
+ bud = MIDDLE;
+ }
+
+ return bud;
+}
+
static inline void *mchunk_memmove(struct z3fold_header *zhdr,
unsigned short dst_chunk)
{
@@ -714,18 +719,7 @@ static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr)
if (WARN_ON(new_zhdr == zhdr))
goto out_fail;

- if (new_zhdr->first_chunks == 0) {
- if (new_zhdr->middle_chunks != 0 &&
- chunks >= new_zhdr->start_middle) {
- new_bud = LAST;
- } else {
- new_bud = FIRST;
- }
- } else if (new_zhdr->last_chunks == 0) {
- new_bud = LAST;
- } else if (new_zhdr->middle_chunks == 0) {
- new_bud = MIDDLE;
- }
+ new_bud = get_free_buddy(new_zhdr, chunks);
q = new_zhdr;
switch (new_bud) {
case FIRST:
@@ -847,9 +841,8 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
return;
}

- if (unlikely(PageIsolated(page) ||
- test_bit(PAGE_CLAIMED, &page->private) ||
- test_bit(PAGE_STALE, &page->private))) {
+ if (test_bit(PAGE_STALE, &page->private) ||
+ test_and_set_bit(PAGE_CLAIMED, &page->private)) {
z3fold_page_unlock(zhdr);
return;
}
@@ -858,13 +851,16 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) {
if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
atomic64_dec(&pool->pages_nr);
- else
+ else {
+ clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
+ }
return;
}

z3fold_compact_page(zhdr);
add_to_unbuddied(pool, zhdr);
+ clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
}

@@ -886,8 +882,9 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
int chunks = size_to_chunks(size), i;

lookup:
+ migrate_disable();
/* First, try to find an unbuddied z3fold page. */
- unbuddied = get_cpu_ptr(pool->unbuddied);
+ unbuddied = this_cpu_ptr(pool->unbuddied);
for_each_unbuddied_list(i, chunks) {
struct list_head *l = &unbuddied[i];

@@ -905,7 +902,7 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
!z3fold_page_trylock(zhdr)) {
spin_unlock(&pool->lock);
zhdr = NULL;
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
if (can_sleep)
cond_resched();
goto lookup;
@@ -919,7 +916,7 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
test_bit(PAGE_CLAIMED, &page->private)) {
z3fold_page_unlock(zhdr);
zhdr = NULL;
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
if (can_sleep)
cond_resched();
goto lookup;
@@ -934,7 +931,7 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
kref_get(&zhdr->refcount);
break;
}
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();

if (!zhdr) {
int cpu;
@@ -973,6 +970,9 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
}
}

+ if (zhdr && !zhdr->slots)
+ zhdr->slots = alloc_slots(pool,
+ can_sleep ? GFP_NOIO : GFP_ATOMIC);
return zhdr;
}

@@ -1109,17 +1109,8 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
retry:
zhdr = __z3fold_alloc(pool, size, can_sleep);
if (zhdr) {
- if (zhdr->first_chunks == 0) {
- if (zhdr->middle_chunks != 0 &&
- chunks >= zhdr->start_middle)
- bud = LAST;
- else
- bud = FIRST;
- } else if (zhdr->last_chunks == 0)
- bud = LAST;
- else if (zhdr->middle_chunks == 0)
- bud = MIDDLE;
- else {
+ bud = get_free_buddy(zhdr, chunks);
+ if (bud == HEADLESS) {
if (kref_put(&zhdr->refcount,
release_z3fold_page_locked))
atomic64_dec(&pool->pages_nr);
@@ -1265,12 +1256,11 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
pr_err("%s: unknown bud %d\n", __func__, bud);
WARN_ON(1);
put_z3fold_header(zhdr);
- clear_bit(PAGE_CLAIMED, &page->private);
return;
}

if (!page_claimed)
- free_handle(handle);
+ free_handle(handle, zhdr);
if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
atomic64_dec(&pool->pages_nr);
return;
@@ -1280,8 +1270,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
z3fold_page_unlock(zhdr);
return;
}
- if (unlikely(PageIsolated(page)) ||
- test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
+ if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
put_z3fold_header(zhdr);
clear_bit(PAGE_CLAIMED, &page->private);
return;
@@ -1345,6 +1334,10 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
struct page *page = NULL;
struct list_head *pos;
unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
+ struct z3fold_buddy_slots slots __attribute__((aligned(SLOTS_ALIGN)));
+
+ rwlock_init(&slots.lock);
+ slots.pool = (unsigned long)pool | (1 << HANDLES_NOFREE);

spin_lock(&pool->lock);
if (!pool->ops || !pool->ops->evict || retries == 0) {
@@ -1359,35 +1352,36 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
list_for_each_prev(pos, &pool->lru) {
page = list_entry(pos, struct page, lru);

- /* this bit could have been set by free, in which case
- * we pass over to the next page in the pool.
- */
- if (test_and_set_bit(PAGE_CLAIMED, &page->private)) {
- page = NULL;
- continue;
- }
-
- if (unlikely(PageIsolated(page))) {
- clear_bit(PAGE_CLAIMED, &page->private);
- page = NULL;
- continue;
- }
zhdr = page_address(page);
if (test_bit(PAGE_HEADLESS, &page->private))
break;

+ if (kref_get_unless_zero(&zhdr->refcount) == 0) {
+ zhdr = NULL;
+ break;
+ }
if (!z3fold_page_trylock(zhdr)) {
- clear_bit(PAGE_CLAIMED, &page->private);
+ if (kref_put(&zhdr->refcount,
+ release_z3fold_page))
+ atomic64_dec(&pool->pages_nr);
zhdr = NULL;
continue; /* can't evict at this point */
}
- if (zhdr->foreign_handles) {
- clear_bit(PAGE_CLAIMED, &page->private);
- z3fold_page_unlock(zhdr);
+
+ /* test_and_set_bit is of course atomic, but we still
+ * need to do it under page lock, otherwise checking
+ * that bit in __z3fold_alloc wouldn't make sense
+ */
+ if (zhdr->foreign_handles ||
+ test_and_set_bit(PAGE_CLAIMED, &page->private)) {
+ if (kref_put(&zhdr->refcount,
+ release_z3fold_page))
+ atomic64_dec(&pool->pages_nr);
+ else
+ z3fold_page_unlock(zhdr);
zhdr = NULL;
continue; /* can't evict such page */
}
- kref_get(&zhdr->refcount);
list_del_init(&zhdr->buddy);
zhdr->cpu = -1;
break;
@@ -1409,12 +1403,16 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
first_handle = 0;
last_handle = 0;
middle_handle = 0;
+ memset(slots.slot, 0, sizeof(slots.slot));
if (zhdr->first_chunks)
- first_handle = encode_handle(zhdr, FIRST);
+ first_handle = __encode_handle(zhdr, &slots,
+ FIRST);
if (zhdr->middle_chunks)
- middle_handle = encode_handle(zhdr, MIDDLE);
+ middle_handle = __encode_handle(zhdr, &slots,
+ MIDDLE);
if (zhdr->last_chunks)
- last_handle = encode_handle(zhdr, LAST);
+ last_handle = __encode_handle(zhdr, &slots,
+ LAST);
/*
* it's safe to unlock here because we hold a
* reference to this page
@@ -1429,19 +1427,16 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
ret = pool->ops->evict(pool, middle_handle);
if (ret)
goto next;
- free_handle(middle_handle);
}
if (first_handle) {
ret = pool->ops->evict(pool, first_handle);
if (ret)
goto next;
- free_handle(first_handle);
}
if (last_handle) {
ret = pool->ops->evict(pool, last_handle);
if (ret)
goto next;
- free_handle(last_handle);
}
next:
if (test_bit(PAGE_HEADLESS, &page->private)) {
@@ -1455,9 +1450,11 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
spin_unlock(&pool->lock);
clear_bit(PAGE_CLAIMED, &page->private);
} else {
+ struct z3fold_buddy_slots *slots = zhdr->slots;
z3fold_page_lock(zhdr);
if (kref_put(&zhdr->refcount,
release_z3fold_page_locked)) {
+ kmem_cache_free(pool->c_handle, slots);
atomic64_dec(&pool->pages_nr);
return 0;
}
@@ -1573,8 +1570,7 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(PageIsolated(page), page);

- if (test_bit(PAGE_HEADLESS, &page->private) ||
- test_bit(PAGE_CLAIMED, &page->private))
+ if (test_bit(PAGE_HEADLESS, &page->private))
return false;

zhdr = page_address(page);
@@ -1586,6 +1582,8 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0)
goto out;

+ if (test_and_set_bit(PAGE_CLAIMED, &page->private))
+ goto out;
pool = zhdr_to_pool(zhdr);
spin_lock(&pool->lock);
if (!list_empty(&zhdr->buddy))
@@ -1612,16 +1610,17 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa

VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);
+ VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);

zhdr = page_address(page);
pool = zhdr_to_pool(zhdr);

- if (!z3fold_page_trylock(zhdr)) {
+ if (!z3fold_page_trylock(zhdr))
return -EAGAIN;
- }
if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) {
z3fold_page_unlock(zhdr);
+ clear_bit(PAGE_CLAIMED, &page->private);
return -EBUSY;
}
if (work_pending(&zhdr->work)) {
@@ -1663,6 +1662,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);

page_mapcount_reset(page);
+ clear_bit(PAGE_CLAIMED, &page->private);
put_page(page);
return 0;
}
@@ -1686,6 +1686,7 @@ static void z3fold_page_putback(struct page *page)
spin_lock(&pool->lock);
list_add(&page->lru, &pool->lru);
spin_unlock(&pool->lock);
+ clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
}