[ANNOUNCE] v5.14-rc5-rt8

From: Sebastian Andrzej Siewior
Date: Tue Aug 10 2021 - 12:37:38 EST


Dear RT folks!

I'm pleased to announce the v5.14-rc5-rt8 patch set.

Changes since v5.14-rc5-rt7:

- The locking bits haven been updated by Thomas Gleixner.

- The SLUB series by Vlastimil Babka has been updated to v4r1.

Known issues
- netconsole triggers WARN.

- The "Memory controller" (CONFIG_MEMCG) has been disabled.

- A RCU and ARM64 warning has been fixed by Valentin Schneider. It is
still not clear if the RCU related change is correct.

The delta patch against v5.14-rc5-rt7 is appended below and can be found here:

https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.14/incr/patch-5.14-rc5-rt7-rt8.patch.xz

You can get this release via the git tree at:

git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v5.14-rc5-rt8

The RT patch against v5.14-rc5 can be found here:

https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.14/older/patch-5.14-rc5-rt8.patch.xz

The split quilt queue is available at:

https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.14/older/patches-5.14-rc5-rt8.tar.xz

Sebastian

diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 65fa7498a4d2c..531cb503d4c49 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -16,6 +16,7 @@
#include <linux/linkage.h>
#include <linux/rbtree_types.h>
#include <linux/spinlock_types_raw.h>
+#include <linux/compiler.h>

extern int max_lock_depth; /* for sysctl */

@@ -40,7 +41,7 @@ struct rt_mutex_base {
*/
static inline bool rt_mutex_base_is_locked(struct rt_mutex_base *lock)
{
- return lock->owner != NULL;
+ return READ_ONCE(lock->owner) != NULL;
}

extern void rt_mutex_base_init(struct rt_mutex_base *rtb);
diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
index 7ed2a33df7cc7..4fc72199cc9d2 100644
--- a/include/linux/spinlock_rt.h
+++ b/include/linux/spinlock_rt.h
@@ -112,7 +112,7 @@ static __always_inline void spin_unlock_irq(spinlock_t *lock)
static __always_inline void spin_unlock_irqrestore(spinlock_t *lock,
unsigned long flags)
{
- spin_unlock(lock);
+ rt_spin_unlock(lock);
}

#define spin_trylock(lock) \
diff --git a/kernel/futex.c b/kernel/futex.c
index 41e3d63160a78..fcc0570868b7b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1357,27 +1357,6 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
return 0;
}

-static int lookup_pi_state(u32 __user *uaddr, u32 uval,
- struct futex_hash_bucket *hb,
- union futex_key *key, struct futex_pi_state **ps,
- struct task_struct **exiting)
-{
- struct futex_q *top_waiter = futex_top_waiter(hb, key);
-
- /*
- * If there is a waiter on that futex, validate it and
- * attach to the pi_state when the validation succeeds.
- */
- if (top_waiter)
- return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
-
- /*
- * We are the first waiter - try to look up the owner based on
- * @uval and attach to it.
- */
- return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
-}
-
static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
{
int err;
@@ -2134,13 +2113,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
if (uaddr1 == uaddr2)
return -EINVAL;

- /*
- * requeue_pi requires a pi_state, try to allocate it now
- * without any locks in case it fails.
- */
- if (refill_pi_state_cache())
- return -ENOMEM;
-
/*
* futex_requeue() allows the caller to define the number
* of waiters to wake up via the @nr_wake argument. With
@@ -2164,6 +2136,13 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
*/
if (nr_wake != 1)
return -EINVAL;
+
+ /*
+ * requeue_pi requires a pi_state, try to allocate it now
+ * without any locks in case it fails.
+ */
+ if (refill_pi_state_cache())
+ return -ENOMEM;
}

retry:
@@ -2213,7 +2192,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
}
}

- if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
+ if (requeue_pi) {
struct task_struct *exiting = NULL;

/*
@@ -2232,18 +2211,15 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
* At this point the top_waiter has either taken uaddr2 or is
* waiting on it. If the former, then the pi_state will not
* exist yet, look it up one more time to ensure we have a
- * reference to it. If the lock was taken, ret contains the
- * vpid of the top waiter task.
+ * reference to it. If the lock was taken, @ret contains the
+ * VPID of the top waiter task.
* If the lock was not taken, we have pi_state and an initial
* refcount on it. In case of an error we have nothing.
*
* The top waiter's requeue_state is up to date:
*
* - If the lock was acquired atomically (ret > 0), then
- * the state is Q_REQUEUE_PI_LOCKED. No matter whether
- * the below lookup_pi_state() fails or not requeue_state
- * is correct because that waiter is dequeued and woken
- * up and nothing can hold it up.
+ * the state is Q_REQUEUE_PI_LOCKED.
*
* - If the trylock failed with an error (ret < 0) then
* the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
@@ -2262,19 +2238,25 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
WARN_ON(pi_state);
task_count++;
/*
- * If we acquired the lock, then the user space value
- * of uaddr2 should be vpid. It cannot be changed by
- * the top waiter as it is blocked on hb2 lock if it
- * tries to do so. If something fiddled with it behind
- * our back the pi state lookup might unearth it. So
- * we rather use the known value than rereading and
- * handing potential crap to lookup_pi_state.
+ * If futex_proxy_trylock_atomic() acquired the
+ * user space futex, then the user space value
+ * @uaddr2 has been set to the @hb1's top waiter
+ * task VPID. This task is guaranteed to be alive
+ * and cannot be exiting because it is either
+ * sleeping or blocked on @hb2 lock.
*
- * If that call succeeds then we have pi_state and an
- * initial refcount on it.
+ * The @uaddr2 futex cannot have waiters either as
+ * otherwise futex_proxy_trylock_atomic() would not
+ * have succeeded.
+ *
+ * In order to requeue waiters to @hb2, pi state is
+ * required. Hand in the VPID value (@ret) and
+ * allocate PI state with an initial refcount on
+ * it.
*/
- ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
- &pi_state, &exiting);
+ ret = attach_to_pi_owner(uaddr2, ret, &key2, &pi_state,
+ &exiting);
+ WARN_ON(ret);
}

switch (ret) {
@@ -2413,9 +2395,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
}

/*
- * We took an extra initial reference to the pi_state either
- * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
- * need to drop it here again.
+ * We took an extra initial reference to the pi_state either in
+ * futex_proxy_trylock_atomic() or in attach_to_pi_owner(). We need
+ * to drop it here again.
*/
put_pi_state(pi_state);

@@ -2594,7 +2576,7 @@ static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
* Modifying pi_state _before_ the user space value would leave the
* pi_state in an inconsistent state when we fault here, because we
* need to drop the locks to handle the fault. This might be observed
- * in the PID check in lookup_pi_state.
+ * in the PID checks when attaching to PI state .
*/
retry:
if (!argowner) {
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 7522c3abacb6c..44472115aaf66 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -284,11 +284,28 @@ static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock,
}
#endif

+static __always_inline int __waiter_prio(struct task_struct *task)
+{
+ int prio = task->prio;
+
+ if (!rt_prio(prio))
+ return DEFAULT_PRIO;
+
+ return prio;
+}
+
+static __always_inline void
+waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
+{
+ waiter->prio = __waiter_prio(task);
+ waiter->deadline = task->dl.deadline;
+}
+
/*
* Only use with rt_mutex_waiter_{less,equal}()
*/
#define task_to_waiter(p) \
- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
+ &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }

static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left,
struct rt_mutex_waiter *right)
@@ -356,11 +373,15 @@ static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_nod

if (rt_mutex_waiter_less(aw, bw))
return 1;
+
+ if (!build_ww_mutex())
+ return 0;
+
if (rt_mutex_waiter_less(bw, aw))
return 0;

/* NOTE: relies on waiter->ww_ctx being set before insertion */
- if (build_ww_mutex() && aw->ww_ctx) {
+ if (aw->ww_ctx) {
if (!bw->ww_ctx)
return 1;

@@ -775,8 +796,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* serializes all pi_waiters access and rb_erase() does not care about
* the values of the node being removed.
*/
- waiter->prio = task->prio;
- waiter->deadline = task->dl.deadline;
+ waiter_update_prio(waiter, task);

rt_mutex_enqueue(lock, waiter);

@@ -1045,8 +1065,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
raw_spin_lock(&task->pi_lock);
waiter->task = task;
waiter->lock = lock;
- waiter->prio = task->prio;
- waiter->deadline = task->dl.deadline;
+ waiter_update_prio(waiter, task);

/* Get the top priority waiter on the lock */
if (rt_mutex_has_waiters(lock))
@@ -1284,27 +1303,34 @@ static __always_inline void __rt_mutex_unlock(struct rt_mutex_base *lock)
}

#ifdef CONFIG_SMP
-/*
- * Note that owner is a speculative pointer and dereferencing relies
- * on rcu_read_lock() and the check against the lock owner.
- */
-static bool rtmutex_adaptive_spinwait(struct rt_mutex_base *lock,
- struct task_struct *owner)
+static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *owner)
{
bool res = true;

rcu_read_lock();
for (;;) {
- /* Owner changed. Trylock again */
+ /* If owner changed, trylock again. */
if (owner != rt_mutex_owner(lock))
break;
/*
- * Ensure that owner->on_cpu is dereferenced _after_
- * checking the above to be valid.
+ * Ensure that @owner is dereferenced after checking that
+ * the lock owner still matches @owner. If that fails,
+ * @owner might point to freed memory. If it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
*/
barrier();
- if (!owner->on_cpu || need_resched() ||
- vcpu_is_preempted(task_cpu(owner))) {
+ /*
+ * Stop spinning when:
+ * - the lock owner has been scheduled out
+ * - current is not longer the top waiter
+ * - current is requested to reschedule (redundant
+ * for CONFIG_PREEMPT_RCU=y)
+ * - the VCPU on which owner runs is preempted
+ */
+ if (!owner->on_cpu || waiter != rt_mutex_top_waiter(lock) ||
+ need_resched() || vcpu_is_preempted(task_cpu(owner))) {
res = false;
break;
}
@@ -1314,8 +1340,9 @@ static bool rtmutex_adaptive_spinwait(struct rt_mutex_base *lock,
return res;
}
#else
-static bool rtmutex_adaptive_spinwait(struct rt_mutex_base *lock,
- struct task_struct *owner)
+static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *owner)
{
return false;
}
@@ -1434,7 +1461,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
owner = NULL;
raw_spin_unlock_irq(&lock->wait_lock);

- if (!owner || !rtmutex_adaptive_spinwait(lock, owner))
+ if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
schedule();

raw_spin_lock_irq(&lock->wait_lock);
@@ -1616,7 +1643,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock)
owner = NULL;
raw_spin_unlock_irq(&lock->wait_lock);

- if (!owner || !rtmutex_adaptive_spinwait(lock, owner))
+ if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner))
schedule_rtlock();

raw_spin_lock_irq(&lock->wait_lock);
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
index 3ca0f167df544..a077079e387ca 100644
--- a/kernel/locking/ww_mutex.h
+++ b/kernel/locking/ww_mutex.h
@@ -237,7 +237,7 @@ __ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
int a_prio = a->task->prio;
int b_prio = b->task->prio;

- if (dl_prio(a_prio) || dl_prio(b_prio)) {
+ if (rt_prio(a_prio) || rt_prio(b_prio)) {

if (a_prio > b_prio)
return true;
diff --git a/localversion-rt b/localversion-rt
index 045478966e9f1..700c857efd9ba 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt7
+-rt8
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 1c673c323baf2..ec2bb0beed757 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -502,6 +502,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (unlikely(!s))
return;

+ cpus_read_lock();
mutex_lock(&slab_mutex);

s->refcount--;
@@ -516,6 +517,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
}
out_unlock:
mutex_unlock(&slab_mutex);
+ cpus_read_unlock();
}
EXPORT_SYMBOL(kmem_cache_destroy);

diff --git a/mm/slub.c b/mm/slub.c
index 5d775fafd2160..ef022fe159c65 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -463,7 +463,8 @@ static inline bool ___cmpxchg_double_slab(struct kmem_cache *s, struct page *pag
} else
#endif
{
- unsigned long flags;
+ /* init to 0 to prevent spurious warnings */
+ unsigned long flags = 0;

__slab_lock(page, &flags, disable_irqs);
if (page->freelist == freelist_old &&
@@ -2636,13 +2637,13 @@ static bool has_cpu_slab(int cpu, struct kmem_cache *s)
static DEFINE_MUTEX(flush_lock);
static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);

-static void flush_all(struct kmem_cache *s)
+static void flush_all_cpus_locked(struct kmem_cache *s)
{
struct slub_flush_work *sfw;
unsigned int cpu;

+ lockdep_assert_cpus_held();
mutex_lock(&flush_lock);
- cpus_read_lock();

for_each_online_cpu(cpu) {
sfw = &per_cpu(slub_flush, cpu);
@@ -2663,10 +2664,16 @@ static void flush_all(struct kmem_cache *s)
flush_work(&sfw->work);
}

- cpus_read_unlock();
mutex_unlock(&flush_lock);
}

+static void flush_all(struct kmem_cache *s)
+{
+ cpus_read_lock();
+ flush_all_cpus_locked(s);
+ cpus_read_unlock();
+}
+
/*
* Use the cpu notifier to insure that the cpu slabs are flushed when
* necessary.
@@ -4236,7 +4243,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
int node;
struct kmem_cache_node *n;

- flush_all(s);
+ flush_all_cpus_locked(s);
/* Attempt to free all objects */
for_each_kmem_cache_node(s, node, n) {
free_partial(s, n);
@@ -4512,7 +4519,7 @@ EXPORT_SYMBOL(kfree);
* being allocated from last increasing the chance that the last objects
* are freed in them.
*/
-int __kmem_cache_shrink(struct kmem_cache *s)
+int __kmem_cache_do_shrink(struct kmem_cache *s)
{
int node;
int i;
@@ -4524,7 +4531,6 @@ int __kmem_cache_shrink(struct kmem_cache *s)
unsigned long flags;
int ret = 0;

- flush_all(s);
for_each_kmem_cache_node(s, node, n) {
INIT_LIST_HEAD(&discard);
for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
@@ -4574,13 +4580,21 @@ int __kmem_cache_shrink(struct kmem_cache *s)
return ret;
}

+int __kmem_cache_shrink(struct kmem_cache *s)
+{
+ flush_all(s);
+ return __kmem_cache_do_shrink(s);
+}
+
static int slab_mem_going_offline_callback(void *arg)
{
struct kmem_cache *s;

mutex_lock(&slab_mutex);
- list_for_each_entry(s, &slab_caches, list)
- __kmem_cache_shrink(s);
+ list_for_each_entry(s, &slab_caches, list) {
+ flush_all_cpus_locked(s);
+ __kmem_cache_do_shrink(s);
+ }
mutex_unlock(&slab_mutex);

return 0;