[RFC PATCH] sched/proxy_exec: Extend PE blocked_on chain to rwsem write-side

From: soolaugust

Date: Wed Mar 04 2026 - 03:27:19 EST


From: zhidao su <suzhidao@xxxxxxxxxx>

Proxy Execution currently tracks blocked_on chains only through
struct mutex. This patch extends the infrastructure to support
rw_semaphore write-side blocking, allowing PE to eliminate priority
inversion where a high-priority writer waits for a low-priority
write lock holder.

Changes:

1. include/linux/sched.h: Generalise blocked_on from struct mutex *
to void *, and add a 2-bit blocked_on_type field encoding the
primitive type (BLOCKED_ON_NONE/MUTEX/RWSEM). All existing mutex
helpers are renamed to _mutex suffix; compatibility wrappers
preserve the old names so that mutex.c requires no change.
New __set/clear_task_blocked_on_rwsem() helpers are added
(void * parameter avoids pulling rwsem.h into sched.h).

2. kernel/locking/rwsem.c: In rwsem_down_write_slowpath(), call
__set_task_blocked_on_rwsem() after entering the wait queue
(wait_lock held), re-set it after each schedule() wakeup, and
clear it on lock acquisition and on signal-interrupted exit.
Pattern mirrors the existing mutex slowpath.

3. kernel/sched/core.c: find_proxy_task() now dispatches on
blocked_on_type. The BLOCKED_ON_RWSEM branch acquires
sem->wait_lock, re-validates blocked_on, then calls
rwsem_owner() to retrieve the write owner (returns NULL for
reader-owned sem, which safely terminates the chain). Owner
validity checks (on_rq, sched_delayed, cpu, migrating) are
shared between both branches.

4. tools/testing/selftests/sched/proxy_exec_test.c: Add TC-4
(single-level rwsem write PE) and TC-5 (mixed rwsem->mutex
chain). TAP plan updated from 3 to 5.

PREEMPT_RT limitation: rwsem is backed by rwbase_rt/rt_mutex under
CONFIG_PREEMPT_RT. The new code paths are not compiled on RT kernels;
on RT blocked_on is never set for rwsem and find_proxy_task()
terminates cleanly at such nodes.

Signed-off-by: zhidao su <suzhidao@xxxxxxxxxx>
---
include/linux/sched.h | 110 ++-
kernel/locking/rwsem.c | 9 +
kernel/sched/core.c | 106 ++-
.../testing/selftests/sched/proxy_exec_test.c | 763 ++++++++++++++++++
4 files changed, 943 insertions(+), 45 deletions(-)
create mode 100644 tools/testing/selftests/sched/proxy_exec_test.c

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a7b4a980eb2..4bef3618889 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1237,7 +1237,8 @@ struct task_struct {
struct rt_mutex_waiter *pi_blocked_on;
#endif

- struct mutex *blocked_on; /* lock we're blocked on */
+ void *blocked_on; /* lock we're blocked on */
+ unsigned int blocked_on_type : 2; /* enum blocked_on_type */

#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
/*
@@ -2178,8 +2179,21 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock) __must_hold(lock);
__cond_resched_rwlock_write(lock); \
})

+/*
+ * Type tag for task_struct::blocked_on. Allows PE chain traversal
+ * to handle different lock primitives (mutex, rwsem write-side).
+ */
+enum blocked_on_type {
+ BLOCKED_ON_NONE = 0,
+ BLOCKED_ON_MUTEX = 1,
+ BLOCKED_ON_RWSEM = 2,
+};
+
#ifndef CONFIG_PREEMPT_RT
-static inline struct mutex *__get_task_blocked_on(struct task_struct *p)
+/* --- mutex blocked_on helpers --- */
+
+static inline struct mutex *
+__get_task_blocked_on_mutex(struct task_struct *p)
{
struct mutex *m = p->blocked_on;

@@ -2188,7 +2202,8 @@ static inline struct mutex *__get_task_blocked_on(struct task_struct *p)
return m;
}

-static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m)
+static inline void
+__set_task_blocked_on_mutex(struct task_struct *p, struct mutex *m)
{
struct mutex *blocked_on = READ_ONCE(p->blocked_on);

@@ -2204,15 +2219,18 @@ static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m)
*/
WARN_ON_ONCE(blocked_on && blocked_on != m);
WRITE_ONCE(p->blocked_on, m);
+ p->blocked_on_type = BLOCKED_ON_MUTEX;
}

-static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m)
+static inline void
+set_task_blocked_on_mutex(struct task_struct *p, struct mutex *m)
{
guard(raw_spinlock_irqsave)(&m->wait_lock);
- __set_task_blocked_on(p, m);
+ __set_task_blocked_on_mutex(p, m);
}

-static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m)
+static inline void
+__clear_task_blocked_on_mutex(struct task_struct *p, struct mutex *m)
{
if (m) {
struct mutex *blocked_on = READ_ONCE(p->blocked_on);
@@ -2227,21 +2245,91 @@ static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *
WARN_ON_ONCE(blocked_on && blocked_on != m);
}
WRITE_ONCE(p->blocked_on, NULL);
+ p->blocked_on_type = BLOCKED_ON_NONE;
}

-static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m)
+static inline void
+clear_task_blocked_on_mutex(struct task_struct *p, struct mutex *m)
{
guard(raw_spinlock_irqsave)(&m->wait_lock);
- __clear_task_blocked_on(p, m);
+ __clear_task_blocked_on_mutex(p, m);
}
-#else
-static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
+
+/* Compatibility wrappers — keep mutex.c callers unchanged */
+static inline struct mutex *
+__get_task_blocked_on(struct task_struct *p)
+{
+ return __get_task_blocked_on_mutex(p);
+}
+
+static inline void
+__set_task_blocked_on(struct task_struct *p, struct mutex *m)
{
+ __set_task_blocked_on_mutex(p, m);
}

-static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
+static inline void
+set_task_blocked_on(struct task_struct *p, struct mutex *m)
+{
+ set_task_blocked_on_mutex(p, m);
+}
+
+static inline void
+__clear_task_blocked_on(struct task_struct *p, struct mutex *m)
{
+ __clear_task_blocked_on_mutex(p, m);
}
+
+static inline void
+clear_task_blocked_on(struct task_struct *p, struct mutex *m)
+{
+ clear_task_blocked_on_mutex(p, m);
+}
+
+/* --- rwsem write-side blocked_on helpers --- */
+
+/*
+ * __set/clear_task_blocked_on_rwsem: called with sem->wait_lock held.
+ * Uses void* to avoid pulling struct rw_semaphore into sched.h.
+ * Callers (rwsem.c) cast sem to void* before passing.
+ */
+static inline void
+__set_task_blocked_on_rwsem(struct task_struct *p, void *sem)
+{
+ void *blocked_on = READ_ONCE(p->blocked_on);
+
+ WARN_ON_ONCE(!sem);
+ /* The task should only be setting itself as blocked */
+ WARN_ON_ONCE(p != current);
+ WARN_ON_ONCE(blocked_on && blocked_on != sem);
+ WRITE_ONCE(p->blocked_on, sem);
+ p->blocked_on_type = BLOCKED_ON_RWSEM;
+}
+
+static inline void
+__clear_task_blocked_on_rwsem(struct task_struct *p, void *sem)
+{
+ if (sem) {
+ void *blocked_on = READ_ONCE(p->blocked_on);
+
+ WARN_ON_ONCE(blocked_on && blocked_on != sem);
+ }
+ WRITE_ONCE(p->blocked_on, NULL);
+ p->blocked_on_type = BLOCKED_ON_NONE;
+}
+
+#else /* CONFIG_PREEMPT_RT */
+
+static inline void
+__clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
+{
+}
+
+static inline void
+clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
+{
+}
+
#endif /* !CONFIG_PREEMPT_RT */

static __always_inline bool need_resched(void)
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 24df4d98f7d..4ef9893a3e4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1154,6 +1154,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)

if (state == TASK_UNINTERRUPTIBLE)
hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_WRITER);
+ /* PE: mark this task as blocked on the rwsem write lock */
+ __set_task_blocked_on_rwsem(current, sem);

for (;;) {
if (rwsem_try_write_lock(sem, &waiter)) {
@@ -1187,8 +1189,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
set_current_state(state);
trylock_again:
raw_spin_lock_irq(&sem->wait_lock);
+ /* PE: re-set blocked_on after wakeup re-acquires wait_lock */
+ __set_task_blocked_on_rwsem(current, sem);
}

+ /* PE: clear blocked_on — lock acquired, wait_lock still held */
+ __clear_task_blocked_on_rwsem(current, sem);
+
if (state == TASK_UNINTERRUPTIBLE)
hung_task_clear_blocker();

@@ -1201,6 +1208,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
out_nolock:
__set_current_state(TASK_RUNNING);
raw_spin_lock_irq(&sem->wait_lock);
+ /* PE: clear blocked_on on signal-interrupted exit */
+ __clear_task_blocked_on_rwsem(current, sem);
rwsem_del_wake_waiter(sem, &waiter, &wake_q);
lockevent_inc(rwsem_wlock_fail);
trace_contention_end(sem, -EINTR);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dc9f17b35e4..d50c8a90908 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -54,6 +54,7 @@
#include <linux/mmu_context.h>
#include <linux/mmzone.h>
#include <linux/mutex_api.h>
+#include <linux/rwsem.h>
#include <linux/nmi.h>
#include <linux/nospec.h>
#include <linux/perf_event_api.h>
@@ -6594,35 +6595,69 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
struct task_struct *owner = NULL;
int this_cpu = cpu_of(rq);
struct task_struct *p;
- struct mutex *mutex;

/* Follow blocked_on chain. */
for (p = donor; task_is_blocked(p); p = owner) {
- mutex = p->blocked_on;
+ void *blocked_lock = READ_ONCE(p->blocked_on);
+ enum blocked_on_type btype = p->blocked_on_type;
+
/* Something changed in the chain, so pick again */
- if (!mutex)
+ if (!blocked_lock)
return NULL;
- /*
- * By taking mutex->wait_lock we hold off concurrent mutex_unlock()
- * and ensure @owner sticks around.
- */
- guard(raw_spinlock)(&mutex->wait_lock);

- /* Check again that p is blocked with wait_lock held */
- if (mutex != __get_task_blocked_on(p)) {
+ if (btype == BLOCKED_ON_MUTEX) {
+ struct mutex *mutex = blocked_lock;
+
/*
- * Something changed in the blocked_on chain and
- * we don't know if only at this level. So, let's
- * just bail out completely and let __schedule()
- * figure things out (pick_again loop).
+ * By taking mutex->wait_lock we hold off concurrent
+ * mutex_unlock() and ensure @owner sticks around.
*/
- return NULL;
- }
+ guard(raw_spinlock)(&mutex->wait_lock);

- owner = __mutex_owner(mutex);
- if (!owner) {
- __clear_task_blocked_on(p, mutex);
- return p;
+ /* Check again that p is blocked with wait_lock held */
+ if (mutex != __get_task_blocked_on(p)) {
+ /*
+ * Something changed in the blocked_on chain
+ * and we don't know if only at this level.
+ * Bail out and let __schedule() figure things
+ * out (pick_again loop).
+ */
+ return NULL;
+ }
+
+ owner = __mutex_owner(mutex);
+ if (!owner) {
+ __clear_task_blocked_on(p, mutex);
+ return p;
+ }
+ } else if (btype == BLOCKED_ON_RWSEM) {
+ struct rw_semaphore *sem = blocked_lock;
+
+ /*
+ * Take sem->wait_lock to serialise against concurrent
+ * up_write() and ensure the owner pointer is stable.
+ */
+ guard(raw_spinlock)(&sem->wait_lock);
+
+ /*
+ * Re-check after acquiring wait_lock: blocked_on
+ * could have been cleared by a concurrent wakeup.
+ */
+ if (sem != READ_ONCE(p->blocked_on))
+ return NULL;
+
+ owner = rwsem_owner(sem);
+ if (!owner) {
+ /*
+ * rwsem is reader-owned or has no writer
+ * owner. Cannot proxy-execute through
+ * readers; treat as terminal node.
+ */
+ return p;
+ }
+ } else {
+ /* Unknown blocked_on type — bail */
+ return NULL;
}

if (!READ_ONCE(owner->on_rq)) {
@@ -6630,7 +6665,7 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
* Owner is off the runqueue; proxy execution cannot
* proceed through it. Deactivate the donor so it will
* be properly re-enqueued when the owner eventually
- * wakes and releases the mutex.
+ * wakes and releases the lock.
*/
return proxy_deactivate(rq, donor);
}
@@ -6658,12 +6693,14 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)

if (task_on_rq_migrating(owner)) {
/*
- * One of the chain of mutex owners is currently migrating to this
- * CPU, but has not yet been enqueued because we are holding the
- * rq lock. As a simple solution, just schedule rq->idle to give
- * the migration a chance to complete. Much like the migrate_task
- * case we should end up back in find_proxy_task(), this time
- * hopefully with all relevant tasks already enqueued.
+ * One of the chain of lock owners is currently
+ * migrating to this CPU, but has not yet been
+ * enqueued because we are holding the rq lock. As a
+ * simple solution, just schedule rq->idle to give
+ * the migration a chance to complete. Much like the
+ * migrate_task case we should end up back in
+ * find_proxy_task(), this time hopefully with all
+ * relevant tasks already enqueued.
*/
return proxy_resched_idle(rq);
}
@@ -6683,8 +6720,8 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
/*
* It's possible we interleave with mutex_unlock like:
*
- * lock(&rq->lock);
- * find_proxy_task()
+ * lock(&rq->lock);
+ * find_proxy_task()
* mutex_unlock()
* lock(&wait_lock);
* donor(owner) = current->blocked_donor;
@@ -6694,13 +6731,14 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
* ...
* ttwu_runnable()
* __task_rq_lock()
- * lock(&wait_lock);
- * owner == p
+ * lock(&wait_lock);
+ * owner == p
*
- * Which leaves us to finish the ttwu_runnable() and make it go.
+ * Which leaves us to finish the ttwu_runnable() and
+ * make it go.
*
- * So schedule rq->idle so that ttwu_runnable() can get the rq
- * lock and mark owner as running.
+ * So schedule rq->idle so that ttwu_runnable() can
+ * get the rq lock and mark owner as running.
*/
return proxy_resched_idle(rq);
}
diff --git a/tools/testing/selftests/sched/proxy_exec_test.c b/tools/testing/selftests/sched/proxy_exec_test.c
new file mode 100644
index 00000000000..30fc58b9738
--- /dev/null
+++ b/tools/testing/selftests/sched/proxy_exec_test.c
@@ -0,0 +1,763 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Proxy Execution (PE) selftest
+ *
+ * Tests for the sched_proxy_exec feature. Verifies that the kernel
+ * correctly handles RT priority inheritance through proxy execution.
+ *
+ * TC-1: Basic PE activation — low-prio holder releases lock for high-prio
+ * waiter within expected time bound.
+ * TC-2: Three-level blocked_on chain — PE chains through B->C so that
+ * A eventually acquires its mutex.
+ * TC-3: PE deactivate path — SIGSTOP/SIGCONT on holder; high-prio thread
+ * must still acquire the lock within a generous timeout.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sched.h>
+#include <errno.h>
+#include <signal.h>
+#include <time.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+
+static int test_count;
+
+/* ------------------------------------------------------------------ */
+/* Helpers */
+/* ------------------------------------------------------------------ */
+
+/*
+ * is_proxy_exec_enabled - check whether CONFIG_SCHED_PROXY_EXEC is active
+ *
+ * Try to read /proc/sys/kernel/sched_proxy_exec. If the file exists and
+ * contains a non-zero value the feature is considered enabled. Returns 1
+ * when enabled, 0 otherwise.
+ */
+static int is_proxy_exec_enabled(void)
+{
+ FILE *f;
+ int val = 0;
+
+ f = fopen("/proc/sys/kernel/sched_proxy_exec", "r");
+ if (!f)
+ return 0;
+
+ if (fscanf(f, "%d", &val) != 1)
+ val = 0;
+
+ fclose(f);
+ return val != 0;
+}
+
+/*
+ * set_rt_prio - set the calling thread to SCHED_FIFO at the given priority
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+static int set_rt_prio(int prio)
+{
+ struct sched_param sp = { .sched_priority = prio };
+
+ if (sched_setscheduler(0, SCHED_FIFO, &sp) != 0)
+ return -1;
+ return 0;
+}
+
+/*
+ * print_result - emit a single TAP result line
+ *
+ * Increments the global test counter and prints either "ok N - name" or
+ * "not ok N - name".
+ */
+static void print_result(const char *name, int pass)
+{
+ ++test_count;
+ if (pass)
+ printf("ok %d - %s\n", test_count, name);
+ else
+ printf("not ok %d - %s\n", test_count, name);
+}
+
+/*
+ * elapsed_ms - compute elapsed wall-clock milliseconds between two
+ * CLOCK_MONOTONIC timestamps.
+ */
+static long elapsed_ms(const struct timespec *start, const struct timespec *end)
+{
+ long diff_sec = (long)(end->tv_sec - start->tv_sec);
+ long diff_nsec = (long)(end->tv_nsec - start->tv_nsec);
+
+ return diff_sec * 1000L + diff_nsec / 1000000L;
+}
+
+/* ------------------------------------------------------------------ */
+/* TC-1: Basic PE activation */
+/* ------------------------------------------------------------------ */
+
+struct tc1_args {
+ pthread_mutex_t *mutex;
+ int hold_ms; /* how long to sleep in critical section */
+};
+
+static void *tc1_holder_thread(void *arg)
+{
+ struct tc1_args *a = arg;
+ struct timespec ts = { 0, (long)a->hold_ms * 1000000L };
+
+ /* Become a low-prio RT thread so PE applies. */
+ set_rt_prio(20);
+
+ pthread_mutex_lock(a->mutex);
+ nanosleep(&ts, NULL);
+ pthread_mutex_unlock(a->mutex);
+
+ return NULL;
+}
+
+static void test_basic_pe_activation(void)
+{
+ pthread_t holder;
+ pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+ struct tc1_args args = { .mutex = &mutex, .hold_ms = 200 };
+ struct timespec t0, t1;
+ long ms;
+ int pass;
+
+ printf("# TC-1: basic PE activation\n");
+
+ /* Spawn the low-prio holder first; let it grab the mutex. */
+ if (pthread_create(&holder, NULL, tc1_holder_thread, &args) != 0) {
+ printf("# TC-1: pthread_create failed: %s\n", strerror(errno));
+ print_result("basic_pe_activation", 0);
+ return;
+ }
+
+ /*
+ * Give the holder a moment to actually lock the mutex before this
+ * (main) thread — soon to be prio 80 — tries to acquire it.
+ */
+ usleep(20000); /* 20 ms */
+
+ /* Raise our own priority so we become the blocked high-prio waiter. */
+ if (set_rt_prio(80) != 0) {
+ printf("# TC-1: set_rt_prio(80) failed: %s\n", strerror(errno));
+ pthread_join(holder, NULL);
+ pthread_mutex_destroy(&mutex);
+ print_result("basic_pe_activation", 0);
+ return;
+ }
+
+ clock_gettime(CLOCK_MONOTONIC, &t0);
+ pthread_mutex_lock(&mutex);
+ clock_gettime(CLOCK_MONOTONIC, &t1);
+ pthread_mutex_unlock(&mutex);
+
+ /* Restore to SCHED_OTHER for the remaining tests. */
+ {
+ struct sched_param sp = { .sched_priority = 0 };
+
+ sched_setscheduler(0, SCHED_OTHER, &sp);
+ }
+
+ pthread_join(holder, NULL);
+ pthread_mutex_destroy(&mutex);
+
+ ms = elapsed_ms(&t0, &t1);
+ pass = (ms < 300L);
+ printf("# TC-1: acquired mutex in %ld ms (limit 300 ms)\n", ms);
+ print_result("basic_pe_activation", pass);
+}
+
+/* ------------------------------------------------------------------ */
+/* TC-2: Three-level blocked_on chain */
+/* ------------------------------------------------------------------ */
+
+struct tc2_shared {
+ pthread_mutex_t mutex1;
+ pthread_mutex_t mutex2;
+
+ /* Synchronisation: holders signal when they have grabbed the lock. */
+ pthread_mutex_t sync_mutex;
+ pthread_cond_t sync_cond;
+ int holders_ready; /* incremented by each holder */
+};
+
+struct tc2_b_args {
+ struct tc2_shared *s;
+};
+
+struct tc2_c_args {
+ struct tc2_shared *s;
+ int hold_ms;
+};
+
+/* Thread C: holds mutex2, sleeps, releases. */
+static void *tc2_c_thread(void *arg)
+{
+ struct tc2_c_args *a = arg;
+ struct tc2_shared *s = a->s;
+ struct timespec ts = { 0, (long)a->hold_ms * 1000000L };
+
+ set_rt_prio(20);
+
+ pthread_mutex_lock(&s->mutex2);
+
+ /* Signal that we are ready. */
+ pthread_mutex_lock(&s->sync_mutex);
+ s->holders_ready++;
+ pthread_cond_broadcast(&s->sync_cond);
+ pthread_mutex_unlock(&s->sync_mutex);
+
+ nanosleep(&ts, NULL);
+ pthread_mutex_unlock(&s->mutex2);
+
+ return NULL;
+}
+
+/* Thread B: holds mutex1, then blocks on mutex2. */
+static void *tc2_b_thread(void *arg)
+{
+ struct tc2_b_args *a = arg;
+ struct tc2_shared *s = a->s;
+
+ set_rt_prio(50);
+
+ pthread_mutex_lock(&s->mutex1);
+
+ /* Signal that we are ready. */
+ pthread_mutex_lock(&s->sync_mutex);
+ s->holders_ready++;
+ pthread_cond_broadcast(&s->sync_cond);
+ pthread_mutex_unlock(&s->sync_mutex);
+
+ /* Now block on C. */
+ pthread_mutex_lock(&s->mutex2);
+ pthread_mutex_unlock(&s->mutex2);
+
+ pthread_mutex_unlock(&s->mutex1);
+
+ return NULL;
+}
+
+static void test_three_level_chain(void)
+{
+ struct tc2_shared shared;
+ struct tc2_b_args b_args;
+ struct tc2_c_args c_args;
+ pthread_t b_tid, c_tid;
+ struct timespec t0, t1;
+ long ms;
+ int pass;
+
+ printf("# TC-2: three-level blocked_on chain\n");
+
+ memset(&shared, 0, sizeof(shared));
+ pthread_mutex_init(&shared.mutex1, NULL);
+ pthread_mutex_init(&shared.mutex2, NULL);
+ pthread_mutex_init(&shared.sync_mutex, NULL);
+ pthread_cond_init(&shared.sync_cond, NULL);
+ shared.holders_ready = 0;
+
+ c_args.s = &shared;
+ c_args.hold_ms = 100;
+ b_args.s = &shared;
+
+ /* Start C first so it grabs mutex2 before B tries. */
+ if (pthread_create(&c_tid, NULL, tc2_c_thread, &c_args) != 0) {
+ printf("# TC-2: pthread_create C failed\n");
+ goto cleanup;
+ }
+
+ /* Wait until C holds mutex2. */
+ pthread_mutex_lock(&shared.sync_mutex);
+ while (shared.holders_ready < 1)
+ pthread_cond_wait(&shared.sync_cond, &shared.sync_mutex);
+ pthread_mutex_unlock(&shared.sync_mutex);
+
+ /* Now start B so it grabs mutex1 then blocks on mutex2. */
+ if (pthread_create(&b_tid, NULL, tc2_b_thread, &b_args) != 0) {
+ printf("# TC-2: pthread_create B failed\n");
+ pthread_join(c_tid, NULL);
+ goto cleanup;
+ }
+
+ /* Wait until B holds mutex1. */
+ pthread_mutex_lock(&shared.sync_mutex);
+ while (shared.holders_ready < 2)
+ pthread_cond_wait(&shared.sync_cond, &shared.sync_mutex);
+ pthread_mutex_unlock(&shared.sync_mutex);
+
+ /* Small delay to let B actually block on mutex2. */
+ usleep(10000); /* 10 ms */
+
+ /* Raise our (A's) priority and try to acquire mutex1. */
+ if (set_rt_prio(80) != 0) {
+ printf("# TC-2: set_rt_prio(80) failed: %s\n", strerror(errno));
+ pthread_join(b_tid, NULL);
+ pthread_join(c_tid, NULL);
+ goto cleanup;
+ }
+
+ clock_gettime(CLOCK_MONOTONIC, &t0);
+ pthread_mutex_lock(&shared.mutex1);
+ clock_gettime(CLOCK_MONOTONIC, &t1);
+ pthread_mutex_unlock(&shared.mutex1);
+
+ /* Restore scheduling. */
+ {
+ struct sched_param sp = { .sched_priority = 0 };
+
+ sched_setscheduler(0, SCHED_OTHER, &sp);
+ }
+
+ pthread_join(b_tid, NULL);
+ pthread_join(c_tid, NULL);
+
+ ms = elapsed_ms(&t0, &t1);
+ pass = (ms < 200L);
+ printf("# TC-2: acquired mutex1 in %ld ms (limit 200 ms)\n", ms);
+ print_result("three_level_chain", pass);
+
+cleanup:
+ pthread_mutex_destroy(&shared.mutex1);
+ pthread_mutex_destroy(&shared.mutex2);
+ pthread_mutex_destroy(&shared.sync_mutex);
+ pthread_cond_destroy(&shared.sync_cond);
+}
+
+/* ------------------------------------------------------------------ */
+/* TC-3: PE deactivate path (owner SIGSTOP) */
+/* ------------------------------------------------------------------ */
+
+/* Flag set by SIGALRM handler so we can detect timeout. */
+static volatile sig_atomic_t tc3_alarm_fired;
+
+static void tc3_alarm_handler(int sig)
+{
+ (void)sig;
+ tc3_alarm_fired = 1;
+}
+
+struct tc3_args {
+ pthread_mutex_t *mutex;
+ /* tid for SIGSTOP/SIGCONT from the main thread */
+ pid_t tid;
+ pthread_mutex_t ready_mutex;
+ pthread_cond_t ready_cond;
+ int ready;
+};
+
+static void *tc3_holder_thread(void *arg)
+{
+ struct tc3_args *a = arg;
+ struct timespec ts = { 0, 50L * 1000000L }; /* 50 ms sleep */
+
+ set_rt_prio(20);
+
+ /* Record our own tid so main can send signals. */
+ a->tid = (pid_t)syscall(SYS_gettid);
+
+ pthread_mutex_lock(a->mutex);
+
+ /* Signal main that we hold the mutex. */
+ pthread_mutex_lock(&a->ready_mutex);
+ a->ready = 1;
+ pthread_cond_signal(&a->ready_cond);
+ pthread_mutex_unlock(&a->ready_mutex);
+
+ /*
+ * Sleep inside the critical section. The main thread will SIGSTOP
+ * us while we are in here.
+ */
+ nanosleep(&ts, NULL);
+
+ pthread_mutex_unlock(a->mutex);
+
+ return NULL;
+}
+
+/* Arguments for the SIGCONT helper thread. */
+struct tc3_cont_args {
+ pid_t tid;
+ pid_t pid;
+};
+
+/*
+ * tc3_cont_thread - sleep 1 second then send SIGCONT to the stopped holder.
+ */
+static void *tc3_cont_thread(void *arg)
+{
+ struct tc3_cont_args *ca = arg;
+ struct timespec ts = { 1, 0 }; /* 1 second */
+
+ nanosleep(&ts, NULL);
+ syscall(SYS_tgkill, ca->pid, ca->tid, SIGCONT);
+ return NULL;
+}
+
+static void test_pe_deactivate_sigstop(void)
+{
+ pthread_t holder;
+ pthread_t cont_tid;
+ pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+ struct tc3_args args;
+ struct tc3_cont_args cont_args;
+ struct sigaction sa_alarm, sa_old;
+ struct timespec t0, t1;
+ long ms;
+ int pass = 0;
+
+ printf("# TC-3: PE deactivate path (owner SIGSTOP/SIGCONT)\n");
+
+ memset(&args, 0, sizeof(args));
+ args.mutex = &mutex;
+ pthread_mutex_init(&args.ready_mutex, NULL);
+ pthread_cond_init(&args.ready_cond, NULL);
+
+ /* Install SIGALRM handler. */
+ memset(&sa_alarm, 0, sizeof(sa_alarm));
+ sa_alarm.sa_handler = tc3_alarm_handler;
+ sigemptyset(&sa_alarm.sa_mask);
+ sa_alarm.sa_flags = 0;
+ sigaction(SIGALRM, &sa_alarm, &sa_old);
+ tc3_alarm_fired = 0;
+
+ if (pthread_create(&holder, NULL, tc3_holder_thread, &args) != 0) {
+ printf("# TC-3: pthread_create holder failed: %s\n",
+ strerror(errno));
+ goto cleanup_sig;
+ }
+
+ /* Wait until the holder has the mutex. */
+ pthread_mutex_lock(&args.ready_mutex);
+ while (!args.ready)
+ pthread_cond_wait(&args.ready_cond, &args.ready_mutex);
+ pthread_mutex_unlock(&args.ready_mutex);
+
+ /* SIGSTOP the holder — it is now off the run queue. */
+ syscall(SYS_tgkill, getpid(), args.tid, SIGSTOP);
+
+ /* Raise our priority so we are the high-prio blocked waiter. */
+ if (set_rt_prio(80) != 0) {
+ printf("# TC-3: set_rt_prio(80) failed: %s\n", strerror(errno));
+ syscall(SYS_tgkill, getpid(), args.tid, SIGCONT);
+ pthread_join(holder, NULL);
+ goto cleanup_sig;
+ }
+
+ /* Spawn the SIGCONT helper before blocking; it fires after 1 second. */
+ cont_args.tid = args.tid;
+ cont_args.pid = getpid();
+
+ if (pthread_create(&cont_tid, NULL, tc3_cont_thread, &cont_args) != 0) {
+ printf("# TC-3: pthread_create cont failed: %s\n",
+ strerror(errno));
+ syscall(SYS_tgkill, getpid(), args.tid, SIGCONT);
+ pthread_join(holder, NULL);
+ goto cleanup_prio;
+ }
+
+ /* Set a 5-second alarm as overall watchdog. */
+ alarm(5);
+
+ clock_gettime(CLOCK_MONOTONIC, &t0);
+ pthread_mutex_lock(&mutex);
+ clock_gettime(CLOCK_MONOTONIC, &t1);
+ pthread_mutex_unlock(&mutex);
+
+ alarm(0); /* cancel watchdog */
+
+ pthread_join(cont_tid, NULL);
+
+ ms = elapsed_ms(&t0, &t1);
+ pass = (!tc3_alarm_fired && ms < 5000L);
+ printf("# TC-3: acquired mutex in %ld ms (limit 5000 ms, alarm=%d)\n",
+ ms, (int)tc3_alarm_fired);
+ print_result("pe_deactivate_sigstop", pass);
+
+cleanup_prio:
+ /* Restore scheduling. */
+ {
+ struct sched_param sp = { .sched_priority = 0 };
+
+ sched_setscheduler(0, SCHED_OTHER, &sp);
+ }
+
+ pthread_join(holder, NULL);
+
+cleanup_sig:
+ sigaction(SIGALRM, &sa_old, NULL);
+ pthread_mutex_destroy(&mutex);
+ pthread_mutex_destroy(&args.ready_mutex);
+ pthread_cond_destroy(&args.ready_cond);
+}
+
+/* ------------------------------------------------------------------ */
+/* TC-4: rwsem write-side PE — basic activation */
+/* ------------------------------------------------------------------ */
+
+/*
+ * NOTE: This test exercises the rwsem PE integration added by the
+ * proxy-exec-rwsem-support patch. It uses pthread_rwlock_t as the
+ * user-space analogue of the kernel rw_semaphore. The kernel PE path
+ * (blocked_on tracking in rwsem_down_write_slowpath) ensures that when
+ * a high-priority writer is blocked on an rwsem held by a low-priority
+ * writer, the holder is proxy-executed at the waiter's priority.
+ *
+ * The timing assertion (< 300 ms) is the same as TC-1: with PE the
+ * holder finishes its 200 ms critical section and releases the lock
+ * within the window.
+ */
+
+struct tc4_args {
+ pthread_rwlock_t *rwlock;
+ int hold_ms;
+};
+
+static void *tc4_holder_thread(void *arg)
+{
+ struct tc4_args *a = arg;
+ struct timespec ts = { 0, (long)a->hold_ms * 1000000L };
+
+ /* Become a low-prio RT thread so PE applies. */
+ set_rt_prio(20);
+
+ pthread_rwlock_wrlock(a->rwlock);
+ nanosleep(&ts, NULL);
+ pthread_rwlock_unlock(a->rwlock);
+
+ return NULL;
+}
+
+static void test_rwsem_write_pe_basic(void)
+{
+ pthread_t holder;
+ pthread_rwlock_t rwlock = PTHREAD_RWLOCK_INITIALIZER;
+ struct tc4_args args = { .rwlock = &rwlock, .hold_ms = 200 };
+ struct timespec t0, t1;
+ long ms;
+ int pass;
+
+ printf("# TC-4: rwsem write-side PE activation\n");
+
+ if (pthread_create(&holder, NULL, tc4_holder_thread, &args) != 0) {
+ printf("# TC-4: pthread_create failed: %s\n",
+ strerror(errno));
+ print_result("rwsem_write_pe_basic", 0);
+ return;
+ }
+
+ /* Give the holder time to acquire the write lock. */
+ usleep(20000); /* 20 ms */
+
+ /* Raise to high prio — we become the blocked writer. */
+ if (set_rt_prio(80) != 0) {
+ printf("# TC-4: set_rt_prio(80) failed: %s\n",
+ strerror(errno));
+ pthread_join(holder, NULL);
+ pthread_rwlock_destroy(&rwlock);
+ print_result("rwsem_write_pe_basic", 0);
+ return;
+ }
+
+ clock_gettime(CLOCK_MONOTONIC, &t0);
+ pthread_rwlock_wrlock(&rwlock);
+ clock_gettime(CLOCK_MONOTONIC, &t1);
+ pthread_rwlock_unlock(&rwlock);
+
+ {
+ struct sched_param sp = { .sched_priority = 0 };
+
+ sched_setscheduler(0, SCHED_OTHER, &sp);
+ }
+
+ pthread_join(holder, NULL);
+ pthread_rwlock_destroy(&rwlock);
+
+ ms = elapsed_ms(&t0, &t1);
+ pass = (ms < 300L);
+ printf("# TC-4: acquired write lock in %ld ms (limit 300 ms)\n",
+ ms);
+ print_result("rwsem_write_pe_basic", pass);
+}
+
+/* ------------------------------------------------------------------ */
+/* TC-5: Mixed chain — rwsem write → mutex */
+/* ------------------------------------------------------------------ */
+
+/*
+ * Chain layout:
+ * A (prio=80) waiting for rwlock write
+ * B (prio=50) holds rwlock write, waiting for mutex
+ * C (prio=20) holds mutex, sleeping 300 ms
+ *
+ * With rwsem PE support the chain A→rwlock→B→mutex→C is fully
+ * traversed. A should acquire the rwlock within 500 ms.
+ */
+
+struct tc5_shared {
+ pthread_rwlock_t rwlock;
+ pthread_mutex_t mutex;
+
+ pthread_mutex_t sync_mu;
+ pthread_cond_t sync_cv;
+ int ready; /* incremented by B and C when ready */
+};
+
+struct tc5_b_args { struct tc5_shared *s; };
+struct tc5_c_args { struct tc5_shared *s; int hold_ms; };
+
+static void *tc5_c_thread(void *arg)
+{
+ struct tc5_c_args *a = arg;
+ struct tc5_shared *s = a->s;
+ struct timespec ts = { 0, (long)a->hold_ms * 1000000L };
+
+ set_rt_prio(20);
+
+ pthread_mutex_lock(&s->mutex);
+
+ pthread_mutex_lock(&s->sync_mu);
+ s->ready++;
+ pthread_cond_signal(&s->sync_cv);
+ pthread_mutex_unlock(&s->sync_mu);
+
+ nanosleep(&ts, NULL);
+ pthread_mutex_unlock(&s->mutex);
+ return NULL;
+}
+
+static void *tc5_b_thread(void *arg)
+{
+ struct tc5_b_args *a = arg;
+ struct tc5_shared *s = a->s;
+
+ set_rt_prio(50);
+
+ pthread_rwlock_wrlock(&s->rwlock);
+
+ pthread_mutex_lock(&s->sync_mu);
+ s->ready++;
+ pthread_cond_signal(&s->sync_cv);
+ pthread_mutex_unlock(&s->sync_mu);
+
+ /* Block on the mutex — this is the middle of the PE chain. */
+ pthread_mutex_lock(&s->mutex);
+ pthread_mutex_unlock(&s->mutex);
+
+ pthread_rwlock_unlock(&s->rwlock);
+ return NULL;
+}
+
+static void test_rwsem_mutex_chain(void)
+{
+ pthread_t tb, tc;
+ struct tc5_shared s = {
+ .rwlock = PTHREAD_RWLOCK_INITIALIZER,
+ .mutex = PTHREAD_MUTEX_INITIALIZER,
+ .sync_mu = PTHREAD_MUTEX_INITIALIZER,
+ .sync_cv = PTHREAD_COND_INITIALIZER,
+ .ready = 0,
+ };
+ struct tc5_b_args bargs = { .s = &s };
+ struct tc5_c_args cargs = { .s = &s, .hold_ms = 300 };
+ struct timespec t0, t1;
+ long ms;
+ int pass;
+
+ printf("# TC-5: mixed rwsem-write -> mutex PE chain\n");
+
+ if (pthread_create(&tc, NULL, tc5_c_thread, &cargs) != 0 ||
+ pthread_create(&tb, NULL, tc5_b_thread, &bargs) != 0) {
+ printf("# TC-5: pthread_create failed: %s\n",
+ strerror(errno));
+ print_result("rwsem_mutex_chain", 0);
+ return;
+ }
+
+ /* Wait until both B and C have grabbed their locks. */
+ pthread_mutex_lock(&s.sync_mu);
+ while (s.ready < 2)
+ pthread_cond_wait(&s.sync_cv, &s.sync_mu);
+ pthread_mutex_unlock(&s.sync_mu);
+
+ if (set_rt_prio(80) != 0) {
+ printf("# TC-5: set_rt_prio(80) failed: %s\n",
+ strerror(errno));
+ pthread_join(tb, NULL);
+ pthread_join(tc, NULL);
+ print_result("rwsem_mutex_chain", 0);
+ return;
+ }
+
+ clock_gettime(CLOCK_MONOTONIC, &t0);
+ pthread_rwlock_wrlock(&s.rwlock);
+ clock_gettime(CLOCK_MONOTONIC, &t1);
+ pthread_rwlock_unlock(&s.rwlock);
+
+ {
+ struct sched_param sp = { .sched_priority = 0 };
+
+ sched_setscheduler(0, SCHED_OTHER, &sp);
+ }
+
+ pthread_join(tb, NULL);
+ pthread_join(tc, NULL);
+ pthread_rwlock_destroy(&s.rwlock);
+ pthread_mutex_destroy(&s.mutex);
+ pthread_mutex_destroy(&s.sync_mu);
+ pthread_cond_destroy(&s.sync_cv);
+
+ ms = elapsed_ms(&t0, &t1);
+ pass = (ms < 500L);
+ printf("# TC-5: acquired write lock in %ld ms (limit 500 ms)\n",
+ ms);
+ print_result("rwsem_mutex_chain", pass);
+}
+
+/* ------------------------------------------------------------------ */
+/* main */
+/* ------------------------------------------------------------------ */
+
+int main(void)
+{
+ struct sched_param sp = { .sched_priority = 1 };
+
+ /*
+ * Capability check: attempt to raise to SCHED_FIFO prio 1. A plain
+ * EPERM means we lack CAP_SYS_NICE; skip gracefully in that case.
+ */
+ if (sched_setscheduler(0, SCHED_FIFO, &sp) != 0) {
+ if (errno == EPERM) {
+ printf("1..0 # SKIP: requires CAP_SYS_NICE\n");
+ return 0;
+ }
+ /* Unexpected error — restore SCHED_OTHER and continue. */
+ } else {
+ /* Restore normal scheduling before running tests. */
+ sp.sched_priority = 0;
+ sched_setscheduler(0, SCHED_OTHER, &sp);
+ }
+
+ if (!is_proxy_exec_enabled()) {
+ printf("1..0 # SKIP: CONFIG_SCHED_PROXY_EXEC not enabled\n");
+ return 0;
+ }
+
+ /* TAP plan: five test cases (3 original + 2 rwsem) */
+ printf("1..5\n");
+
+ test_basic_pe_activation();
+ test_three_level_chain();
+ test_pe_deactivate_sigstop();
+ test_rwsem_write_pe_basic();
+ test_rwsem_mutex_chain();
+
+ return 0;
+}
--
2.43.0