[PATCH 5/5] futex: fix miss ordered wakeups

From: Daniel Walker
Date: Wed Jun 11 2008 - 16:55:09 EST


Adds an additional function call to the sched_setscheduler to update the
waiter position of a task if it happens to be waiting on a futex. This
ensures that the kernel level waiter ordering is correctly maintained
based on the changed priority of the task.

I fixed the locking issue noticed by Thomas Gleixner.

This doesn't address userspace at all, only the kernel level wakeups and
kernel level ordering.

The additional locking added to the futex_wait function has no visible speed
impact, and only effects waiters which actual enter the kernel.

Signed-off-by: Daniel Walker <dwalker@xxxxxxxxxx>

---
include/linux/sched.h | 4 ++++
kernel/futex.c | 41 +++++++++++++++++++++++++++++++++++++++++
kernel/sched.c | 1 +
3 files changed, 46 insertions(+)

Index: linux-2.6.25/include/linux/sched.h
===================================================================
--- linux-2.6.25.orig/include/linux/sched.h
+++ linux-2.6.25/include/linux/sched.h
@@ -1027,6 +1027,7 @@ struct sched_rt_entity {
enum lock_waiter_type {
MUTEX_WAITER = 1,
RT_MUTEX_WAITER,
+ FUTEX_WAITER
};

struct lock_waiter_state {
@@ -1034,6 +1035,7 @@ struct lock_waiter_state {
union {
struct mutex_waiter *mutex_blocked_on;
struct rt_mutex_waiter *rt_blocked_on;
+ union futex_key *futex_blocked_on;
};
};

@@ -1675,6 +1677,8 @@ static inline int rt_mutex_getprio(struc
# define rt_mutex_adjust_pi(p) do { } while (0)
#endif

+extern void futex_adjust_waiters(struct task_struct *p);
+
extern void set_user_nice(struct task_struct *p, long nice);
extern int task_prio(const struct task_struct *p);
extern int task_nice(const struct task_struct *p);
Index: linux-2.6.25/kernel/futex.c
===================================================================
--- linux-2.6.25.orig/kernel/futex.c
+++ linux-2.6.25/kernel/futex.c
@@ -327,6 +327,38 @@ static int get_futex_value_locked(u32 *d
return ret ? -EFAULT : 0;
}

+void futex_adjust_waiters(struct task_struct *p)
+{
+
+ if (p->blocked_on) {
+ struct futex_hash_bucket *hb;
+ struct futex_q *q, *next;
+ union futex_key key;
+
+ spin_lock_irq(&p->pi_lock);
+ if (p->blocked_on && p->blocked_on->lock_type == FUTEX_WAITER) {
+ key = *p->blocked_on->futex_blocked_on;
+ spin_unlock_irq(&p->pi_lock);
+ } else {
+ spin_unlock_irq(&p->pi_lock);
+ return;
+ }
+
+ hb = hash_futex(&key);
+ spin_lock(&hb->lock);
+ plist_for_each_entry_safe(q, next, &hb->chain, list) {
+ if (match_futex(&q->key, &key) && q->task == p) {
+ int prio = min(p->normal_prio, MAX_RT_PRIO);
+ plist_del(&q->list, &hb->chain);
+ plist_node_init(&q->list, prio);
+ plist_add(&q->list, &hb->chain);
+ break;
+ }
+ }
+ spin_unlock(&hb->lock);
+ }
+}
+
/*
* Fault handling.
* if fshared is non NULL, current->mm->mmap_sem is already held
@@ -1159,6 +1191,8 @@ static int futex_wait(u32 __user *uaddr,
DECLARE_WAITQUEUE(wait, curr);
struct futex_hash_bucket *hb;
struct futex_q q;
+ struct lock_waiter_state blocked_on = {
+ .lock_type = FUTEX_WAITER, { .futex_blocked_on = &q.key } };
u32 uval;
int ret;
struct hrtimer_sleeper t;
@@ -1176,6 +1210,8 @@ static int futex_wait(u32 __user *uaddr,
if (unlikely(ret != 0))
goto out_release_sem;

+ set_blocked_on(current, &blocked_on);
+
hb = queue_lock(&q);

/*
@@ -1203,6 +1239,8 @@ static int futex_wait(u32 __user *uaddr,
if (unlikely(ret)) {
queue_unlock(&q, hb);

+ set_blocked_on(current, NULL);
+
/*
* If we would have faulted, release mmap_sem, fault it in and
* start all over again.
@@ -1276,6 +1314,8 @@ static int futex_wait(u32 __user *uaddr,
}
__set_current_state(TASK_RUNNING);

+ set_blocked_on(current, NULL);
+
/*
* NOTE: we don't remove ourselves from the waitqueue because
* we are the only user of it.
@@ -1310,6 +1350,7 @@ static int futex_wait(u32 __user *uaddr,

out_unlock_release_sem:
queue_unlock(&q, hb);
+ set_blocked_on(current, NULL);

out_release_sem:
futex_unlock_mm(fshared);
Index: linux-2.6.25/kernel/sched.c
===================================================================
--- linux-2.6.25.orig/kernel/sched.c
+++ linux-2.6.25/kernel/sched.c
@@ -5209,6 +5209,7 @@ recheck:
spin_unlock_irqrestore(&p->pi_lock, flags);

rt_mutex_adjust_pi(p);
+ futex_adjust_waiters(p);

return 0;
}

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/