Re: [RFC] RT-patch update to remove the global pi_lock

From: Steven Rostedt
Date: Thu Aug 25 2005 - 09:48:46 EST


OK, here it is. Against 2.6.13-rc7-rt1.

Please, test it well. Make sure to turn off any debugging, especially
CONFIG_RT_DEADLOCK_DETECT. Since that would use a global trace_lock
that defeats the purpose of removing the pi_lock.

The patch to remove the single pi_lock:

-- Steve

Signed-off-by: Steven Rostedt <rostedt@xxxxxxxxxxx>

Index: linux_realtime_goliath/kernel/rt.c
===================================================================
--- linux_realtime_goliath/kernel/rt.c (revision 304)
+++ linux_realtime_goliath/kernel/rt.c (working copy)
@@ -22,6 +22,9 @@
*
* Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
*
+ * (also by Steven Rostedt)
+ * - Converted single pi_lock to individual task locks.
+ *
*/
#include <linux/config.h>
#include <linux/sched.h>
@@ -36,6 +39,26 @@
#define CAPTURE_LOCK

/*
+ * Lock order:
+ *
+ * To keep from having a single lock for PI, each task and lock
+ * has their own locking. The order is as follows:
+ *
+ * blocked task->pi_lock -> lock->wait_lock -> owner task->pi_lock.
+ *
+ * This is safe since a owner task should never block on a lock that
+ * is owned by a blocking task. Otherwise you would have a deadlock
+ * in the normal system.
+ * The same goes for the locks. A lock held by one task, should not be
+ * taken by task that holds a lock that is blocking this lock's owner.
+ *
+ * A task that is about to grab a lock is first considered to be a
+ * blocking task, even if the task successfully acquires the lock.
+ * This is because the taking of the locks happen before the
+ * task becomes the owner.
+ */
+
+/*
* These flags are used for allowing of stealing of ownerships.
*/
#define RT_PENDOWNER 1 /* pending owner on a lock */
@@ -83,14 +106,6 @@
*/
//#define ALL_TASKS_PI

-/*
- * We need a global lock for priority inheritance handling.
- * This is only for the slow path, but still, we might want
- * to optimize it later to be more scalable.
- */
-static __cacheline_aligned_in_smp raw_spinlock_t pi_lock =
- RAW_SPIN_LOCK_UNLOCKED;
-
#ifdef CONFIG_RT_DEADLOCK_DETECT
# define __EIP_DECL__ , unsigned long eip
# define __EIP__ , eip
@@ -174,6 +189,8 @@
if (trace_on) { \
trace_on = 0; \
console_verbose(); \
+ if (spin_is_locked(&current->pi_lock)) \
+ __raw_spin_unlock(&current->pi_lock); \
spin_unlock(&trace_lock); \
} \
} while (0)
@@ -228,7 +245,6 @@
*/
void zap_rt_locks(void)
{
- spin_lock_init(&pi_lock);
#ifdef CONFIG_RT_DEADLOCK_DETECT
spin_lock_init(&trace_lock);
#endif
@@ -589,11 +605,11 @@
printk("exiting task is not even the owner??\n");
goto restart;
}
- __raw_spin_lock(&pi_lock);
+ __raw_spin_lock(&task->pi_lock);
plist_for_each(curr1, &task->pi_waiters) {
w = plist_entry(curr1, struct rt_mutex_waiter, pi_list);
TRACE_OFF();
- __raw_spin_unlock(&pi_lock);
+ __raw_spin_unlock(&task->pi_lock);
trace_unlock_irqrestore(&trace_lock, flags, ti);

printk("hm, PI interest held at exit time? Task:\n");
@@ -601,7 +617,7 @@
printk_waiter(w);
return;
}
- __raw_spin_unlock(&pi_lock);
+ __raw_spin_unlock(&task->pi_lock);
trace_unlock_irqrestore(&trace_lock, flags, ti);
}

@@ -674,6 +690,7 @@
struct rt_mutex_waiter *w;
struct plist *curr1;

+ __raw_spin_lock(&old_owner->task->pi_lock);
plist_for_each(curr1, &old_owner->task->pi_waiters) {
w = plist_entry(curr1, struct rt_mutex_waiter, pi_list);
if (w->lock == lock) {
@@ -682,9 +699,10 @@
printk_waiter(w);
printk("\n");
TRACE_WARN_ON(1);
- return;
+ break;
}
}
+ __raw_spin_unlock(&old_owner->task->pi_lock);
}

#else
@@ -732,13 +750,24 @@

int pi_walk, pi_null, pi_prio, pi_initialized;

-static void pi_setprio(struct rt_mutex *lock, struct task_struct *p, int prio)
+/*
+ * The lock->wait_lock and p->pi_lock must be held.
+ */
+static void pi_setprio(struct rt_mutex *lock, struct task_struct *task, int prio)
{
+ struct rt_mutex *l = lock;
+ struct task_struct *p = task;
+ /*
+ * We don't want to release the parameters locks.
+ */
+
if (unlikely(!p->pid)) {
pi_null++;
return;
}

+ TRACE_BUG_ON_LOCKED(!spin_is_locked(&lock->wait_lock));
+ TRACE_BUG_ON_LOCKED(!spin_is_locked(&p->pi_lock));
#ifdef CONFIG_RT_DEADLOCK_DETECT
pi_prio++;
if (p->policy != SCHED_NORMAL && prio > normal_prio(p)) {
@@ -759,7 +788,6 @@
* If the task is blocked on some other task then boost that
* other task (or tasks) too:
*/
- __raw_spin_lock(&p->pi_lock);
for (;;) {
struct rt_mutex_waiter *w = p->blocked_on;
#ifdef CONFIG_RT_DEADLOCK_DETECT
@@ -774,16 +802,27 @@
* it RT, then register the task in the PI list and
* requeue it to the wait list:
*/
- lock = w->lock;
+
+ /*
+ * Don't unlock the original lock->wait_lock
+ */
+ if (l != lock)
+ __raw_spin_unlock(&l->wait_lock);
+ l = w->lock;
TRACE_BUG_ON_LOCKED(!lock);
- TRACE_BUG_ON_LOCKED(!lock_owner(lock));
+
+ if (l != lock)
+ __raw_spin_lock(&l->wait_lock);
+
+ TRACE_BUG_ON_LOCKED(!lock_owner(l));
if (rt_task(p) && plist_empty(&w->pi_list)) {
+ TRACE_BUG_ON_LOCKED(was_rt);
plist_init(&w->pi_list, prio);
- plist_add(&w->pi_list, &lock_owner(lock)->task->pi_waiters);
+ plist_add(&w->pi_list, &lock_owner(l)->task->pi_waiters);

- plist_del(&w->list, &lock->wait_list);
+ plist_del(&w->list, &l->wait_list);
plist_init(&w->list, prio);
- plist_add(&w->list, &lock->wait_list);
+ plist_add(&w->list, &l->wait_list);
}
/*
* If the task is blocked on a lock, and we just restored
@@ -795,16 +834,18 @@
*/
if (!rt_task(p) && !plist_empty(&w->pi_list)) {
TRACE_BUG_ON_LOCKED(!was_rt);
- plist_del(&w->pi_list, &lock_owner(lock)->task->pi_waiters);
- plist_del(&w->list, &lock->wait_list);
+ plist_del(&w->pi_list, &lock_owner(l)->task->pi_waiters);
+ plist_del(&w->list, &l->wait_list);
plist_init(&w->list, prio);
- plist_add(&w->list, &lock->wait_list);
+ plist_add(&w->list, &l->wait_list);
}

pi_walk++;
+
+ if (p != task)
+ __raw_spin_unlock(&p->pi_lock);

- __raw_spin_unlock(&p->pi_lock);
- p = lock_owner(lock)->task;
+ p = lock_owner(l)->task;
TRACE_BUG_ON_LOCKED(!p);
__raw_spin_lock(&p->pi_lock);
/*
@@ -815,7 +856,10 @@
if (p->prio <= prio)
break;
}
- __raw_spin_unlock(&p->pi_lock);
+ if (l != lock)
+ __raw_spin_unlock(&l->wait_lock);
+ if (p != task)
+ __raw_spin_unlock(&p->pi_lock);
}

/*
@@ -831,7 +875,9 @@
unsigned long flags;
int oldprio;

- spin_lock_irqsave(&pi_lock, flags);
+ spin_lock_irqsave(&p->pi_lock,flags);
+ if (p->blocked_on)
+ spin_lock(&p->blocked_on->lock->wait_lock);

oldprio = p->normal_prio;
if (oldprio == prio)
@@ -858,10 +904,17 @@
else
mutex_setprio(p, prio);
out:
- spin_unlock_irqrestore(&pi_lock, flags);
+ if (p->blocked_on)
+ spin_unlock(&p->blocked_on->lock->wait_lock);

+ spin_unlock_irqrestore(&p->pi_lock, flags);
+
}

+/*
+ * This is called with both the waiter->task->pi_lock and
+ * lock->wait_lock held.
+ */
static void
task_blocks_on_lock(struct rt_mutex_waiter *waiter, struct thread_info *ti,
struct rt_mutex *lock __EIP_DECL__)
@@ -872,7 +925,6 @@
/* mark the current thread as blocked on the lock */
waiter->eip = eip;
#endif
- __raw_spin_lock(&task->pi_lock);
task->blocked_on = waiter;
waiter->lock = lock;
waiter->ti = ti;
@@ -884,31 +936,24 @@
if (!rt_task(task)) {
plist_add(&waiter->list, &lock->wait_list);
set_lock_owner_pending(lock);
- __raw_spin_unlock(&task->pi_lock);
return;
}
#endif
- __raw_spin_unlock(&task->pi_lock);
- __raw_spin_lock(&pi_lock);
+ plist_add(&waiter->pi_list, &lock_owner(lock)->task->pi_waiters);
/*
- * We could have been added to the pi list from another task
- * doing a pi_setprio.
+ * Add RT tasks to the head:
*/
- if (likely(plist_empty(&waiter->pi_list))) {
- plist_add(&waiter->pi_list, &lock_owner(lock)->task->pi_waiters);
- /*
- * Add RT tasks to the head:
- */
- plist_add(&waiter->list, &lock->wait_list);
- }
+ plist_add(&waiter->list, &lock->wait_list);
set_lock_owner_pending(lock);
/*
* If the waiter has higher priority than the owner
* then temporarily boost the owner:
*/
- if (task->prio < lock_owner(lock)->task->prio)
+ if (task->prio < lock_owner(lock)->task->prio) {
+ __raw_spin_lock(&lock_owner(lock)->task->pi_lock);
pi_setprio(lock, lock_owner(lock)->task, task->prio);
- __raw_spin_unlock(&pi_lock);
+ __raw_spin_unlock(&lock_owner(lock)->task->pi_lock);
+ }
}

/*
@@ -942,6 +987,10 @@
}
EXPORT_SYMBOL(__init_rwsem);

+/*
+ * This must be called with both the old_owner and new_owner pi_locks held.
+ * As well as the lock->wait_lock.
+ */
static inline
void set_new_owner(struct rt_mutex *lock, struct thread_info *old_owner,
struct thread_info *new_owner __EIP_DECL__)
@@ -975,6 +1024,8 @@
/*
* handle the lock release when processes blocked on it that can now run
* - the spinlock must be held by the caller
+ *
+ * The lock->wait_lock must be held, and the lock's owner->pi_lock must not.
*/
static struct thread_info *
pick_new_owner(struct rt_mutex *lock, struct thread_info *old_owner,
@@ -990,14 +1041,34 @@
*/
waiter = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, list);

+ try_again:
trace_special_pid(waiter->ti->task->pid, waiter->ti->task->prio, 0);

#ifdef ALL_TASKS_PI
check_pi_list_present(lock, waiter, old_owner);
#endif
new_owner = waiter->ti;
+ /*
+ * The new owner is still blocked on this lock, so we
+ * must release the lock->wait_lock before grabing
+ * the new_owner lock.
+ */
+ __raw_spin_unlock(&lock->wait_lock);
+ __raw_spin_lock(&new_owner->task->pi_lock);
+ __raw_spin_lock(&lock->wait_lock);
+ /*
+ * In this split second of releasing the lock, a high priority
+ * process could have come along and blocked as well.
+ */
+ waiter = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, list);
+ if (unlikely(waiter->ti != new_owner)) {
+ __raw_spin_unlock(&new_owner->task->pi_lock);
+ goto try_again;
+ }
plist_del_init(&waiter->list, &lock->wait_list);

+ __raw_spin_lock(&old_owner->task->pi_lock);
+
plist_del(&waiter->pi_list, &old_owner->task->pi_waiters);
plist_init(&waiter->pi_list, waiter->ti->task->prio);

@@ -1008,6 +1079,9 @@
new_owner->task->blocked_on = NULL;
TRACE_WARN_ON(save_state != lock->save_state);

+ __raw_spin_unlock(&old_owner->task->pi_lock);
+ __raw_spin_unlock(&new_owner->task->pi_lock);
+
return new_owner;
}

@@ -1060,9 +1134,7 @@
* is still sleeping and hasn't woken up to get the lock.
*/

- /* Test the simple case first, is it already running? */
- if (!TASK_PENDING(owner))
- return 0;
+ TRACE_BUG_ON_LOCKED(!owner);

/* The owner is pending on a lock, but is it this lock? */
if (owner->pending_owner != lock)
@@ -1134,17 +1206,35 @@
return 0;

trace_lock_irqsave(&trace_lock, flags, ti);
+ /*
+ * We are no longer blocked on the lock, so we are considered a
+ * owner. So we must grab the lock->wait_lock first.
+ */
__raw_spin_lock(&lock->wait_lock);
+ __raw_spin_lock(&task->pi_lock);

if (!(task->rt_flags & RT_PENDOWNER)) {
- /* someone else stole it */
+ /*
+ * Someone else stole it.
+ * We are grabbing a lock now, so we must release the
+ * locks again and retake them in the opposite order.
+ */
+ __raw_spin_unlock(&task->pi_lock);
+ __raw_spin_unlock(&lock->wait_lock);
+
+ __raw_spin_lock(&task->pi_lock);
+ __raw_spin_lock(&lock->wait_lock);
+
old_owner = lock_owner(lock);
TRACE_BUG_ON_LOCKED(old_owner == ti);
if (likely(!old_owner) || __grab_lock(lock, task, old_owner->task)) {
/* we got it back! */
- __raw_spin_lock(&pi_lock);
- set_new_owner(lock, old_owner, ti __W_EIP__(waiter));
- __raw_spin_unlock(&pi_lock);
+ if (old_owner) {
+ __raw_spin_lock(&old_owner->task->pi_lock);
+ set_new_owner(lock, old_owner, ti __W_EIP__(waiter));
+ __raw_spin_unlock(&old_owner->task->pi_lock);
+ } else
+ set_new_owner(lock, old_owner, ti __W_EIP__(waiter));
ret = 0;
} else {
/* Add ourselves back to the list */
@@ -1159,6 +1249,7 @@
}

__raw_spin_unlock(&lock->wait_lock);
+ __raw_spin_unlock(&task->pi_lock);
trace_unlock_irqrestore(&trace_lock, flags, ti);

return ret;
@@ -1196,6 +1287,7 @@

trace_lock_irqsave(&trace_lock, flags, ti);
TRACE_BUG_ON_LOCKED(!raw_irqs_disabled());
+ __raw_spin_lock(&task->pi_lock);
__raw_spin_lock(&lock->wait_lock);
INIT_WAITER(&waiter);

@@ -1205,10 +1297,14 @@
if (likely(!old_owner) || __grab_lock(lock, task, old_owner->task)) {
/* granted */
TRACE_WARN_ON_LOCKED(!plist_empty(&lock->wait_list) && !old_owner);
- __raw_spin_lock(&pi_lock);
- set_new_owner(lock, old_owner, ti __EIP__);
- __raw_spin_unlock(&pi_lock);
+ if (old_owner) {
+ __raw_spin_lock(&old_owner->task->pi_lock);
+ set_new_owner(lock, old_owner, ti __EIP__);
+ __raw_spin_unlock(&old_owner->task->pi_lock);
+ } else
+ set_new_owner(lock, old_owner, ti __EIP__);
__raw_spin_unlock(&lock->wait_lock);
+ __raw_spin_unlock(&task->pi_lock);
trace_unlock_irqrestore(&trace_lock, flags, ti);

FREE_WAITER(&waiter);
@@ -1223,6 +1319,7 @@
TRACE_BUG_ON_LOCKED(!raw_irqs_disabled());
/* we don't need to touch the lock struct anymore */
__raw_spin_unlock(&lock->wait_lock);
+ __raw_spin_unlock(&task->pi_lock);
trace_unlock_irqrestore(&trace_lock, flags, ti);

might_sleep();
@@ -1273,6 +1370,7 @@

trace_lock_irqsave(&trace_lock, flags, ti);
TRACE_BUG_ON_LOCKED(!raw_irqs_disabled());
+ __raw_spin_lock(&task->pi_lock);
__raw_spin_lock(&lock->wait_lock);
INIT_WAITER(&waiter);

@@ -1282,10 +1380,14 @@
if (likely(!old_owner) || __grab_lock(lock, task, old_owner->task)) {
/* granted */
TRACE_WARN_ON_LOCKED(!plist_empty(&lock->wait_list) && !old_owner);
- __raw_spin_lock(&pi_lock);
- set_new_owner(lock, old_owner, ti __EIP__);
- __raw_spin_unlock(&pi_lock);
+ if (old_owner) {
+ __raw_spin_lock(&old_owner->task->pi_lock);
+ set_new_owner(lock, old_owner, ti __EIP__);
+ __raw_spin_unlock(&old_owner->task->pi_lock);
+ } else
+ set_new_owner(lock, old_owner, ti __EIP__);
__raw_spin_unlock(&lock->wait_lock);
+ __raw_spin_unlock(&task->pi_lock);
trace_unlock_irqrestore(&trace_lock, flags, ti);

FREE_WAITER(&waiter);
@@ -1306,6 +1408,7 @@

/* we don't need to touch the lock struct anymore */
__raw_spin_unlock(&lock->wait_lock);
+ __raw_spin_unlock(&task->pi_lock);
trace_unlock(&trace_lock, ti);

/*
@@ -1396,7 +1499,6 @@
TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list));
list_del_init(&lock->held_list);
#endif
- __raw_spin_lock(&pi_lock);

#ifdef ALL_TASKS_PI
if (plist_empty(&lock->wait_list))
@@ -1409,7 +1511,6 @@
__up_mutex_waiter_nosavestate(lock __EIP__);
} else
lock->owner = NULL;
- __raw_spin_unlock(&pi_lock);
__raw_spin_unlock(&lock->wait_lock);
#ifdef CONFIG_DEBUG_PREEMPT
if (current->lock_count < 0 || current->lock_count >= MAX_LOCK_STACK) {
@@ -1640,6 +1741,7 @@

trace_lock_irqsave(&trace_lock, flags, ti);
TRACE_BUG_ON_LOCKED(!raw_irqs_disabled());
+ __raw_spin_lock(&task->pi_lock);
__raw_spin_lock(&lock->wait_lock);
INIT_WAITER(&waiter);

@@ -1649,10 +1751,14 @@
if (likely(!old_owner) || __grab_lock(lock, task, old_owner->task)) {
/* granted */
TRACE_WARN_ON_LOCKED(!plist_empty(&lock->wait_list) && !old_owner);
- __raw_spin_lock(&pi_lock);
- set_new_owner(lock, old_owner, ti __EIP__);
- __raw_spin_unlock(&pi_lock);
+ if (old_owner) {
+ __raw_spin_lock(&old_owner->task->pi_lock);
+ set_new_owner(lock, old_owner, ti __EIP__);
+ __raw_spin_unlock(&old_owner->task->pi_lock);
+ } else
+ set_new_owner(lock, old_owner, ti __EIP__);
__raw_spin_unlock(&lock->wait_lock);
+ __raw_spin_unlock(&task->pi_lock);
trace_unlock_irqrestore(&trace_lock, flags, ti);

FREE_WAITER(&waiter);
@@ -1667,6 +1773,7 @@
TRACE_BUG_ON_LOCKED(!raw_irqs_disabled());
/* we don't need to touch the lock struct anymore */
__raw_spin_unlock(&lock->wait_lock);
+ __raw_spin_unlock(&task->pi_lock);
trace_unlock_irqrestore(&trace_lock, flags, ti);

might_sleep();
@@ -1684,6 +1791,7 @@
* didnt get the lock - else return success:
*/
trace_lock_irq(&trace_lock, ti);
+ __raw_spin_lock(&task->pi_lock);
__raw_spin_lock(&lock->wait_lock);
if (waiter.ti) {
plist_del_init(&waiter.list, &lock->wait_list);
@@ -1692,17 +1800,16 @@
* (No big problem if our PI effect lingers
* a bit - owner will restore prio.)
*/
- __raw_spin_lock(&pi_lock);
TRACE_WARN_ON_LOCKED(waiter.ti != ti);
TRACE_WARN_ON_LOCKED(current->blocked_on != &waiter);
plist_del(&waiter.pi_list, &task->pi_waiters);
plist_init(&waiter.pi_list, task->prio);
waiter.ti = NULL;
current->blocked_on = NULL;
- __raw_spin_unlock(&pi_lock);
ret = -EINTR;
}
__raw_spin_unlock(&lock->wait_lock);
+ __raw_spin_unlock(&task->pi_lock);
trace_unlock_irq(&trace_lock, ti);
break;
}
@@ -1741,6 +1848,7 @@

trace_lock_irqsave(&trace_lock, flags, ti);
TRACE_BUG_ON_LOCKED(!raw_irqs_disabled());
+ __raw_spin_lock(&task->pi_lock);
__raw_spin_lock(&lock->wait_lock);

old_owner = lock_owner(lock);
@@ -1749,13 +1857,18 @@
if (likely(!old_owner) || __grab_lock(lock, task, old_owner->task)) {
/* granted */
TRACE_WARN_ON_LOCKED(!plist_empty(&lock->wait_list) && !old_owner);
- __raw_spin_lock(&pi_lock);
- set_new_owner(lock, old_owner, ti __EIP__);
- __raw_spin_unlock(&pi_lock);
+ if (old_owner) {
+ __raw_spin_lock(&old_owner->task->pi_lock);
+ set_new_owner(lock, old_owner, ti __EIP__);
+ __raw_spin_unlock(&old_owner->task->pi_lock);
+ } else
+ set_new_owner(lock, old_owner, ti __EIP__);
+ __raw_spin_unlock(&lock->wait_lock);
+ __raw_spin_unlock(&task->pi_lock);
ret = 1;
}
-
__raw_spin_unlock(&lock->wait_lock);
+ __raw_spin_unlock(&task->pi_lock);
trace_unlock_irqrestore(&trace_lock, flags, ti);

return ret;
@@ -1817,6 +1930,7 @@
* to the previous priority (or to the next highest prio
* waiter's priority):
*/
+ __raw_spin_lock(&old_owner->pi_lock);
prio = old_owner->normal_prio;
if (unlikely(!plist_empty(&old_owner->pi_waiters))) {
w = plist_first_entry(&old_owner->pi_waiters, struct rt_mutex_waiter, pi_list);
@@ -1825,6 +1939,7 @@
}
if (unlikely(prio != old_owner->prio))
pi_setprio(lock, old_owner, prio);
+ __raw_spin_unlock(&old_owner->pi_lock);
#ifdef CAPTURE_LOCK
if (lock != &kernel_sem.lock) {
new_owner->rt_flags |= RT_PENDOWNER;
@@ -1851,6 +1966,7 @@
* to the previous priority (or to the next highest prio
* waiter's priority):
*/
+ __raw_spin_lock(&old_owner->pi_lock);
prio = old_owner->normal_prio;
if (unlikely(!plist_empty(&old_owner->pi_waiters))) {
w = plist_first_entry(&old_owner->pi_waiters, struct rt_mutex_waiter, pi_list);
@@ -1859,6 +1975,7 @@
}
if (unlikely(prio != old_owner->prio))
pi_setprio(lock, old_owner, prio);
+ __raw_spin_unlock(&old_owner->pi_lock);
#ifdef CAPTURE_LOCK
if (lock != &kernel_sem.lock) {
new_owner->rt_flags |= RT_PENDOWNER;




-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/