From: Ma Ling<ling.ml@xxxxxxxxxxxxxxx>
All load instructions can run speculatively but they have to follow
memory order rule in multiple cores as below:
_x = _y = 0
Processor 0 Processor 1
mov r1, [ _y] //M1 mov [ _x], 1 //M3
mov r2, [ _x] //M2 mov [ _y], 1 //M4
If r1 = 1, r2 must be 1
In order to guarantee above rule, although Processor 0 execute
M1 and M2 instruction out of order, they are kept in ROB,
when load buffer for _x in Processor 0 received the update
message from Processor 1, Processor 0 need to roll back
from M2 instruction, which will flush the whole pipeline,
the latency is over the penalty from branch prediction miss.
In this patch we use lock cmpxchg instruction to force load
instructions to be serialization, the destination operand
receives a write cycle without regard to the result of
the comparison, which can help us to reduce the penalty
from load instruction roll back.
Our experiment indicates the performance can be improved by 10%~15%
for 2 and 3 threads cases, the conflicts from lock cache line
spend them most of the time.
Thanks
Ling
Signed-off-by: Ma Ling<ling.ml@xxxxxxxxxxxxxxx>
---
kernel/locking/qspinlock.c | 43 ++++++++++++++++++-------------------------
1 files changed, 18 insertions(+), 25 deletions(-)
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 87e9ce6..16421f2 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -332,25 +332,14 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
if (new == _Q_LOCKED_VAL)
return;
- /*
- * we're pending, wait for the owner to go away.
- *
- * *,1,1 -> *,1,0
+ /* we're waiting, and get lock owner
*
- * this wait loop must be a load-acquire such that we match the
- * store-release that clears the locked bit and create lock
- * sequentiality; this is because not all clear_pending_set_locked()
- * implementations imply full barriers.
+ * *,1,* -> *,0,1
*/
- while ((val = smp_load_acquire(&lock->val.counter))& _Q_LOCKED_MASK)
+ while (cmpxchg(&((struct __qspinlock *)lock)->locked_pending,
+ _Q_PENDING_VAL, _Q_LOCKED_VAL) != _Q_PENDING_VAL)
cpu_relax();
-
- /*
- * take ownership and clear the pending bit.
- *
- * *,1,0 -> *,0,1
- */
- clear_pending_set_locked(lock);
+
return;
/*
@@ -399,17 +388,21 @@ queue:
* we're at the head of the waitqueue, wait for the owner& pending to
* go away.
*
- * *,x,y -> *,0,0
- *
- * this wait loop must use a load-acquire such that we match the
- * store-release that clears the locked bit and create lock
- * sequentiality; this is because the set_locked() function below
- * does not imply a full barrier.
- *
+ * *,x,y -> *,0,1
*/
pv_wait_head(lock, node);
- while ((val = smp_load_acquire(&lock->val.counter))& _Q_LOCKED_PENDING_MASK)
+ next = READ_ONCE(node->next);
+ while (cmpxchg(&((struct __qspinlock *)lock)->locked_pending, 0,
+ _Q_LOCKED_VAL) != 0) {
+ next = READ_ONCE(node->next);
cpu_relax();
+ }
+
+ if (next)
+ goto next_node;
+
+ val = smp_load_acquire(&lock->val.counter);
+ tail = tail | _Q_LOCKED_VAL;
/*
* claim the lock:
@@ -423,7 +416,6 @@ queue:
*/
for (;;) {
if (val != tail) {
- set_locked(lock);
break;
}
old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
@@ -439,6 +431,7 @@ queue:
while (!(next = READ_ONCE(node->next)))
cpu_relax();
+next_node:
arch_mcs_spin_unlock_contended(&next->locked);
pv_kick_node(lock, next);