From 28af255824b92a79fc055e15f5dd2251bc055c10 Mon Sep 17 00:00:00 2001
From: Alex Kogan <alex.kogan@oracle.com>
Date: Thu, 27 Aug 2020 19:06:48 -0400
Subject: [PATCH v10 7/7] locking/qspinlock: Implement incremental culling in
 CNA

Replace the scan of the main queue in cna_order_queue()
with incremental culling. This decreases the worst case
complexity of cna_order_queue() from O(n) down to O(1).
The idea is to move waiters running on other nodes
gradually, one-by-one, into the secondary queue. This is
achieved by probing the numa node of the next waiter only,
hence O(1) complexity. To keep the lock priority on the
same node even if there is a chain of waiters from different
nodes, we overwrite numa nodes of those waiters. Those
'fake' numa nodes will be reset in subsequent lock acquisitions.

Signed-off-by: Alex Kogan <alex.kogan@oracle.com>
---
 kernel/locking/qspinlock_cna.h | 135 +++++++++++++++------------------
 1 file changed, 63 insertions(+), 72 deletions(-)

diff --git a/kernel/locking/qspinlock_cna.h b/kernel/locking/qspinlock_cna.h
index 911c96279494..e5faf16ebe29 100644
--- a/kernel/locking/qspinlock_cna.h
+++ b/kernel/locking/qspinlock_cna.h
@@ -59,14 +59,14 @@ struct cna_node {
 	u16			numa_node;
 	u16			real_numa_node;
 	u32			encoded_tail;	/* self */
-	u32			partial_order;	/* encoded tail or enum val */
+	u32			partial_order;	/* enum val */
 	u32			intra_count;
 };
 
 enum {
-	LOCAL_WAITER_FOUND = 2,	/* 0 and 1 are reserved for @locked */
-	FLUSH_SECONDARY_QUEUE = 3,
-	MIN_ENCODED_TAIL
+	LOCAL_WAITER_FOUND,
+	LOCAL_WAITER_NOT_FOUND,
+	FLUSH_SECONDARY_QUEUE,
 };
 
 /*
@@ -90,10 +90,9 @@ static void __init cna_init_nodes_per_cpu(unsigned int cpu)
 		cn->encoded_tail = encode_tail(cpu, i);
 		/*
 		 * make sure @encoded_tail is not confused with other valid
-		 * values for @locked (0 or 1) or with designated values for
-		 * @pre_scan_result
+		 * values for @locked (0 or 1)
 		 */
-		WARN_ON(cn->encoded_tail < MIN_ENCODED_TAIL);
+		WARN_ON(cn->encoded_tail <= 1);
 	}
 }
 
@@ -117,10 +116,6 @@ static int __init cna_init_nodes(void)
 
 static __always_inline void cna_init_node(struct mcs_spinlock *node)
 {
-	/*
-	 * Set the priority bit in @numa_node for threads that should not
-	 * be moved to the secondary queue.
-	 */
 	bool priority = !in_task() || irqs_disabled() || rt_task(current);
 	struct cna_node *cn = (struct cna_node *)node;
 
@@ -213,79 +208,64 @@ static inline bool cna_try_clear_tail(struct qspinlock *lock, u32 val,
 }
 
 /*
- * cna_splice_tail -- splice nodes in the primary queue between [first, last]
+ * cna_splice_tail -- splice the next node from the primary queue
  * onto the secondary queue.
  */
-static void cna_splice_tail(struct mcs_spinlock *node,
-			    struct mcs_spinlock *first,
-			    struct mcs_spinlock *last)
+static void cna_splice_next(struct mcs_spinlock *node,
+			    struct mcs_spinlock *next,
+			    struct mcs_spinlock *nnext)
 {
-	/* remove [first,last] */
-	node->next = last->next;
+	/* remove 'next' from the main queue */
+	node->next = nnext;
 
-	/* stick [first,last] on the secondary queue tail */
+	/* stick `next` on the secondary queue tail */
 	if (node->locked <= 1) { /* if secondary queue is empty */
 		/* create secondary queue */
-		last->next = first;
+		next->next = next;
 	} else {
 		/* add to the tail of the secondary queue */
 		struct mcs_spinlock *tail_2nd = decode_tail(node->locked);
 		struct mcs_spinlock *head_2nd = tail_2nd->next;
 
-		tail_2nd->next = first;
-		last->next = head_2nd;
+		tail_2nd->next = next;
+		next->next = head_2nd;
 	}
 
-	node->locked = ((struct cna_node *)last)->encoded_tail;
+	node->locked = ((struct cna_node *)next)->encoded_tail;
 }
 
 /*
- * cna_order_queue - scan the primary queue looking for the first lock node on
- * the same NUMA node as the lock holder and move any skipped nodes onto the
- * secondary queue.
- *
- * Returns LOCAL_WAITER_FOUND if a matching node is found; otherwise return
- * the encoded pointer to the last element inspected (such that a subsequent
- * scan can continue there).
- *
- * The worst case complexity of the scan is O(n), where n is the number
- * of current waiters. However, the amortized complexity is close to O(1),
- * as the immediate successor is likely to be running on the same node once
- * threads from other nodes are moved to the secondary queue.
+ * cna_order_queue - check whether the next lock waiter
+ * is on the same NUMA node as the lock holder; if not,
+ * and it is not a prioritized waiter, move it onto
+ * the secondary queue.
  */
-static u32 cna_order_queue(struct mcs_spinlock *node,
-			   struct mcs_spinlock *iter)
+static u32 cna_order_queue(struct mcs_spinlock *node)
 {
-	struct cna_node *cni = (struct cna_node *)READ_ONCE(iter->next);
-	struct cna_node *cn = (struct cna_node *)node;
-	int nid = cn->numa_node;
-	struct cna_node *last;
-
-	/* find any next waiter on 'our' NUMA node */
-	for (last = cn;
-		 /*
-		  * iterate as long as the current node is not priorizied and
-		  * does not run on 'our' NUMA node
-		  */
-	     cni && cni->numa_node != CNA_PRIORITY_NODE && cni->numa_node != nid;
-	     last = cni, cni = (struct cna_node *)READ_ONCE(cni->mcs.next))
-		;
+	struct mcs_spinlock *next = READ_ONCE(node->next);
+	int numa_node, next_numa_node;
 
-	if (!cni)
-		return last->encoded_tail; /* continue from here */
+	if (!next)
+		return LOCAL_WAITER_NOT_FOUND;
 
-	if (cni->numa_node == CNA_PRIORITY_NODE)
-		cni->numa_node = nid;	/* Inherit node id of primary queue */
+	numa_node = ((struct cna_node *)node)->numa_node;
+	next_numa_node = ((struct cna_node *)next)->numa_node;
 
-	if (last != cn)	/* did we skip any waiters? */
-		cna_splice_tail(node, node->next, (struct mcs_spinlock *)last);
+	if (next_numa_node != numa_node) {
+		if (next_numa_node != CNA_PRIORITY_NODE) {
+			struct mcs_spinlock *nnext = READ_ONCE(next->next);
 
-	/*
-	 * We return LOCAL_WAITER_FOUND here even if we stopped the scan because
-	 * of a prioritized waiter. That waiter will get the lock next even if
-	 * it runs on a different NUMA node, but this is what we wanted when we
-	 * prioritized it.
-	 */
+			if (nnext) {
+				cna_splice_next(node, next, nnext);
+				next = nnext;
+			}
+		}
+		/*
+		 * Inherit numa node id of primary queue, to maintain the
+		 * preference even if the next waiter is on a different node.
+		 */
+		((struct cna_node *)next)->numa_node = numa_node;
+	}
 	return LOCAL_WAITER_FOUND;
 }
 
@@ -307,7 +287,7 @@ static __always_inline u32 cna_wait_head_or_lock(struct qspinlock *lock,
 		 * Try and put the time otherwise spent spin waiting on
 		 * _Q_LOCKED_PENDING_MASK to use by sorting our lists.
 		 */
-		cn->partial_order = cna_order_queue(node, node);
+		cn->partial_order = cna_order_queue(node);
 	} else {
 		cn->partial_order = FLUSH_SECONDARY_QUEUE;
 	}
@@ -323,18 +303,23 @@ static inline void cna_lock_handoff(struct mcs_spinlock *node,
 	u32 partial_order = cn->partial_order;
 
 	/*
-	 * check if a successor from the same numa node has not been found in
-	 * pre-scan, and if so, try to find it in post-scan starting from the
-	 * node where pre-scan stopped (stored in @pre_scan_result)
+	 * Check if a successor from the same numa node has not been found in
+	 * pre-scan, and if so, try to find it in post-scan.
 	 */
-	if (partial_order >= MIN_ENCODED_TAIL)
-		partial_order =
-			cna_order_queue(node, decode_tail(partial_order));
+	if (partial_order == LOCAL_WAITER_NOT_FOUND)
+		partial_order = cna_order_queue(node);
+
+	/*
+	 * We have a successor in the main queue ('next'), so
+	 * if we call cna_order_queue() above, we will find
+	 * a local waiter, either real or faked one.
+	 */
+	WARN_ON(partial_order == LOCAL_WAITER_NOT_FOUND);
 
 	if (partial_order == LOCAL_WAITER_FOUND) {
 		/*
-		 * We found a local waiter; reload @next in case we called
-		 * cna_order_queue() above.
+		 * We found a local waiter; reload @next in case it
+		 * was changed by cna_order_queue().
 		 */
 		next = node->next;
 		if (node->locked > 1) {
@@ -342,7 +327,13 @@ static inline void cna_lock_handoff(struct mcs_spinlock *node,
 			((struct cna_node *)next)->intra_count =
 				cn->intra_count + 1;
 		}
-	} else if (node->locked > 1) {
+	} else {
+		WARN_ON(partial_order != FLUSH_SECONDARY_QUEUE);
+		/*
+		 * We decided to flush the secondary queue;
+		 * this can only happen if that queue is not empty.
+		 */
+		WARN_ON(node->locked <= 1);
 		/*
 		 * When there are no local waiters on the primary queue, splice
 		 * the secondary queue onto the primary queue and pass the lock
-- 
2.21.1 (Apple Git-122.3)