[PATCH net-next v3 4/4] net: dqs: make struct dql more cache efficient

From: Breno Leitao
Date: Thu Apr 11 2024 - 15:26:47 EST


With the previous change, struct dqs->stall_thrs will be in the hot path
(at queue side), even if DQS is disabled.

The other fields accessed in this function (last_obj_cnt and num_queued)
are in the first cache line, let's move this field (stall_thrs) to the
very first cache line, since there is a hole there.

This does not change the structure size, since it moves an short (2
bytes) to 4-bytes whole in the first cache line.

This is the new structure format now:

struct dql {
unsigned int num_queued;
unsigned int last_obj_cnt;
..
short unsigned int stall_thrs;
/* XXX 2 bytes hole, try to pack */
..
/* --- cacheline 1 boundary (64 bytes) --- */
..
/* Longest stall detected, reported to user */
short unsigned int stall_max;
/* XXX 2 bytes hole, try to pack */
};

Also, read the stall_thrs (now in the very first cache line) earlier,
together with dql->num_queued (also in the first cache line).

Suggested-by: Jakub Kicinski <kuba@xxxxxxxxxx>
Suggested-by: Eric Dumazet <edumazet@xxxxxxxxxx>
Signed-off-by: Breno Leitao <leitao@xxxxxxxxxx>
---
include/linux/dynamic_queue_limits.h | 5 +++--
lib/dynamic_queue_limits.c | 13 +++++++++----
2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/include/linux/dynamic_queue_limits.h b/include/linux/dynamic_queue_limits.h
index 869afb800ea1..281298e77a15 100644
--- a/include/linux/dynamic_queue_limits.h
+++ b/include/linux/dynamic_queue_limits.h
@@ -50,6 +50,9 @@ struct dql {
unsigned int adj_limit; /* limit + num_completed */
unsigned int last_obj_cnt; /* Count at last queuing */

+ /* Stall threshold (in jiffies), defined by user */
+ unsigned short stall_thrs;
+
unsigned long history_head; /* top 58 bits of jiffies */
/* stall entries, a bit per entry */
unsigned long history[DQL_HIST_LEN];
@@ -71,8 +74,6 @@ struct dql {
unsigned int min_limit; /* Minimum limit */
unsigned int slack_hold_time; /* Time to measure slack */

- /* Stall threshold (in jiffies), defined by user */
- unsigned short stall_thrs;
/* Longest stall detected, reported to user */
unsigned short stall_max;
unsigned long last_reap; /* Last reap (in jiffies) */
diff --git a/lib/dynamic_queue_limits.c b/lib/dynamic_queue_limits.c
index a1389db1c30a..e49deddd3de9 100644
--- a/lib/dynamic_queue_limits.c
+++ b/lib/dynamic_queue_limits.c
@@ -15,12 +15,10 @@
#define POSDIFF(A, B) ((int)((A) - (B)) > 0 ? (A) - (B) : 0)
#define AFTER_EQ(A, B) ((int)((A) - (B)) >= 0)

-static void dql_check_stall(struct dql *dql)
+static void dql_check_stall(struct dql *dql, unsigned short stall_thrs)
{
- unsigned short stall_thrs;
unsigned long now;

- stall_thrs = READ_ONCE(dql->stall_thrs);
if (!stall_thrs)
return;

@@ -86,9 +84,16 @@ void dql_completed(struct dql *dql, unsigned int count)
{
unsigned int inprogress, prev_inprogress, limit;
unsigned int ovlimit, completed, num_queued;
+ unsigned short stall_thrs;
bool all_prev_completed;

num_queued = READ_ONCE(dql->num_queued);
+ /* Read stall_thrs in advance since it belongs to the same (first)
+ * cache line as ->num_queued. This way, dql_check_stall() does not
+ * need to touch the first cache line again later, reducing the window
+ * of possible false sharing.
+ */
+ stall_thrs = READ_ONCE(dql->stall_thrs);

/* Can't complete more than what's in queue */
BUG_ON(count > num_queued - dql->num_completed);
@@ -178,7 +183,7 @@ void dql_completed(struct dql *dql, unsigned int count)
dql->num_completed = completed;
dql->prev_num_queued = num_queued;

- dql_check_stall(dql);
+ dql_check_stall(dql, stall_thrs);
}
EXPORT_SYMBOL(dql_completed);

--
2.43.0