[PATCH 5/4] sunrpc: protect the svc_pool_map pool_to[] array with RCU
From: Jeff Layton
Date: Tue Jun 30 2026 - 08:55:18 EST
svc_pool_map_get_node() reads the global svc_pool_map without holding
svc_pool_map_mutex and dereferences m->pool_to[]. The array was both
published and torn down without any synchronisation against that
lockless reader:
- svc_pool_map_get() incremented m->count to one before
svc_pool_map_init_pernode() allocated and filled the arrays, so a
reader observing the map as "in use" could see a NULL (or partially
built) pool_to[] and oops.
- svc_pool_map_put() freed the arrays as soon as the last reference
went away, so a reader that had already started dereferencing
pool_to[] could use it after free.
svc_new_thread() takes this lockless path for every service, including
unpooled ones that hold no map reference, so the reader genuinely can
run concurrently with another service's startup or shutdown.
Publish pool_to[] with rcu_assign_pointer() only after it is fully
built in a private allocation, and have svc_pool_map_get_node()
dereference it under rcu_read_lock(). On teardown, clear the pointer
and defer the free past a grace period with kfree_rcu_mightsleep().
svc_pool_map_set_cpumask() also reads pool_to[], but its caller holds a
map reference (it checks sv_nrpools > 1) so the array is stable; it uses
rcu_dereference_protected() rather than taking the read lock.
to_pool[] needs no such treatment: it is only read by services that
hold a map reference, so it cannot be freed under a reader.
Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx>
Assisted-by: Claude:claude-opus-4-8
---
net/sunrpc/svc.c | 91 +++++++++++++++++++++++++-----------------------
1 file changed, 48 insertions(+), 43 deletions(-)
Sashiko pointed out this (preexisting) problem during review of the
other 4 in the series. I figure we might as well fix it too while we're
in here.
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 4fff0725ef8f..ebd953248e34 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -46,13 +46,13 @@ static void svc_unregister(const struct svc_serv *serv, struct net *net);
struct svc_pool_map {
int count; /* How many svc_servs use us */
unsigned int npools;
- unsigned int *pool_to; /* maps pool id to node */
+ unsigned int __rcu *pool_to; /* maps pool id to node */
unsigned int *to_pool; /* maps node to pool id */
};
static struct svc_pool_map svc_pool_map;
-static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */
+static DEFINE_MUTEX(svc_pool_map_mutex);/* serialises svc_pool_map updates */
/*
* Pool modes that were historically accepted. They no longer select
@@ -102,32 +102,12 @@ param_get_pool_mode(char *buf, const struct kernel_param *kp)
module_param_call(pool_mode, param_set_pool_mode, param_get_pool_mode,
NULL, 0644);
-/*
- * Allocate the to_pool[] and pool_to[] arrays.
- * Returns 0 on success or an errno.
- */
-static int
-svc_pool_map_alloc_arrays(struct svc_pool_map *m, unsigned int maxpools)
-{
- m->to_pool = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL);
- if (!m->to_pool)
- goto fail;
- m->pool_to = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL);
- if (!m->pool_to)
- goto fail_free;
-
- return 0;
-
-fail_free:
- kfree(m->to_pool);
- m->to_pool = NULL;
-fail:
- return -ENOMEM;
-}
-
/*
* Initialise the pool map for one pool per NUMA node.
* Returns number of pools or <0 on error.
+ *
+ * pool_to[] is published with rcu_assign_pointer() only once fully built,
+ * so a lockless reader sees either NULL or a complete map.
*/
static int
svc_pool_map_init_pernode(struct svc_pool_map *m)
@@ -135,21 +115,30 @@ svc_pool_map_init_pernode(struct svc_pool_map *m)
unsigned int maxpools = nr_node_ids;
unsigned int pidx = 0;
unsigned int node;
- int err;
+ unsigned int *to_pool, *pool_to;
- err = svc_pool_map_alloc_arrays(m, maxpools);
- if (err)
- return err;
+ to_pool = kcalloc(maxpools, sizeof(*to_pool), GFP_KERNEL);
+ if (!to_pool)
+ return -ENOMEM;
+ pool_to = kcalloc(maxpools, sizeof(*pool_to), GFP_KERNEL);
+ if (!pool_to) {
+ kfree(to_pool);
+ return -ENOMEM;
+ }
for_each_node_with_cpus(node) {
/* some architectures (e.g. SN2) have cpuless nodes */
BUG_ON(pidx > maxpools);
- m->to_pool[node] = pidx;
- m->pool_to[pidx] = node;
+ to_pool[node] = pidx;
+ pool_to[pidx] = node;
pidx++;
}
/* nodes brought online later all get mapped to pool0, sorry */
+ m->npools = pidx;
+ m->to_pool = to_pool;
+ rcu_assign_pointer(m->pool_to, pool_to);
+
return pidx;
}
@@ -178,7 +167,6 @@ svc_pool_map_get(void)
mutex_unlock(&svc_pool_map_mutex);
return 0;
}
- m->npools = npools;
mutex_unlock(&svc_pool_map_mutex);
return npools;
}
@@ -195,11 +183,21 @@ svc_pool_map_put(void)
mutex_lock(&svc_pool_map_mutex);
if (!--m->count) {
+ unsigned int *pool_to;
+
+ /* Protected by svc_pool_map_mutex */
+ pool_to = rcu_dereference_protected(m->pool_to, 1);
+
+ /*
+ * Defer the pool_to[] free past a grace period; a lockless
+ * reader may still hold it. to_pool[] has no such readers, so
+ * free it directly.
+ */
+ rcu_assign_pointer(m->pool_to, NULL);
kfree(m->to_pool);
m->to_pool = NULL;
- kfree(m->pool_to);
- m->pool_to = NULL;
m->npools = 0;
+ kfree_rcu_mightsleep(pool_to);
}
mutex_unlock(&svc_pool_map_mutex);
}
@@ -207,10 +205,15 @@ svc_pool_map_put(void)
static int svc_pool_map_get_node(unsigned int pidx)
{
const struct svc_pool_map *m = &svc_pool_map;
+ const unsigned int *pool_to;
+ int node = numa_mem_id();
- if (m->count)
- return m->pool_to[pidx];
- return numa_mem_id();
+ rcu_read_lock();
+ pool_to = rcu_dereference(m->pool_to);
+ if (pool_to)
+ node = pool_to[pidx];
+ rcu_read_unlock();
+ return node;
}
/*
@@ -221,17 +224,19 @@ static inline void
svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx)
{
struct svc_pool_map *m = &svc_pool_map;
- unsigned int node = m->pool_to[pidx];
+ const unsigned int *pool_to;
/*
- * The caller checks for sv_nrpools > 1, which
- * implies that we've been initialized.
+ * The caller checks for sv_nrpools > 1, so its service holds a
+ * reference to the map: pool_to[] is allocated and cannot be freed
+ * under us. No RCU read lock is needed; the held reference keeps the
+ * array stable.
*/
- WARN_ON_ONCE(m->count == 0);
- if (m->count == 0)
+ pool_to = rcu_dereference_protected(m->pool_to, 1);
+ if (WARN_ON_ONCE(!pool_to))
return;
- set_cpus_allowed_ptr(task, cpumask_of_node(node));
+ set_cpus_allowed_ptr(task, cpumask_of_node(pool_to[pidx]));
}
/**
--
2.54.0