Re: [PATCH v2] sunrpc: hardcode pool_mode to pernode, remove other modes
From: NeilBrown
Date: Thu Jun 25 2026 - 22:16:34 EST
On Fri, 26 Jun 2026, Jeff Layton wrote:
> The SVC_POOL_AUTO/GLOBAL/PERCPU/PERNODE pool mode selection machinery
> was added when NUMA was new and the right default was unclear. Today,
> pernode is the right choice everywhere:
>
> - On multi-NUMA hosts, it gives one pool per node with proper thread
> affinity and NUMA-local memory allocation.
> - On single-node hosts, pernode degenerates to exactly one pool,
> identical to the old "global" mode -- svc_pool_for_cpu() short-
> circuits when sv_nrpools <= 1, no CPU affinity is set, and memory
> is allocated from the single node.
>
> The percpu mode (one pool per CPU) created excessive pools relative to
> the number of threads most deployments run, and was only auto-selected
> in a narrow case (single node, >2 CPUs).
>
> Remove the SVC_POOL_* enum, mode selection heuristic,
> svc_pool_map_init_percpu(), and all mode-based switch statements.
> Simplify pool map functions to always use the pernode path. If pool
> map allocation fails, svc_pool_map_get() now returns 0 and service
> creation fails, rather than silently falling back to a single global
> pool.
>
> The module parameter and netlink interfaces are preserved for backward
> compatibility:
> - Writing any previously-accepted value succeeds silently
> - Reading always returns "pernode"
> - Writing to the module parameter emits a deprecation notice
>
> Assisted-by: Claude:claude-opus-4-8
> Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx>
> ---
> This version is essentially the same as v1, but allows the kernel to
> accept any previously-accepted setting for pool-mode, which should
> alleviate concerns about breakage.
> ---
> Changes in v2:
> - Accept any previously-accepted setting for pool_mode
> - Link to v1: https://lore.kernel.org/r/20260423-sunrpc-pool-mode-v1-1-b7f20e35749b@xxxxxxxxxx
> ---
> net/sunrpc/svc.c | 238 +++++++++----------------------------------------------
> 1 file changed, 37 insertions(+), 201 deletions(-)
>
> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
> index dd80a2eaaa74..6e3d509bf95a 100644
> --- a/net/sunrpc/svc.c
> +++ b/net/sunrpc/svc.c
> @@ -38,19 +38,6 @@
>
> static void svc_unregister(const struct svc_serv *serv, struct net *net);
>
> -#define SVC_POOL_DEFAULT SVC_POOL_GLOBAL
> -
> -/*
> - * Mode for mapping cpus to pools.
> - */
> -enum {
> - SVC_POOL_AUTO = -1, /* choose one of the others */
> - SVC_POOL_GLOBAL, /* no mapping, just a single global pool
> - * (legacy & UP mode) */
> - SVC_POOL_PERCPU, /* one pool per cpu */
> - SVC_POOL_PERNODE /* one pool per numa node */
> -};
> -
> /*
> * Structure for mapping cpus to pools and vice versa.
> * Setup once during sunrpc initialisation.
> @@ -58,62 +45,23 @@ enum {
>
> struct svc_pool_map {
> int count; /* How many svc_servs use us */
> - int mode; /* Note: int not enum to avoid
> - * warnings about "enumeration value
> - * not handled in switch" */
> unsigned int npools;
> - unsigned int *pool_to; /* maps pool id to cpu or node */
> - unsigned int *to_pool; /* maps cpu or node to pool id */
> + unsigned int *pool_to; /* maps pool id to node */
> + unsigned int *to_pool; /* maps node to pool id */
> };
>
> -static struct svc_pool_map svc_pool_map = {
> - .mode = SVC_POOL_DEFAULT
> -};
> +static struct svc_pool_map svc_pool_map;
>
> static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */
>
> -static int
> -__param_set_pool_mode(const char *val, struct svc_pool_map *m)
> -{
> - int err, mode;
> -
> - mutex_lock(&svc_pool_map_mutex);
> -
> - err = 0;
> - if (!strncmp(val, "auto", 4))
> - mode = SVC_POOL_AUTO;
> - else if (!strncmp(val, "global", 6))
> - mode = SVC_POOL_GLOBAL;
> - else if (!strncmp(val, "percpu", 6))
> - mode = SVC_POOL_PERCPU;
> - else if (!strncmp(val, "pernode", 7))
> - mode = SVC_POOL_PERNODE;
> - else
> - err = -EINVAL;
> -
> - if (err)
> - goto out;
> -
> - if (m->count == 0)
> - m->mode = mode;
> - else if (mode != m->mode)
> - err = -EBUSY;
> -out:
> - mutex_unlock(&svc_pool_map_mutex);
> - return err;
> -}
> -
> -static int
> -param_set_pool_mode(const char *val, const struct kernel_param *kp)
> -{
> - struct svc_pool_map *m = kp->arg;
> -
> - return __param_set_pool_mode(val, m);
> -}
> -
> int sunrpc_set_pool_mode(const char *val)
> {
> - return __param_set_pool_mode(val, &svc_pool_map);
> + if (!strncmp(val, "auto", 4) ||
> + !strncmp(val, "global", 6) ||
> + !strncmp(val, "percpu", 6) ||
> + !strncmp(val, "pernode", 7))
> + return 0;
"!strncmp" is one most disliked constructs....
What would you think of using match_string() or even
sysfs_match_string() ??
> + return -EINVAL;
> }
> EXPORT_SYMBOL(sunrpc_set_pool_mode);
>
> @@ -122,84 +70,32 @@ EXPORT_SYMBOL(sunrpc_set_pool_mode);
> * @buf: where to write the current pool_mode
> * @size: size of @buf
> *
> - * Grab the current pool_mode from the svc_pool_map and write
> - * the resulting string to @buf. Returns the number of characters
> + * Write the pool_mode string to @buf. Returns the number of characters
> * written to @buf (a'la snprintf()).
> */
> int
> sunrpc_get_pool_mode(char *buf, size_t size)
> {
> - struct svc_pool_map *m = &svc_pool_map;
> -
> - switch (m->mode)
> - {
> - case SVC_POOL_AUTO:
> - return snprintf(buf, size, "auto");
> - case SVC_POOL_GLOBAL:
> - return snprintf(buf, size, "global");
> - case SVC_POOL_PERCPU:
> - return snprintf(buf, size, "percpu");
> - case SVC_POOL_PERNODE:
> - return snprintf(buf, size, "pernode");
> - default:
> - return snprintf(buf, size, "%d", m->mode);
> - }
> + return snprintf(buf, size, "pernode");
> }
> EXPORT_SYMBOL(sunrpc_get_pool_mode);
>
> static int
> -param_get_pool_mode(char *buf, const struct kernel_param *kp)
> +param_set_pool_mode(const char *val, const struct kernel_param *kp)
> {
> - char str[16];
> - int len;
> -
> - len = sunrpc_get_pool_mode(str, ARRAY_SIZE(str));
> -
> - /* Ensure we have room for newline and NUL */
> - len = min_t(int, len, ARRAY_SIZE(str) - 2);
> -
> - /* tack on the newline */
> - str[len] = '\n';
> - str[len + 1] = '\0';
> -
> - return sysfs_emit(buf, "%s", str);
> + pr_notice("sunrpc: the pool_mode parameter is deprecated and will be removed in a future release.\n");
Which future release? How will we remember?
> + return sunrpc_set_pool_mode(val);
> }
>
> -module_param_call(pool_mode, param_set_pool_mode, param_get_pool_mode,
> - &svc_pool_map, 0644);
> -
> -/*
> - * Detect best pool mapping mode heuristically,
> - * according to the machine's topology.
> - */
> static int
> -svc_pool_map_choose_mode(void)
> +param_get_pool_mode(char *buf, const struct kernel_param *kp)
> {
> - unsigned int node;
> -
> - if (nr_online_nodes > 1) {
> - /*
> - * Actually have multiple NUMA nodes,
> - * so split pools on NUMA node boundaries
> - */
> - return SVC_POOL_PERNODE;
> - }
> -
> - node = first_online_node;
> - if (nr_cpus_node(node) > 2) {
> - /*
> - * Non-trivial SMP, or CONFIG_NUMA on
> - * non-NUMA hardware, e.g. with a generic
> - * x86_64 kernel on Xeons. In this case we
> - * want to divide the pools on cpu boundaries.
> - */
> - return SVC_POOL_PERCPU;
> - }
> -
> - /* default: one global pool */
> - return SVC_POOL_GLOBAL;
> + return sysfs_emit(buf, "pernode\n");
> }
>
> +module_param_call(pool_mode, param_set_pool_mode, param_get_pool_mode,
> + NULL, 0644);
> +
> /*
> * Allocate the to_pool[] and pool_to[] arrays.
> * Returns 0 on success or an errno.
> @@ -224,35 +120,7 @@ svc_pool_map_alloc_arrays(struct svc_pool_map *m, unsigned int maxpools)
> }
>
> /*
> - * Initialise the pool map for SVC_POOL_PERCPU mode.
> - * Returns number of pools or <0 on error.
> - */
> -static int
> -svc_pool_map_init_percpu(struct svc_pool_map *m)
> -{
> - unsigned int maxpools = nr_cpu_ids;
> - unsigned int pidx = 0;
> - unsigned int cpu;
> - int err;
> -
> - err = svc_pool_map_alloc_arrays(m, maxpools);
> - if (err)
> - return err;
> -
> - for_each_online_cpu(cpu) {
> - BUG_ON(pidx >= maxpools);
> - m->to_pool[cpu] = pidx;
> - m->pool_to[pidx] = cpu;
> - pidx++;
> - }
> - /* cpus brought online later all get mapped to pool0, sorry */
> -
> - return pidx;
> -};
> -
> -
> -/*
> - * Initialise the pool map for SVC_POOL_PERNODE mode.
> + * Initialise the pool map for one pool per NUMA node.
> * Returns number of pools or <0 on error.
> */
> static int
> @@ -284,14 +152,13 @@ svc_pool_map_init_pernode(struct svc_pool_map *m)
> * Add a reference to the global map of cpus to pools (and
> * vice versa) if pools are in use.
> * Initialise the map if we're the first user.
> - * Returns the number of pools. If this is '1', no reference
> - * was taken.
> + * Returns the number of pools, or 0 on failure.
> */
> static unsigned int
> svc_pool_map_get(void)
> {
> struct svc_pool_map *m = &svc_pool_map;
> - int npools = -1;
> + int npools;
>
> mutex_lock(&svc_pool_map_mutex);
> if (m->count++) {
> @@ -299,22 +166,11 @@ svc_pool_map_get(void)
> return m->npools;
> }
>
> - if (m->mode == SVC_POOL_AUTO)
> - m->mode = svc_pool_map_choose_mode();
> -
> - switch (m->mode) {
> - case SVC_POOL_PERCPU:
> - npools = svc_pool_map_init_percpu(m);
> - break;
> - case SVC_POOL_PERNODE:
> - npools = svc_pool_map_init_pernode(m);
> - break;
> - }
> -
> + npools = svc_pool_map_init_pernode(m);
> if (npools <= 0) {
> - /* default, or memory allocation failure */
> - npools = 1;
> - m->mode = SVC_POOL_GLOBAL;
> + m->count--;
Could we make this
m->count = 0
to make it crystal clear that there are now no references?
It would be nice to rename ->count to ->refcount or similar
so it is obvious what is being counted, but that doesn't need to be part
of this change.
> + mutex_unlock(&svc_pool_map_mutex);
> + return 0;
> }
> m->npools = npools;
> mutex_unlock(&svc_pool_map_mutex);
> @@ -346,14 +202,11 @@ static int svc_pool_map_get_node(unsigned int pidx)
> {
> const struct svc_pool_map *m = &svc_pool_map;
>
> - if (m->count) {
> - if (m->mode == SVC_POOL_PERCPU)
> - return cpu_to_node(m->pool_to[pidx]);
> - if (m->mode == SVC_POOL_PERNODE)
> - return m->pool_to[pidx];
> - }
> + if (m->count)
> + return m->pool_to[pidx];
> return numa_mem_id();
Can this code run if m->count is zero?
> }
> +
> /*
> * Set the given thread's cpus_allowed mask so that it
> * will only run on cpus in the given pool.
> @@ -372,27 +225,15 @@ svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx)
> if (m->count == 0)
> return;
>
> - switch (m->mode) {
> - case SVC_POOL_PERCPU:
> - {
> - set_cpus_allowed_ptr(task, cpumask_of(node));
> - break;
> - }
> - case SVC_POOL_PERNODE:
> - {
> - set_cpus_allowed_ptr(task, cpumask_of_node(node));
> - break;
> - }
> - }
> + set_cpus_allowed_ptr(task, cpumask_of_node(node));
> }
>
> /**
> * svc_pool_for_cpu - Select pool to run a thread on this cpu
> * @serv: An RPC service
> *
> - * Use the active CPU and the svc_pool_map's mode setting to
> - * select the svc thread pool to use. Once initialized, the
> - * svc_pool_map does not change.
> + * Use the active CPU and the svc_pool_map to select the svc thread
> + * pool to use. Once initialized, the svc_pool_map does not change.
> *
> * Return value:
> * A pointer to an svc_pool
> @@ -400,20 +241,12 @@ svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx)
> struct svc_pool *svc_pool_for_cpu(struct svc_serv *serv)
> {
> struct svc_pool_map *m = &svc_pool_map;
> - int cpu = raw_smp_processor_id();
> - unsigned int pidx = 0;
> + unsigned int pidx;
>
> if (serv->sv_nrpools <= 1)
> return serv->sv_pools;
>
> - switch (m->mode) {
> - case SVC_POOL_PERCPU:
> - pidx = m->to_pool[cpu];
> - break;
> - case SVC_POOL_PERNODE:
> - pidx = m->to_pool[cpu_to_node(cpu)];
> - break;
> - }
> + pidx = m->to_pool[cpu_to_node(raw_smp_processor_id())];
>
> return &serv->sv_pools[pidx % serv->sv_nrpools];
> }
> @@ -617,6 +450,9 @@ struct svc_serv *svc_create_pooled(struct svc_program *prog,
> struct svc_serv *serv;
> unsigned int npools = svc_pool_map_get();
>
> + if (!npools)
> + return NULL;
> +
> serv = __svc_create(prog, nprogs, stats, bufsize, npools, threadfn);
> if (!serv)
> goto out_err;
>
I think this is a good a worthwhile change.
Regarding Chucks comments, I think a minimum of one thread per node,
rather than a minimum of one thread total, is the sensible approach.
Of course if there exist memory-only nodes, they wouldn't need a thread.
Thanks,
NeilBrown