[PATCH] smp: add a best_effort version of smp_call_function_many()

From: Luigi Rizzo
Date: Mon Apr 19 2021 - 14:45:09 EST


Regardless of the 'wait' argument, smp_call_function_many() must spin
if any of the target CPUs have their csd busy waiting to be processed
for a previous call. This may cause high tail latencies e.g. when some
of the target CPUs are running functions that disable interrupts for a
long time; getrusage() is one possible culprit.

Here we introduce a variant, __smp_call_function_many(), that adds
a third 'best_effort' mode to the two existing ones (nowait, wait).
In best effort mode, the call will skip CPUs whose csd is busy, and if
any CPU is skipped it returns -EBUSY and the set of busy in the mask.
This allows the caller to decide how to proceed, e.g. it might retry at
a later time, or use a private csd, etc..

The new function is a compromise to avoid touching existing callers of
smp_call_function_many(). If the feature is considered interesting, we
could even replace the 'wait' argument with a ternary 'mode' in all
smp_call_function_*() and derived methods.

Signed-off-by: Luigi Rizzo <lrizzo@xxxxxxxxxx>
---
include/linux/smp.h | 10 ++++++
kernel/smp.c | 75 +++++++++++++++++++++++++++++++++++++--------
2 files changed, 72 insertions(+), 13 deletions(-)

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 70c6f6284dcf..5c6c7d3e1f19 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -75,6 +75,11 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,

int smp_call_function_single_async(int cpu, call_single_data_t *csd);

+/* Modes for __smp_call_function_many() */
+#define SMP_CFM_NOWAIT 0
+#define SMP_CFM_WAIT 1
+#define SMP_CFM_BEST_EFFORT 2
+
#ifdef CONFIG_SMP

#include <linux/preempt.h>
@@ -120,6 +125,8 @@ extern void smp_cpus_done(unsigned int max_cpus);
void smp_call_function(smp_call_func_t func, void *info, int wait);
void smp_call_function_many(const struct cpumask *mask,
smp_call_func_t func, void *info, bool wait);
+int __smp_call_function_many(struct cpumask *mask, smp_call_func_t func,
+ void *info, int mode);

int smp_call_function_any(const struct cpumask *mask,
smp_call_func_t func, void *info, int wait);
@@ -170,6 +177,9 @@ static inline void smp_send_reschedule(int cpu) { }
#define smp_prepare_boot_cpu() do {} while (0)
#define smp_call_function_many(mask, func, info, wait) \
(up_smp_call_function(func, info))
+#define ____smp_call_function_many(mask, func, info, mode) \
+ (up_smp_call_function(func, info), 0)
+
static inline void call_function_init(void) { }

static inline int
diff --git a/kernel/smp.c b/kernel/smp.c
index aeb0adfa0606..75155875fadc 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -242,6 +242,18 @@ static __always_inline void csd_lock(call_single_data_t *csd)
smp_wmb();
}

+static __always_inline bool csd_trylock(call_single_data_t *csd)
+{
+ unsigned int flags = READ_ONCE(csd->node.u_flags);
+
+ if (flags & CSD_FLAG_LOCK)
+ return false;
+ csd->node.u_flags |= CSD_FLAG_LOCK;
+ /* See csd_trylock() */
+ smp_wmb();
+ return true;
+}
+
static __always_inline void csd_unlock(call_single_data_t *csd)
{
WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));
@@ -608,12 +620,14 @@ int smp_call_function_any(const struct cpumask *mask,
}
EXPORT_SYMBOL_GPL(smp_call_function_any);

-static void smp_call_function_many_cond(const struct cpumask *mask,
- smp_call_func_t func, void *info,
- bool wait, smp_cond_func_t cond_func)
+static struct cpumask *smp_call_function_many_cond(const struct cpumask *mask,
+ smp_call_func_t func,
+ void *info, int mode,
+ smp_cond_func_t cond_func)
{
struct call_function_data *cfd;
int cpu, next_cpu, this_cpu = smp_processor_id();
+ bool busy = false, wait = (mode == SMP_CFM_WAIT);

/*
* Can deadlock when called with interrupts disabled.
@@ -639,18 +653,18 @@ static void smp_call_function_many_cond(const struct cpumask *mask,

/* No online cpus? We're done. */
if (cpu >= nr_cpu_ids)
- return;
+ return NULL;

/* Do we have another CPU which isn't us? */
next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
if (next_cpu == this_cpu)
next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);

- /* Fastpath: do that cpu by itself. */
- if (next_cpu >= nr_cpu_ids) {
+ /* Fastpath: if not best-effort do that cpu by itself. */
+ if (next_cpu >= nr_cpu_ids && mode != SMP_CFM_BEST_EFFORT) {
if (!cond_func || cond_func(cpu, info))
smp_call_function_single(cpu, func, info, wait);
- return;
+ return NULL;
}

cfd = this_cpu_ptr(&cfd_data);
@@ -660,7 +674,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,

/* Some callers race with other cpus changing the passed mask */
if (unlikely(!cpumask_weight(cfd->cpumask)))
- return;
+ return NULL;

cpumask_clear(cfd->cpumask_ipi);
for_each_cpu(cpu, cfd->cpumask) {
@@ -669,9 +683,17 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
if (cond_func && !cond_func(cpu, info))
continue;

- csd_lock(csd);
- if (wait)
- csd->node.u_flags |= CSD_TYPE_SYNC;
+ if (mode == SMP_CFM_BEST_EFFORT) {
+ if (!csd_trylock(csd)) {
+ cpumask_clear_cpu(cpu, cfd->cpumask);
+ busy = true;
+ continue;
+ }
+ } else {
+ csd_lock(csd);
+ if (wait)
+ csd->node.u_flags |= CSD_TYPE_SYNC;
+ }
csd->func = func;
csd->info = info;
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
@@ -693,8 +715,32 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
csd_lock_wait(csd);
}
}
+ return busy ? cfd->cpumask : NULL;
}

+/**
+ * Extended version of smp_call_function_many(). Same constraints.
+ * @mode == 0 same as wait = false, returns 0;
+ * @mode == 1 same as wait = true, returns 0;
+ * @mode = SMP_CFM_BEST_EFFORT: skips CPUs with previous pending requests,
+ * returns 0 and *mask unmodified if no CPUs are skipped,
+ * -EBUSY if CPUs are skipped, and *mask is the set of skipped CPUs
+ */
+int __smp_call_function_many(struct cpumask *mask, smp_call_func_t func,
+ void *info, int mode)
+{
+ struct cpumask *ret = smp_call_function_many_cond(mask, func, info,
+ mode, NULL);
+
+ if (!ret)
+ return 0;
+ cpumask_andnot(mask, mask, ret);
+ cpumask_and(mask, mask, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), mask);
+ return -EBUSY;
+}
+EXPORT_SYMBOL(__smp_call_function_many);
+
/**
* smp_call_function_many(): Run a function on a set of other CPUs.
* @mask: The set of cpus to run on (only runs on online subset).
@@ -712,7 +758,9 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
void smp_call_function_many(const struct cpumask *mask,
smp_call_func_t func, void *info, bool wait)
{
- smp_call_function_many_cond(mask, func, info, wait, NULL);
+ const int mode = wait ? SMP_CFM_WAIT : SMP_CFM_NOWAIT;
+
+ smp_call_function_many_cond(mask, func, info, mode, NULL);
}
EXPORT_SYMBOL(smp_call_function_many);

@@ -898,9 +946,10 @@ EXPORT_SYMBOL(on_each_cpu_mask);
void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
void *info, bool wait, const struct cpumask *mask)
{
+ const int mode = wait ? SMP_CFM_WAIT : SMP_CFM_NOWAIT;
int cpu = get_cpu();

- smp_call_function_many_cond(mask, func, info, wait, cond_func);
+ smp_call_function_many_cond(mask, func, info, mode, cond_func);
if (cpumask_test_cpu(cpu, mask) && cond_func(cpu, info)) {
unsigned long flags;

--
2.31.1.368.gbe11c130af-goog