[PATCH] smp: do not send IPI if call_single_queue not empty

From: Aaron Lu
Date: Fri Apr 28 2017 - 09:19:56 EST


For smp_call_function_many(), whenever a CPU queues a CSD to a target
CPU, it will send an IPI to let the target CPU to handle the work.
This isn't necessary - we need only send IPI when queueing a CSD
to an empty call_single_queue.

The reason:
flush_smp_call_function_queue() that is called upon a CPU receiving an
IPI will empty the queue and then handle all of the CSDs there. So if
the target CPU's call_single_queue is not empty, we know that:
i. An IPI for the target CPU has already been sent by previous queuers*;
ii. flush_smp_call_function_queue() hasn't emptied that CPU's queue yet.
Thus, it's safe for us to just queue our CSD there without sending an
addtional IPI.

*For previous queuers, we can limit it to the first queuer.

The workload used to see the effectiveness of this change is
vm-scalability's case-swap-w-seq-mt:
https://git.kernel.org/pub/scm/linux/kernel/git/wfg/vm-scalability.git/tree/case-anon-w-seq-mt

What it does is to spawn 88 threads to do continous anonymous memory
consumption. The test machine is a 2-node Broadwell EP with 88 threads
and 32G memory. The test size is 100G so a lot of swap out happened
after available memory is used up. Due to:
i. shrink_page_list() will need to
try_to_unmap_flush() -> flush_tlb_others();
ii. this process' mm_cpumask() is almost full.
the number of IPI sent during the workload is huge.

Base:
"interrupts.CAL:Function_call_interrupts": 819388170.0,
"vm-scalability.throughput": 5051472.0,
This patch:
"interrupts.CAL:Function_call_interrupts": 92214434.66666667, â88.7%
"vm-scalability.throughput": 5991764.333333333 â18.6%

Interrupts dropped a lot and performance increased 18.6%.

Signed-off-by: Aaron Lu <aaron.lu@xxxxxxxxx>
---
kernel/smp.c | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/kernel/smp.c b/kernel/smp.c
index a817769b53c0..76d16fe3c427 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -30,6 +30,7 @@ enum {
struct call_function_data {
struct call_single_data __percpu *csd;
cpumask_var_t cpumask;
+ cpumask_var_t cpumask_ipi;
};

static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
@@ -45,9 +46,15 @@ int smpcfd_prepare_cpu(unsigned int cpu)
if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
cpu_to_node(cpu)))
return -ENOMEM;
+ if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
+ cpu_to_node(cpu))) {
+ free_cpumask_var(cfd->cpumask);
+ return -ENOMEM;
+ }
cfd->csd = alloc_percpu(struct call_single_data);
if (!cfd->csd) {
free_cpumask_var(cfd->cpumask);
+ free_cpumask_var(cfd->cpumask_ipi);
return -ENOMEM;
}

@@ -59,6 +66,7 @@ int smpcfd_dead_cpu(unsigned int cpu)
struct call_function_data *cfd = &per_cpu(cfd_data, cpu);

free_cpumask_var(cfd->cpumask);
+ free_cpumask_var(cfd->cpumask_ipi);
free_percpu(cfd->csd);
return 0;
}
@@ -434,6 +442,7 @@ void smp_call_function_many(const struct cpumask *mask,
if (unlikely(!cpumask_weight(cfd->cpumask)))
return;

+ cpumask_clear(cfd->cpumask_ipi);
for_each_cpu(cpu, cfd->cpumask) {
struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);

@@ -442,11 +451,12 @@ void smp_call_function_many(const struct cpumask *mask,
csd->flags |= CSD_FLAG_SYNCHRONOUS;
csd->func = func;
csd->info = info;
- llist_add(&csd->llist, &per_cpu(call_single_queue, cpu));
+ if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
+ cpumask_set_cpu(cpu, cfd->cpumask_ipi);
}

/* Send a message to all CPUs in the map */
- arch_send_call_function_ipi_mask(cfd->cpumask);
+ arch_send_call_function_ipi_mask(cfd->cpumask_ipi);

if (wait) {
for_each_cpu(cpu, cfd->cpumask) {
--
2.9.3