[RFC PATCH v2 6/7] lib/persubnode: Introducing a simple per-subnode APIs

From: Waiman Long
Date: Mon Jul 11 2016 - 13:32:41 EST


The percpu APIs are extensively used in the Linux kernel to reduce
cacheline contention and improve performance. For some use cases, the
percpu APIs may be too fine-grain for distributed resources whereas
a per-node based allocation may be too coarse as we can have dozens
of CPUs in a NUMA node in some high-end systems.

This patch introduces a simple per-subnode APIs where each of the
distributed resources will be shared by only a handful of CPUs within
a NUMA node. The per-subnode APIs are built on top of the percpu APIs
and hence requires the same amount of memory as if the percpu APIs
are used. However, it helps to reduce the total number of separate
resources that needed to be managed. As a result, it can speed up code
that need to iterate all the resources compared with using the percpu
APIs. Cacheline contention, however, will increases slightly as each
resource is shared by more than one CPU. As long as the number of CPUs
in each subnode is small, the performance impact won't be significant.

In this patch, at most 2 sibling groups can be put into a subnode. For
an x86-64 CPU, at most 4 CPUs will be in a subnode when HT is enabled
and 2 when it is not.

Signed-off-by: Waiman Long <Waiman.Long@xxxxxxx>
---
include/linux/persubnode.h | 80 +++++++++++++++++++++++++++++
init/main.c | 2 +
lib/Makefile | 2 +
lib/persubnode.c | 119 ++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 203 insertions(+), 0 deletions(-)
create mode 100644 include/linux/persubnode.h
create mode 100644 lib/persubnode.c

diff --git a/include/linux/persubnode.h b/include/linux/persubnode.h
new file mode 100644
index 0000000..b777daa
--- /dev/null
+++ b/include/linux/persubnode.h
@@ -0,0 +1,80 @@
+/*
+ * Per-subnode definitions
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2016 Hewlett-Packard Enterprise Development LP
+ *
+ * Authors: Waiman Long <waiman.long@xxxxxxx>
+ */
+#ifndef __LINUX_PERSUBNODE_H
+#define __LINUX_PERSUBNODE_H
+
+#include <linux/percpu.h>
+#include <linux/topology.h>
+
+/*
+ * Per-subnode APIs
+ */
+#define __persubnode __percpu
+#define nr_subnode_ids nr_cpu_ids
+#define alloc_persubnode(type) alloc_percpu(type)
+#define free_persubnode(var) free_percpu(var)
+#define for_each_subnode(snode) for_each_cpu(snode, subnode_mask)
+#define per_subnode_ptr(ptr, subnode) per_cpu_ptr(ptr, subnode)
+#define per_subnode(var, subnode) per_cpu(var, subnode)
+
+#ifdef CONFIG_SMP
+
+extern struct cpumask __subnode_mask __read_mostly;
+DECLARE_PER_CPU_READ_MOSTLY(int, cpu_subnode_id);
+
+#define subnode_mask (&__subnode_mask)
+
+static inline int this_cpu_to_subnode(void)
+{
+ return *this_cpu_ptr(&cpu_subnode_id);
+}
+
+/*
+ * For safety, preemption should be disabled before using this_subnode_ptr().
+ */
+#define this_subnode_ptr(ptr) \
+({ \
+ int _snid = this_cpu_to_subnode(); \
+ per_cpu_ptr(ptr, _snid); \
+})
+
+#define get_subnode_ptr(ptr) \
+({ \
+ preempt_disable(); \
+ this_subnode_ptr(ptr); \
+})
+
+#define put_subnode_ptr(ptr) \
+do { \
+ (void)(ptr); \
+ preempt_enable(); \
+} while (0)
+
+extern void __init subnode_early_init(void);
+
+#else /* CONFIG_SMP */
+
+#define subnode_mask cpu_possible_mask
+#define this_subnode_ptr(ptr) this_cpu_ptr(ptr)
+#define get_subnode_ptr(ptr) get_cpu_ptr(ptr)
+#define put_subnode_ptr(ptr) put_cpu_ptr(ptr)
+
+static inline void subnode_early_init(void) { }
+
+#endif /* CONFIG_SMP */
+#endif /* __LINUX_PERSUBNODE_H */
diff --git a/init/main.c b/init/main.c
index 4c17fda..28e4425 100644
--- a/init/main.c
+++ b/init/main.c
@@ -81,6 +81,7 @@
#include <linux/integrity.h>
#include <linux/proc_ns.h>
#include <linux/io.h>
+#include <linux/persubnode.h>

#include <asm/io.h>
#include <asm/bugs.h>
@@ -524,6 +525,7 @@ asmlinkage __visible void __init start_kernel(void)
NULL, set_init_arg);

jump_label_init();
+ subnode_early_init();

/*
* These use large bootmem allocations and must precede
diff --git a/lib/Makefile b/lib/Makefile
index 92e8c38..440152c 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -232,3 +232,5 @@ obj-$(CONFIG_UCS2_STRING) += ucs2_string.o
obj-$(CONFIG_UBSAN) += ubsan.o

UBSAN_SANITIZE_ubsan.o := n
+
+obj-$(CONFIG_SMP) += persubnode.o
diff --git a/lib/persubnode.c b/lib/persubnode.c
new file mode 100644
index 0000000..9febe7c
--- /dev/null
+++ b/lib/persubnode.c
@@ -0,0 +1,119 @@
+/*
+ * Per-subnode APIs
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2016 Hewlett-Packard Enterprise Development LP
+ *
+ * Authors: Waiman Long <waiman.long@xxxxxxx>
+ */
+
+/*
+ * The per-subnode APIs work on top of the per-cpu APIs. Instead of
+ * having to manage n separate resources on a n-cpu system, users can
+ * now manage only n/m separate resources where m is the number of CPUs
+ * in each subnode. This cuts down the time needed to traverse all the
+ * resources but at the expense of some wasted per-cpu memory as part of
+ * the per-cpu memory will not be used.
+ *
+ * All the CPUs in the same subnode must come from the same NUMA node.
+ * However, there can be more than one subnode in each NUMA node.
+ *
+ * As the per-subnode APIs can be used early in the bootup process while
+ * not all the information needed for initialization may be available
+ * at that time, the initialization is separated into two steps:
+ * 1) an early init that is called directly from start_kernel(); and
+ * 2) a postcore init.
+ *
+ * Before initialization, all the subnode IDs of the CPUs will be zero. So
+ * they will all use the same resource at subnode 0. The early init copies
+ * the cpu_possible_mask to subnode_mask causing resource initialization
+ * to be done for all the per-cpu resources allocated. At the postcore
+ * init, some bits of the subnode_mask will be cleared and the corresponding
+ * cpu to subnode ID mapping will be set accordingly.
+ */
+#include <linux/persubnode.h>
+
+DEFINE_PER_CPU_READ_MOSTLY(int, cpu_subnode_id);
+EXPORT_PER_CPU_SYMBOL(cpu_subnode_id);
+
+struct cpumask __subnode_mask __read_mostly;
+EXPORT_SYMBOL(__subnode_mask);
+
+/*
+ * Iterates all the CPUs from the given starting CPU
+ */
+#define for_each_cpu_from(from, cpu, mask) \
+ for ((cpu) = (from); (cpu) < nr_cpu_ids; \
+ (cpu) = cpumask_next((cpu), (mask)))
+
+/*
+ * Early subnode initialization to be called early in the boot process.
+ */
+void __init subnode_early_init(void)
+{
+ /* Make subnode_mask the same as cpu_possible_mask */
+ cpumask_copy(subnode_mask, cpu_possible_mask);
+}
+
+/*
+ * Initialize the subnodes
+ *
+ * All the sibling CPUs will be in the same subnode. On top of that, we will
+ * put at most 2 sibling groups into the same subnode. The percpu
+ * topology_sibling_cpumask() and topology_core_cpumask() are used for
+ * grouping CPUs into subnodes. The subnode ID is the CPU number of the
+ * first CPU in the subnode.
+ */
+static int __init subnode_init(void)
+{
+ int cpu;
+ int nr_subnodes = 0;
+ const int subnode_nr_cpus = 2;
+
+ /*
+ * Some of the bits in the subnode_mask will be cleared as we proceed.
+ */
+ for_each_cpu(cpu, subnode_mask) {
+ int ccpu, scpu;
+ int cpucnt = 0;
+
+ cpumask_var_t core_mask = topology_core_cpumask(cpu);
+ cpumask_var_t sibling_mask;
+
+ /*
+ * Put subnode_nr_cpus of CPUs and their siblings into each
+ * subnode.
+ */
+ for_each_cpu_from(cpu, ccpu, core_mask) {
+ sibling_mask = topology_sibling_cpumask(ccpu);
+ for_each_cpu_from(ccpu, scpu, sibling_mask) {
+ /*
+ * Clear the bits of the higher CPUs.
+ */
+ if (scpu > cpu)
+ cpumask_clear_cpu(scpu, subnode_mask);
+ per_cpu(cpu_subnode_id, scpu) = cpu;
+ }
+ if (++cpucnt == subnode_nr_cpus)
+ break; /* One subnode formed */
+ }
+ nr_subnodes++;
+ }
+ pr_info("Number of subnodes initialized = %d\n", nr_subnodes);
+
+ /*
+ * CPU 0 must be mapped to subnode 0
+ */
+ BUG_ON(per_cpu(cpu_subnode_id, 0) != 0);
+ return 0;
+}
+postcore_initcall(subnode_init);
--
1.7.1