Re: [RFC PATCH] riscv: Add numa support for riscv64 platform

From: Greentime Hu
Date: Tue Dec 24 2019 - 06:33:30 EST


Hi Anup,

On Tue, Dec 24, 2019 at 6:28 PM Anup Patel <anup@xxxxxxxxxxxxxx> wrote:
>
> +Christoph, +Mike, +Ganpatro
>
> On Tue, Dec 24, 2019 at 2:25 PM Greentime Hu <greentime.hu@xxxxxxxxxx> wrote:
> >
> > This implementation is based on arm64 porting. It is tested with
> > qemu-system-riscv64, unleashed board and OmniXtend FPGA platform.
> >
> > There will be 2 nodes in /sys/devices/system/node if it is described in dts and
> > CONFIG_NUMA is enabled. We can use numastat/numactl/numademo to see its status.
>
> This patch can be broken down into separate (more granular) patches.
> For example:
> 1. asm/pgtable.h change can be separate patch
> 2. Movement of unflatten_device_tree() from setup_arch() to paging_init()
> 3. changes in kernel/smpboot.c can also be separate patch
>
> Also, since this is ported from arm64 implementation, I strongly
> suggest having a generic NUMA support which can be shared
> between arm64 and riscv. I think Ganpat (CC'ed) here could be
> the best person to maintain the generic NUMA support since he
> originally added it for arm64.
>
> >
> > Signed-off-by: Greentime Hu <greentime.hu@xxxxxxxxxx>
> > ---
> > arch/riscv/Kconfig | 30 ++-
> > arch/riscv/include/asm/mmzone.h | 13 ++
> > arch/riscv/include/asm/numa.h | 46 ++++
> > arch/riscv/include/asm/pci.h | 10 +
> > arch/riscv/include/asm/pgtable.h | 20 ++
> > arch/riscv/kernel/setup.c | 26 ++-
> > arch/riscv/kernel/smpboot.c | 20 +-
> > arch/riscv/mm/Makefile | 1 +
> > arch/riscv/mm/init.c | 3 +
> > arch/riscv/mm/numa.c | 372 +++++++++++++++++++++++++++++++
> > 10 files changed, 536 insertions(+), 5 deletions(-)
> > create mode 100644 arch/riscv/include/asm/mmzone.h
> > create mode 100644 arch/riscv/include/asm/numa.h
> > create mode 100644 arch/riscv/mm/numa.c
> >
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index bc7598fc5f00..53ae1816df50 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -22,7 +22,6 @@ config RISCV
> > select CLONE_BACKWARDS
> > select COMMON_CLK
> > select GENERIC_CLOCKEVENTS
> > - select GENERIC_CPU_DEVICES
> > select GENERIC_IRQ_SHOW
> > select GENERIC_PCI_IOMAP
> > select GENERIC_SCHED_CLOCK
> > @@ -234,6 +233,35 @@ config TUNE_GENERIC
> > bool "generic"
> >
> > endchoice
> > +# Common NUMA Features
> > +config NUMA
> > + bool "Numa Memory Allocation and Scheduler Support"
> > + select OF_NUMA
> > + select ARCH_SUPPORTS_NUMA_BALANCING
> > + depends on SPARSEMEM
> > + help
> > + Enable NUMA (Non Uniform Memory Access) support.
> > +
> > + The kernel will try to allocate memory used by a CPU on the
> > + local memory of the CPU and add some more
> > + NUMA awareness to the kernel.
> > +
> > +config NODES_SHIFT
> > + int "Maximum NUMA Nodes (as a power of 2)"
> > + range 1 10
> > + default "2"
> > + depends on NEED_MULTIPLE_NODES
> > + help
> > + Specify the maximum number of NUMA Nodes available on the target
> > + system. Increases memory reserved to accommodate various tables.
> > +
> > +config USE_PERCPU_NUMA_NODE_ID
> > + def_bool y
> > + depends on NUMA
> > +
> > +config NEED_PER_CPU_EMBED_FIRST_CHUNK
> > + def_bool y
> > + depends on NUMA
> >
> > config RISCV_ISA_C
> > bool "Emit compressed instructions when building Linux"
> > diff --git a/arch/riscv/include/asm/mmzone.h b/arch/riscv/include/asm/mmzone.h
> > new file mode 100644
> > index 000000000000..fa17e01d9ab2
> > --- /dev/null
> > +++ b/arch/riscv/include/asm/mmzone.h
> > @@ -0,0 +1,13 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef __ASM_MMZONE_H
> > +#define __ASM_MMZONE_H
> > +
> > +#ifdef CONFIG_NUMA
> > +
> > +#include <asm/numa.h>
> > +
> > +extern struct pglist_data *node_data[];
> > +#define NODE_DATA(nid) (node_data[(nid)])
> > +
> > +#endif /* CONFIG_NUMA */
> > +#endif /* __ASM_MMZONE_H */
> > diff --git a/arch/riscv/include/asm/numa.h b/arch/riscv/include/asm/numa.h
> > new file mode 100644
> > index 000000000000..10a4513d078b
> > --- /dev/null
> > +++ b/arch/riscv/include/asm/numa.h
> > @@ -0,0 +1,46 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef __ASM_NUMA_H
> > +#define __ASM_NUMA_H
> > +
> > +#include <asm/topology.h>
> > +
> > +#ifdef CONFIG_NUMA
> > +
> > +extern nodemask_t numa_nodes_parsed __initdata;
> > +
> > +extern bool numa_off;
> > +
> > +/* Mappings between node number and cpus on that node. */
> > +extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
> > +void numa_clear_node(unsigned int cpu);
> > +
> > +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
> > +const struct cpumask *cpumask_of_node(int node);
> > +#else
> > +/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
> > +static inline const struct cpumask *cpumask_of_node(int node)
> > +{
> > + return node_to_cpumask_map[node];
> > +}
> > +#endif
> > +
> > +void __init riscv_numa_init(void);
> > +int __init numa_add_memblk(int nodeid, u64 start, u64 end);
> > +void __init numa_set_distance(int from, int to, int distance);
> > +void __init numa_free_distance(void);
> > +void __init early_map_cpu_to_node(unsigned int cpu, int nid);
> > +void numa_store_cpu_info(unsigned int cpu);
> > +void numa_add_cpu(unsigned int cpu);
> > +void numa_remove_cpu(unsigned int cpu);
> > +
> > +#else /* CONFIG_NUMA */
> > +
> > +static inline void numa_store_cpu_info(unsigned int cpu) { }
> > +static inline void numa_add_cpu(unsigned int cpu) { }
> > +static inline void numa_remove_cpu(unsigned int cpu) { }
> > +static inline void riscv_numa_init(void) { }
> > +static inline void early_map_cpu_to_node(unsigned int cpu, int nid) { }
> > +
> > +#endif /* CONFIG_NUMA */
> > +
> > +#endif /* __ASM_NUMA_H */
> > diff --git a/arch/riscv/include/asm/pci.h b/arch/riscv/include/asm/pci.h
> > index 5ac8daa1cc36..781aa8b6dcd3 100644
> > --- a/arch/riscv/include/asm/pci.h
> > +++ b/arch/riscv/include/asm/pci.h
> > @@ -32,6 +32,16 @@ static inline int pci_proc_domain(struct pci_bus *bus)
> > /* always show the domain in /proc */
> > return 1;
> > }
> > +
> > +#ifdef CONFIG_NUMA
> > +int pcibus_to_node(struct pci_bus *bus);
> > +#ifndef cpumask_of_pcibus
> > +#define cpumask_of_pcibus(bus) (pcibus_to_node(bus) == -1 ? \
> > + cpu_all_mask : \
> > + cpumask_of_node(pcibus_to_node(bus)))
> > +#endif
> > +#endif /* CONFIG_NUMA */
> > +
> > #endif /* CONFIG_PCI */
> >
> > #endif /* __ASM_PCI_H */
> > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> > index d3221017194d..04b7c38870f7 100644
> > --- a/arch/riscv/include/asm/pgtable.h
> > +++ b/arch/riscv/include/asm/pgtable.h
> > @@ -175,6 +175,11 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
> > return (unsigned long)pfn_to_virt(pmd_val(pmd) >> _PAGE_PFN_SHIFT);
> > }
> >
> > +static inline pte_t pmd_pte(pmd_t pmd)
> > +{
> > + return __pte(pmd_val(pmd));
> > +}
> > +
> > /* Yields the page frame number (PFN) of a page table entry */
> > static inline unsigned long pte_pfn(pte_t pte)
> > {
> > @@ -288,6 +293,21 @@ static inline pte_t pte_mkhuge(pte_t pte)
> > return pte;
> > }
> >
> > +#ifdef CONFIG_NUMA_BALANCING
> > +/*
> > + * See the comment in include/asm-generic/pgtable.h
> > + */
> > +static inline int pte_protnone(pte_t pte)
> > +{
> > + return (pte_val(pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE)) == _PAGE_PROT_NONE;
> > +}
> > +
> > +static inline int pmd_protnone(pmd_t pmd)
> > +{
> > + return pte_protnone(pmd_pte(pmd));
> > +}
> > +#endif
> > +
> > /* Modify page protection bits */
> > static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
> > {
> > diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
> > index 845ae0e12115..f6f2354036a0 100644
> > --- a/arch/riscv/kernel/setup.c
> > +++ b/arch/riscv/kernel/setup.c
> > @@ -53,6 +53,31 @@ void __init parse_dtb(void)
> > #endif
> > }
> >
> > +static DEFINE_PER_CPU(struct cpu, cpu_devices);
> > +
> > +static int __init topology_init(void)
> > +{
> > + int i, ret;
> > +
> > +#ifdef CONFIG_NEED_MULTIPLE_NODES
> > + for_each_online_node(i)
> > + register_one_node(i);
> > +#endif
> > +
> > + for_each_possible_cpu(i) {
> > + struct cpu *cpu = &per_cpu(cpu_devices, i);
> > +
> > + cpu->hotpluggable = 1;
>
> Strange !!!
>
> We cannot claim CPUs are hotpluggable until Atish's
> Linux SBI v0.2 HSM patches are available.

Thanks. It should be set to 0 for now.
cpu->hotpluggable = 0;

> If required then Linux RISC-V NUMA patches should
> be based upon Atish's Linux SBI v0.2 HSM support.
>
> > + ret = register_cpu(cpu, i);
> > + if (unlikely(ret))
> > + pr_warn("Warning: %s: register_cpu %d failed (%d)\n",
> > + __func__, i, ret);
> > + }
> > +
> > + return 0;
> > +}
> > +subsys_initcall(topology_init);
> > +
> > void __init setup_arch(char **cmdline_p)
> > {
> > init_mm.start_code = (unsigned long) _stext;
> > @@ -66,7 +91,6 @@ void __init setup_arch(char **cmdline_p)
> >
> > setup_bootmem();
> > paging_init();
> > - unflatten_device_tree();
>
> Movement of unflatten_device_tree() call from here to
> paging_init() needs explanation.
>

It is moved to paging_init() is because that of_numa_init() will use
of_numa_parse_cpu_nodes() and of_numa_parse_memory_nodes().
We have to unflatten_device_tree() first then we can call
riscv_numa_init(), but riscv_numa_init() shall be called before
memblocks_present() because the node information will be used in
memblocks_present().
So the order will be like this.

unflatten_device_tree(); //To get dt information for memory and nodes
riscv_numa_init(); //It can use of_numa_parse_* and set the nodes information
memblocks_present(); //The node information can be used now