[PATCH v2 1/1] memory tier: consolidate the initialization of memory tiers
From: Ho-Ren (Jack) Chuang
Date: Fri Jun 28 2024 - 02:10:06 EST
If we simply move the set_node_memory_tier() from memory_tier_init()
to late_initcall(), it will result in HMAT not registering
the mt_adistance_algorithm callback function, because
set_node_memory_tier() is not performed during the memory tiering
initialization phase, leading to a lack of correct default_dram
information.
Therefore, we introduced a nodemask to pass the information of the
default DRAM nodes. The reason for not choosing to reuse
default_dram_type->nodes is that it is not clean enough. So in the end,
we use a __initdata variable, which is a variable that is released once
initialization is complete, including both CPU and memory nodes for HMAT
to iterate through.
Besides, since default_dram_type may be checked/used during the
initialization process of HMAT and drivers, it is better to keep the
allocation of default_dram_type in memory_tier_init().
Signed-off-by: Ho-Ren (Jack) Chuang <horenchuang@xxxxxxxxxxxxx>
Suggested-by: Jonathan Cameron <Jonathan.Cameron@xxxxxxxxxx>
---
drivers/acpi/numa/hmat.c | 5 +--
include/linux/memory-tiers.h | 2 ++
mm/memory-tiers.c | 59 +++++++++++++++---------------------
3 files changed, 28 insertions(+), 38 deletions(-)
diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index 2c8ccc91ebe6..a2f9e7a4b479 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -940,10 +940,7 @@ static int hmat_set_default_dram_perf(void)
struct memory_target *target;
struct access_coordinate *attrs;
- if (!default_dram_type)
- return -EIO;
-
- for_each_node_mask(nid, default_dram_type->nodes) {
+ for_each_node_mask(nid, default_dram_nodes) {
pxm = node_to_pxm(nid);
target = find_mem_target(pxm);
if (!target)
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 0d70788558f4..fa61ad9c4d75 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -38,6 +38,7 @@ struct access_coordinate;
#ifdef CONFIG_NUMA
extern bool numa_demotion_enabled;
extern struct memory_dev_type *default_dram_type;
+extern nodemask_t default_dram_nodes __initdata;
struct memory_dev_type *alloc_memory_type(int adistance);
void put_memory_type(struct memory_dev_type *memtype);
void init_node_memory_type(int node, struct memory_dev_type *default_type);
@@ -76,6 +77,7 @@ static inline bool node_is_toptier(int node)
#define numa_demotion_enabled false
#define default_dram_type NULL
+#define default_dram_nodes NODE_MASK_NONE
/*
* CONFIG_NUMA implementation returns non NULL error.
*/
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 6632102bd5c9..a19a90c3ad36 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -43,6 +43,7 @@ static LIST_HEAD(memory_tiers);
static LIST_HEAD(default_memory_types);
static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
struct memory_dev_type *default_dram_type;
+nodemask_t default_dram_nodes __initdata = NODE_MASK_NONE;
static const struct bus_type memory_tier_subsys = {
.name = "memory_tiering",
@@ -671,28 +672,38 @@ EXPORT_SYMBOL_GPL(mt_put_memory_types);
/*
* This is invoked via `late_initcall()` to initialize memory tiers for
- * CPU-less memory nodes after driver initialization, which is
- * expected to provide `adistance` algorithms.
+ * memory nodes, both with and without CPUs. After the initialization of
+ * firmware and devices, adistance algorithms are expected to be provided.
*/
static int __init memory_tier_late_init(void)
{
int nid;
+ struct memory_tier *memtier;
+ get_online_mems();
guard(mutex)(&memory_tier_lock);
+ /*
+ * Look at all the existing and uninitialized N_MEMORY nodes and
+ * add them to default memory tier or to a tier if we already have
+ * memory types assigned.
+ */
for_each_node_state(nid, N_MEMORY) {
/*
- * Some device drivers may have initialized memory tiers
- * between `memory_tier_init()` and `memory_tier_late_init()`,
- * potentially bringing online memory nodes and
- * configuring memory tiers. Exclude them here.
+ * Some device drivers may have initialized
+ * memory tiers, potentially bringing memory nodes
+ * online and configuring memory tiers.
+ * Exclude them here.
*/
if (node_memory_types[nid].memtype)
continue;
- set_node_memory_tier(nid);
+ memtier = set_node_memory_tier(nid);
+ if (IS_ERR(memtier))
+ /* Continue with memtiers we are able to setup. */
+ break;
}
-
establish_demotion_targets();
+ put_online_mems();
return 0;
}
@@ -875,8 +886,7 @@ static int __meminit memtier_hotplug_callback(struct notifier_block *self,
static int __init memory_tier_init(void)
{
- int ret, node;
- struct memory_tier *memtier;
+ int ret;
ret = subsys_virtual_register(&memory_tier_subsys, NULL);
if (ret)
@@ -887,7 +897,8 @@ static int __init memory_tier_init(void)
GFP_KERNEL);
WARN_ON(!node_demotion);
#endif
- mutex_lock(&memory_tier_lock);
+
+ guard(mutex)(&memory_tier_lock);
/*
* For now we can have 4 faster memory tiers with smaller adistance
* than default DRAM tier.
@@ -897,29 +908,9 @@ static int __init memory_tier_init(void)
if (IS_ERR(default_dram_type))
panic("%s() failed to allocate default DRAM tier\n", __func__);
- /*
- * Look at all the existing N_MEMORY nodes and add them to
- * default memory tier or to a tier if we already have memory
- * types assigned.
- */
- for_each_node_state(node, N_MEMORY) {
- if (!node_state(node, N_CPU))
- /*
- * Defer memory tier initialization on
- * CPUless numa nodes. These will be initialized
- * after firmware and devices are initialized.
- */
- continue;
-
- memtier = set_node_memory_tier(node);
- if (IS_ERR(memtier))
- /*
- * Continue with memtiers we are able to setup
- */
- break;
- }
- establish_demotion_targets();
- mutex_unlock(&memory_tier_lock);
+ /* Record nodes with memory and CPU to set default DRAM performance. */
+ nodes_and(default_dram_nodes, node_states[N_MEMORY],
+ node_states[N_CPU]);
hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
return 0;
--
Ho-Ren (Jack) Chuang