Re: [PATCH v9 6/8] mm/demotion: Add pg_data_t member to track node memory tier details

From: Aneesh Kumar K.V
Date: Fri Jul 15 2022 - 03:19:47 EST


Alistair Popple <apopple@xxxxxxxxxx> writes:

> "Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxx> writes:
>
>> Also update different helpes to use NODE_DATA()->memtier. Since
>> node specific memtier can change based on the reassignment of
>> NUMA node to a different memory tiers, accessing NODE_DATA()->memtier
>> needs to happen under an rcu read lock or memory_tier_lock.
>>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx>
>> ---
>> include/linux/mmzone.h | 3 ++
>> mm/memory-tiers.c | 64 +++++++++++++++++++++++++++++++-----------
>> 2 files changed, 50 insertions(+), 17 deletions(-)
>>
>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>> index aab70355d64f..353812495a70 100644
>> --- a/include/linux/mmzone.h
>> +++ b/include/linux/mmzone.h
>> @@ -928,6 +928,9 @@ typedef struct pglist_data {
>> /* Per-node vmstats */
>> struct per_cpu_nodestat __percpu *per_cpu_nodestats;
>> atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];
>> +#ifdef CONFIG_NUMA
>> + struct memory_tier __rcu *memtier;
>> +#endif
>> } pg_data_t;
>>
>> #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
>> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
>> index e951f54ce56c..bab4700bf58d 100644
>> --- a/mm/memory-tiers.c
>> +++ b/mm/memory-tiers.c
>> @@ -7,6 +7,7 @@
>> #include <linux/moduleparam.h>
>> #include <linux/memory.h>
>> #include <linux/random.h>
>> +#include <linux/rcupdate.h>
>> #include <linux/memory-tiers.h>
>>
>> #include "internal.h"
>> @@ -124,18 +125,23 @@ static struct memory_tier *register_memory_tier(unsigned int tier)
>> static void unregister_memory_tier(struct memory_tier *memtier)
>> {
>> list_del(&memtier->list);
>> - kfree(memtier);
>> + kfree_rcu(memtier);
>> }
>>
>> static struct memory_tier *__node_get_memory_tier(int node)
>> {
>> - struct memory_tier *memtier;
>> + pg_data_t *pgdat;
>>
>> - list_for_each_entry(memtier, &memory_tiers, list) {
>> - if (node_isset(node, memtier->nodelist))
>> - return memtier;
>> - }
>> - return NULL;
>> + pgdat = NODE_DATA(node);
>> + if (!pgdat)
>> + return NULL;
>> + /*
>> + * Since we hold memory_tier_lock, we can avoid
>> + * RCU read locks when accessing the details. No
>> + * parallel updates are possible here.
>> + */
>> + return rcu_dereference_check(pgdat->memtier,
>> + lockdep_is_held(&memory_tier_lock));
>> }
>>
>> static struct memory_tier *__get_memory_tier_from_id(int id)
>> @@ -149,6 +155,33 @@ static struct memory_tier *__get_memory_tier_from_id(int id)
>> return NULL;
>> }
>>
>> +/*
>> + * Called with memory_tier_lock. Hence the device references cannot
>> + * be dropped during this function.
>> + */
>> +static void memtier_node_set(int node, struct memory_tier *memtier)
>> +{
>> + pg_data_t *pgdat;
>> + struct memory_tier *current_memtier;
>> +
>> + pgdat = NODE_DATA(node);
>> + if (!pgdat)
>> + return;
>> + /*
>> + * Make sure we mark the memtier NULL before we assign the new memory tier
>> + * to the NUMA node. This make sure that anybody looking at NODE_DATA
>> + * finds a NULL memtier or the one which is still valid.
>> + */
>> + current_memtier = rcu_dereference_check(pgdat->memtier,
>> + lockdep_is_held(&memory_tier_lock));
>> + rcu_assign_pointer(pgdat->memtier, NULL);
>> + if (current_memtier)
>> + node_clear(node, current_memtier->nodelist);
>
> It seems odd to me that you would update the current memtier prior to
> the synchronize_rcu(). I suppose it's really memory_tier_lock that
> protects the details like ->nodelist, but is there any reason not do the
> update after anyway?

The synchronize_rcu ensures that the lockless read of pgdat->memtier
either see value NULL or a stable memtier which got current numa node in
its nodelist. IIUC what you are suggesting is we should move the
node_clear after synchronize_rcu?. I am also wondering whether I need
a smp_wmb()?

pgdat->memtier = NULL;
synchronize_rcu
remove node from memtier;
set node in new memtier
smp_wmb();
pgdat->memtier = new memtier;


>
>> + synchronize_rcu();
>> + node_set(node, memtier->nodelist);
>> + rcu_assign_pointer(pgdat->memtier, memtier);
>> +}
>> +
>> static int __node_create_and_set_memory_tier(int node, int tier)
>> {
>> int ret = 0;
>> @@ -162,7 +195,7 @@ static int __node_create_and_set_memory_tier(int node, int tier)
>> goto out;
>> }
>> }
>> - node_set(node, memtier->nodelist);
>> + memtier_node_set(node, memtier);
>> out:
>> return ret;
>> }
>> @@ -184,14 +217,7 @@ int node_create_and_set_memory_tier(int node, int tier)
>> if (current_tier->id == tier)
>> goto out;
>>
>> - node_clear(node, current_tier->nodelist);
>> -
>> ret = __node_create_and_set_memory_tier(node, tier);
>> - if (ret) {
>> - /* reset it back to older tier */
>> - node_set(node, current_tier->nodelist);
>> - goto out;
>> - }
>> if (nodes_empty(current_tier->nodelist))
>> unregister_memory_tier(current_tier);
>>
>> @@ -213,7 +239,7 @@ static int __node_set_memory_tier(int node, int tier)
>> ret = -EINVAL;
>> goto out;
>> }
>> - node_set(node, memtier->nodelist);
>> + memtier_node_set(node, memtier);
>> out:
>> return ret;
>> }
>> @@ -428,6 +454,7 @@ static void __init migrate_on_reclaim_init(void)
>>
>> static int __init memory_tier_init(void)
>> {
>> + int node;
>> struct memory_tier *memtier;
>>
>> /*
>> @@ -444,7 +471,10 @@ static int __init memory_tier_init(void)
>> __func__, PTR_ERR(memtier));
>>
>> /* CPU only nodes are not part of memory tiers. */
>> - memtier->nodelist = node_states[N_MEMORY];
>> + for_each_node_state(node, N_MEMORY) {
>> + rcu_assign_pointer(NODE_DATA(node)->memtier, memtier);
>> + node_set(node, memtier->nodelist);
>
> Similar comment here - the order seems opposite to what I'd expect.
> Shouldn't memtier->nodelist be fully initialised prior to making it
> visible with rcu_assign_pointer()?

Will fix this. This is early during boot. So the ordering won't impact
correctness. Hence i can skip the smp_wmb()?

>
>> + }
>> mutex_unlock(&memory_tier_lock);
>>
>> migrate_on_reclaim_init();