Re: [PATCH -V3 3/4] acpi, hmat: calculate abstract distance with HMAT

From: Dave Jiang
Date: Thu Sep 14 2023 - 13:31:29 EST




On 9/12/23 01:21, Huang Ying wrote:
> A memory tiering abstract distance calculation algorithm based on ACPI
> HMAT is implemented. The basic idea is as follows.
>
> The performance attributes of system default DRAM nodes are recorded
> as the base line. Whose abstract distance is MEMTIER_ADISTANCE_DRAM.
> Then, the ratio of the abstract distance of a memory node (target) to
> MEMTIER_ADISTANCE_DRAM is scaled based on the ratio of the performance
> attributes of the node to that of the default DRAM nodes.
>
> The functions to record the read/write latency/bandwidth of the
> default DRAM nodes and calculate abstract distance according to
> read/write latency/bandwidth ratio will be used by CXL CDAT (Coherent
> Device Attribute Table) and other memory device drivers. So, they are
> put in memory-tiers.c.
>
> Signed-off-by: "Huang, Ying" <ying.huang@xxxxxxxxx>
> Tested-by: Bharata B Rao <bharata@xxxxxxx>
> Cc: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx>
> Cc: Wei Xu <weixugc@xxxxxxxxxx>
> Cc: Alistair Popple <apopple@xxxxxxxxxx>
> Cc: Dan Williams <dan.j.williams@xxxxxxxxx>
> Cc: Dave Hansen <dave.hansen@xxxxxxxxx>
> Cc: Davidlohr Bueso <dave@xxxxxxxxxxxx>
> Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
> Cc: Jonathan Cameron <Jonathan.Cameron@xxxxxxxxxx>
> Cc: Michal Hocko <mhocko@xxxxxxxxxx>
> Cc: Yang Shi <shy828301@xxxxxxxxx>
> Cc: Dave Jiang <dave.jiang@xxxxxxxxx>
> Cc: Rafael J Wysocki <rafael.j.wysocki@xxxxxxxxx>

Reviewed-by: Dave Jiang <dave.jiang@xxxxxxxxx>

> ---
> drivers/acpi/numa/hmat.c | 62 ++++++++++++++++++++-
> include/linux/memory-tiers.h | 18 ++++++
> mm/memory-tiers.c | 103 ++++++++++++++++++++++++++++++++++-
> 3 files changed, 181 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
> index 2dee0098f1a9..64c0810d324b 100644
> --- a/drivers/acpi/numa/hmat.c
> +++ b/drivers/acpi/numa/hmat.c
> @@ -24,6 +24,7 @@
> #include <linux/node.h>
> #include <linux/sysfs.h>
> #include <linux/dax.h>
> +#include <linux/memory-tiers.h>
>
> static u8 hmat_revision;
> static int hmat_disable __initdata;
> @@ -759,6 +760,61 @@ static int hmat_callback(struct notifier_block *self,
> return NOTIFY_OK;
> }
>
> +static int hmat_set_default_dram_perf(void)
> +{
> + int rc;
> + int nid, pxm;
> + struct memory_target *target;
> + struct node_hmem_attrs *attrs;
> +
> + if (!default_dram_type)
> + return -EIO;
> +
> + for_each_node_mask(nid, default_dram_type->nodes) {
> + pxm = node_to_pxm(nid);
> + target = find_mem_target(pxm);
> + if (!target)
> + continue;
> + attrs = &target->hmem_attrs[1];
> + rc = mt_set_default_dram_perf(nid, attrs, "ACPI HMAT");
> + if (rc)
> + return rc;
> + }
> +
> + return 0;
> +}
> +
> +static int hmat_calculate_adistance(struct notifier_block *self,
> + unsigned long nid, void *data)
> +{
> + static DECLARE_BITMAP(p_nodes, MAX_NUMNODES);
> + struct memory_target *target;
> + struct node_hmem_attrs *perf;
> + int *adist = data;
> + int pxm;
> +
> + pxm = node_to_pxm(nid);
> + target = find_mem_target(pxm);
> + if (!target)
> + return NOTIFY_OK;
> +
> + mutex_lock(&target_lock);
> + hmat_update_target_attrs(target, p_nodes, 1);
> + mutex_unlock(&target_lock);
> +
> + perf = &target->hmem_attrs[1];
> +
> + if (mt_perf_to_adistance(perf, adist))
> + return NOTIFY_OK;
> +
> + return NOTIFY_STOP;
> +}
> +
> +static struct notifier_block hmat_adist_nb __meminitdata = {
> + .notifier_call = hmat_calculate_adistance,
> + .priority = 100,
> +};
> +
> static __init void hmat_free_structures(void)
> {
> struct memory_target *target, *tnext;
> @@ -801,6 +857,7 @@ static __init int hmat_init(void)
> struct acpi_table_header *tbl;
> enum acpi_hmat_type i;
> acpi_status status;
> + int usage;
>
> if (srat_disabled() || hmat_disable)
> return 0;
> @@ -841,7 +898,10 @@ static __init int hmat_init(void)
> hmat_register_targets();
>
> /* Keep the table and structures if the notifier may use them */
> - if (!hotplug_memory_notifier(hmat_callback, HMAT_CALLBACK_PRI))
> + usage = !hotplug_memory_notifier(hmat_callback, HMAT_CALLBACK_PRI);
> + if (!hmat_set_default_dram_perf())
> + usage += !register_mt_adistance_algorithm(&hmat_adist_nb);
> + if (usage)
> return 0;
> out_put:
> hmat_free_structures();
> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
> index c8382220cced..9d27ca3b143e 100644
> --- a/include/linux/memory-tiers.h
> +++ b/include/linux/memory-tiers.h
> @@ -31,8 +31,11 @@ struct memory_dev_type {
> struct kref kref;
> };
>
> +struct node_hmem_attrs;
> +
> #ifdef CONFIG_NUMA
> extern bool numa_demotion_enabled;
> +extern struct memory_dev_type *default_dram_type;
> struct memory_dev_type *alloc_memory_type(int adistance);
> void put_memory_type(struct memory_dev_type *memtype);
> void init_node_memory_type(int node, struct memory_dev_type *default_type);
> @@ -40,6 +43,9 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype);
> int register_mt_adistance_algorithm(struct notifier_block *nb);
> int unregister_mt_adistance_algorithm(struct notifier_block *nb);
> int mt_calc_adistance(int node, int *adist);
> +int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf,
> + const char *source);
> +int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist);
> #ifdef CONFIG_MIGRATION
> int next_demotion_node(int node);
> void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
> @@ -64,6 +70,7 @@ static inline bool node_is_toptier(int node)
> #else
>
> #define numa_demotion_enabled false
> +#define default_dram_type NULL
> /*
> * CONFIG_NUMA implementation returns non NULL error.
> */
> @@ -116,5 +123,16 @@ static inline int mt_calc_adistance(int node, int *adist)
> {
> return NOTIFY_DONE;
> }
> +
> +static inline int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf,
> + const char *source)
> +{
> + return -EIO;
> +}
> +
> +static inline int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist)
> +{
> + return -EIO;
> +}
> #endif /* CONFIG_NUMA */
> #endif /* _LINUX_MEMORY_TIERS_H */
> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> index 76c0ad47a5ad..fa1a8b418f9a 100644
> --- a/mm/memory-tiers.c
> +++ b/mm/memory-tiers.c
> @@ -37,7 +37,7 @@ struct node_memory_type_map {
> static DEFINE_MUTEX(memory_tier_lock);
> static LIST_HEAD(memory_tiers);
> static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
> -static struct memory_dev_type *default_dram_type;
> +struct memory_dev_type *default_dram_type;
>
> static struct bus_type memory_tier_subsys = {
> .name = "memory_tiering",
> @@ -108,6 +108,11 @@ static struct demotion_nodes *node_demotion __read_mostly;
>
> static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
>
> +static bool default_dram_perf_error;
> +static struct node_hmem_attrs default_dram_perf;
> +static int default_dram_perf_ref_nid = NUMA_NO_NODE;
> +static const char *default_dram_perf_ref_source;
> +
> static inline struct memory_tier *to_memory_tier(struct device *device)
> {
> return container_of(device, struct memory_tier, dev);
> @@ -595,6 +600,102 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype)
> }
> EXPORT_SYMBOL_GPL(clear_node_memory_type);
>
> +static void dump_hmem_attrs(struct node_hmem_attrs *attrs, const char *prefix)
> +{
> + pr_info(
> +"%sread_latency: %u, write_latency: %u, read_bandwidth: %u, write_bandwidth: %u\n",
> + prefix, attrs->read_latency, attrs->write_latency,
> + attrs->read_bandwidth, attrs->write_bandwidth);
> +}
> +
> +int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf,
> + const char *source)
> +{
> + int rc = 0;
> +
> + mutex_lock(&memory_tier_lock);
> + if (default_dram_perf_error) {
> + rc = -EIO;
> + goto out;
> + }
> +
> + if (perf->read_latency + perf->write_latency == 0 ||
> + perf->read_bandwidth + perf->write_bandwidth == 0) {
> + rc = -EINVAL;
> + goto out;
> + }
> +
> + if (default_dram_perf_ref_nid == NUMA_NO_NODE) {
> + default_dram_perf = *perf;
> + default_dram_perf_ref_nid = nid;
> + default_dram_perf_ref_source = kstrdup(source, GFP_KERNEL);
> + goto out;
> + }
> +
> + /*
> + * The performance of all default DRAM nodes is expected to be
> + * same (that is, the variation is less than 10%). And it
> + * will be used as base to calculate the abstract distance of
> + * other memory nodes.
> + */
> + if (abs(perf->read_latency - default_dram_perf.read_latency) * 10 >
> + default_dram_perf.read_latency ||
> + abs(perf->write_latency - default_dram_perf.write_latency) * 10 >
> + default_dram_perf.write_latency ||
> + abs(perf->read_bandwidth - default_dram_perf.read_bandwidth) * 10 >
> + default_dram_perf.read_bandwidth ||
> + abs(perf->write_bandwidth - default_dram_perf.write_bandwidth) * 10 >
> + default_dram_perf.write_bandwidth) {
> + pr_info(
> +"memory-tiers: the performance of DRAM node %d mismatches that of the reference\n"
> +"DRAM node %d.\n", nid, default_dram_perf_ref_nid);
> + pr_info(" performance of reference DRAM node %d:\n",
> + default_dram_perf_ref_nid);
> + dump_hmem_attrs(&default_dram_perf, " ");
> + pr_info(" performance of DRAM node %d:\n", nid);
> + dump_hmem_attrs(perf, " ");
> + pr_info(
> +" disable default DRAM node performance based abstract distance algorithm.\n");
> + default_dram_perf_error = true;
> + rc = -EINVAL;
> + }
> +
> +out:
> + mutex_unlock(&memory_tier_lock);
> + return rc;
> +}
> +
> +int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist)
> +{
> + if (default_dram_perf_error)
> + return -EIO;
> +
> + if (default_dram_perf_ref_nid == NUMA_NO_NODE)
> + return -ENOENT;
> +
> + if (perf->read_latency + perf->write_latency == 0 ||
> + perf->read_bandwidth + perf->write_bandwidth == 0)
> + return -EINVAL;
> +
> + mutex_lock(&memory_tier_lock);
> + /*
> + * The abstract distance of a memory node is in direct proportion to
> + * its memory latency (read + write) and inversely proportional to its
> + * memory bandwidth (read + write). The abstract distance, memory
> + * latency, and memory bandwidth of the default DRAM nodes are used as
> + * the base.
> + */
> + *adist = MEMTIER_ADISTANCE_DRAM *
> + (perf->read_latency + perf->write_latency) /
> + (default_dram_perf.read_latency + default_dram_perf.write_latency) *
> + (default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) /
> + (perf->read_bandwidth + perf->write_bandwidth);
> + mutex_unlock(&memory_tier_lock);
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(mt_perf_to_adistance);
> +
> /**
> * register_mt_adistance_algorithm() - Register memory tiering abstract distance algorithm
> * @nb: The notifier block which describe the algorithm