[RFC v2 5/5] hmem: add performance attributes

From: Ross Zwisler
Date: Thu Jul 06 2017 - 17:52:59 EST


Add performance information found in the HMAT to the sysfs representation.
This information lives as an attribute group named "local_init" in the
memory target:

# tree mem_tgt2/
mem_tgt2/
âââ firmware_id
âââ is_cached
âââ is_enabled
âââ is_isolated
âââ local_init
â âââ mem_init0 -> ../../mem_init0
â âââ mem_init1 -> ../../mem_init1
â âââ mem_tgt2 -> ../../mem_tgt2
â âââ read_bw_MBps
â âââ read_lat_nsec
â âââ write_bw_MBps
â âââ write_lat_nsec
âââ node2 -> ../../node/node2
âââ phys_addr_base
âââ phys_length_bytes
âââ power
â âââ async
â ...
âââ subsystem -> ../../../../bus/hmem
âââ uevent

This attribute group surfaces latency and bandwidth performance for a memory
target and its local initiators. For example:

# grep . mem_tgt2/local_init/* 2>/dev/null
mem_tgt2/local_init/read_bw_MBps:30720
mem_tgt2/local_init/read_lat_nsec:100
mem_tgt2/local_init/write_bw_MBps:30720
mem_tgt2/local_init/write_lat_nsec:100

The local initiators have a symlink to the performance information which lives
in the target's attribute group:

# ls -l mem_init0/mem_tgt2
lrwxrwxrwx. 1 root root 0 Jul 5 14:38 mem_init0/mem_tgt2 ->
../mem_tgt2/local_init

We create performance attribute groups only for local (initiator,target)
pairings, where the the first local initiator for a given target is defined
by the "Processor Proximity Domain" field in the HMAT's Memory Subsystem
Address Range Structure table. After we have one local initiator we scan
the performance data to link to any other "local" initiators with the same
local performance to a given memory target.

A given target only has one set of local performance values, so each target
will have at most one "local_init" attribute group, though that group can
contain links to multiple initiators that all have local performance. A
given memory initiator may have multiple local memory targets, so multiple
"mem_tgtX" links may exist for a given initiator.

If a given memory target is cached we give performance numbers only for the
media itself, and rely on the "is_cached" attribute to represent the
fact that there is a caching layer.

The fact that we only expose a subset of the performance information
presented in the HMAT via sysfs as a compromise, driven by fact that those
usages will be the highest performing and because to represent all possible
paths could cause an unmanageable explosion of sysfs entries.

If we dump everything from the HMAT into sysfs we end up with
O(num_targets * num_initiators * num_caching_levels) attributes. Each of
these attributes only takes up 2 bytes in a System Locality Latency and
Bandwidth Information Structure, but if we have to create a directory entry
for each it becomes much more expensive.

For example, very large systems today can have on the order of thousands of
NUMA nodes. Say we have a system which used to have 1,000 NUMA nodes that
each had both a CPU and local memory. The HMAT allows us to separate the
CPUs and memory into separate NUMA nodes, so we can end up with 1,000 CPU
initiator NUMA nodes and 1,000 memory target NUMA nodes. If we represented
the performance information for each possible CPU/memory pair in sysfs we
would end up with 1,000,000 attribute groups.

This is a lot to pass in a set of packed data tables, but I think we'll
break sysfs if we try to create millions of attributes, regardless of how
we nest them in a directory hierarchy.

By only representing performance information for local (initiator,target)
pairings, we reduce the number of sysfs entries to O(num_targets).

Signed-off-by: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx>
---
drivers/acpi/hmem/Makefile | 2 +-
drivers/acpi/hmem/core.c | 268 +++++++++++++++++++++++++++++++++++-
drivers/acpi/hmem/hmem.h | 17 +++
drivers/acpi/hmem/perf_attributes.c | 56 ++++++++
4 files changed, 341 insertions(+), 2 deletions(-)
create mode 100644 drivers/acpi/hmem/perf_attributes.c

diff --git a/drivers/acpi/hmem/Makefile b/drivers/acpi/hmem/Makefile
index d2aa546..44e8304 100644
--- a/drivers/acpi/hmem/Makefile
+++ b/drivers/acpi/hmem/Makefile
@@ -1,2 +1,2 @@
obj-$(CONFIG_ACPI_HMEM) := hmem.o
-hmem-y := core.o initiator.o target.o
+hmem-y := core.o initiator.o target.o perf_attributes.o
diff --git a/drivers/acpi/hmem/core.c b/drivers/acpi/hmem/core.c
index f7638db..8f3ab35 100644
--- a/drivers/acpi/hmem/core.c
+++ b/drivers/acpi/hmem/core.c
@@ -23,11 +23,230 @@
#include <linux/slab.h>
#include "hmem.h"

+#define NO_VALUE -1
+#define LOCAL_INIT "local_init"
+
static LIST_HEAD(target_list);
static LIST_HEAD(initiator_list);
+LIST_HEAD(locality_list);

static bool bad_hmem;

+/* Performance attributes for an initiator/target pair. */
+static int get_performance_data(u32 init_pxm, u32 tgt_pxm,
+ struct acpi_hmat_locality *hmat_loc)
+{
+ int num_init = hmat_loc->number_of_initiator_Pds;
+ int num_tgt = hmat_loc->number_of_target_Pds;
+ int init_idx = NO_VALUE;
+ int tgt_idx = NO_VALUE;
+ u32 *initiators, *targets;
+ u16 *entries, val;
+ int i;
+
+ /* the initiator array is after the struct acpi_hmat_locality fields */
+ initiators = (u32 *)(hmat_loc + 1);
+ targets = &initiators[num_init];
+ entries = (u16 *)&targets[num_tgt];
+
+ for (i = 0; i < num_init; i++) {
+ if (initiators[i] == init_pxm) {
+ init_idx = i;
+ break;
+ }
+ }
+
+ if (init_idx == NO_VALUE)
+ return NO_VALUE;
+
+ for (i = 0; i < num_tgt; i++) {
+ if (targets[i] == tgt_pxm) {
+ tgt_idx = i;
+ break;
+ }
+ }
+
+ if (tgt_idx == NO_VALUE)
+ return NO_VALUE;
+
+ val = entries[init_idx*num_tgt + tgt_idx];
+ if (val < 10 || val == 0xFFFF)
+ return NO_VALUE;
+
+ return (val * hmat_loc->entry_base_unit) / 10;
+}
+
+/*
+ * 'direction' is either READ or WRITE
+ * Latency is reported in nanoseconds and bandwidth is reported in MB/s.
+ */
+static int hmem_get_attribute(int init_pxm, int tgt_pxm, int direction,
+ enum hmem_attr_type type)
+{
+ struct memory_locality *loc;
+ int value;
+
+ list_for_each_entry(loc, &locality_list, list) {
+ struct acpi_hmat_locality *hmat_loc = loc->hmat_loc;
+
+ if (direction == READ && type == LATENCY &&
+ (hmat_loc->data_type == ACPI_HMAT_ACCESS_LATENCY ||
+ hmat_loc->data_type == ACPI_HMAT_READ_LATENCY)) {
+ value = get_performance_data(init_pxm, tgt_pxm,
+ hmat_loc);
+ if (value != NO_VALUE)
+ return value;
+ }
+
+ if (direction == WRITE && type == LATENCY &&
+ (hmat_loc->data_type == ACPI_HMAT_ACCESS_LATENCY ||
+ hmat_loc->data_type == ACPI_HMAT_WRITE_LATENCY)) {
+ value = get_performance_data(init_pxm, tgt_pxm,
+ hmat_loc);
+ if (value != NO_VALUE)
+ return value;
+ }
+
+ if (direction == READ && type == BANDWIDTH &&
+ (hmat_loc->data_type == ACPI_HMAT_ACCESS_BANDWIDTH ||
+ hmat_loc->data_type == ACPI_HMAT_READ_BANDWIDTH)) {
+ value = get_performance_data(init_pxm, tgt_pxm,
+ hmat_loc);
+ if (value != NO_VALUE)
+ return value;
+ }
+
+ if (direction == WRITE && type == BANDWIDTH &&
+ (hmat_loc->data_type == ACPI_HMAT_ACCESS_BANDWIDTH ||
+ hmat_loc->data_type == ACPI_HMAT_WRITE_BANDWIDTH)) {
+ value = get_performance_data(init_pxm, tgt_pxm,
+ hmat_loc);
+ if (value != NO_VALUE)
+ return value;
+ }
+ }
+
+ return NO_VALUE;
+}
+
+/*
+ * 'direction' is either READ or WRITE
+ * Latency is reported in nanoseconds and bandwidth is reported in MB/s.
+ */
+int hmem_local_attribute(struct device *tgt_dev, int direction,
+ enum hmem_attr_type type)
+{
+ struct memory_target *tgt = to_memory_target(tgt_dev);
+ int tgt_pxm = tgt->ma->proximity_domain;
+ int init_pxm;
+
+ if (!tgt->local_init)
+ return NO_VALUE;
+
+ init_pxm = tgt->local_init->pxm;
+ return hmem_get_attribute(init_pxm, tgt_pxm, direction, type);
+}
+
+static bool is_local_init(int init_pxm, int tgt_pxm, int read_lat,
+ int write_lat, int read_bw, int write_bw)
+{
+ if (read_lat != hmem_get_attribute(init_pxm, tgt_pxm, READ, LATENCY))
+ return false;
+
+ if (write_lat != hmem_get_attribute(init_pxm, tgt_pxm, WRITE, LATENCY))
+ return false;
+
+ if (read_bw != hmem_get_attribute(init_pxm, tgt_pxm, READ, BANDWIDTH))
+ return false;
+
+ if (write_bw != hmem_get_attribute(init_pxm, tgt_pxm, WRITE, BANDWIDTH))
+ return false;
+
+ return true;
+}
+
+static const struct attribute_group performance_attribute_group = {
+ .attrs = performance_attributes,
+ .name = LOCAL_INIT,
+};
+
+static void remove_performance_attributes(struct memory_target *tgt)
+{
+ if (!tgt->local_init)
+ return;
+
+ /*
+ * FIXME: Need to enhance the core sysfs code to remove all the links
+ * in both the attribute group and in the device itself when those are
+ * removed.
+ */
+ sysfs_remove_group(&tgt->dev.kobj, &performance_attribute_group);
+}
+
+static int add_performance_attributes(struct memory_target *tgt)
+{
+ int read_lat, write_lat, read_bw, write_bw;
+ int tgt_pxm = tgt->ma->proximity_domain;
+ struct kobject *init_kobj, *tgt_kobj;
+ struct device *init_dev, *tgt_dev;
+ struct memory_initiator *init;
+ int ret;
+
+ if (!tgt->local_init)
+ return 0;
+
+ tgt_dev = &tgt->dev;
+ tgt_kobj = &tgt_dev->kobj;
+
+ /* Create entries for initiator/target pair in the target. */
+ ret = sysfs_create_group(tgt_kobj, &performance_attribute_group);
+ if (ret < 0)
+ return ret;
+
+ ret = sysfs_add_link_to_group(tgt_kobj, LOCAL_INIT, tgt_kobj,
+ dev_name(tgt_dev));
+ if (ret < 0)
+ goto err;
+
+ /*
+ * Iterate through initiators and find all the ones that have the same
+ * performance as the local initiator.
+ */
+ read_lat = hmem_local_attribute(tgt_dev, READ, LATENCY);
+ write_lat = hmem_local_attribute(tgt_dev, WRITE, LATENCY);
+ read_bw = hmem_local_attribute(tgt_dev, READ, BANDWIDTH);
+ write_bw = hmem_local_attribute(tgt_dev, WRITE, BANDWIDTH);
+
+ list_for_each_entry(init, &initiator_list, list) {
+ init_dev = &init->dev;
+ init_kobj = &init_dev->kobj;
+
+ if (init == tgt->local_init ||
+ is_local_init(init->pxm, tgt_pxm, read_lat,
+ write_lat, read_bw, write_bw)) {
+ ret = sysfs_add_link_to_group(tgt_kobj, LOCAL_INIT,
+ init_kobj, dev_name(init_dev));
+ if (ret < 0)
+ goto err;
+
+ /*
+ * Create a link in the initiator to the performance
+ * attributes.
+ */
+ ret = sysfs_add_group_link(init_kobj, tgt_kobj,
+ LOCAL_INIT, dev_name(tgt_dev));
+ if (ret < 0)
+ goto err;
+ }
+
+ }
+ tgt->has_perf_attributes = true;
+ return 0;
+err:
+ remove_performance_attributes(tgt);
+ return ret;
+}
+
static int link_node_for_kobj(unsigned int node, struct kobject *kobj)
{
if (node_devices[node])
@@ -178,6 +397,9 @@ static void release_memory_target(struct device *dev)

static void __init remove_memory_target(struct memory_target *tgt)
{
+ if (tgt->has_perf_attributes)
+ remove_performance_attributes(tgt);
+
if (tgt->is_registered) {
remove_node_for_kobj(pxm_to_node(tgt->ma->proximity_domain),
&tgt->dev.kobj);
@@ -309,6 +531,38 @@ hmat_parse_address_range(struct acpi_subtable_header *header,
return -EINVAL;
}

+static int __init hmat_parse_locality(struct acpi_subtable_header *header,
+ const unsigned long end)
+{
+ struct acpi_hmat_locality *hmat_loc;
+ struct memory_locality *loc;
+
+ if (bad_hmem)
+ return 0;
+
+ hmat_loc = (struct acpi_hmat_locality *)header;
+ if (!hmat_loc) {
+ pr_err("HMEM: NULL table entry\n");
+ bad_hmem = true;
+ return -EINVAL;
+ }
+
+ /* We don't report cached performance information in sysfs. */
+ if (hmat_loc->flags == ACPI_HMAT_MEMORY ||
+ hmat_loc->flags == ACPI_HMAT_LAST_LEVEL_CACHE) {
+ loc = kzalloc(sizeof(*loc), GFP_KERNEL);
+ if (!loc) {
+ bad_hmem = true;
+ return -ENOMEM;
+ }
+
+ loc->hmat_loc = hmat_loc;
+ list_add_tail(&loc->list, &locality_list);
+ }
+
+ return 0;
+}
+
static int __init hmat_parse_cache(struct acpi_subtable_header *header,
const unsigned long end)
{
@@ -464,6 +718,7 @@ srat_parse_memory_affinity(struct acpi_subtable_header *header,
static void hmem_cleanup(void)
{
struct memory_initiator *init, *init_iter;
+ struct memory_locality *loc, *loc_iter;
struct memory_target *tgt, *tgt_iter;

list_for_each_entry_safe(tgt, tgt_iter, &target_list, list)
@@ -471,6 +726,11 @@ static void hmem_cleanup(void)

list_for_each_entry_safe(init, init_iter, &initiator_list, list)
remove_memory_initiator(init);
+
+ list_for_each_entry_safe(loc, loc_iter, &locality_list, list) {
+ list_del(&loc->list);
+ kfree(loc);
+ }
}

static int __init hmem_init(void)
@@ -521,13 +781,15 @@ static int __init hmem_init(void)
}

if (!acpi_table_parse(ACPI_SIG_HMAT, hmem_noop_parse)) {
- struct acpi_subtable_proc hmat_proc[2];
+ struct acpi_subtable_proc hmat_proc[3];

memset(hmat_proc, 0, sizeof(hmat_proc));
hmat_proc[0].id = ACPI_HMAT_TYPE_ADDRESS_RANGE;
hmat_proc[0].handler = hmat_parse_address_range;
hmat_proc[1].id = ACPI_HMAT_TYPE_CACHE;
hmat_proc[1].handler = hmat_parse_cache;
+ hmat_proc[2].id = ACPI_HMAT_TYPE_LOCALITY;
+ hmat_proc[2].handler = hmat_parse_locality;

acpi_table_parse_entries_array(ACPI_SIG_HMAT,
sizeof(struct acpi_table_hmat),
@@ -549,6 +811,10 @@ static int __init hmem_init(void)
ret = register_memory_target(tgt);
if (ret)
goto err;
+
+ ret = add_performance_attributes(tgt);
+ if (ret)
+ goto err;
}

return 0;
diff --git a/drivers/acpi/hmem/hmem.h b/drivers/acpi/hmem/hmem.h
index 38ff540..71b10f0 100644
--- a/drivers/acpi/hmem/hmem.h
+++ b/drivers/acpi/hmem/hmem.h
@@ -16,6 +16,11 @@
#ifndef _ACPI_HMEM_H_
#define _ACPI_HMEM_H_

+enum hmem_attr_type {
+ LATENCY,
+ BANDWIDTH,
+};
+
struct memory_initiator {
struct list_head list;
struct device dev;
@@ -39,9 +44,21 @@ struct memory_target {

bool is_cached;
bool is_registered;
+ bool has_perf_attributes;
};
#define to_memory_target(d) container_of((d), struct memory_target, dev)

+struct memory_locality {
+ struct list_head list;
+ struct acpi_hmat_locality *hmat_loc;
+};
+
extern const struct attribute_group *memory_initiator_attribute_groups[];
extern const struct attribute_group *memory_target_attribute_groups[];
+extern struct attribute *performance_attributes[];
+
+extern struct list_head locality_list;
+
+int hmem_local_attribute(struct device *tgt_dev, int direction,
+ enum hmem_attr_type type);
#endif /* _ACPI_HMEM_H_ */
diff --git a/drivers/acpi/hmem/perf_attributes.c b/drivers/acpi/hmem/perf_attributes.c
new file mode 100644
index 0000000..72a710a
--- /dev/null
+++ b/drivers/acpi/hmem/perf_attributes.c
@@ -0,0 +1,56 @@
+/*
+ * Heterogeneous memory performance attributes
+ *
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/acpi.h>
+#include <linux/device.h>
+#include <linux/sysfs.h>
+#include "hmem.h"
+
+static ssize_t read_lat_nsec_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", hmem_local_attribute(dev, READ, LATENCY));
+}
+static DEVICE_ATTR_RO(read_lat_nsec);
+
+static ssize_t write_lat_nsec_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", hmem_local_attribute(dev, WRITE, LATENCY));
+}
+static DEVICE_ATTR_RO(write_lat_nsec);
+
+static ssize_t read_bw_MBps_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", hmem_local_attribute(dev, READ, BANDWIDTH));
+}
+static DEVICE_ATTR_RO(read_bw_MBps);
+
+static ssize_t write_bw_MBps_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n",
+ hmem_local_attribute(dev, WRITE, BANDWIDTH));
+}
+static DEVICE_ATTR_RO(write_bw_MBps);
+
+struct attribute *performance_attributes[] = {
+ &dev_attr_read_lat_nsec.attr,
+ &dev_attr_write_lat_nsec.attr,
+ &dev_attr_read_bw_MBps.attr,
+ &dev_attr_write_bw_MBps.attr,
+ NULL
+};
--
2.9.4