[PATCH 10/21] nd: regions (block-data-window, persistent memory, volatile memory)

From: Dan Williams
Date: Fri Apr 17 2015 - 21:38:56 EST


A "region" device represents the maximum capacity of a
block-data-window, or an interleaved spa range (direct-access persistent
memory or volatile memory), without regard for aliasing. Aliasing is
resolved by the label data on the dimm to designate which exclusive
interface will access the aliased data. Enabling for the
label-designated sub-device is in a subsequent patch.

The "region" types are defined in the NFIT System Physical Address (spa)
table. In the case of persistent memory the spa-range describes the
direct memory address range of the storage (NFIT_SPA_PM). A block
"region" region (NFIT_SPA_DCR) points to a DIMM Control Region (DCR) or
an interleaved group of DCRs. Those DCRs are (optionally) referenced by
a block-data-window (BDW) set to describe the access mechanism and
capacity of the BLK-accessible storage. If the related BDW is not
published then the dimm is only available for control/configuration
commands. Finally, a volatile "region" (NFIT_SPA_VOLATILE) indicates
the portions of NVDIMMs that have been re-assigned as normal volatile
system memory by platform firmware.

The name format of "region" devices is "regionN" where, like dimms, N is
a global ida index assigned at discovery time. This id is not reliable
across reboots nor in the presence of hotplug. Look to attributes of
the region or static id-data of the sub-namespace to generate a
persistent name.

"region"s have 2 generic attributes "size", and "mapping"s where:
- size: the block-data-window accessible capacity or the span of the
spa-range in the case of pm.

- mappingN: a tuple describing a dimm's contribution to the region's
capacity in the format (<nfit-dimm-handle>,<dpa>,<size>). For a
pm-region there will be at least one mapping per dimm in the interleave
set. For a block-region there is only "mapping0" listing the starting dimm
offset of the block-data-window and the available capacity of that
window (matches "size" above).

The max number of mappings per "region" is hard coded per the constraints of
sysfs attribute groups. That said the number of mappings per region should
never exceed the maximum number of possible dimms in the system. If the
current number turns out to not be enough then the "mappings" attribute
clarifies how many there are supposed to be. "32 should be enough for
anybody...".

Cc: Greg KH <gregkh@xxxxxxxxxxxxxxxxxxx>
Cc: Neil Brown <neilb@xxxxxxx>
Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
---
drivers/block/nd/Makefile | 1
drivers/block/nd/core.c | 8 +
drivers/block/nd/nd-private.h | 5
drivers/block/nd/nd.h | 17 ++
drivers/block/nd/region_devs.c | 426 ++++++++++++++++++++++++++++++++++++++++
5 files changed, 455 insertions(+), 2 deletions(-)
create mode 100644 drivers/block/nd/region_devs.c

diff --git a/drivers/block/nd/Makefile b/drivers/block/nd/Makefile
index 9f1b69c86fba..6698acbe7b44 100644
--- a/drivers/block/nd/Makefile
+++ b/drivers/block/nd/Makefile
@@ -23,3 +23,4 @@ nd-y := core.o
nd-y += bus.o
nd-y += dimm_devs.o
nd-y += dimm.o
+nd-y += region_devs.o
diff --git a/drivers/block/nd/core.c b/drivers/block/nd/core.c
index 426f96b02594..32ecd6f05c90 100644
--- a/drivers/block/nd/core.c
+++ b/drivers/block/nd/core.c
@@ -230,7 +230,7 @@ struct nfit_table_header {
__le16 length;
};

-static const char *spa_type_name(u16 type)
+const char *spa_type_name(u16 type)
{
switch (type) {
case NFIT_SPA_VOLATILE: return "volatile";
@@ -241,7 +241,7 @@ static const char *spa_type_name(u16 type)
}
}

-static int nfit_spa_type(struct nfit_spa __iomem *nfit_spa)
+int nfit_spa_type(struct nfit_spa __iomem *nfit_spa)
{
__u8 uuid[16];

@@ -577,6 +577,10 @@ static struct nd_bus *nd_bus_probe(struct nd_bus *nd_bus)
if (rc)
goto err_child;

+ rc = nd_bus_register_regions(nd_bus);
+ if (rc)
+ goto err_child;
+
mutex_lock(&nd_bus_list_mutex);
list_add_tail(&nd_bus->list, &nd_bus_list);
mutex_unlock(&nd_bus_list_mutex);
diff --git a/drivers/block/nd/nd-private.h b/drivers/block/nd/nd-private.h
index 72197992e386..d254ff688ad6 100644
--- a/drivers/block/nd/nd-private.h
+++ b/drivers/block/nd/nd-private.h
@@ -85,6 +85,8 @@ struct nd_mem {
struct list_head list;
};

+const char *spa_type_name(u16 type);
+int nfit_spa_type(struct nfit_spa __iomem *nfit_spa);
struct nd_dimm *nd_dimm_by_handle(struct nd_bus *nd_bus, u32 nfit_handle);
bool is_nd_dimm(struct device *dev);
struct nd_bus *to_nd_bus(struct device *dev);
@@ -99,4 +101,7 @@ void __exit nd_dimm_exit(void);
int nd_bus_create_ndctl(struct nd_bus *nd_bus);
void nd_bus_destroy_ndctl(struct nd_bus *nd_bus);
int nd_bus_register_dimms(struct nd_bus *nd_bus);
+int nd_bus_register_regions(struct nd_bus *nd_bus);
+int nd_match_dimm(struct device *dev, void *data);
+bool is_nd_dimm(struct device *dev);
#endif /* __ND_PRIVATE_H__ */
diff --git a/drivers/block/nd/nd.h b/drivers/block/nd/nd.h
index f277440c72b4..13eba9bd74c7 100644
--- a/drivers/block/nd/nd.h
+++ b/drivers/block/nd/nd.h
@@ -22,6 +22,22 @@ struct nd_dimm_drvdata {
void *data;
};

+struct nd_mapping {
+ struct nd_dimm *nd_dimm;
+ u64 start;
+ u64 size;
+};
+
+struct nd_region {
+ struct device dev;
+ struct nd_spa *nd_spa;
+ u16 ndr_mappings;
+ u64 ndr_size;
+ u64 ndr_start;
+ int id;
+ struct nd_mapping mapping[0];
+};
+
enum nd_async_mode {
ND_SYNC,
ND_ASYNC,
@@ -39,4 +55,5 @@ void nd_dimm_set_dsm_mask(struct nd_dimm *nd_dimm, unsigned long dsm_mask);
int nd_dimm_init_nsarea(struct nd_dimm_drvdata *ndd);
int nd_dimm_init_config_data(struct nd_dimm_drvdata *ndd);
int nd_dimm_firmware_status(struct device *dev);
+struct nd_region *to_nd_region(struct device *dev);
#endif /* __ND_H__ */
diff --git a/drivers/block/nd/region_devs.c b/drivers/block/nd/region_devs.c
new file mode 100644
index 000000000000..f474c32d6dad
--- /dev/null
+++ b/drivers/block/nd/region_devs.c
@@ -0,0 +1,426 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/slab.h>
+#include <linux/io.h>
+#include "nd-private.h"
+#include "nfit.h"
+#include "nd.h"
+
+#include <asm-generic/io-64-nonatomic-lo-hi.h>
+
+static DEFINE_IDA(region_ida);
+
+static void nd_region_release(struct device *dev)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ u16 i;
+
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nd_dimm *nd_dimm = nd_mapping->nd_dimm;
+
+ put_device(&nd_dimm->dev);
+ }
+ ida_simple_remove(&region_ida, nd_region->id);
+ kfree(nd_region);
+}
+
+static struct device_type nd_block_device_type = {
+ .name = "nd_blk",
+ .release = nd_region_release,
+};
+
+static struct device_type nd_pmem_device_type = {
+ .name = "nd_pmem",
+ .release = nd_region_release,
+};
+
+static struct device_type nd_volatile_device_type = {
+ .name = "nd_volatile",
+ .release = nd_region_release,
+};
+
+static bool is_nd_pmem(struct device *dev)
+{
+ return dev ? dev->type == &nd_pmem_device_type : false;
+}
+
+struct nd_region *to_nd_region(struct device *dev)
+{
+ struct nd_region *nd_region = container_of(dev, struct nd_region, dev);
+
+ WARN_ON(dev->type->release != nd_region_release);
+ return nd_region;
+}
+
+static ssize_t size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ unsigned long long size = 0;
+
+ if (is_nd_pmem(dev)) {
+ size = nd_region->ndr_size;
+ } else if (nd_region->ndr_mappings == 1) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+
+ size = nd_mapping->size;
+ }
+
+ return sprintf(buf, "%llu\n", size);
+}
+static DEVICE_ATTR_RO(size);
+
+static ssize_t mappings_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ return sprintf(buf, "%d\n", nd_region->ndr_mappings);
+}
+static DEVICE_ATTR_RO(mappings);
+
+static struct attribute *nd_region_attributes[] = {
+ &dev_attr_size.attr,
+ &dev_attr_mappings.attr,
+ NULL,
+};
+
+static struct attribute_group nd_region_attribute_group = {
+ .attrs = nd_region_attributes,
+};
+
+/*
+ * Retrieve the nth entry referencing this spa, for pm there may be not only
+ * multiple per device in the interleave, but multiple per-dimm for each region
+ * of the dimm that maps into the interleave.
+ */
+static struct nd_memdev *nd_memdev_from_spa(struct nd_bus *nd_bus,
+ u16 spa_index, int n)
+{
+ struct nd_memdev *nd_memdev;
+
+ list_for_each_entry(nd_memdev, &nd_bus->memdevs, list)
+ if (readw(&nd_memdev->nfit_mem->spa_index) == spa_index)
+ if (n-- == 0)
+ return nd_memdev;
+ return NULL;
+}
+
+static int num_nd_mem(struct nd_bus *nd_bus, u16 spa_index)
+{
+ struct nd_memdev *nd_memdev;
+ int count = 0;
+
+ list_for_each_entry(nd_memdev, &nd_bus->memdevs, list)
+ if (readw(&nd_memdev->nfit_mem->spa_index) == spa_index)
+ count++;
+ return count;
+}
+
+/* convert and anoymous MEMDEV to its set of associated tables */
+static struct nd_mem *nd_memdev_to_mem(struct nd_bus *nd_bus,
+ struct nd_memdev *nd_memdev)
+{
+ u32 nfit_handle = readl(&nd_memdev->nfit_mem->nfit_handle);
+ struct nd_mem *nd_mem;
+
+ list_for_each_entry(nd_mem, &nd_bus->dimms, list)
+ if (readl(&nd_mem->nfit_mem_dcr->nfit_handle) == nfit_handle)
+ return nd_mem;
+ return NULL;
+}
+
+static ssize_t mappingN(struct device *dev, char *buf, int n)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ struct nfit_mem __iomem *nfit_mem;
+ struct nd_mapping *nd_mapping;
+ struct nd_dimm *nd_dimm;
+
+ if (n >= nd_region->ndr_mappings)
+ return -ENXIO;
+ nd_mapping = &nd_region->mapping[n];
+ nd_dimm = nd_mapping->nd_dimm;
+ nfit_mem = nd_dimm->nd_mem->nfit_mem_dcr;
+
+ return sprintf(buf, "%#x,%llu,%llu\n", readl(&nfit_mem->nfit_handle),
+ nd_mapping->start, nd_mapping->size);
+}
+
+#define REGION_MAPPING(idx) \
+static ssize_t mapping##idx##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ return mappingN(dev, buf, idx); \
+} \
+static DEVICE_ATTR_RO(mapping##idx)
+
+/*
+ * 32 should be enough for a while, even in the presence of socket
+ * interleave a 32-way interleave set is a degenerate case.
+ */
+REGION_MAPPING(0);
+REGION_MAPPING(1);
+REGION_MAPPING(2);
+REGION_MAPPING(3);
+REGION_MAPPING(4);
+REGION_MAPPING(5);
+REGION_MAPPING(6);
+REGION_MAPPING(7);
+REGION_MAPPING(8);
+REGION_MAPPING(9);
+REGION_MAPPING(10);
+REGION_MAPPING(11);
+REGION_MAPPING(12);
+REGION_MAPPING(13);
+REGION_MAPPING(14);
+REGION_MAPPING(15);
+REGION_MAPPING(16);
+REGION_MAPPING(17);
+REGION_MAPPING(18);
+REGION_MAPPING(19);
+REGION_MAPPING(20);
+REGION_MAPPING(21);
+REGION_MAPPING(22);
+REGION_MAPPING(23);
+REGION_MAPPING(24);
+REGION_MAPPING(25);
+REGION_MAPPING(26);
+REGION_MAPPING(27);
+REGION_MAPPING(28);
+REGION_MAPPING(29);
+REGION_MAPPING(30);
+REGION_MAPPING(31);
+
+static umode_t nd_mapping_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+ struct device *dev = container_of(kobj, struct device, kobj);
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ if (n < nd_region->ndr_mappings)
+ return a->mode;
+ return 0;
+}
+
+static struct attribute *nd_mapping_attributes[] = {
+ &dev_attr_mapping0.attr,
+ &dev_attr_mapping1.attr,
+ &dev_attr_mapping2.attr,
+ &dev_attr_mapping3.attr,
+ &dev_attr_mapping4.attr,
+ &dev_attr_mapping5.attr,
+ &dev_attr_mapping6.attr,
+ &dev_attr_mapping7.attr,
+ &dev_attr_mapping8.attr,
+ &dev_attr_mapping9.attr,
+ &dev_attr_mapping10.attr,
+ &dev_attr_mapping11.attr,
+ &dev_attr_mapping12.attr,
+ &dev_attr_mapping13.attr,
+ &dev_attr_mapping14.attr,
+ &dev_attr_mapping15.attr,
+ &dev_attr_mapping16.attr,
+ &dev_attr_mapping17.attr,
+ &dev_attr_mapping18.attr,
+ &dev_attr_mapping19.attr,
+ &dev_attr_mapping20.attr,
+ &dev_attr_mapping21.attr,
+ &dev_attr_mapping22.attr,
+ &dev_attr_mapping23.attr,
+ &dev_attr_mapping24.attr,
+ &dev_attr_mapping25.attr,
+ &dev_attr_mapping26.attr,
+ &dev_attr_mapping27.attr,
+ &dev_attr_mapping28.attr,
+ &dev_attr_mapping29.attr,
+ &dev_attr_mapping30.attr,
+ &dev_attr_mapping31.attr,
+ NULL,
+};
+
+static struct attribute_group nd_mapping_attribute_group = {
+ .is_visible = nd_mapping_visible,
+ .attrs = nd_mapping_attributes,
+};
+
+static const struct attribute_group *nd_region_attribute_groups[] = {
+ &nd_region_attribute_group,
+ &nd_mapping_attribute_group,
+ NULL,
+};
+
+static void nd_blk_init(struct nd_bus *nd_bus, struct nd_region *nd_region,
+ struct nd_mem *nd_mem)
+{
+ struct nd_mapping *nd_mapping;
+ struct nd_dimm *nd_dimm;
+ u32 nfit_handle;
+
+ nd_region->dev.type = &nd_block_device_type;
+ nfit_handle = readl(&nd_mem->nfit_mem_dcr->nfit_handle);
+ nd_dimm = nd_dimm_by_handle(nd_bus, nfit_handle);
+
+ /* mark this region invalid unless we find a BDW */
+ nd_region->ndr_mappings = 0;
+
+ if (!nd_mem->nfit_bdw) {
+ dev_dbg(&nd_region->dev,
+ "%s: %s no block-data-window descriptor\n",
+ __func__, dev_name(&nd_dimm->dev));
+ put_device(&nd_dimm->dev);
+ return;
+ }
+ if (readq(&nd_mem->nfit_bdw->blk_offset) % SZ_4K) {
+ dev_err(&nd_region->dev, "%s: %s block-capacity is not 4K aligned\n",
+ __func__, dev_name(&nd_dimm->dev));
+ put_device(&nd_dimm->dev);
+ return;
+ }
+
+ nd_region->ndr_mappings = 1;
+ nd_mapping = &nd_region->mapping[0];
+ nd_mapping->nd_dimm = nd_dimm;
+ nd_mapping->size = readq(&nd_mem->nfit_bdw->blk_capacity);
+ nd_mapping->start = readq(&nd_mem->nfit_bdw->blk_offset);
+}
+
+static void nd_spa_range_init(struct nd_bus *nd_bus, struct nd_region *nd_region,
+ struct device_type *type)
+{
+ u16 i;
+ struct nd_spa *nd_spa = nd_region->nd_spa;
+ u16 spa_index = readw(&nd_spa->nfit_spa->spa_index);
+
+ nd_region->dev.type = type;
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_memdev *nd_memdev = nd_memdev_from_spa(nd_bus,
+ spa_index, i);
+ struct nd_mem *nd_mem = nd_memdev_to_mem(nd_bus, nd_memdev);
+ u32 nfit_handle = readl(&nd_mem->nfit_mem_dcr->nfit_handle);
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nd_dimm *nd_dimm;
+
+ nd_dimm = nd_dimm_by_handle(nd_bus, nfit_handle);
+ nd_mapping->nd_dimm = nd_dimm;
+ nd_mapping->start = readq(&nd_memdev->nfit_mem->region_dpa);
+ nd_mapping->size = readq(&nd_memdev->nfit_mem->region_len);
+
+ if ((nd_mapping->start | nd_mapping->size) % SZ_4K) {
+ dev_err(&nd_region->dev, "%s: %s mapping is not 4K aligned\n",
+ __func__, dev_name(&nd_dimm->dev));
+ nd_region->ndr_mappings = 0;
+ return;
+ }
+ }
+}
+
+static struct nd_region *nd_region_create(struct nd_bus *nd_bus,
+ struct nd_spa *nd_spa, struct nd_mem *nd_mem)
+{
+ u16 spa_index = readw(&nd_spa->nfit_spa->spa_index);
+ int spa_type = nfit_spa_type(nd_spa->nfit_spa);
+ struct nd_region *nd_region;
+ struct device *dev;
+ u16 num_mappings;
+
+ if (nd_mem)
+ num_mappings = 1;
+ else
+ num_mappings = num_nd_mem(nd_bus, spa_index);
+ nd_region = kzalloc(sizeof(struct nd_region)
+ + sizeof(struct nd_mapping) * num_mappings, GFP_KERNEL);
+ if (!nd_region)
+ return NULL;
+ nd_region->id = ida_simple_get(&region_ida, 0, 0, GFP_KERNEL);
+ if (nd_region->id < 0) {
+ kfree(nd_region);
+ return NULL;
+ }
+ nd_region->nd_spa = nd_spa;
+ nd_region->ndr_mappings = num_mappings;
+ dev = &nd_region->dev;
+ dev_set_name(dev, "region%d", nd_region->id);
+ dev->parent = &nd_bus->dev;
+ dev->groups = nd_region_attribute_groups;
+ nd_region->ndr_size = readq(&nd_spa->nfit_spa->spa_length);
+ nd_region->ndr_start = readq(&nd_spa->nfit_spa->spa_base);
+ switch (spa_type) {
+ case NFIT_SPA_PM:
+ nd_spa_range_init(nd_bus, nd_region, &nd_pmem_device_type);
+ break;
+ case NFIT_SPA_VOLATILE:
+ nd_spa_range_init(nd_bus, nd_region, &nd_volatile_device_type);
+ break;
+ case NFIT_SPA_DCR:
+ nd_blk_init(nd_bus, nd_region, nd_mem);
+ break;
+ default:
+ break;
+ }
+ nd_device_register(dev);
+
+ return nd_region;
+}
+
+int nd_bus_register_regions(struct nd_bus *nd_bus)
+{
+ struct nd_spa *nd_spa;
+ int rc = 0;
+
+ mutex_lock(&nd_bus_list_mutex);
+ list_for_each_entry(nd_spa, &nd_bus->spas, list) {
+ int spa_type;
+ u16 spa_index;
+ struct nd_mem *nd_mem;
+ struct nd_region *nd_region;
+
+ spa_type = nfit_spa_type(nd_spa->nfit_spa);
+ spa_index = readw(&nd_spa->nfit_spa->spa_index);
+ if (spa_index == 0) {
+ dev_dbg(&nd_bus->dev, "detected invalid spa index\n");
+ continue;
+ }
+ switch (spa_type) {
+ case NFIT_SPA_PM:
+ case NFIT_SPA_VOLATILE:
+ nd_region = nd_region_create(nd_bus, nd_spa, NULL);
+ if (!nd_region)
+ rc = -ENOMEM;
+ break;
+ case NFIT_SPA_DCR:
+ list_for_each_entry(nd_mem, &nd_bus->dimms, list) {
+ if (readw(&nd_mem->nfit_spa_dcr->spa_index)
+ != spa_index)
+ continue;
+ nd_region = nd_region_create(nd_bus, nd_spa,
+ nd_mem);
+ if (!nd_region)
+ rc = -ENOMEM;
+ }
+ break;
+ case NFIT_SPA_BDW:
+ /* we'll consume this in nd_blk_register for the DCR */
+ break;
+ default:
+ dev_info(&nd_bus->dev, "spa[%d] unhandled type: %s\n",
+ spa_index, spa_type_name(spa_type));
+ break;
+ }
+ }
+ mutex_unlock(&nd_bus_list_mutex);
+
+ nd_synchronize();
+
+ return rc;
+}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/