Re: [PATCH 2/4] s390/sclp: Add support for dynamic (de)configuration of memory
From: David Hildenbrand
Date: Tue Oct 07 2025 - 16:07:50 EST
[...]
---
drivers/s390/char/sclp_mem.c | 291 +++++++++++++++++++++++++++++------
1 file changed, 241 insertions(+), 50 deletions(-)
diff --git a/drivers/s390/char/sclp_mem.c b/drivers/s390/char/sclp_mem.c
index 27f49f5fd358..802439230294 100644
--- a/drivers/s390/char/sclp_mem.c
+++ b/drivers/s390/char/sclp_mem.c
@@ -9,9 +9,12 @@
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
#include <linux/cpufeature.h>
+#include <linux/container_of.h>
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/kstrtox.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
#include <linux/mm.h>
@@ -27,7 +30,6 @@
#define SCLP_CMDW_ASSIGN_STORAGE 0x000d0001
#define SCLP_CMDW_UNASSIGN_STORAGE 0x000c0001
-static DEFINE_MUTEX(sclp_mem_mutex);
static LIST_HEAD(sclp_mem_list);
static u8 sclp_max_storage_id;
static DECLARE_BITMAP(sclp_storage_ids, 256);
@@ -38,6 +40,18 @@ struct memory_increment {
int standby;
};
+struct mblock {
+ struct kobject kobj;
+ unsigned int id;
+ unsigned int memmap_on_memory;
+ unsigned int config;
+};
+
+struct memory_block_arg {
+ struct mblock *mblocks;
+ struct kset *kset;
+};
I would avoid using "memory_block_arg" as it reminds of core mm "struct memory_block".
Similarly, I'd not call this "mblock".
What about incorporating the "sclp" side of things?
"struct sclp_mem" / "struct sclp_mem_arg"
Nicely fits "sclp_mem.c" ;)
Something like that might be better.
+
struct assign_storage_sccb {
struct sccb_header header;
u16 rn;
@@ -185,15 +199,11 @@ static int sclp_mem_notifier(struct notifier_block *nb,
{
unsigned long start, size;
struct memory_notify *arg;
- unsigned char id;
int rc = 0;
arg = data;
start = arg->start_pfn << PAGE_SHIFT;
size = arg->nr_pages << PAGE_SHIFT;
- mutex_lock(&sclp_mem_mutex);
- for_each_clear_bit(id, sclp_storage_ids, sclp_max_storage_id + 1)
- sclp_attach_storage(id);
switch (action) {
case MEM_GOING_OFFLINE:
/*
@@ -204,45 +214,201 @@ static int sclp_mem_notifier(struct notifier_block *nb,
if (contains_standby_increment(start, start + size))
rc = -EPERM;
break;
Is there any reson this notifier is still needed? I'd assume we can just allow
for offlining + re-onlining as we please now.
In fact, I'd assume we can get rid of the notifier entirely now?
- case MEM_PREPARE_ONLINE:
- /*
- * Access the altmap_start_pfn and altmap_nr_pages fields
- * within the struct memory_notify specifically when dealing
- * with only MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers.
- *
- * When altmap is in use, take the specified memory range
- * online, which includes the altmap.
- */
- if (arg->altmap_nr_pages) {
- start = PFN_PHYS(arg->altmap_start_pfn);
- size += PFN_PHYS(arg->altmap_nr_pages);
- }
- rc = sclp_mem_change_state(start, size, 1);
- if (rc || !arg->altmap_nr_pages)
- break;
- /*
- * Set CMMA state to nodat here, since the struct page memory
- * at the beginning of the memory block will not go through the
- * buddy allocator later.
- */
- __arch_set_page_nodat((void *)__va(start), arg->altmap_nr_pages);
+ default:
break;
- case MEM_FINISH_OFFLINE:
+ }
+ return rc ? NOTIFY_BAD : NOTIFY_OK;
+}
+
+static ssize_t config_mblock_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ struct mblock *mblock = container_of(kobj, struct mblock, kobj);
+
+ return sysfs_emit(buf, "%u\n", READ_ONCE(mblock->config));
+}
+
+static ssize_t config_mblock_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long long addr, block_size;
"unsigned long" should be sufficient I'm sure :)
+ struct memory_block *mem;
+ struct mblock *mblock;
+ unsigned char id;
+ bool value;
+ int rc;
+
+ rc = kstrtobool(buf, &value);
+ if (rc)
+ return rc;
+ mblock = container_of(kobj, struct mblock, kobj);
+ block_size = memory_block_size_bytes();
+ addr = mblock->id * block_size;
+ /*
+ * Hold device_hotplug_lock when adding/removing memory blocks.
+ * Additionally, also protect calls to find_memory_block() and
+ * sclp_attach_storage().
+ */
+ rc = lock_device_hotplug_sysfs();
+ if (rc)
+ goto out;
+ for_each_clear_bit(id, sclp_storage_ids, sclp_max_storage_id + 1)
+ sclp_attach_storage(id);
+ if (value) {
+ if (mblock->config)
+ goto out_unlock;
+ rc = sclp_mem_change_state(addr, block_size, 1);
+ if (rc)
+ goto out_unlock;
/*
- * When altmap is in use, take the specified memory range
- * offline, which includes the altmap.
+ * Set entire memory block CMMA state to nodat. Later, when
+ * page tables pages are allocated via __add_memory(), those
+ * regions are marked __arch_set_page_dat().
*/
- if (arg->altmap_nr_pages) {
- start = PFN_PHYS(arg->altmap_start_pfn);
- size += PFN_PHYS(arg->altmap_nr_pages);
+ __arch_set_page_nodat((void *)__va(addr), block_size >> PAGE_SHIFT);
+ rc = __add_memory(0, addr, block_size,
+ mblock->memmap_on_memory ?
+ MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE);
+ if (rc)
+ goto out_unlock;
Do we have to undo the state change?
+ mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(addr)));
+ put_device(&mem->dev);
+ WRITE_ONCE(mblock->config, 1);
+ } else {
+ if (!mblock->config)
+ goto out_unlock;
+ mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(addr)));
+ if (mem->state != MEM_OFFLINE) {
+ put_device(&mem->dev);
+ rc = -EBUSY;
+ goto out_unlock;
}
- sclp_mem_change_state(start, size, 0);
- break;
- default:
- break;
+ /* drop the ref just got via find_memory_block() */
+ put_device(&mem->dev);
+ sclp_mem_change_state(addr, block_size, 0);
+ __remove_memory(addr, block_size);
+ WRITE_ONCE(mblock->config, 0);
}
- mutex_unlock(&sclp_mem_mutex);
- return rc ? NOTIFY_BAD : NOTIFY_OK;
+out_unlock:
+ unlock_device_hotplug();
+out:
+ return rc ? rc : count;
+}
+
+static ssize_t memmap_on_memory_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ struct mblock *mblock = container_of(kobj, struct mblock, kobj);
+
+ return sysfs_emit(buf, "%u\n", READ_ONCE(mblock->memmap_on_memory));
+}
+
+static ssize_t memmap_on_memory_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long block_size;
+ struct memory_block *mem;
+ struct mblock *mblock;
+ bool value;
+ int rc;
+
+ rc = kstrtobool(buf, &value);
+ if (rc)
+ return rc;
+ rc = lock_device_hotplug_sysfs();
+ if (rc)
+ return rc;
+ block_size = memory_block_size_bytes();
+ mblock = container_of(kobj, struct mblock, kobj);
+ mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(mblock->id * block_size)));
+ if (!mem) {
+ WRITE_ONCE(mblock->memmap_on_memory, value);
+ } else {
+ put_device(&mem->dev);
+ rc = -EBUSY;
+ }
+ unlock_device_hotplug();
+ return rc ? rc : count;
+}
+
+static void mblock_sysfs_release(struct kobject *kobj)
+{
+ struct mblock *mblock = container_of(kobj, struct mblock, kobj);
+
+ kfree(mblock);
+}
+
+static const struct kobj_type ktype = {
+ .release = mblock_sysfs_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+};
+
+static struct kobj_attribute memmap_attr =
+ __ATTR(memmap_on_memory, 0644, memmap_on_memory_show, memmap_on_memory_store);
+static struct kobj_attribute config_attr =
+ __ATTR(config, 0644, config_mblock_show, config_mblock_store);
+
+static struct attribute *mblock_attrs[] = {
+ &memmap_attr.attr,
+ &config_attr.attr,
+ NULL,
+};
+
+static struct attribute_group mblock_attr_group = {
+ .attrs = mblock_attrs,
+};
+
+static int create_mblock(struct mblock *mblock, struct kset *kset,
+ unsigned int id, bool config, bool memmap_on_memory)
+{
+ int rc;
+
+ mblock->memmap_on_memory = memmap_on_memory;
+ mblock->config = config;
+ mblock->id = id;
+ kobject_init(&mblock->kobj, &ktype);
+ rc = kobject_add(&mblock->kobj, &kset->kobj, "memory%d", id);
+ if (rc)
+ return rc;
+ rc = sysfs_create_group(&mblock->kobj, &mblock_attr_group);
+ if (rc)
+ kobject_put(&mblock->kobj);
+ return rc;
+}
+
+/*
+ * Create /sys/firmware/memory/memoryX for boottime configured online memory
+ * blocks
+ */
+static int create_online_mblock(struct memory_block *mem, void *argument)
"online" is conusing. It's "initial" / "configured". Same applies to the other functions
that mention "online".
+{
+ struct memory_block_arg *arg;
+ struct mblock *mblocks;
+ struct kset *kset;
+ unsigned int id;
+
+ id = mem->dev.id;
+ arg = (struct memory_block_arg *)argument;
+ mblocks = arg->mblocks;
+ kset = arg->kset;
+ return create_mblock(&mblocks[id], kset, id, true, false);
+}
+
+static int __init create_initial_online_mblocks(struct mblock *mblocks, struct kset *kset)
+{
+ struct memory_block_arg arg;
+
+ arg.mblocks = mblocks;
+ arg.kset = kset;
+ return for_each_memory_block(&arg, create_online_mblock);
+}
+
+static struct mblock * __init allocate_mblocks(void)
+{
+ u64 max_mblocks;
Nit: why an u64? The block ids are "unsigned int id;"
+ u64 block_size;
+
+ block_size = memory_block_size_bytes();
+ max_mblocks = roundup(sclp.rnmax * sclp.rzm, block_size) / block_size;
+ return kcalloc(max_mblocks, sizeof(struct mblock), GFP_KERNEL);
I think you should structure the code a bit differently, not splitting
the function up into tiny helpers.
static int __init init_sclp_mem(void)
{
const u64 block_size = memory_block_size_bytes();
const u64 max_mblocks = roundup(sclp.rnmax * sclp.rzm, block_size) / block_size;
struct sclp_mem_arg arg;
struct kset *kset;
int rc;
/* We'll allocate memory for all blocks ahead of time. */
sclp_mem = kcalloc(max_mblocks, sizeof(struct mblock), GFP_KERNEL);
if (!sclp_mem)
return -ENOMEM;
kset = kset_create_and_add("memory", NULL, firmware_kobj);
if (!kset)
return -ENOMEM;
/* Initial memory is in the "configured" state already. */
arg.sclp_mem = sclp_mem;
arg.kset = kset;
rc = for_each_memory_block(&arg, create_configured_sclp_mem);
if (rc)
return rc;
/* Standby memory is "deconfigured". */
return create_standby_sclp_mem(sclp_mem, kset);
}
Should still be quite readable.
}
static struct notifier_block sclp_mem_nb = {
@@ -264,14 +430,17 @@ static void __init align_to_block_size(unsigned long *start,
*size = size_align;
}
-static void __init add_memory_merged(u16 rn)
+static int __init create_standby_mblocks_merged(struct mblock *mblocks,
+ struct kset *kset, u16 rn)
{
unsigned long start, size, addr, block_size;
static u16 first_rn, num;
+ unsigned int id;
+ int rc = 0;
if (rn && first_rn && (first_rn + num == rn)) {
num++;
- return;
+ return rc;
}
if (!first_rn)
goto skip_add;
@@ -286,24 +455,31 @@ static void __init add_memory_merged(u16 rn)
if (!size)
goto skip_add;
for (addr = start; addr < start + size; addr += block_size) {
- add_memory(0, addr, block_size,
- cpu_has_edat1() ?
- MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE);
+ id = addr / block_size;
+ rc = create_mblock(&mblocks[id], kset, id, false, mhp_supports_memmap_on_memory());
+ if (rc)
+ break;
}
skip_add:
first_rn = rn;
num = 1;
+ return rc;
}
-static void __init sclp_add_standby_memory(void)
+static int __init create_standby_mblocks(struct mblock *mblocks, struct kset *kset)
{
struct memory_increment *incr;
+ int rc = 0;
list_for_each_entry(incr, &sclp_mem_list, list) {
if (incr->standby)
- add_memory_merged(incr->rn);
+ rc = create_standby_mblocks_merged(mblocks, kset, incr->rn);
+ if (rc)
+ goto out;
}
- add_memory_merged(0);
+ rc = create_standby_mblocks_merged(mblocks, kset, 0);
+out:
+ return rc;
}
static void __init insert_increment(u16 rn, int standby, int assigned)
@@ -336,10 +512,12 @@ static void __init insert_increment(u16 rn, int standby, int assigned)
list_add(&new_incr->list, prev);
}
-static int __init sclp_detect_standby_memory(void)
+static int __init sclp_setup_memory(void)
{
struct read_storage_sccb *sccb;
int i, id, assigned, rc;
+ struct mblock *mblocks;
+ struct kset *kset;
/* No standby memory in kdump mode */
if (oldmem_data.start)
Wouldn't we still want to create the ones for initial memory at least?
[...]
--
Cheers
David / dhildenb