Re: [PATCH v7 2/3] mm/mempolicy: Prepare weighted interleave sysfs for memory hotplug

From: Dan Williams
Date: Fri Apr 11 2025 - 18:24:55 EST


Rakie Kim wrote:
> On Tue, 8 Apr 2025 20:54:48 -0700 Dan Williams <dan.j.williams@xxxxxxxxx> wrote:
> > Dan Williams wrote:
> > > >
> > > > +struct sysfs_wi_group {
> > > > + struct kobject wi_kobj;
> > > > + struct iw_node_attr *nattrs[];
> > > > +};
> > > > +
> > > > +static struct sysfs_wi_group *wi_group;
> > > > +
> > > > static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
> > > > char *buf)
> > > > {
> > > > @@ -3461,27 +3468,24 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
> > > > return count;
> > > > }
> > > >
> > > > -static struct iw_node_attr **node_attrs;
> > > > -
> > > > -static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
> > > > - struct kobject *parent)
> > > > +static void sysfs_wi_node_delete(int nid)
> > > > {
> > > > - if (!node_attr)
> > > > + if (!wi_group->nattrs[nid])
> > > > return;
> > > > - sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
> > > > - kfree(node_attr->kobj_attr.attr.name);
> > > > - kfree(node_attr);
> > > > +
> > > > + sysfs_remove_file(&wi_group->wi_kobj,
> > > > + &wi_group->nattrs[nid]->kobj_attr.attr);
> > >
> > > This still looks broken to me, but I think this is more a problem that
> > > was present in the original code.
> > >
> > > At this point @wi_group's reference count is zero because
> > > sysfs_wi_release() has been called. However, it can only be zero if it has
> > > properly transitioned through kobject_del() and final kobject_put(). It
> > > follows that kobject_del() arranges for kobj->sd to be NULL. That means
> > > that this *should* be hitting the WARN() in kernfs_remove_by_name_ns()
> > > for the !parent case.
> > >
> > > So, either you are not triggering that path, or testing that path, but
> > > sys_remove_file() of the child attributes should be happening *before*
> > > sysfs_wi_release().
> > >
> > > Did I miss something?
> >
> > I think the missing change is that sysfs_wi_node_add() failures need to
> > be done with a sysfs_wi_node_delete() of the added attrs *before* the
> > kobject_del() of @wi_group.
>
> Hi Dan,
>
> Thank you for pointing out this issue.
>
> As you suggested, I believe the most appropriate way to handle this is
> to incorporate your feedback into Patch 1
> (mm/mempolicy: Fix memory leaks in weighted interleave sysfs).
>
> To ensure that sysfs_remove_file() is called before kobject_del(), I
> have restructured the code as follows:
>
> <Previously>
> static void sysfs_wi_release(struct kobject *wi_kobj)
> {
> int nid;
>
> for (nid = 0; nid < nr_node_ids; nid++)
> sysfs_wi_node_delete(node_attrs[nid], wi_kobj);
> -> ERROR: sysfs_remove_file called here
> kfree(node_attrs);
> kfree(wi_kobj);
> }
>
> <Now>
> static void sysfs_wi_node_delete_all(struct kobject *wi_kobj)
> {
> int nid;
>
> for (nid = 0; nid < nr_node_ids; nid++)
> sysfs_wi_node_delete(node_attrs[nid], wi_kobj);

At this point the nodes were live which means userspace could have
triggered an iw_table update. So I would expect that after all node
files have been deleted then this function frees the iw_table.

> -> sysfs_remove_file called here

Call iw_table_free() after the loop, where that is something like below
(untested!):

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b28a1e6ae096..88538f23c7d4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -3430,6 +3430,28 @@ static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
return sysfs_emit(buf, "%d\n", weight);
}

+static void iw_table_install(static u8 *new, struct iw_node_attr *node_attr, u8 weight)
+{
+ u8 *old;
+
+ mutex_lock(&iw_table_lock);
+ old = rcu_dereference_protected(iw_table,
+ lockdep_is_held(&iw_table_lock));
+ if (old && new)
+ memcpy(new, old, nr_node_ids);
+ if (new)
+ new[node_attr->nid] = weight;
+ rcu_assign_pointer(iw_table, new);
+ mutex_unlock(&iw_table_lock);
+ synchronize_rcu();
+ kfree(old);
+}
+
+static void iw_table_free(void)
+{
+ iw_table_install(NULL, NULL, 0);
+}
+
static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
@@ -3447,17 +3469,8 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
new = kzalloc(nr_node_ids, GFP_KERNEL);
if (!new)
return -ENOMEM;
+ iw_table_install(new, node_attr, weight);

- mutex_lock(&iw_table_lock);
- old = rcu_dereference_protected(iw_table,
- lockdep_is_held(&iw_table_lock));
- if (old)
- memcpy(new, old, nr_node_ids);
- new[node_attr->nid] = weight;
- rcu_assign_pointer(iw_table, new);
- mutex_unlock(&iw_table_lock);
- synchronize_rcu();
- kfree(old);
return count;
}

@@ -3550,15 +3563,6 @@ static int add_weighted_interleave_group(struct kobject *root_kobj)

static void mempolicy_kobj_release(struct kobject *kobj)
{
- u8 *old;
-
- mutex_lock(&iw_table_lock);
- old = rcu_dereference_protected(iw_table,
- lockdep_is_held(&iw_table_lock));
- rcu_assign_pointer(iw_table, NULL);
- mutex_unlock(&iw_table_lock);
- synchronize_rcu();
- kfree(old);
kfree(node_attrs);
kfree(kobj);
}

> }
>
> static void sysfs_wi_release(struct kobject *wi_kobj)
> {
> kfree(node_attrs);
> kfree(wi_kobj);
> }
>
> In addition, I call sysfs_wi_node_delete_all() before kobject_del()
> during error handling:
>
> +err_cleanup_kobj:
> + sysfs_wi_node_delete_all(wi_kobj);
> kobject_del(wi_kobj);
>
> I believe this resolves the issue you raised.

Yes, along with the iw_table_free() change because while it is not a
leak, it is awkward that mempolicy_kobj_release arranges to keep
iw_table allocated long past the time the node attributes have been
deleted and shutdown in sysfs.

> That said, I have a follow-up question. With this structure, when the
> system is shutting down, sysfs_remove_file() will not be called. Based
> on my review of other kernel subsystems, it seems that sysfs_remove_file()
> is only called during module_exit() in driver code, and not in other
> built-in subsystems.

Correct.

> Is this an acceptable practice? If you happen to know the expected
> behavior in such cases, I would appreciate your insights.

Yes, there are plenty of examples of sysfs infrastructure that gets set
up, but never torn down for the life of the kernel. The goal here is to
make the error unwind path correct and make the code clean for potentially
deleting mempolicy_kobj infrastructure in the future, but it is
otherwise ok if the only patch that calls kobject_del() for an object is
the error unwind path.

>
> Below is the full content of the updated Patch 1.
> @@ -3463,8 +3463,8 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
>
> static struct iw_node_attr **node_attrs;
>
> -static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
> - struct kobject *parent)
> +static void sysfs_wi_node_delete(struct iw_node_attr *node_attr,
> + struct kobject *parent)
> {
> if (!node_attr)
> return;
> @@ -3473,13 +3473,16 @@ static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
> kfree(node_attr);
> }
>
> -static void sysfs_wi_release(struct kobject *wi_kobj)
> +static void sysfs_wi_node_delete_all(struct kobject *wi_kobj)
> {
> - int i;
> + int nid;
>
> - for (i = 0; i < nr_node_ids; i++)
> - sysfs_wi_node_release(node_attrs[i], wi_kobj);
> + for (nid = 0; nid < nr_node_ids; nid++)
> + sysfs_wi_node_delete(node_attrs[nid], wi_kobj);
> +}
>
> +static void sysfs_wi_release(struct kobject *wi_kobj)
> +{
> kfree(node_attrs);
> kfree(wi_kobj);
> }
> @@ -3547,13 +3550,14 @@ static int add_weighted_interleave_group(struct kobject *root_kobj)
> err = add_weight_node(nid, wi_kobj);
> if (err) {
> pr_err("failed to add sysfs [node%d]\n", nid);
> - goto err_del_kobj;
> + goto err_cleanup_kobj;
> }
> }
>
> return 0;
>
> -err_del_kobj:
> +err_cleanup_kobj:
> + sysfs_wi_node_delete_all(wi_kobj);
> kobject_del(wi_kobj);
> err_put_kobj:
> kobject_put(wi_kobj);
>
> Thank you again for your helpful feedback.

Hey, thanks for the patience to get this all fixed up properly.