Re: [RFC PATCH 4/5] pgo: Add module notifier machinery

From: jarmo . tiitto
Date: Mon Jun 14 2021 - 14:31:21 EST


Kees Cook wrote maanantaina 14. kesäkuuta 2021 19.00.34 EEST:
> On Sat, Jun 12, 2021 at 06:24:25AM +0300, Jarmo Tiitto wrote:
> > Add module notifier callback and register modules
> > into prf_list.
> >
> > For each module that has __llvm_prf_{data,cnts,names,vnds} sections
> > The callback creates debugfs <module>.profraw entry and
> > links new prf_object into prf_list.
> >
> > This enables profiling for all loaded modules.
> >
> > * Moved rcu_read_lock() outside of allocate_node() in order
> >
> > to protect __llvm_profile_instrument_target() from module unload(s)
> >
> > Signed-off-by: Jarmo Tiitto <jarmo.tiitto@xxxxxxxxx>
> > ---
> > v2: Passed QEMU SMP boot test into minimal Arch Linux distro,
> > module loading and unloading work without warnings.
> > Module profile data looks ok. :-)
> > ---
> >
> > kernel/pgo/fs.c | 111 ++++++++++++++++++++++++++++++++++++++++
> > kernel/pgo/instrument.c | 19 ++++---
> > 2 files changed, 122 insertions(+), 8 deletions(-)
> >
> > diff --git a/kernel/pgo/fs.c b/kernel/pgo/fs.c
> > index 84b36e61758b..98b982245b58 100644
> > --- a/kernel/pgo/fs.c
> > +++ b/kernel/pgo/fs.c
> > @@ -419,6 +419,110 @@ static const struct file_operations prf_reset_fops =
{
> >
> > .llseek = noop_llseek,
> >
> > };
> >
> > +static void pgo_module_init(struct module *mod)
> > +{
> > + struct prf_object *po;
> > + char fname[MODULE_NAME_LEN + 9]; /* +strlen(".profraw") */
> > + unsigned long flags;
> > +
> > + /* add new prf_object entry for the module */
> > + po = kzalloc(sizeof(*po), GFP_KERNEL);
> > + if (!po)
> > + goto out;
> > +
> > + po->mod_name = mod->name;
> > +
> > + fname[0] = 0;
> > + snprintf(fname, sizeof(fname), "%s.profraw", po->mod_name);
> > +
> > + /* setup prf_object sections */
> > + po->data = mod->prf_data;
> > + po->data_num = prf_get_count(mod->prf_data,
> > + (char *)mod->prf_data + mod->prf_data_size,
sizeof(po->data[0]));
> > +
> > + po->cnts = mod->prf_cnts;
> > + po->cnts_num = prf_get_count(mod->prf_cnts,
> > + (char *)mod->prf_cnts + mod->prf_cnts_size,
sizeof(po->cnts[0]));
> > +
> > + po->names = mod->prf_names;
> > + po->names_num = prf_get_count(mod->prf_names,
> > + (char *)mod->prf_names + mod->prf_names_size,
sizeof(po->names[0]));
> > +
> > + po->vnds = mod->prf_vnds;
> > + po->vnds_num = prf_get_count(mod->prf_vnds,
> > + (char *)mod->prf_vnds + mod->prf_vnds_size,
sizeof(po->vnds[0]));
> > +
> > + /* create debugfs entry */
> > + po->file = debugfs_create_file(fname, 0600, directory, po,
&prf_fops);
> > + if (!po->file) {
> > + pr_err("Failed to setup module pgo: %s", fname);
> > + kfree(po);
> > + goto out;
> > + }
> > +
> > + /* finally enable profiling for the module */
> > + flags = prf_list_lock();
> > + list_add_tail_rcu(&po->link, &prf_list);
> > + prf_list_unlock(flags);
> > +
> > +out:
> > + return;
> > +}
> > +
> > +static int pgo_module_notifier(struct notifier_block *nb, unsigned long
> > event, + void *pdata)
> > +{
> > + struct module *mod = pdata;
> > + struct prf_object *po;
> > + unsigned long flags;
> > +
> > + if (event == MODULE_STATE_LIVE) {
> > + /* does the module have profiling info? */
> > + if (mod->prf_data
> > + && mod->prf_cnts
> > + && mod->prf_names
> > + && mod->prf_vnds) {
> > +
> > + /* setup module profiling */
> > + pgo_module_init(mod);
> > + }
> > + }
> > +
> > + if (event == MODULE_STATE_GOING) {
> > + /* find the prf_object from the list */
> > + rcu_read_lock();
> > +
> > + list_for_each_entry_rcu(po, &prf_list, link) {
> > + if (strcmp(po->mod_name, mod->name) ==
0)
> > + goto out_unlock;
> > + }
> > + /* no such module */
> > + po = NULL;
> > +
> > +out_unlock:
> > + rcu_read_unlock();
>
> Is it correct to do the unlock before the possible list_del_rcu()?
>

Oh, list_del_rcu() should then propably go before unlocking. I'll fix that.

> > +
> > + if (po) {
> > + /* remove from profiled modules */
> > + flags = prf_list_lock();
> > + list_del_rcu(&po->link);
> > + prf_list_unlock(flags);
> > +
> > + debugfs_remove(po->file);
> > + po->file = NULL;
> > +
> > + /* cleanup memory */
> > + kfree_rcu(po, rcu);
> > + }
>
> Though I thought module load/unload was already under a global lock, so
> maybe a race isn't possible here.
>

I searched a bit and found out that module.c/module_mutex is not held during
the module notifier MODULE_STATE_GOING callbacks.
But the callback it only invoked only once per module un/load so I think it is
okay.
If I remember correctly, the main reason I moved the tear down code after
rcu_read_unlock() was that debugfs_remove() may sleep.


> For the next version of this series, please Cc the module subsystem
> maintainer as well:
> Jessica Yu <jeyu@xxxxxxxxxx>
>

OK, after all there is a lot of pointers to the kernel's module subsys.

> > + }
> > +
> > + return NOTIFY_OK;
> > +}
> > +
> > +static struct notifier_block pgo_module_nb = {
> > + .notifier_call = pgo_module_notifier
> > +};
> > +
> >
> > /* Create debugfs entries. */
> > static int __init pgo_init(void)
> > {
> >
> > @@ -444,6 +548,7 @@ static int __init pgo_init(void)
> >
> > /* enable profiling */
> > flags = prf_list_lock();
> >
> > + prf_vmlinux.mod_name = "vmlinux";
> >
> > list_add_tail_rcu(&prf_vmlinux.link, &prf_list);
> > prf_list_unlock(flags);
> >
> > @@ -460,6 +565,9 @@ static int __init pgo_init(void)
> >
> > &prf_reset_fops))
> >
> > goto err_remove;
> >
> > + /* register module notifer */
> > + register_module_notifier(&pgo_module_nb);
> > +
> >
> > /* show notice why the system slower: */
> > pr_notice("Clang PGO instrumentation is active.");
> >
> > @@ -473,6 +581,9 @@ static int __init pgo_init(void)
> >
> > /* Remove debugfs entries. */
> > static void __exit pgo_exit(void)
> > {
> >
> > + /* unsubscribe the notifier and do cleanup. */
> > + unregister_module_notifier(&pgo_module_nb);
> > +
> >
> > debugfs_remove_recursive(directory);
> >
> > }
> >
> > diff --git a/kernel/pgo/instrument.c b/kernel/pgo/instrument.c
> > index e214c9d7a113..70bab7e4c153 100644
> > --- a/kernel/pgo/instrument.c
> > +++ b/kernel/pgo/instrument.c
> > @@ -33,7 +33,6 @@
> >
> > * ensures that we don't try to serialize data that's only partially
> > updated.
> > */
> >
> > static DEFINE_SPINLOCK(pgo_lock);
> >
> > -static int current_node;
> >
> > unsigned long prf_lock(void)
> > {
> >
> > @@ -62,8 +61,6 @@ static struct llvm_prf_value_node *allocate_node(struct
> > llvm_prf_data *p,>
> > struct llvm_prf_data *data_end;
> > int max_vnds;
> >
> > - rcu_read_lock();
> > -
>
> Please move these rcu locks change into the patch that introduces them
> to avoid adding those changes here.
>
> > list_for_each_entry_rcu(po, &prf_list, link) {
> >
> > /* get section limits */
> > max_vnds = prf_vnds_count(po);
> >
> > @@ -76,7 +73,6 @@ static struct llvm_prf_value_node *allocate_node(struct
> > llvm_prf_data *p,>
> > */
> >
> > if (memory_contains(po->data, data_end, p,
sizeof(*p))) {
> >
> > -
> >
> > if (WARN_ON_ONCE(po->current_node >=
max_vnds))
> >
> > return NULL; /* Out of
nodes */
> >
> > @@ -87,7 +83,6 @@ static struct llvm_prf_value_node *allocate_node(struct
> > llvm_prf_data *p,>
> > }
> >
> > out:
> > - rcu_read_unlock();
> >
> > return vnode;
> >
> > }
> >
> > @@ -108,8 +103,14 @@ void __llvm_profile_instrument_target(u64
target_value,
> > void *data, u32 index)>
> > u8 values = 0;
> > unsigned long flags;
> >
> > + /*
> > + * lock the underlying prf_object(s) in place,
> > + * so they won't vanish while we are operating on it.
> > + */
> > + rcu_read_lock();
> > +
> >
> > if (!p || !p->values)
> >
> > - return;
> > + goto rcu_unlock;
> >
> > counters = (struct llvm_prf_value_node **)p->values;
> > curr = counters[index];
> >
> > @@ -117,7 +118,7 @@ void __llvm_profile_instrument_target(u64 target_value,
> > void *data, u32 index)>
> > while (curr) {
> >
> > if (target_value == curr->value) {
> >
> > curr->count++;
> >
> > - return;
> > + goto rcu_unlock;
> >
> > }
> >
> > if (curr->count < min_count) {
> >
> > @@ -136,7 +137,7 @@ void __llvm_profile_instrument_target(u64 target_value,
> > void *data, u32 index)>
> > curr->value = target_value;
> > curr->count++;
> >
> > }
> >
> > - return;
> > + goto rcu_unlock;
> >
> > }
> >
> > /* Lock when updating the value node structure. */
> >
> > @@ -156,6 +157,8 @@ void __llvm_profile_instrument_target(u64 target_value,
> > void *data, u32 index)>
> > out:
> > prf_unlock(flags);
> >
> > +rcu_unlock:
> > + rcu_read_unlock();
> >
> > }
> > EXPORT_SYMBOL(__llvm_profile_instrument_target);
> >
> > --
> > 2.32.0
>
> --
> Kees Cook