Re: [RFC PATCH 4/5] pgo: Add module notifier machinery
From: Kees Cook
Date: Mon Jun 14 2021 - 12:00:59 EST
On Sat, Jun 12, 2021 at 06:24:25AM +0300, Jarmo Tiitto wrote:
> Add module notifier callback and register modules
> into prf_list.
>
> For each module that has __llvm_prf_{data,cnts,names,vnds} sections
> The callback creates debugfs <module>.profraw entry and
> links new prf_object into prf_list.
>
> This enables profiling for all loaded modules.
>
> * Moved rcu_read_lock() outside of allocate_node() in order
> to protect __llvm_profile_instrument_target() from module unload(s)
>
> Signed-off-by: Jarmo Tiitto <jarmo.tiitto@xxxxxxxxx>
> ---
> v2: Passed QEMU SMP boot test into minimal Arch Linux distro,
> module loading and unloading work without warnings.
> Module profile data looks ok. :-)
> ---
> kernel/pgo/fs.c | 111 ++++++++++++++++++++++++++++++++++++++++
> kernel/pgo/instrument.c | 19 ++++---
> 2 files changed, 122 insertions(+), 8 deletions(-)
>
> diff --git a/kernel/pgo/fs.c b/kernel/pgo/fs.c
> index 84b36e61758b..98b982245b58 100644
> --- a/kernel/pgo/fs.c
> +++ b/kernel/pgo/fs.c
> @@ -419,6 +419,110 @@ static const struct file_operations prf_reset_fops = {
> .llseek = noop_llseek,
> };
>
> +static void pgo_module_init(struct module *mod)
> +{
> + struct prf_object *po;
> + char fname[MODULE_NAME_LEN + 9]; /* +strlen(".profraw") */
> + unsigned long flags;
> +
> + /* add new prf_object entry for the module */
> + po = kzalloc(sizeof(*po), GFP_KERNEL);
> + if (!po)
> + goto out;
> +
> + po->mod_name = mod->name;
> +
> + fname[0] = 0;
> + snprintf(fname, sizeof(fname), "%s.profraw", po->mod_name);
> +
> + /* setup prf_object sections */
> + po->data = mod->prf_data;
> + po->data_num = prf_get_count(mod->prf_data,
> + (char *)mod->prf_data + mod->prf_data_size, sizeof(po->data[0]));
> +
> + po->cnts = mod->prf_cnts;
> + po->cnts_num = prf_get_count(mod->prf_cnts,
> + (char *)mod->prf_cnts + mod->prf_cnts_size, sizeof(po->cnts[0]));
> +
> + po->names = mod->prf_names;
> + po->names_num = prf_get_count(mod->prf_names,
> + (char *)mod->prf_names + mod->prf_names_size, sizeof(po->names[0]));
> +
> + po->vnds = mod->prf_vnds;
> + po->vnds_num = prf_get_count(mod->prf_vnds,
> + (char *)mod->prf_vnds + mod->prf_vnds_size, sizeof(po->vnds[0]));
> +
> + /* create debugfs entry */
> + po->file = debugfs_create_file(fname, 0600, directory, po, &prf_fops);
> + if (!po->file) {
> + pr_err("Failed to setup module pgo: %s", fname);
> + kfree(po);
> + goto out;
> + }
> +
> + /* finally enable profiling for the module */
> + flags = prf_list_lock();
> + list_add_tail_rcu(&po->link, &prf_list);
> + prf_list_unlock(flags);
> +
> +out:
> + return;
> +}
> +
> +static int pgo_module_notifier(struct notifier_block *nb, unsigned long event,
> + void *pdata)
> +{
> + struct module *mod = pdata;
> + struct prf_object *po;
> + unsigned long flags;
> +
> + if (event == MODULE_STATE_LIVE) {
> + /* does the module have profiling info? */
> + if (mod->prf_data
> + && mod->prf_cnts
> + && mod->prf_names
> + && mod->prf_vnds) {
> +
> + /* setup module profiling */
> + pgo_module_init(mod);
> + }
> + }
> +
> + if (event == MODULE_STATE_GOING) {
> + /* find the prf_object from the list */
> + rcu_read_lock();
> +
> + list_for_each_entry_rcu(po, &prf_list, link) {
> + if (strcmp(po->mod_name, mod->name) == 0)
> + goto out_unlock;
> + }
> + /* no such module */
> + po = NULL;
> +
> +out_unlock:
> + rcu_read_unlock();
Is it correct to do the unlock before the possible list_del_rcu()?
> +
> + if (po) {
> + /* remove from profiled modules */
> + flags = prf_list_lock();
> + list_del_rcu(&po->link);
> + prf_list_unlock(flags);
> +
> + debugfs_remove(po->file);
> + po->file = NULL;
> +
> + /* cleanup memory */
> + kfree_rcu(po, rcu);
> + }
Though I thought module load/unload was already under a global lock, so
maybe a race isn't possible here.
For the next version of this series, please Cc the module subsystem
maintainer as well:
Jessica Yu <jeyu@xxxxxxxxxx>
> + }
> +
> + return NOTIFY_OK;
> +}
> +
> +static struct notifier_block pgo_module_nb = {
> + .notifier_call = pgo_module_notifier
> +};
> +
> /* Create debugfs entries. */
> static int __init pgo_init(void)
> {
> @@ -444,6 +548,7 @@ static int __init pgo_init(void)
>
> /* enable profiling */
> flags = prf_list_lock();
> + prf_vmlinux.mod_name = "vmlinux";
> list_add_tail_rcu(&prf_vmlinux.link, &prf_list);
> prf_list_unlock(flags);
>
> @@ -460,6 +565,9 @@ static int __init pgo_init(void)
> &prf_reset_fops))
> goto err_remove;
>
> + /* register module notifer */
> + register_module_notifier(&pgo_module_nb);
> +
> /* show notice why the system slower: */
> pr_notice("Clang PGO instrumentation is active.");
>
> @@ -473,6 +581,9 @@ static int __init pgo_init(void)
> /* Remove debugfs entries. */
> static void __exit pgo_exit(void)
> {
> + /* unsubscribe the notifier and do cleanup. */
> + unregister_module_notifier(&pgo_module_nb);
> +
> debugfs_remove_recursive(directory);
> }
>
> diff --git a/kernel/pgo/instrument.c b/kernel/pgo/instrument.c
> index e214c9d7a113..70bab7e4c153 100644
> --- a/kernel/pgo/instrument.c
> +++ b/kernel/pgo/instrument.c
> @@ -33,7 +33,6 @@
> * ensures that we don't try to serialize data that's only partially updated.
> */
> static DEFINE_SPINLOCK(pgo_lock);
> -static int current_node;
>
> unsigned long prf_lock(void)
> {
> @@ -62,8 +61,6 @@ static struct llvm_prf_value_node *allocate_node(struct llvm_prf_data *p,
> struct llvm_prf_data *data_end;
> int max_vnds;
>
> - rcu_read_lock();
> -
Please move these rcu locks change into the patch that introduces them
to avoid adding those changes here.
> list_for_each_entry_rcu(po, &prf_list, link) {
> /* get section limits */
> max_vnds = prf_vnds_count(po);
> @@ -76,7 +73,6 @@ static struct llvm_prf_value_node *allocate_node(struct llvm_prf_data *p,
> */
> if (memory_contains(po->data, data_end, p, sizeof(*p))) {
>
> -
> if (WARN_ON_ONCE(po->current_node >= max_vnds))
> return NULL; /* Out of nodes */
>
> @@ -87,7 +83,6 @@ static struct llvm_prf_value_node *allocate_node(struct llvm_prf_data *p,
> }
>
> out:
> - rcu_read_unlock();
> return vnode;
> }
>
> @@ -108,8 +103,14 @@ void __llvm_profile_instrument_target(u64 target_value, void *data, u32 index)
> u8 values = 0;
> unsigned long flags;
>
> + /*
> + * lock the underlying prf_object(s) in place,
> + * so they won't vanish while we are operating on it.
> + */
> + rcu_read_lock();
> +
> if (!p || !p->values)
> - return;
> + goto rcu_unlock;
>
> counters = (struct llvm_prf_value_node **)p->values;
> curr = counters[index];
> @@ -117,7 +118,7 @@ void __llvm_profile_instrument_target(u64 target_value, void *data, u32 index)
> while (curr) {
> if (target_value == curr->value) {
> curr->count++;
> - return;
> + goto rcu_unlock;
> }
>
> if (curr->count < min_count) {
> @@ -136,7 +137,7 @@ void __llvm_profile_instrument_target(u64 target_value, void *data, u32 index)
> curr->value = target_value;
> curr->count++;
> }
> - return;
> + goto rcu_unlock;
> }
>
> /* Lock when updating the value node structure. */
> @@ -156,6 +157,8 @@ void __llvm_profile_instrument_target(u64 target_value, void *data, u32 index)
>
> out:
> prf_unlock(flags);
> +rcu_unlock:
> + rcu_read_unlock();
> }
> EXPORT_SYMBOL(__llvm_profile_instrument_target);
>
> --
> 2.32.0
>
--
Kees Cook