Re: [PATCH v3 bpf-next 1/3] bpf: Add kmem_cache iterator

From: Vlastimil Babka
Date: Wed Oct 02 2024 - 06:54:33 EST


On 10/2/24 08:54, Namhyung Kim wrote:
> The new "kmem_cache" iterator will traverse the list of slab caches
> and call attached BPF programs for each entry. It should check the
> argument (ctx.s) if it's NULL before using it.
>
> Now the iteration grabs the slab_mutex only if it traverse the list and
> releases the mutex when it runs the BPF program. The kmem_cache entry
> is protected by a refcount during the execution.
>
> It includes the internal "mm/slab.h" header to access kmem_cache,
> slab_caches and slab_mutex. Hope it's ok to mm folks.
>
> Signed-off-by: Namhyung Kim <namhyung@xxxxxxxxxx>
> ---
> I've removed the Acked-by's from Roman and Vlastimil since it's changed
> not to hold the slab_mutex and to manage the refcount. Please review
> this change again!
>
> include/linux/btf_ids.h | 1 +
> kernel/bpf/Makefile | 1 +
> kernel/bpf/kmem_cache_iter.c | 165 +++++++++++++++++++++++++++++++++++
> 3 files changed, 167 insertions(+)
> create mode 100644 kernel/bpf/kmem_cache_iter.c
>
> diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
> index c0e3e1426a82f5c4..139bdececdcfaefb 100644
> --- a/include/linux/btf_ids.h
> +++ b/include/linux/btf_ids.h
> @@ -283,5 +283,6 @@ extern u32 btf_tracing_ids[];
> extern u32 bpf_cgroup_btf_id[];
> extern u32 bpf_local_storage_map_btf_id[];
> extern u32 btf_bpf_map_id[];
> +extern u32 bpf_kmem_cache_btf_id[];
>
> #endif
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index 9b9c151b5c826b31..105328f0b9c04e37 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -52,3 +52,4 @@ obj-$(CONFIG_BPF_PRELOAD) += preload/
> obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
> obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o
> obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o
> +obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o
> diff --git a/kernel/bpf/kmem_cache_iter.c b/kernel/bpf/kmem_cache_iter.c
> new file mode 100644
> index 0000000000000000..a77c08b82c6bc965
> --- /dev/null
> +++ b/kernel/bpf/kmem_cache_iter.c
> @@ -0,0 +1,165 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2024 Google */
> +#include <linux/bpf.h>
> +#include <linux/btf_ids.h>
> +#include <linux/slab.h>
> +#include <linux/kernel.h>
> +#include <linux/seq_file.h>
> +
> +#include "../../mm/slab.h" /* kmem_cache, slab_caches and slab_mutex */
> +
> +struct bpf_iter__kmem_cache {
> + __bpf_md_ptr(struct bpf_iter_meta *, meta);
> + __bpf_md_ptr(struct kmem_cache *, s);
> +};
> +
> +static void *kmem_cache_iter_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> + loff_t cnt = 0;
> + struct kmem_cache *s = NULL;
> +
> + mutex_lock(&slab_mutex);
> +
> + /*
> + * Find an entry at the given position in the slab_caches list instead
> + * of keeping a reference (of the last visited entry, if any) out of
> + * slab_mutex. It might miss something if one is deleted in the middle
> + * while it releases the lock. But it should be rare and there's not
> + * much we can do about it.
> + */
> + list_for_each_entry(s, &slab_caches, list) {
> + if (cnt == *pos) {
> + /*
> + * Make sure this entry remains in the list by getting
> + * a new reference count. Note that boot_cache entries
> + * have a negative refcount, so don't touch them.
> + */
> + if (s->refcount > 0)
> + s->refcount++;
> + break;
> + }
> +
> + cnt++;
> + }
> + mutex_unlock(&slab_mutex);
> +
> + if (cnt != *pos)
> + return NULL;
> +
> + ++*pos;
> + return s;
> +}
> +
> +static void kmem_cache_iter_seq_stop(struct seq_file *seq, void *v)
> +{
> + struct bpf_iter_meta meta;
> + struct bpf_iter__kmem_cache ctx = {
> + .meta = &meta,
> + .s = v,
> + };
> + struct bpf_prog *prog;
> + bool destroy = false;
> +
> + meta.seq = seq;
> + prog = bpf_iter_get_info(&meta, true);
> + if (prog)
> + bpf_iter_run_prog(prog, &ctx);
> +
> + mutex_lock(&slab_mutex);
> + if (ctx.s && ctx.s->refcount > 0)
> + destroy = true;

I'd do the same optimization as in kmem_cache_iter_seq_next() otherwise this
will always results in taking the mutex twice and performing
kvfree_rcu_barrier() needlessly?

> + mutex_unlock(&slab_mutex);
> +
> + if (destroy)
> + kmem_cache_destroy(ctx.s);
> +}
> +
> +static void *kmem_cache_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> +{
> + struct kmem_cache *s = v;
> + struct kmem_cache *next = NULL;
> + bool destroy = false;
> +
> + ++*pos;
> +
> + mutex_lock(&slab_mutex);
> +
> + if (list_last_entry(&slab_caches, struct kmem_cache, list) != s) {
> + next = list_next_entry(s, list);
> + if (next->refcount > 0)
> + next->refcount++;
> + }
> +
> + /* Skip kmem_cache_destroy() for active entries */
> + if (s->refcount > 1)
> + s->refcount--;
> + else if (s->refcount == 1)
> + destroy = true;
> +
> + mutex_unlock(&slab_mutex);
> +
> + if (destroy)
> + kmem_cache_destroy(s);
> +
> + return next;
> +}
> +
> +static int kmem_cache_iter_seq_show(struct seq_file *seq, void *v)
> +{
> + struct bpf_iter_meta meta;
> + struct bpf_iter__kmem_cache ctx = {
> + .meta = &meta,
> + .s = v,
> + };
> + struct bpf_prog *prog;
> + int ret = 0;
> +
> + meta.seq = seq;
> + prog = bpf_iter_get_info(&meta, false);
> + if (prog)
> + ret = bpf_iter_run_prog(prog, &ctx);
> +
> + return ret;
> +}
> +
> +static const struct seq_operations kmem_cache_iter_seq_ops = {
> + .start = kmem_cache_iter_seq_start,
> + .next = kmem_cache_iter_seq_next,
> + .stop = kmem_cache_iter_seq_stop,
> + .show = kmem_cache_iter_seq_show,
> +};
> +
> +BTF_ID_LIST_GLOBAL_SINGLE(bpf_kmem_cache_btf_id, struct, kmem_cache)
> +
> +static const struct bpf_iter_seq_info kmem_cache_iter_seq_info = {
> + .seq_ops = &kmem_cache_iter_seq_ops,
> +};
> +
> +static void bpf_iter_kmem_cache_show_fdinfo(const struct bpf_iter_aux_info *aux,
> + struct seq_file *seq)
> +{
> + seq_puts(seq, "kmem_cache iter\n");
> +}
> +
> +DEFINE_BPF_ITER_FUNC(kmem_cache, struct bpf_iter_meta *meta,
> + struct kmem_cache *s)
> +
> +static struct bpf_iter_reg bpf_kmem_cache_reg_info = {
> + .target = "kmem_cache",
> + .feature = BPF_ITER_RESCHED,
> + .show_fdinfo = bpf_iter_kmem_cache_show_fdinfo,
> + .ctx_arg_info_size = 1,
> + .ctx_arg_info = {
> + { offsetof(struct bpf_iter__kmem_cache, s),
> + PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
> + },
> + .seq_info = &kmem_cache_iter_seq_info,
> +};
> +
> +static int __init bpf_kmem_cache_iter_init(void)
> +{
> + bpf_kmem_cache_reg_info.ctx_arg_info[0].btf_id = bpf_kmem_cache_btf_id[0];
> + return bpf_iter_reg_target(&bpf_kmem_cache_reg_info);
> +}
> +
> +late_initcall(bpf_kmem_cache_iter_init);
>
> base-commit: 9502a7de5a61bec3bda841a830560c5d6d40ecac