Re: [PATCH] mm/kmemleak: Add cond_resched() to kmemleak_free_percpu()

From: Catalin Marinas
Date: Tue Nov 28 2023 - 11:04:49 EST


On Mon, Nov 27, 2023 at 02:41:53PM -0500, Waiman Long wrote:
> /**
> * kmemleak_free_percpu - unregister a previously registered __percpu object
> * @ptr: __percpu pointer to beginning of the object
> *
> * This function is called from the kernel percpu allocator when an object
> - * (memory block) is freed (free_percpu).
> + * (memory block) is freed (free_percpu). Since this function is inherently
> + * slow especially on systems with a large number of CPUs, defer the actual
> + * removal of kmemleak objects associated with the percpu pointer to a
> + * workqueue if it is not in a task context.
> */
> void __ref kmemleak_free_percpu(const void __percpu *ptr)
> {
> - unsigned int cpu;
> -
> pr_debug("%s(0x%px)\n", __func__, ptr);
>
> - if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
> - for_each_possible_cpu(cpu)
> - delete_object_full((unsigned long)per_cpu_ptr(ptr,
> - cpu));
> + if (!kmemleak_free_enabled || !ptr || IS_ERR(ptr))
> + return;
> +
> + if (!in_task()) {
> + struct kmemleak_percpu_addr *addr;
> +
> + addr = kzalloc(sizeof(*addr), GFP_ATOMIC);
> + if (addr) {
> + INIT_WORK(&addr->work, kmemleak_free_percpu_workfn);
> + addr->ptr = ptr;
> + queue_work(system_long_wq, &addr->work);
> + return;
> + }

We can't defer this freeing. It can mess up the kmemleak metadata if the
per-cpu pointer is re-allocated before kmemleak removed it from its
object tree.

The problem is looking up the object tree for each per-cpu offset. We
can make the percpu pointer handling O(1) since freeing is only done by
the main __percpu pointer, so that's the only one needing a look-up. So
far the per-cpu pointers are not tracked for leaking, only scanned.

We could just add the per_cpu_ptr(ptr, 0) to the kmemleak
object_tree_root but when scanning we don't have an inverse function to
get the __percpu pointer back and calculate the pointers for the other
CPUs (well, we could with some hacks but they are probably fragile).

What I came up with is a separate object_percpu_tree_root similar to the
object_phys_tree_root. The only reason for these additional trees is to
look up the kmemleak metadata when needed (usually freeing). They don't
contain objects that are tracked for actual leaking, only scanned. A
briefly tested patch below. I need to go through it again, update some
comments and write a commit log:

---------------------8<---------------------------------
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 1eacca03bedd..7446c9e0b8c8 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -178,6 +178,8 @@ struct kmemleak_object {
#define OBJECT_FULL_SCAN (1 << 3)
/* flag set for object allocated with physical address */
#define OBJECT_PHYS (1 << 4)
+/* flag set for per-CPU pointers */
+#define OBJECT_PERCPU (1 << 5)

/* set when __remove_object() called */
#define DELSTATE_REMOVED (1 << 0)
@@ -206,6 +208,8 @@ static LIST_HEAD(mem_pool_free_list);
static struct rb_root object_tree_root = RB_ROOT;
/* search tree for object (with OBJECT_PHYS flag) boundaries */
static struct rb_root object_phys_tree_root = RB_ROOT;
+/* search tree for object (with OBJECT_PERCPU flag) boundaries */
+static struct rb_root object_percpu_tree_root = RB_ROOT;
/* protecting the access to object_list, object_tree_root (or object_phys_tree_root) */
static DEFINE_RAW_SPINLOCK(kmemleak_lock);

@@ -298,7 +302,7 @@ static void hex_dump_object(struct seq_file *seq,
const u8 *ptr = (const u8 *)object->pointer;
size_t len;

- if (WARN_ON_ONCE(object->flags & OBJECT_PHYS))
+ if (WARN_ON_ONCE(object->flags & (OBJECT_PHYS | OBJECT_PERCPU)))
return;

/* limit the number of lines to HEX_MAX_LINES */
@@ -392,6 +396,15 @@ static void dump_object_info(struct kmemleak_object *object)
stack_depot_print(object->trace_handle);
}

+static struct rb_root *object_tree(unsigned long objflags)
+{
+ if (objflags & OBJECT_PHYS)
+ return &object_phys_tree_root;
+ if (objflags & OBJECT_PERCPU)
+ return &object_percpu_tree_root;
+ return &object_tree_root;
+}
+
/*
* Look-up a memory block metadata (kmemleak_object) in the object search
* tree based on a pointer value. If alias is 0, only values pointing to the
@@ -399,10 +412,9 @@ static void dump_object_info(struct kmemleak_object *object)
* when calling this function.
*/
static struct kmemleak_object *__lookup_object(unsigned long ptr, int alias,
- bool is_phys)
+ unsigned int objflags)
{
- struct rb_node *rb = is_phys ? object_phys_tree_root.rb_node :
- object_tree_root.rb_node;
+ struct rb_node *rb = object_tree(objflags)->rb_node;
unsigned long untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);

while (rb) {
@@ -431,7 +443,7 @@ static struct kmemleak_object *__lookup_object(unsigned long ptr, int alias,
/* Look-up a kmemleak object which allocated with virtual address. */
static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
{
- return __lookup_object(ptr, alias, false);
+ return __lookup_object(ptr, alias, 0);
}

/*
@@ -544,14 +556,14 @@ static void put_object(struct kmemleak_object *object)
* Look up an object in the object search tree and increase its use_count.
*/
static struct kmemleak_object *__find_and_get_object(unsigned long ptr, int alias,
- bool is_phys)
+ unsigned int objflags)
{
unsigned long flags;
struct kmemleak_object *object;

rcu_read_lock();
raw_spin_lock_irqsave(&kmemleak_lock, flags);
- object = __lookup_object(ptr, alias, is_phys);
+ object = __lookup_object(ptr, alias, objflags);
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);

/* check whether the object is still available */
@@ -565,7 +577,7 @@ static struct kmemleak_object *__find_and_get_object(unsigned long ptr, int alia
/* Look up and get an object which allocated with virtual address. */
static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
{
- return __find_and_get_object(ptr, alias, false);
+ return __find_and_get_object(ptr, alias, 0);
}

/*
@@ -575,9 +587,7 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
*/
static void __remove_object(struct kmemleak_object *object)
{
- rb_erase(&object->rb_node, object->flags & OBJECT_PHYS ?
- &object_phys_tree_root :
- &object_tree_root);
+ rb_erase(&object->rb_node, object_tree(object->flags));
if (!(object->del_state & DELSTATE_NO_DELETE))
list_del_rcu(&object->object_list);
object->del_state |= DELSTATE_REMOVED;
@@ -585,11 +595,11 @@ static void __remove_object(struct kmemleak_object *object)

static struct kmemleak_object *__find_and_remove_object(unsigned long ptr,
int alias,
- bool is_phys)
+ unsigned int objflags)
{
struct kmemleak_object *object;

- object = __lookup_object(ptr, alias, is_phys);
+ object = __lookup_object(ptr, alias, objflags);
if (object)
__remove_object(object);

@@ -603,13 +613,13 @@ static struct kmemleak_object *__find_and_remove_object(unsigned long ptr,
* by create_object().
*/
static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias,
- bool is_phys)
+ unsigned int objflags)
{
unsigned long flags;
struct kmemleak_object *object;

raw_spin_lock_irqsave(&kmemleak_lock, flags);
- object = __find_and_remove_object(ptr, alias, is_phys);
+ object = __find_and_remove_object(ptr, alias, objflags);
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);

return object;
@@ -648,7 +658,7 @@ static struct kmemleak_object *__alloc_object(gfp_t gfp)
}

static int __link_object(struct kmemleak_object *object, unsigned long ptr,
- size_t size, int min_count, bool is_phys)
+ size_t size, int min_count, unsigned int objflags)
{

struct kmemleak_object *parent;
@@ -661,7 +671,7 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr,
INIT_HLIST_HEAD(&object->area_list);
raw_spin_lock_init(&object->lock);
atomic_set(&object->use_count, 1);
- object->flags = OBJECT_ALLOCATED | (is_phys ? OBJECT_PHYS : 0);
+ object->flags = OBJECT_ALLOCATED | objflags;
object->pointer = ptr;
object->size = kfence_ksize((void *)ptr) ?: size;
object->excess_ref = 0;
@@ -697,12 +707,11 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr,
* Only update min_addr and max_addr with object
* storing virtual address.
*/
- if (!is_phys) {
+ if (!(objflags & (OBJECT_PHYS | OBJECT_PERCPU))) {
min_addr = min(min_addr, untagged_ptr);
max_addr = max(max_addr, untagged_ptr + size);
}
- link = is_phys ? &object_phys_tree_root.rb_node :
- &object_tree_root.rb_node;
+ link = &object_tree(objflags)->rb_node;
rb_parent = NULL;
while (*link) {
rb_parent = *link;
@@ -724,8 +733,7 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr,
}
}
rb_link_node(&object->rb_node, rb_parent, link);
- rb_insert_color(&object->rb_node, is_phys ? &object_phys_tree_root :
- &object_tree_root);
+ rb_insert_color(&object->rb_node, object_tree(objflags));
list_add_tail_rcu(&object->object_list, &object_list);

return 0;
@@ -737,7 +745,7 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr,
* object_phys_tree_root).
*/
static void __create_object(unsigned long ptr, size_t size,
- int min_count, gfp_t gfp, bool is_phys)
+ int min_count, gfp_t gfp, unsigned int objflags)
{
struct kmemleak_object *object;
unsigned long flags;
@@ -748,7 +756,7 @@ static void __create_object(unsigned long ptr, size_t size,
return;

raw_spin_lock_irqsave(&kmemleak_lock, flags);
- ret = __link_object(object, ptr, size, min_count, is_phys);
+ ret = __link_object(object, ptr, size, min_count, objflags);
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
if (ret)
mem_pool_free(object);
@@ -758,14 +766,21 @@ static void __create_object(unsigned long ptr, size_t size,
static void create_object(unsigned long ptr, size_t size,
int min_count, gfp_t gfp)
{
- __create_object(ptr, size, min_count, gfp, false);
+ __create_object(ptr, size, min_count, gfp, 0);
}

/* Create kmemleak object which allocated with physical address. */
static void create_object_phys(unsigned long ptr, size_t size,
int min_count, gfp_t gfp)
{
- __create_object(ptr, size, min_count, gfp, true);
+ __create_object(ptr, size, min_count, gfp, OBJECT_PHYS);
+}
+
+/* Create kmemleak object corresponding to a per-CPU allocation. */
+static void create_object_percpu(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp)
+{
+ __create_object(ptr, size, min_count, gfp, OBJECT_PERCPU);
}

/*
@@ -792,11 +807,11 @@ static void __delete_object(struct kmemleak_object *object)
* Look up the metadata (struct kmemleak_object) corresponding to ptr and
* delete it.
*/
-static void delete_object_full(unsigned long ptr)
+static void delete_object_full(unsigned long ptr, unsigned int objflags)
{
struct kmemleak_object *object;

- object = find_and_remove_object(ptr, 0, false);
+ object = find_and_remove_object(ptr, 0, objflags);
if (!object) {
#ifdef DEBUG
kmemleak_warn("Freeing unknown object at 0x%08lx\n",
@@ -812,7 +827,8 @@ static void delete_object_full(unsigned long ptr)
* delete it. If the memory block is partially freed, the function may create
* additional metadata for the remaining parts of the block.
*/
-static void delete_object_part(unsigned long ptr, size_t size, bool is_phys)
+static void delete_object_part(unsigned long ptr, size_t size,
+ unsigned int objflags)
{
struct kmemleak_object *object, *object_l, *object_r;
unsigned long start, end, flags;
@@ -826,7 +842,7 @@ static void delete_object_part(unsigned long ptr, size_t size, bool is_phys)
goto out;

raw_spin_lock_irqsave(&kmemleak_lock, flags);
- object = __find_and_remove_object(ptr, 1, is_phys);
+ object = __find_and_remove_object(ptr, 1, objflags);
if (!object) {
#ifdef DEBUG
kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
@@ -844,11 +860,11 @@ static void delete_object_part(unsigned long ptr, size_t size, bool is_phys)
end = object->pointer + object->size;
if ((ptr > start) &&
!__link_object(object_l, start, ptr - start,
- object->min_count, is_phys))
+ object->min_count, objflags))
object_l = NULL;
if ((ptr + size < end) &&
!__link_object(object_r, ptr + size, end - ptr - size,
- object->min_count, is_phys))
+ object->min_count, objflags))
object_r = NULL;

unlock:
@@ -879,11 +895,11 @@ static void paint_it(struct kmemleak_object *object, int color)
raw_spin_unlock_irqrestore(&object->lock, flags);
}

-static void paint_ptr(unsigned long ptr, int color, bool is_phys)
+static void paint_ptr(unsigned long ptr, int color, unsigned int objflags)
{
struct kmemleak_object *object;

- object = __find_and_get_object(ptr, 0, is_phys);
+ object = __find_and_get_object(ptr, 0, objflags);
if (!object) {
kmemleak_warn("Trying to color unknown object at 0x%08lx as %s\n",
ptr,
@@ -901,16 +917,16 @@ static void paint_ptr(unsigned long ptr, int color, bool is_phys)
*/
static void make_gray_object(unsigned long ptr)
{
- paint_ptr(ptr, KMEMLEAK_GREY, false);
+ paint_ptr(ptr, KMEMLEAK_GREY, 0);
}

/*
* Mark the object as black-colored so that it is ignored from scans and
* reporting.
*/
-static void make_black_object(unsigned long ptr, bool is_phys)
+static void make_black_object(unsigned long ptr, unsigned int objflags)
{
- paint_ptr(ptr, KMEMLEAK_BLACK, is_phys);
+ paint_ptr(ptr, KMEMLEAK_BLACK, objflags);
}

/*
@@ -1046,8 +1062,6 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc);
void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
gfp_t gfp)
{
- unsigned int cpu;
-
pr_debug("%s(0x%px, %zu)\n", __func__, ptr, size);

/*
@@ -1055,9 +1069,7 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
* (min_count is set to 0).
*/
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
- for_each_possible_cpu(cpu)
- create_object((unsigned long)per_cpu_ptr(ptr, cpu),
- size, 0, gfp);
+ create_object_percpu((unsigned long)ptr, size, 0, gfp);
}
EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);

@@ -1098,7 +1110,7 @@ void __ref kmemleak_free(const void *ptr)
pr_debug("%s(0x%px)\n", __func__, ptr);

if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
- delete_object_full((unsigned long)ptr);
+ delete_object_full((unsigned long)ptr, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free);

@@ -1116,7 +1128,7 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
pr_debug("%s(0x%px)\n", __func__, ptr);

if (kmemleak_enabled && ptr && !IS_ERR(ptr))
- delete_object_part((unsigned long)ptr, size, false);
+ delete_object_part((unsigned long)ptr, size, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free_part);

@@ -1129,14 +1141,10 @@ EXPORT_SYMBOL_GPL(kmemleak_free_part);
*/
void __ref kmemleak_free_percpu(const void __percpu *ptr)
{
- unsigned int cpu;
-
pr_debug("%s(0x%px)\n", __func__, ptr);

if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
- for_each_possible_cpu(cpu)
- delete_object_full((unsigned long)per_cpu_ptr(ptr,
- cpu));
+ delete_object_full((unsigned long)ptr, OBJECT_PERCPU);
}
EXPORT_SYMBOL_GPL(kmemleak_free_percpu);

@@ -1204,7 +1212,7 @@ void __ref kmemleak_ignore(const void *ptr)
pr_debug("%s(0x%px)\n", __func__, ptr);

if (kmemleak_enabled && ptr && !IS_ERR(ptr))
- make_black_object((unsigned long)ptr, false);
+ make_black_object((unsigned long)ptr, 0);
}
EXPORT_SYMBOL(kmemleak_ignore);

@@ -1278,7 +1286,7 @@ void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size)
pr_debug("%s(0x%px)\n", __func__, &phys);

if (kmemleak_enabled)
- delete_object_part((unsigned long)phys, size, true);
+ delete_object_part((unsigned long)phys, size, OBJECT_PHYS);
}
EXPORT_SYMBOL(kmemleak_free_part_phys);

@@ -1292,7 +1300,7 @@ void __ref kmemleak_ignore_phys(phys_addr_t phys)
pr_debug("%s(0x%px)\n", __func__, &phys);

if (kmemleak_enabled)
- make_black_object((unsigned long)phys, true);
+ make_black_object((unsigned long)phys, OBJECT_PHYS);
}
EXPORT_SYMBOL(kmemleak_ignore_phys);

@@ -1303,7 +1311,7 @@ static bool update_checksum(struct kmemleak_object *object)
{
u32 old_csum = object->checksum;

- if (WARN_ON_ONCE(object->flags & OBJECT_PHYS))
+ if (WARN_ON_ONCE(object->flags & (OBJECT_PHYS | OBJECT_PERCPU)))
return false;

kasan_disable_current();
@@ -1459,7 +1467,6 @@ static void scan_object(struct kmemleak_object *object)
{
struct kmemleak_scan_area *area;
unsigned long flags;
- void *obj_ptr;

/*
* Once the object->lock is acquired, the corresponding memory block
@@ -1472,14 +1479,27 @@ static void scan_object(struct kmemleak_object *object)
/* already freed object */
goto out;

- obj_ptr = object->flags & OBJECT_PHYS ?
- __va((phys_addr_t)object->pointer) :
- (void *)object->pointer;
+ if (object->flags & OBJECT_PERCPU) {
+ unsigned int cpu;

- if (hlist_empty(&object->area_list) ||
+ for_each_possible_cpu(cpu) {
+ void *start = per_cpu_ptr((void __percpu *)object->pointer, cpu);
+ void *end = start + object->size;
+
+ scan_block(start, end, object);
+
+ raw_spin_unlock_irqrestore(&object->lock, flags);
+ cond_resched();
+ raw_spin_lock_irqsave(&object->lock, flags);
+ if (!(object->flags & OBJECT_ALLOCATED))
+ break;
+ }
+ } else if (hlist_empty(&object->area_list) ||
object->flags & OBJECT_FULL_SCAN) {
- void *start = obj_ptr;
- void *end = obj_ptr + object->size;
+ void *start = object->flags & OBJECT_PHYS ?
+ __va((phys_addr_t)object->pointer) :
+ (void *)object->pointer;
+ void *end = start + object->size;
void *next;

do {
@@ -1494,11 +1514,12 @@ static void scan_object(struct kmemleak_object *object)
cond_resched();
raw_spin_lock_irqsave(&object->lock, flags);
} while (object->flags & OBJECT_ALLOCATED);
- } else
+ } else {
hlist_for_each_entry(area, &object->area_list, node)
scan_block((void *)area->start,
(void *)(area->start + area->size),
object);
+ }
out:
raw_spin_unlock_irqrestore(&object->lock, flags);
}

--
Catalin