Re: [PATCH] mm: kfence: Improve the performance of __kfence_alloc() and __kfence_free()

From: Marco Elver
Date: Mon Apr 03 2023 - 05:22:53 EST


On Mon, 3 Apr 2023 at 08:28, Peng Zhang <zhangpeng.00@xxxxxxxxxxxxx> wrote:
>
> In __kfence_alloc() and __kfence_free(), we will set and check canary.
> Assuming that the size of the object is close to 0, nearly 4k memory
> accesses are required because setting and checking canary is executed
> byte by byte.
>
> canary is now defined like this:
> KFENCE_CANARY_PATTERN(addr) ((u8)0xaa ^ (u8)((unsigned long)(addr) & 0x7))
>
> Observe that canary is only related to the lower three bits of the
> address, so every 8 bytes of canary are the same. We can access 8-byte
> canary each time instead of byte-by-byte, thereby optimizing nearly 4k
> memory accesses to 4k/8 times.
>
> Use the bcc tool funclatency to measure the latency of __kfence_alloc()
> and __kfence_free(), the numbers (deleted the distribution of latency)
> is posted below. Though different object sizes will have an impact on the
> measurement, we ignore it for now and assume the average object size is
> roughly equal.
>
> Before playing patch:
> __kfence_alloc:
> avg = 5055 nsecs, total: 5515252 nsecs, count: 1091
> __kfence_free:
> avg = 5319 nsecs, total: 9735130 nsecs, count: 1830
>
> After playing patch:
> __kfence_alloc:
> avg = 3597 nsecs, total: 6428491 nsecs, count: 1787
> __kfence_free:
> avg = 3046 nsecs, total: 3415390 nsecs, count: 1121

Seems like a nice improvement!

> The numbers indicate that there is ~30% - ~40% performance improvement.
>
> Signed-off-by: Peng Zhang <zhangpeng.00@xxxxxxxxxxxxx>
> ---
> mm/kfence/core.c | 71 +++++++++++++++++++++++++++++++++-------------
> mm/kfence/kfence.h | 10 ++++++-
> mm/kfence/report.c | 2 +-
> 3 files changed, 62 insertions(+), 21 deletions(-)
>
> diff --git a/mm/kfence/core.c b/mm/kfence/core.c
> index 79c94ee55f97..0b1b1298c738 100644
> --- a/mm/kfence/core.c
> +++ b/mm/kfence/core.c
> @@ -297,20 +297,13 @@ metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state nex
> WRITE_ONCE(meta->state, next);
> }
>
> -/* Write canary byte to @addr. */
> -static inline bool set_canary_byte(u8 *addr)
> -{
> - *addr = KFENCE_CANARY_PATTERN(addr);
> - return true;
> -}
> -
> /* Check canary byte at @addr. */
> static inline bool check_canary_byte(u8 *addr)
> {
> struct kfence_metadata *meta;
> unsigned long flags;
>
> - if (likely(*addr == KFENCE_CANARY_PATTERN(addr)))
> + if (likely(*addr == KFENCE_CANARY_PATTERN_U8(addr)))
> return true;
>
> atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
> @@ -323,11 +316,27 @@ static inline bool check_canary_byte(u8 *addr)
> return false;
> }
>
> -/* __always_inline this to ensure we won't do an indirect call to fn. */
> -static __always_inline void for_each_canary(const struct kfence_metadata *meta, bool (*fn)(u8 *))
> +static inline void set_canary(const struct kfence_metadata *meta)
> {
> const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
> - unsigned long addr;
> + unsigned long addr = pageaddr;
> +
> + /*
> + * The canary may be written to part of the object memory, but it does
> + * not affect it. The user should initialize the object before using it.
> + */
> + for (; addr < meta->addr; addr += sizeof(u64))
> + *((u64 *)addr) = KFENCE_CANARY_PATTERN_U64;
> +
> + addr = ALIGN_DOWN(meta->addr + meta->size, sizeof(u64));
> + for (; addr - pageaddr < PAGE_SIZE; addr += sizeof(u64))
> + *((u64 *)addr) = KFENCE_CANARY_PATTERN_U64;
> +}
> +
> +static inline void check_canary(const struct kfence_metadata *meta)
> +{
> + const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
> + unsigned long addr = pageaddr;
>
> /*
> * We'll iterate over each canary byte per-side until fn() returns

This comment is now out-of-date ("fn" no longer exists).

> @@ -339,14 +348,38 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
> */
>
> /* Apply to left of object. */
> - for (addr = pageaddr; addr < meta->addr; addr++) {
> - if (!fn((u8 *)addr))
> + for (; meta->addr - addr >= sizeof(u64); addr += sizeof(u64)) {
> + if (unlikely(*((u64 *)addr) != KFENCE_CANARY_PATTERN_U64))
> break;
> }
>
> - /* Apply to right of object. */
> - for (addr = meta->addr + meta->size; addr < pageaddr + PAGE_SIZE; addr++) {
> - if (!fn((u8 *)addr))
> + /*
> + * If the canary is damaged in a certain 64 bytes, or the canay memory

"damaged" -> "corrupted"
"canay" -> "canary"

> + * cannot be completely covered by multiple consecutive 64 bytes, it
> + * needs to be checked one by one.
> + */
> + for (; addr < meta->addr; addr++) {
> + if (unlikely(!check_canary_byte((u8 *)addr)))
> + break;
> + }
> +
> + /*
> + * Apply to right of object.
> + * For easier implementation, check from high address to low address.
> + */
> + addr = pageaddr + PAGE_SIZE - sizeof(u64);
> + for (; addr >= meta->addr + meta->size ; addr -= sizeof(u64)) {
> + if (unlikely(*((u64 *)addr) != KFENCE_CANARY_PATTERN_U64))
> + break;
> + }
> +
> + /*
> + * Same as above, checking byte by byte, but here is the reverse of
> + * the above.
> + */
> + addr = addr + sizeof(u64) - 1;
> + for (; addr >= meta->addr + meta->size; addr--) {

The re-checking should forward-check i.e. not in reverse, otherwise
the report might not include some corrupted bytes that had in the
previous version been included. I think you need to check from low to
high address to start with above.

> + if (unlikely(!check_canary_byte((u8 *)addr)))
> break;
> }
> }
> @@ -434,7 +467,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
> #endif
>
> /* Memory initialization. */
> - for_each_canary(meta, set_canary_byte);
> + set_canary(meta);
>
> /*
> * We check slab_want_init_on_alloc() ourselves, rather than letting
> @@ -495,7 +528,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
> alloc_covered_add(meta->alloc_stack_hash, -1);
>
> /* Check canary bytes for memory corruption. */
> - for_each_canary(meta, check_canary_byte);
> + check_canary(meta);
>
> /*
> * Clear memory if init-on-free is set. While we protect the page, the
> @@ -751,7 +784,7 @@ static void kfence_check_all_canary(void)
> struct kfence_metadata *meta = &kfence_metadata[i];
>
> if (meta->state == KFENCE_OBJECT_ALLOCATED)
> - for_each_canary(meta, check_canary_byte);
> + check_canary(meta);
> }
> }
>
> diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
> index 600f2e2431d6..2aafc46a4aaf 100644
> --- a/mm/kfence/kfence.h
> +++ b/mm/kfence/kfence.h
> @@ -21,7 +21,15 @@
> * lower 3 bits of the address, to detect memory corruptions with higher
> * probability, where similar constants are used.
> */
> -#define KFENCE_CANARY_PATTERN(addr) ((u8)0xaa ^ (u8)((unsigned long)(addr) & 0x7))
> +#define KFENCE_CANARY_PATTERN_U8(addr) ((u8)0xaa ^ (u8)((unsigned long)(addr) & 0x7))
> +
> +/*
> + * Define a continuous 8-byte canary starting from a multiple of 8. The canary
> + * of each byte is only related to the lowest three bits of its address, so the
> + * canary of every 8 bytes is the same. 64-bit memory can be filled and checked
> + * at a time instead of byte by byte to improve performance.
> + */
> +#define KFENCE_CANARY_PATTERN_U64 ((u64)0xaaaaaaaaaaaaaaaa ^ (u64)(0x0706050403020100))
>
> /* Maximum stack depth for reports. */
> #define KFENCE_STACK_DEPTH 64
> diff --git a/mm/kfence/report.c b/mm/kfence/report.c
> index 60205f1257ef..197430a5be4a 100644
> --- a/mm/kfence/report.c
> +++ b/mm/kfence/report.c
> @@ -168,7 +168,7 @@ static void print_diff_canary(unsigned long address, size_t bytes_to_show,
>
> pr_cont("[");
> for (cur = (const u8 *)address; cur < end; cur++) {
> - if (*cur == KFENCE_CANARY_PATTERN(cur))
> + if (*cur == KFENCE_CANARY_PATTERN_U8(cur))
> pr_cont(" .");
> else if (no_hash_pointers)
> pr_cont(" 0x%02x", *cur);
> --
> 2.20.1
>