[PATCH] arm64: clear_user: align __arch_clear_user() to 128B for I-cache efficiency

From: Luke Yang

Date: Fri Nov 21 2025 - 00:05:12 EST

On aarch64 kernels, recent changes (specifically irqbypass patch
https://lore.kernel.org/all/20250516230734.2564775-6-seanjc@xxxxxxxxxx/)
shifted __arch_clear_user() such that the tight zeroing loop straddles
I-cache lines. This causes measurable read performance regression when
reading from /dev/zero.

Add `.p2align 6` (64-byte alignment) to guarantee the loop stays within a
single I-cache boundary, restoring the previous IPC and throughput.

Tested on bare-metal aarch64 systems:

Good kernel: pread_z100k ~ 6.9 s
Bad kernel: pread_z100k ~ 9.0 s
With patch: pread_z100k ~ 6.9 s

Reproducer:

// gcc -O2 -Wall -Wextra -o pread_z100k pread_z100k.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <sys/time.h>

#define SIZE (100 * 1024)
#define COUNT 1000000

static double now_sec(void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + tv.tv_usec / 1e6;
}

int main(void)
{
int fd = open("/dev/zero", O_RDONLY);
if (fd < 0) { perror("open /dev/zero"); return 1; }

char *buf = malloc(SIZE);
if (!buf) { perror("malloc"); close(fd); return 1; }

double t1 = now_sec();
for (int i = 0; i < COUNT; i++) {
ssize_t r = pread(fd, buf, SIZE, 0);
if (r != SIZE) { perror("pread"); break; }
}
double t2 = now_sec();

printf("%.6f\n", t2 - t1);

close(fd);
free(buf);
return 0;
}

Signed-off-by: Luke Yang <luyang@xxxxxxxxxx>
Signed-off-by: Jirka Hladky <jhladky@xxxxxxxxxx>
---
arch/arm64/lib/clear_user.S | 5 +++++
1 file changed, 5 insertions(+)

diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index de9a303b6..91eee4a7c 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -17,6 +17,11 @@
* Alignment fixed up by hardware.
*/

+/*
+ * Ensure __arch_clear_user() always starts on a clean I-cache boundary.
+ */
+ .p2align 6 // 2^6 = 64-byte alignment
+
SYM_FUNC_START(__arch_clear_user)
add x2, x0, x1

--
2.51.1