[PATCH 2/5] LoongArch: BPF: Add private stack support
From: George Guo
Date: Wed Jun 17 2026 - 23:39:34 EST
From: George Guo <guodongtai@xxxxxxxxxx>
Support per-program private stacks, advertised via
bpf_jit_supports_private_stack(). When the verifier marks a program with
jits_use_priv_stack (e.g. a sufficiently deep, potentially recursive
tracing program), its BPF stack is moved off the kernel stack into a
per-CPU allocation, reducing kernel stack pressure.
The private stack is allocated in bpf_int_jit_compile() as the
verifier-computed stack depth plus two 16-byte guard regions used to
detect overflow and underflow; the guards are initialised at allocation
time and validated in bpf_jit_free(). S5 (otherwise saved/restored but
unused by the JIT) is reused to hold the private stack pointer, loaded
in the prologue with the current CPU's per-CPU offset ($r21). When a
private stack is in use the BPF frame pointer points into this per-CPU
region and the BPF stack is no longer reserved on the kernel stack.
Signed-off-by: George Guo <guodongtai@xxxxxxxxxx>
---
arch/loongarch/net/bpf_jit.c | 111 ++++++++++++++++++++++++++++++++++-
arch/loongarch/net/bpf_jit.h | 1 +
2 files changed, 109 insertions(+), 3 deletions(-)
diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c
index 3f9ffdde2491..c410b02e64be 100644
--- a/arch/loongarch/net/bpf_jit.c
+++ b/arch/loongarch/net/bpf_jit.c
@@ -18,8 +18,13 @@
#define REG_TCC LOONGARCH_GPR_A6
#define REG_ARENA LOONGARCH_GPR_S6 /* For storing arena_vm_start */
+#define REG_PRIV_SP LOONGARCH_GPR_S5 /* For storing the private stack pointer */
#define BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack) (round_up(stack, 16) - 80)
+/* Memory size/value to protect private stack overflow/underflow */
+#define PRIV_STACK_GUARD_SZ 16
+#define PRIV_STACK_GUARD_VAL 0xEB9F12345678eb9fULL
+
static const int regmap[] = {
/* return value from in-kernel function, and exit value for eBPF program */
[BPF_REG_0] = LOONGARCH_GPR_A5,
@@ -40,6 +45,15 @@ static const int regmap[] = {
[BPF_REG_AX] = LOONGARCH_GPR_T0,
};
+static void emit_percpu_ptr(struct jit_ctx *ctx, u8 dst, void __percpu *ptr)
+{
+ move_imm(ctx, dst, (__force long)ptr, false);
+#ifdef CONFIG_SMP
+ /* dst += __my_cpu_offset, held in $r21 */
+ emit_insn(ctx, addd, dst, dst, LOONGARCH_GPR_U0);
+#endif
+}
+
static void prepare_bpf_tail_call_cnt(struct jit_ctx *ctx, int *store_offset)
{
const struct bpf_prog *prog = ctx->prog;
@@ -141,7 +155,14 @@ static void build_prologue(struct jit_ctx *ctx)
stack_adjust += 8;
stack_adjust = round_up(stack_adjust, 16);
- stack_adjust += bpf_stack_adjust;
+
+ /*
+ * When a private stack is used the BPF stack lives in a per-CPU
+ * allocation rather than on the kernel stack, so only the non-BPF
+ * part is reserved here.
+ */
+ if (!ctx->priv_sp_used)
+ stack_adjust += bpf_stack_adjust;
move_reg(ctx, LOONGARCH_GPR_T0, LOONGARCH_GPR_RA);
/* Reserve space for the move_imm + jirl instruction */
@@ -191,8 +212,16 @@ static void build_prologue(struct jit_ctx *ctx)
emit_insn(ctx, addid, LOONGARCH_GPR_FP, LOONGARCH_GPR_SP, stack_adjust);
- if (bpf_stack_adjust)
+ if (ctx->priv_sp_used) {
+ /* Set up the private stack pointer and the BPF frame pointer */
+ void __percpu *priv_stack_ptr;
+
+ priv_stack_ptr = prog->aux->priv_stack_ptr + PRIV_STACK_GUARD_SZ;
+ emit_percpu_ptr(ctx, REG_PRIV_SP, priv_stack_ptr);
+ emit_insn(ctx, addid, regmap[BPF_REG_FP], REG_PRIV_SP, bpf_stack_adjust);
+ } else if (bpf_stack_adjust) {
emit_insn(ctx, addid, regmap[BPF_REG_FP], LOONGARCH_GPR_SP, bpf_stack_adjust);
+ }
ctx->stack_size = stack_adjust;
@@ -2166,6 +2195,39 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
return ret < 0 ? ret : ret * LOONGARCH_INSN_SIZE;
}
+static void priv_stack_init_guard(void __percpu *priv_stack_ptr, int alloc_size)
+{
+ int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3;
+ u64 *stack_ptr;
+
+ for_each_possible_cpu(cpu) {
+ stack_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
+ stack_ptr[0] = PRIV_STACK_GUARD_VAL;
+ stack_ptr[1] = PRIV_STACK_GUARD_VAL;
+ stack_ptr[underflow_idx] = PRIV_STACK_GUARD_VAL;
+ stack_ptr[underflow_idx + 1] = PRIV_STACK_GUARD_VAL;
+ }
+}
+
+static void priv_stack_check_guard(void __percpu *priv_stack_ptr, int alloc_size,
+ struct bpf_prog *prog)
+{
+ int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3;
+ u64 *stack_ptr;
+
+ for_each_possible_cpu(cpu) {
+ stack_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
+ if (stack_ptr[0] != PRIV_STACK_GUARD_VAL ||
+ stack_ptr[1] != PRIV_STACK_GUARD_VAL ||
+ stack_ptr[underflow_idx] != PRIV_STACK_GUARD_VAL ||
+ stack_ptr[underflow_idx + 1] != PRIV_STACK_GUARD_VAL) {
+ pr_err("BPF private stack overflow/underflow detected for prog %sx\n",
+ bpf_jit_get_prog_name(prog));
+ break;
+ }
+ }
+}
+
struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog)
{
bool extra_pass = false;
@@ -2174,7 +2236,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_pr
struct jit_ctx ctx;
struct jit_data *jit_data;
struct bpf_binary_header *header;
- struct bpf_binary_header *ro_header;
+ struct bpf_binary_header *ro_header = NULL;
+ void __percpu *priv_stack_ptr = NULL;
+ int priv_stack_alloc_sz;
/*
* If BPF JIT was not enabled then we must fall back to
@@ -2190,6 +2254,22 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_pr
return prog;
prog->aux->jit_data = jit_data;
}
+ priv_stack_ptr = prog->aux->priv_stack_ptr;
+ if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
+ /*
+ * Allocate the actual private stack: the verifier-calculated
+ * stack size plus two guard regions to detect overflow and
+ * underflow.
+ */
+ priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 16) +
+ 2 * PRIV_STACK_GUARD_SZ;
+ priv_stack_ptr = __alloc_percpu_gfp(priv_stack_alloc_sz, 16, GFP_KERNEL);
+ if (!priv_stack_ptr)
+ goto out_priv_stack;
+
+ priv_stack_init_guard(priv_stack_ptr, priv_stack_alloc_sz);
+ prog->aux->priv_stack_ptr = priv_stack_ptr;
+ }
if (jit_data->ctx.offset) {
ctx = jit_data->ctx;
ro_header = jit_data->ro_header;
@@ -2205,6 +2285,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_pr
ctx.prog = prog;
ctx.arena_vm_start = bpf_arena_get_kern_vm_start(prog->aux->arena);
ctx.user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena);
+ ctx.priv_sp_used = priv_stack_ptr ? true : false;
ctx.offset = kvcalloc(prog->len + 1, sizeof(u32), GFP_KERNEL);
if (ctx.offset == NULL)
@@ -2298,7 +2379,17 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_pr
bpf_prog_fill_jited_linfo(prog, ctx.offset + 1);
out_offset:
+ /*
+ * A NULL ro_header here means the JIT failed, so release the
+ * private stack that was allocated above; on success the
+ * program keeps it until bpf_jit_free().
+ */
+ if (!ro_header && priv_stack_ptr) {
+ free_percpu(priv_stack_ptr);
+ prog->aux->priv_stack_ptr = NULL;
+ }
kvfree(ctx.offset);
+out_priv_stack:
kfree(jit_data);
prog->aux->jit_data = NULL;
}
@@ -2324,6 +2415,8 @@ void bpf_jit_free(struct bpf_prog *prog)
if (prog->jited) {
struct jit_data *jit_data = prog->aux->jit_data;
struct bpf_binary_header *hdr;
+ void __percpu *priv_stack_ptr;
+ int priv_stack_alloc_sz;
/*
* If we fail the final pass of JIT (from jit_subprogs), the
@@ -2336,6 +2429,13 @@ void bpf_jit_free(struct bpf_prog *prog)
}
hdr = bpf_jit_binary_pack_hdr(prog);
bpf_jit_binary_pack_free(hdr, NULL);
+ priv_stack_ptr = prog->aux->priv_stack_ptr;
+ if (priv_stack_ptr) {
+ priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 16) +
+ 2 * PRIV_STACK_GUARD_SZ;
+ priv_stack_check_guard(priv_stack_ptr, priv_stack_alloc_sz, prog);
+ free_percpu(prog->aux->priv_stack_ptr);
+ }
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
}
@@ -2382,6 +2482,11 @@ bool bpf_jit_supports_fsession(void)
return true;
}
+bool bpf_jit_supports_private_stack(void)
+{
+ return true;
+}
+
/* Indicate the JIT backend supports mixing bpf2bpf and tailcalls. */
bool bpf_jit_supports_subprog_tailcalls(void)
{
diff --git a/arch/loongarch/net/bpf_jit.h b/arch/loongarch/net/bpf_jit.h
index a8e29be35fa8..01a7ea47e79b 100644
--- a/arch/loongarch/net/bpf_jit.h
+++ b/arch/loongarch/net/bpf_jit.h
@@ -22,6 +22,7 @@ struct jit_ctx {
u32 stack_size;
u64 arena_vm_start;
u64 user_vm_start;
+ bool priv_sp_used;
};
struct jit_data {
--
2.25.1