Re: [PATCH v2 8/8] powerpc/bpf: Reallocate BPF registers to volatile registers when possible on PPC32
From: Christophe Leroy
Date: Mon Jan 10 2022 - 07:13:43 EST
Le 07/01/2022 à 12:51, Naveen N. Rao a écrit :
> Christophe Leroy wrote:
>> When the BPF routine doesn't call any function, the non volatile
>> registers can be reallocated to volatile registers in order to
>> avoid having to save them/restore on the stack.
>>
>> Before this patch, the test #359 ADD default X is:
>>
>> 0: 7c 64 1b 78 mr r4,r3
>> 4: 38 60 00 00 li r3,0
>> 8: 94 21 ff b0 stwu r1,-80(r1)
>> c: 60 00 00 00 nop
>> 10: 92 e1 00 2c stw r23,44(r1)
>> 14: 93 01 00 30 stw r24,48(r1)
>> 18: 93 21 00 34 stw r25,52(r1)
>> 1c: 93 41 00 38 stw r26,56(r1)
>> 20: 39 80 00 00 li r12,0
>> 24: 39 60 00 00 li r11,0
>> 28: 3b 40 00 00 li r26,0
>> 2c: 3b 20 00 00 li r25,0
>> 30: 7c 98 23 78 mr r24,r4
>> 34: 7c 77 1b 78 mr r23,r3
>> 38: 39 80 00 42 li r12,66
>> 3c: 39 60 00 00 li r11,0
>> 40: 7d 8c d2 14 add r12,r12,r26
>> 44: 39 60 00 00 li r11,0
>> 48: 7d 83 63 78 mr r3,r12
>> 4c: 82 e1 00 2c lwz r23,44(r1)
>> 50: 83 01 00 30 lwz r24,48(r1)
>> 54: 83 21 00 34 lwz r25,52(r1)
>> 58: 83 41 00 38 lwz r26,56(r1)
>> 5c: 38 21 00 50 addi r1,r1,80
>> 60: 4e 80 00 20 blr
>>
>> After this patch, the same test has become:
>>
>> 0: 7c 64 1b 78 mr r4,r3
>> 4: 38 60 00 00 li r3,0
>> 8: 94 21 ff b0 stwu r1,-80(r1)
>> c: 60 00 00 00 nop
>> 10: 39 80 00 00 li r12,0
>> 14: 39 60 00 00 li r11,0
>> 18: 39 00 00 00 li r8,0
>> 1c: 38 e0 00 00 li r7,0
>> 20: 7c 86 23 78 mr r6,r4
>> 24: 7c 65 1b 78 mr r5,r3
>> 28: 39 80 00 42 li r12,66
>> 2c: 39 60 00 00 li r11,0
>> 30: 7d 8c 42 14 add r12,r12,r8
>> 34: 39 60 00 00 li r11,0
>> 38: 7d 83 63 78 mr r3,r12
>> 3c: 38 21 00 50 addi r1,r1,80
>> 40: 4e 80 00 20 blr
>>
>> Signed-off-by: Christophe Leroy <christophe.leroy@xxxxxxxxxx>
>> ---
>> arch/powerpc/net/bpf_jit.h | 16 ++++++++++++++++
>> arch/powerpc/net/bpf_jit64.h | 2 +-
>> arch/powerpc/net/bpf_jit_comp.c | 2 ++
>> arch/powerpc/net/bpf_jit_comp32.c | 30 ++++++++++++++++++++++++++++--
>> arch/powerpc/net/bpf_jit_comp64.c | 4 ++++
>> 5 files changed, 51 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
>> index a45b8266355d..776abef4d2a0 100644
>> --- a/arch/powerpc/net/bpf_jit.h
>> +++ b/arch/powerpc/net/bpf_jit.h
>> @@ -116,6 +116,15 @@ static inline bool is_nearbranch(int offset)
>> #define SEEN_STACK 0x40000000 /* uses BPF stack */
>> #define SEEN_TAILCALL 0x80000000 /* uses tail calls */
>>
>> +#define SEEN_VREG_MASK 0x1ff80000 /* Volatile registers r3-r12 */
>> +#define SEEN_NVREG_MASK 0x0003ffff /* Non volatile registers
>> r14-r31 */
>> +
>> +#ifdef CONFIG_PPC64
>> +extern const int b2p[MAX_BPF_JIT_REG + 2];
>> +#else
>> +extern const int b2p[MAX_BPF_JIT_REG + 1];
>> +#endif
>> +
>> struct codegen_context {
>> /*
>> * This is used to track register usage as well
>> @@ -129,6 +138,7 @@ struct codegen_context {
>> unsigned int seen;
>> unsigned int idx;
>> unsigned int stack_size;
>> + int b2p[ARRAY_SIZE(b2p)];
>> };
>>
>> static inline void bpf_flush_icache(void *start, void *end)
>> @@ -147,11 +157,17 @@ static inline void bpf_set_seen_register(struct
>> codegen_context *ctx, int i)
>> ctx->seen |= 1 << (31 - i);
>> }
>>
>> +static inline void bpf_clear_seen_register(struct codegen_context
>> *ctx, int i)
>> +{
>> + ctx->seen &= ~(1 << (31 - i));
>> +}
>> +
>> void bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context
>> *ctx, u64 func);
>> int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct
>> codegen_context *ctx,
>> u32 *addrs, bool extra_pass);
>> void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx);
>> void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx);
>> +void bpf_jit_realloc_regs(struct codegen_context *ctx);
>>
>> #endif
>>
>> diff --git a/arch/powerpc/net/bpf_jit64.h b/arch/powerpc/net/bpf_jit64.h
>> index b05f2e67bba1..7b713edfa7e2 100644
>> --- a/arch/powerpc/net/bpf_jit64.h
>> +++ b/arch/powerpc/net/bpf_jit64.h
>> @@ -39,7 +39,7 @@
>> #define TMP_REG_2 (MAX_BPF_JIT_REG + 1)
>>
>> /* BPF to ppc register mappings */
>> -static const int b2p[] = {
>> +const int b2p[MAX_BPF_JIT_REG + 2] = {
>> /* function return value */
>> [BPF_REG_0] = 8,
>> /* function arguments */
>> diff --git a/arch/powerpc/net/bpf_jit_comp.c
>> b/arch/powerpc/net/bpf_jit_comp.c
>> index efac89964873..798ac4350a82 100644
>> --- a/arch/powerpc/net/bpf_jit_comp.c
>> +++ b/arch/powerpc/net/bpf_jit_comp.c
>> @@ -143,6 +143,7 @@ struct bpf_prog *bpf_int_jit_compile(struct
>> bpf_prog *fp)
>> }
>>
>> memset(&cgctx, 0, sizeof(struct codegen_context));
>> + memcpy(cgctx.b2p, b2p, sizeof(cgctx.b2p));
>>
>> /* Make sure that the stack is quadword aligned. */
>> cgctx.stack_size = round_up(fp->aux->stack_depth, 16);
>> @@ -167,6 +168,7 @@ struct bpf_prog *bpf_int_jit_compile(struct
>> bpf_prog *fp)
>> }
>> }
>>
>> + bpf_jit_realloc_regs(&cgctx);
>> /*
>> * Pretend to build prologue, given the features we've seen.
>> This will
>> * update ctgtx.idx as it pretends to output instructions, then
>> we can
>> diff --git a/arch/powerpc/net/bpf_jit_comp32.c
>> b/arch/powerpc/net/bpf_jit_comp32.c
>> index 29ce802d7534..003843273b43 100644
>> --- a/arch/powerpc/net/bpf_jit_comp32.c
>> +++ b/arch/powerpc/net/bpf_jit_comp32.c
>> @@ -37,7 +37,7 @@
>> #define TMP_REG (MAX_BPF_JIT_REG + 0)
>>
>> /* BPF to ppc register mappings */
>> -static const int b2p[] = {
>> +const int b2p[MAX_BPF_JIT_REG + 1] = {
>> /* function return value */
>> [BPF_REG_0] = 12,
>> /* function arguments */
>> @@ -60,7 +60,7 @@ static const int b2p[] = {
>>
>> static int bpf_to_ppc(struct codegen_context *ctx, int reg)
>> {
>> - return b2p[reg];
>> + return ctx->b2p[reg];
>> }
>>
>> /* PPC NVR range -- update this if we ever use NVRs below r17 */
>> @@ -77,6 +77,32 @@ static int bpf_jit_stack_offsetof(struct
>> codegen_context *ctx, int reg)
>> return BPF_PPC_STACKFRAME(ctx) - 4;
>> }
>>
>> +void bpf_jit_realloc_regs(struct codegen_context *ctx)
>> +{
>> + if (ctx->seen & SEEN_FUNC)
>> + return;
>
> Can't you remap BPF_REG_5, BPF_REG_AX and TMP_REG regardless of SEEN_FUNC?
>
Oh yes, we can do that.
BPF_REG_5 is unlikely to be used unless BPF_REG_0 to 4 are used, so I
guess we won't have any volatile register available.
BPF_REG_AX, I wasn't sure but it is a volatile register on PPC64 so I
guess it is OK.
TMP_REG for sure can be reallocated to a volatile reg when one is available.
I'll send a patch for that.
Thanks
Christophe