Re: [PATCH v2] ARC: ARCv2: jump label: implement jump label patching

From: Vineet Gupta
Date: Tue Aug 27 2019 - 13:00:20 EST


On 7/18/19 6:20 AM, Eugeniy Paltsev wrote:
> Implement jump label patching for ARC. Jump labels provide
> an interface to generate dynamic branches using
> self-modifying code.
>
> This allows us to implement conditional branches where
> changing branch direction is expensive but branch selection
> is basically 'free'
>
> This implementation uses 32-bit NOP and BRANCH instructions
> which forced to be aligned by 4 to guarantee that they don't
> cross L1 I$ cache fetch block and can be update atomically.

I've discussed this with hardware folks, and it seems fetch block size is
irrelevant. We need to make sure this doesn't straddle cache line.

Please respin !

Thx,
-Vineet

>
> Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@xxxxxxxxxxxx>
> ---
> Changes v1->v2:
> * Patched instruction should not cross L1 I$ fetch block boundary and
> not only L1 I$ line. Fix comments and asserts in code.
> * Other small comments fix and code cleanup.
>
> arch/arc/Kconfig | 8 ++
> arch/arc/include/asm/cache.h | 11 ++
> arch/arc/include/asm/jump_label.h | 71 +++++++++++++
> arch/arc/kernel/Makefile | 1 +
> arch/arc/kernel/jump_label.c | 167 ++++++++++++++++++++++++++++++
> 5 files changed, 258 insertions(+)
> create mode 100644 arch/arc/include/asm/jump_label.h
> create mode 100644 arch/arc/kernel/jump_label.c
>
> diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
> index 1c8137e7247b..f9657c86c3e1 100644
> --- a/arch/arc/Kconfig
> +++ b/arch/arc/Kconfig
> @@ -44,6 +44,7 @@ config ARC
> select OF_EARLY_FLATTREE
> select PCI_SYSCALL if PCI
> select PERF_USE_VMALLOC if ARC_CACHE_VIPT_ALIASING
> + select HAVE_ARCH_JUMP_LABEL if ISA_ARCV2 && !CPU_ENDIAN_BE32
>
> config ARCH_HAS_CACHE_LINE_SIZE
> def_bool y
> @@ -523,6 +524,13 @@ config ARC_DW2_UNWIND
> config ARC_DBG_TLB_PARANOIA
> bool "Paranoia Checks in Low Level TLB Handlers"
>
> +config ARC_DBG_JUMP_LABEL
> + bool "Paranoid checks in Static Keys (jump labels) code"
> + depends on JUMP_LABEL
> + select STATIC_KEYS_SELFTEST
> + help
> + Enable paranoid checks and self-test of both ARC-specific and generic
> + part of static keys (jump labels) related code.
> endif
>
> config ARC_BUILTIN_DTB_NAME
> diff --git a/arch/arc/include/asm/cache.h b/arch/arc/include/asm/cache.h
> index 918804c7c1a4..46d4f673609a 100644
> --- a/arch/arc/include/asm/cache.h
> +++ b/arch/arc/include/asm/cache.h
> @@ -16,6 +16,11 @@
> #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
> #define CACHE_LINE_MASK (~(L1_CACHE_BYTES - 1))
>
> +#ifdef CONFIG_ISA_ARCV2
> +/* instruction_fetch_block_width is same for all ARCv2 */
> +#define I_CACHE_FETCH_BLOCK_WIDTH 16
> +#endif
> +
> /*
> * ARC700 doesn't cache any access in top 1G (0xc000_0000 to 0xFFFF_FFFF)
> * Ideal for wiring memory mapped peripherals as we don't need to do
> @@ -25,6 +30,12 @@
>
> #ifndef __ASSEMBLY__
>
> +#include <linux/build_bug.h>
> +
> +#ifdef CONFIG_ISA_ARCV2
> +static_assert(I_CACHE_FETCH_BLOCK_WIDTH <= L1_CACHE_BYTES);
> +#endif
> +
> /* Uncached access macros */
> #define arc_read_uncached_32(ptr) \
> ({ \
> diff --git a/arch/arc/include/asm/jump_label.h b/arch/arc/include/asm/jump_label.h
> new file mode 100644
> index 000000000000..0de7833b2071
> --- /dev/null
> +++ b/arch/arc/include/asm/jump_label.h
> @@ -0,0 +1,71 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_ARC_JUMP_LABEL_H
> +#define _ASM_ARC_JUMP_LABEL_H
> +
> +#ifndef __ASSEMBLY__
> +
> +#include <linux/types.h>
> +
> +#define JUMP_LABEL_NOP_SIZE 4
> +
> +/*
> + * NOTE about '.balign 4':
> + *
> + * To make atomic update of patched instruction available we need to guarantee
> + * that this instruction doesn't cross L1 I$ fetch block boundary (it's
> + * smaller than L1 I$ line size).
> + *
> + * As of today we simply align instruction which can be patched by 4 byte using
> + * ".balign 4" directive. In that case patched instruction is aligned with one
> + * 16-bit NOP_S if this is required.
> + * However 'align by 4' directive is much stricter than it actually required.
> + * It's enough that our 32-bit instruction don't cross L1 I$ fetch block
> + * boundary which can be achieved by using ".bundle_align_mode" directive.
> + * That will save us from adding useless NOP_S padding in most of the cases.
> + *
> + * TODO: switch to ".bundle_align_mode" directive using whin it will be
> + * supported by ARC toolchain.
> + */
> +
> +static __always_inline bool arch_static_branch(struct static_key *key,
> + bool branch)
> +{
> + asm_volatile_goto(".balign 4 \n"
> + "1: \n"
> + "nop \n"
> + ".pushsection __jump_table, \"aw\" \n"
> + ".word 1b, %l[l_yes], %c0 \n"
> + ".popsection \n"
> + : : "i" (&((char *)key)[branch]) : : l_yes);
> +
> + return false;
> +l_yes:
> + return true;
> +}
> +
> +static __always_inline bool arch_static_branch_jump(struct static_key *key,
> + bool branch)
> +{
> + asm_volatile_goto(".balign 4 \n"
> + "1: \n"
> + "b %l[l_yes] \n"
> + ".pushsection __jump_table, \"aw\" \n"
> + ".word 1b, %l[l_yes], %c0 \n"
> + ".popsection \n"
> + : : "i" (&((char *)key)[branch]) : : l_yes);
> +
> + return false;
> +l_yes:
> + return true;
> +}
> +
> +typedef u32 jump_label_t;
> +
> +struct jump_entry {
> + jump_label_t code;
> + jump_label_t target;
> + jump_label_t key;
> +};
> +
> +#endif /* __ASSEMBLY__ */
> +#endif
> diff --git a/arch/arc/kernel/Makefile b/arch/arc/kernel/Makefile
> index de6251132310..e784f5396dda 100644
> --- a/arch/arc/kernel/Makefile
> +++ b/arch/arc/kernel/Makefile
> @@ -20,6 +20,7 @@ obj-$(CONFIG_ARC_EMUL_UNALIGNED) += unaligned.o
> obj-$(CONFIG_KGDB) += kgdb.o
> obj-$(CONFIG_ARC_METAWARE_HLINK) += arc_hostlink.o
> obj-$(CONFIG_PERF_EVENTS) += perf_event.o
> +obj-$(CONFIG_JUMP_LABEL) += jump_label.o
>
> obj-$(CONFIG_ARC_FPU_SAVE_RESTORE) += fpu.o
> CFLAGS_fpu.o += -mdpfp
> diff --git a/arch/arc/kernel/jump_label.c b/arch/arc/kernel/jump_label.c
> new file mode 100644
> index 000000000000..9755895c312f
> --- /dev/null
> +++ b/arch/arc/kernel/jump_label.c
> @@ -0,0 +1,167 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/kernel.h>
> +#include <linux/jump_label.h>
> +
> +#include "asm/cacheflush.h"
> +
> +#define JUMPLABEL_ERR "ARC: jump_label: ERROR: "
> +
> +/* Halt system on fatal error to make debug easier */
> +#define arc_jl_fatal(format...) \
> +({ \
> + pr_err(JUMPLABEL_ERR format); \
> + BUG(); \
> +})
> +
> +static inline u32 arc_gen_nop(void)
> +{
> + /* 1x 32bit NOP in middle endian */
> + return 0x7000264a;
> +}
> +
> +static inline bool cross_l1_ifetch_block(void *addr, int len)
> +{
> + unsigned long a = (unsigned long)addr;
> + unsigned long ifetch_blk_shift = __fls(I_CACHE_FETCH_BLOCK_WIDTH);
> +
> + return (a >> ifetch_blk_shift) != ((a + len - 1) >> ifetch_blk_shift);
> +}
> +
> +/*
> + * ARCv2 'Branch unconditionally' instruction:
> + * 00000ssssssssss1SSSSSSSSSSNRtttt
> + * s S[n:0] lower bits signed immediate (number is bitfield size)
> + * S S[m:n+1] upper bits signed immediate (number is bitfield size)
> + * t S[24:21] upper bits signed immediate (branch unconditionally far)
> + * N N <.d> delay slot mode
> + * R R Reserved
> + */
> +static inline u32 arc_gen_branch(jump_label_t pc, jump_label_t target)
> +{
> + u32 instruction_l, instruction_r;
> + u32 pcl = pc & GENMASK(31, 2);
> + u32 u_offset = target - pcl;
> + u32 s, S, t;
> +
> + /*
> + * Offset in 32-bit branch instruction must to fit into s25.
> + * Something is terribly broken if we get such huge offset within one
> + * function.
> + */
> + if ((s32)u_offset < -16777216 || (s32)u_offset > 16777214)
> + arc_jl_fatal("gen branch with offset (%d) not fit in s25",
> + (s32)u_offset);
> +
> + /*
> + * All instructions are aligned by 2 bytes so we should never get offset
> + * here which is not 2 bytes aligned.
> + */
> + if (u_offset & 0x1)
> + arc_jl_fatal("gen branch with offset (%d) unaligned to 2 bytes",
> + (s32)u_offset);
> +
> + s = (u_offset >> 1) & GENMASK(9, 0);
> + S = (u_offset >> 11) & GENMASK(9, 0);
> + t = (u_offset >> 21) & GENMASK(3, 0);
> +
> + /* 00000ssssssssss1 */
> + instruction_l = (s << 1) | 0x1;
> + /* SSSSSSSSSSNRtttt */
> + instruction_r = (S << 6) | t;
> +
> + return (instruction_r << 16) | (instruction_l & GENMASK(15, 0));
> +}
> +
> +void arch_jump_label_transform(struct jump_entry *entry,
> + enum jump_label_type type)
> +{
> + jump_label_t *instr_addr = (jump_label_t *)entry->code;
> + u32 instr;
> +
> + /*
> + * Atomic update of patched instruction is only available if this
> + * instruction doesn't cross L1 I$ fetch block boundary (it's smaller
> + * than L1 I$ line size). You can read about the way we achieve this in
> + * arc/include/asm/jump_label.h
> + */
> + if (cross_l1_ifetch_block(instr_addr, JUMP_LABEL_NOP_SIZE))
> + arc_jl_fatal("instruction (addr %px) cross L1 cache fetch block",
> + instr_addr);
> +
> + if (type == JUMP_LABEL_JMP)
> + instr = arc_gen_branch(entry->code, entry->target);
> + else
> + instr = arc_gen_nop();
> +
> + WRITE_ONCE(*instr_addr, instr);
> + flush_icache_range(entry->code, entry->code + JUMP_LABEL_NOP_SIZE);
> +}
> +
> +void arch_jump_label_transform_static(struct jump_entry *entry,
> + enum jump_label_type type)
> +{
> + /*
> + * We use only one NOP type (1x, 4 byte) in arch_static_branch, so
> + * there's no need to patch an identical NOP over the top of it here.
> + * The generic code calls 'arch_jump_label_transform' if the NOP needs
> + * to be replaced by a branch, so 'arch_jump_label_transform_static' is
> + * never called with type other than JUMP_LABEL_NOP.
> + */
> + BUG_ON(type != JUMP_LABEL_NOP);
> +}
> +
> +#ifdef CONFIG_ARC_DBG_JUMP_LABEL
> +#define SELFTEST_MSG "ARC: instruction generation self-test: "
> +
> +struct arc_gen_branch_testdata {
> + jump_label_t pc;
> + jump_label_t target_address;
> + u32 expected_instr;
> +};
> +
> +static __init int branch_gen_test(struct arc_gen_branch_testdata *test_data)
> +{
> + u32 instr_got;
> +
> + instr_got = arc_gen_branch(test_data->pc, test_data->target_address);
> + if (instr_got == test_data->expected_instr)
> + return 0;
> +
> + pr_err(SELFTEST_MSG "FAIL:\n arc_gen_branch(0x%08x, 0x%08x) != 0x%08x, got 0x%08x\n",
> + test_data->pc, test_data->target_address,
> + test_data->expected_instr, instr_got);
> +
> + return -EFAULT;
> +}
> +
> +static __init int instr_gen_test(void)
> +{
> + int i;
> +
> + struct arc_gen_branch_testdata test_data[] = {
> + {0x90007548, 0x90007514, 0xffcf07cd}, /* tiny (-52) offs */
> + {0x9000c9c0, 0x9000c782, 0xffcf05c3}, /* tiny (-574) offs */
> + {0x9000cc1c, 0x9000c782, 0xffcf0367}, /* tiny (-1178) offs */
> + {0x9009dce0, 0x9009d106, 0xff8f0427}, /* small (-3034) offs */
> + {0x9000f5de, 0x90007d30, 0xfc0f0755}, /* big (-30892) offs */
> + {0x900a2444, 0x90035f64, 0xc9cf0321}, /* huge (-443616) offs */
> + {0x90007514, 0x9000752c, 0x00000019}, /* tiny (+24) offs */
> + {0x9001a578, 0x9001a77a, 0x00000203}, /* tiny (+514) offs */
> + {0x90031ed8, 0x90032634, 0x0000075d}, /* tiny (+1884) offs */
> + {0x9008c7f2, 0x9008d3f0, 0x00400401}, /* small (+3072) offs */
> + {0x9000bb38, 0x9003b340, 0x17c00009}, /* big (+194568) offs */
> + {0x90008f44, 0x90578d80, 0xb7c2063d} /* huge (+5701180) offs */
> + };
> +
> + for (i = 0; i < ARRAY_SIZE(test_data); i++)
> + if (branch_gen_test(&test_data[i]))
> + return -EFAULT;
> +
> + pr_info(SELFTEST_MSG "OK\n");
> +
> + return 0;
> +}
> +early_initcall(instr_gen_test);
> +
> +#endif /* CONFIG_ARC_DBG_JUMP_LABEL */
>