Re: [RFC PATCH v0] RISCV: Report vector unaligned accesses hwprobe
From: Evan Green
Date: Wed Jun 05 2024 - 11:55:07 EST
On Tue, Jun 4, 2024 at 9:25 AM Jesse Taube <jesse@xxxxxxxxxxxx> wrote:
>
> Detected if a system traps into the kernel on an vector unaligned access.
> Add the result to a new key in hwprobe.
>
> Signed-off-by: Jesse Taube <jesse@xxxxxxxxxxxx>
For the scalar world, we wanted to know whether misaligned accesses
were faster or slower than equivalent byte accesses, so usermode could
know for something like memcpy which option had better bandwidth. Is
the motivation here the same, where we're going to use vector
registers for memcpy and we want to know which size load to use? Or
will usermode be consuming this info for a different purpose as well?
I know this is a basic question, but having the motivation helps me
get the right lens for reviewing it. Perhaps that should be added to
the commit message as well.
> ---
> arch/riscv/include/asm/cpufeature.h | 3 ++
> arch/riscv/include/asm/hwprobe.h | 2 +-
> arch/riscv/include/uapi/asm/hwprobe.h | 6 +++
> arch/riscv/kernel/sys_hwprobe.c | 34 ++++++++++++
> arch/riscv/kernel/traps_misaligned.c | 60 ++++++++++++++++++++++
> arch/riscv/kernel/unaligned_access_speed.c | 4 ++
> 6 files changed, 108 insertions(+), 1 deletion(-)
>
> diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
> index 347805446151..5ad69cf25b25 100644
> --- a/arch/riscv/include/asm/cpufeature.h
> +++ b/arch/riscv/include/asm/cpufeature.h
> @@ -35,9 +35,12 @@ void riscv_user_isa_enable(void);
>
> #if defined(CONFIG_RISCV_MISALIGNED)
> bool check_unaligned_access_emulated_all_cpus(void);
> +bool check_vector_unaligned_access_all_cpus(void);
> +
> void unaligned_emulation_finish(void);
> bool unaligned_ctl_available(void);
> DECLARE_PER_CPU(long, misaligned_access_speed);
> +DECLARE_PER_CPU(long, vector_misaligned_access);
> #else
> static inline bool unaligned_ctl_available(void)
> {
> diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
> index 630507dff5ea..150a9877b0af 100644
> --- a/arch/riscv/include/asm/hwprobe.h
> +++ b/arch/riscv/include/asm/hwprobe.h
> @@ -8,7 +8,7 @@
>
> #include <uapi/asm/hwprobe.h>
>
> -#define RISCV_HWPROBE_MAX_KEY 6
> +#define RISCV_HWPROBE_MAX_KEY 7
>
> static inline bool riscv_hwprobe_key_is_valid(__s64 key)
> {
> diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
> index 060212331a03..4474e98d17bd 100644
> --- a/arch/riscv/include/uapi/asm/hwprobe.h
> +++ b/arch/riscv/include/uapi/asm/hwprobe.h
> @@ -68,6 +68,12 @@ struct riscv_hwprobe {
> #define RISCV_HWPROBE_MISALIGNED_UNSUPPORTED (4 << 0)
> #define RISCV_HWPROBE_MISALIGNED_MASK (7 << 0)
> #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE 6
> +#define RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF 7
> +#define RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN 0
> +#define RISCV_HWPROBE_VEC_MISALIGNED_EMULATED 1
> +#define RISCV_HWPROBE_VEC_MISALIGNED_SLOW 2
> +#define RISCV_HWPROBE_VEC_MISALIGNED_FAST 3
> +#define RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED 4
This needs to be added to the documentation as well.
What value should be returned when V is not enabled in the kernel, or
V is not supported in the hardware? Currently in the code it would be
UNKNOWN, right? Is that what we want, or is it worth differentiating
"no support for V" from "I don't know the speed of misaligned loads"?
Maybe UNKNOWN is the right value, as there are other values to tell
you V is not enabled.
> /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
>
> /* Flags */
> diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
> index b286b73e763e..ce641cc6e47a 100644
> --- a/arch/riscv/kernel/sys_hwprobe.c
> +++ b/arch/riscv/kernel/sys_hwprobe.c
> @@ -184,6 +184,36 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
> }
> #endif
>
> +#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
> +{
> + int cpu;
> + u64 perf = -1ULL;
> +
> + for_each_cpu(cpu, cpus) {
> + int this_perf = per_cpu(vector_misaligned_access, cpu);
> +
> + if (perf == -1ULL)
> + perf = this_perf;
> +
> + if (perf != this_perf) {
> + perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> + break;
> + }
> + }
> +
> + if (perf == -1ULL)
> + return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> +
> + return perf;
> +}
> +#else
> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
> +{
> + return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> +}
> +#endif
> +
> static void hwprobe_one_pair(struct riscv_hwprobe *pair,
> const struct cpumask *cpus)
> {
> @@ -211,6 +241,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair,
> pair->value = hwprobe_misaligned(cpus);
> break;
>
> + case RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF:
> + pair->value = hwprobe_vec_misaligned(cpus);
> + break;
> +
> case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE:
> pair->value = 0;
> if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ))
> diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
> index 2adb7c3e4dd5..0c07e990e9c5 100644
> --- a/arch/riscv/kernel/traps_misaligned.c
> +++ b/arch/riscv/kernel/traps_misaligned.c
> @@ -16,6 +16,7 @@
> #include <asm/entry-common.h>
> #include <asm/hwprobe.h>
> #include <asm/cpufeature.h>
> +#include <asm/vector.h>
>
> #define INSN_MATCH_LB 0x3
> #define INSN_MASK_LB 0x707f
> @@ -426,6 +427,14 @@ int handle_misaligned_load(struct pt_regs *regs)
> if (get_insn(regs, epc, &insn))
> return -1;
>
> +#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
> + if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) {
> + *this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
Shouldn't this be EMULATED, given we were just delegated this trap? I
guess it depends on whether you're going to add support for actually
handling the misaligned vector trap, as Clément mentioned.
Scalar misaligned loads had a history to lean on since the specs were
always explicit that misaligned loads/store had to be supported one
way or another. So UNSUPPORTED was a future theoretical value. I
haven't dug through the specs yet, do you know what the story is for V
and misaligned loads? My sub-question is what the rules are for
detecting the difference between EMULATED and UNSUPPORTED.
> + regs->epc = epc + INSN_LEN(insn);
> + return 0;
> + }
> +#endif
> +
> regs->epc = 0;
>
> if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) {
> @@ -625,6 +634,57 @@ static bool check_unaligned_access_emulated(int cpu)
> return misaligned_emu_detected;
> }
>
> +#ifdef CONFIG_RISCV_ISA_V
> +static bool check_vector_unaligned_access(int cpu)
> +{
> + long *mas_ptr = per_cpu_ptr(&vector_misaligned_access, cpu);
> + struct riscv_isainfo *isainfo = &hart_isa[cpu];
> + unsigned long tmp_var;
> + bool misaligned_vec_suported;
> +
> + if (!riscv_isa_extension_available(isainfo->isa, v))
> + return false;
> +
> + /* This case will only happen if a unaligned vector load
> + * was called by the kernel before this check
> + */
> + if (*mas_ptr != RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN)
> + return false;
> +
> + kernel_vector_begin();
> + __asm__ __volatile__ (
> + ".option push\n\t"
> + ".option arch, +v\n\t"
> + " li t1, 0x1\n" //size
> + " vsetvli t0, t1, e16, m2, ta, ma\n\t" // Vectors of 16b
> + " addi t0, %[ptr], 1\n\t" // Misalign address
> + " vle16.v v0, (t0)\n\t" // Load bytes
> + ".option pop\n\t"
> + : : [ptr] "r" (&tmp_var) : "v0", "t0", "t1", "memory");
> + kernel_vector_end();
> +
> + misaligned_vec_suported = (*mas_ptr == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN);
> +
> + return misaligned_vec_suported;
> +}
> +#else
> +static bool check_vector_unaligned_access(int cpu)
> +{
> + return false;
> +}
> +#endif
> +
> +bool check_vector_unaligned_access_all_cpus(void)
> +{
> + int cpu;
> +
> + for_each_online_cpu(cpu)
> + if (!check_vector_unaligned_access(cpu))
> + return false;
> +
> + return true;
> +}
These functions return a bool, but the bool is never checked. I'm
guessing that's because you're going to check it in a future patch
where you decide whether or not to probe for fast/slow?
> +
> bool check_unaligned_access_emulated_all_cpus(void)
> {
> int cpu;
> diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
> index a9a6bcb02acf..92a84239beaa 100644
> --- a/arch/riscv/kernel/unaligned_access_speed.c
> +++ b/arch/riscv/kernel/unaligned_access_speed.c
> @@ -20,6 +20,7 @@
> #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
>
> DEFINE_PER_CPU(long, misaligned_access_speed);
> +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>
> #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
> static cpumask_t fast_misaligned_access;
> @@ -264,6 +265,8 @@ static int check_unaligned_access_all_cpus(void)
> {
> bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
>
> + check_vector_unaligned_access_all_cpus();
> +
> if (!all_cpus_emulated)
> return check_unaligned_access_speed_all_cpus();
>
> @@ -273,6 +276,7 @@ static int check_unaligned_access_all_cpus(void)
> static int check_unaligned_access_all_cpus(void)
> {
> check_unaligned_access_emulated_all_cpus();
> + check_vector_unaligned_access_all_cpus();
>
> return 0;
> }
> --
> 2.43.0
>