Re: [RFC PATCH v0] RISCV: Report vector unaligned accesses hwprobe

From: Clément Léger
Date: Wed Jun 05 2024 - 03:55:54 EST




On 04/06/2024 18:42, Jesse Taube wrote:
>
>
> On 6/4/24 12:24, Jesse Taube wrote:
>> Detected if a system traps into the kernel on an vector unaligned access.
>> Add the result to a new key in hwprobe.
>>
>> Signed-off-by: Jesse Taube <jesse@xxxxxxxxxxxx>
>> ---
>>   arch/riscv/include/asm/cpufeature.h        |  3 ++
>>   arch/riscv/include/asm/hwprobe.h           |  2 +-
>>   arch/riscv/include/uapi/asm/hwprobe.h      |  6 +++
>>   arch/riscv/kernel/sys_hwprobe.c            | 34 ++++++++++++
>>   arch/riscv/kernel/traps_misaligned.c       | 60 ++++++++++++++++++++++
>>   arch/riscv/kernel/unaligned_access_speed.c |  4 ++
>>   6 files changed, 108 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/riscv/include/asm/cpufeature.h
>> b/arch/riscv/include/asm/cpufeature.h
>> index 347805446151..5ad69cf25b25 100644
>> --- a/arch/riscv/include/asm/cpufeature.h
>> +++ b/arch/riscv/include/asm/cpufeature.h
>> @@ -35,9 +35,12 @@ void riscv_user_isa_enable(void);
>>     #if defined(CONFIG_RISCV_MISALIGNED)
>>   bool check_unaligned_access_emulated_all_cpus(void);
>> +bool check_vector_unaligned_access_all_cpus(void);
>> +
>>   void unaligned_emulation_finish(void);
>>   bool unaligned_ctl_available(void);
>>   DECLARE_PER_CPU(long, misaligned_access_speed);
>> +DECLARE_PER_CPU(long, vector_misaligned_access);
>>   #else
>>   static inline bool unaligned_ctl_available(void)
>>   {
>> diff --git a/arch/riscv/include/asm/hwprobe.h
>> b/arch/riscv/include/asm/hwprobe.h
>> index 630507dff5ea..150a9877b0af 100644
>> --- a/arch/riscv/include/asm/hwprobe.h
>> +++ b/arch/riscv/include/asm/hwprobe.h
>> @@ -8,7 +8,7 @@
>>     #include <uapi/asm/hwprobe.h>
>>   -#define RISCV_HWPROBE_MAX_KEY 6
>> +#define RISCV_HWPROBE_MAX_KEY 7
>>     static inline bool riscv_hwprobe_key_is_valid(__s64 key)
>>   {
>> diff --git a/arch/riscv/include/uapi/asm/hwprobe.h
>> b/arch/riscv/include/uapi/asm/hwprobe.h
>> index 060212331a03..4474e98d17bd 100644
>> --- a/arch/riscv/include/uapi/asm/hwprobe.h
>> +++ b/arch/riscv/include/uapi/asm/hwprobe.h
>> @@ -68,6 +68,12 @@ struct riscv_hwprobe {
>>   #define        RISCV_HWPROBE_MISALIGNED_UNSUPPORTED    (4 << 0)
>>   #define        RISCV_HWPROBE_MISALIGNED_MASK        (7 << 0)
>>   #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE    6
>> +#define RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF    7
>
> There were talks about combining vecotor and scalar speed for the user
> facing API into RISCV_HWPROBE_KEY_CPUPERF_0, but adding another key
> seems easier.
>
>> +#define        RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN        0
>> +#define        RISCV_HWPROBE_VEC_MISALIGNED_EMULATED        1
>> +#define        RISCV_HWPROBE_VEC_MISALIGNED_SLOW        2
>> +#define        RISCV_HWPROBE_VEC_MISALIGNED_FAST        3
>> +#define        RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED    4
>>   /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
>>     /* Flags */
>> diff --git a/arch/riscv/kernel/sys_hwprobe.c
>> b/arch/riscv/kernel/sys_hwprobe.c
>> index b286b73e763e..ce641cc6e47a 100644
>> --- a/arch/riscv/kernel/sys_hwprobe.c
>> +++ b/arch/riscv/kernel/sys_hwprobe.c
>> @@ -184,6 +184,36 @@ static u64 hwprobe_misaligned(const struct
>> cpumask *cpus)
>>   }
>>   #endif
>>   +#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
>> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
>> +{
>> +    int cpu;
>> +    u64 perf = -1ULL;
>> +
>> +    for_each_cpu(cpu, cpus) {
>> +        int this_perf = per_cpu(vector_misaligned_access, cpu);
>> +
>> +        if (perf == -1ULL)
>> +            perf = this_perf;
>> +
>> +        if (perf != this_perf) {
>> +            perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>> +            break;
>> +        }
>> +    }
>> +
>> +    if (perf == -1ULL)
>> +        return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>> +
>> +    return perf;
>> +}
>> +#else
>> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
>> +{
>> +    return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>> +}
>> +#endif
>> +
>>   static void hwprobe_one_pair(struct riscv_hwprobe *pair,
>>                    const struct cpumask *cpus)
>>   {
>> @@ -211,6 +241,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe
>> *pair,
>>           pair->value = hwprobe_misaligned(cpus);
>>           break;
>>   +    case RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF:
>> +        pair->value = hwprobe_vec_misaligned(cpus);
>> +        break;
>> +
>>       case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE:
>>           pair->value = 0;
>>           if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ))
>> diff --git a/arch/riscv/kernel/traps_misaligned.c
>> b/arch/riscv/kernel/traps_misaligned.c
>> index 2adb7c3e4dd5..0c07e990e9c5 100644
>> --- a/arch/riscv/kernel/traps_misaligned.c
>> +++ b/arch/riscv/kernel/traps_misaligned.c
>> @@ -16,6 +16,7 @@
>>   #include <asm/entry-common.h>
>>   #include <asm/hwprobe.h>
>>   #include <asm/cpufeature.h>
>> +#include <asm/vector.h>
>>     #define INSN_MATCH_LB            0x3
>>   #define INSN_MASK_LB            0x707f
>> @@ -426,6 +427,14 @@ int handle_misaligned_load(struct pt_regs *regs)
>>       if (get_insn(regs, epc, &insn))
>>           return -1;
>>  
>
> Is this an appropriate way to check if there is vector missaligned
> access? What if a unaligned vector load as called by the kernel before
> this check?
>
>> +#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>> +    if (*this_cpu_ptr(&vector_misaligned_access) ==
>> RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) {
>> +        *this_cpu_ptr(&vector_misaligned_access) =
>> RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
>> +        regs->epc = epc + INSN_LEN(insn);
>> +        return 0;
>> +    }
>> +#endif

Since this code will be called for standard load/store misaligned
accesses your, I guess this needs to check if the faulty instruction
itself is a vector related one before setting it to supported. I don't
know what the specs says about that but I guess it shoukld be checked if
vector load/store and standard load/store could have different behaviors
with respect to misaligned accesses.

Regardless of that (except if I missed something), the emulation code
does not actually support vector load/store instructions.
So, support for misaligned vector load/store should be added in a first
place before reporting that it is actually "supported".

Thanks,

Clément

>> +
>>       regs->epc = 0;
>>         if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) {
>> @@ -625,6 +634,57 @@ static bool check_unaligned_access_emulated(int cpu)
>>       return misaligned_emu_detected;
>>   }
>>   +#ifdef CONFIG_RISCV_ISA_V
>> +static bool check_vector_unaligned_access(int cpu)
>> +{
>> +    long *mas_ptr = per_cpu_ptr(&vector_misaligned_access, cpu);
>> +    struct riscv_isainfo *isainfo = &hart_isa[cpu];
>> +    unsigned long tmp_var;
>> +    bool misaligned_vec_suported;
>> +
>> +    if (!riscv_isa_extension_available(isainfo->isa, v))
>> +        return false;
>> +
>> +    /* This case will only happen if a unaligned vector load
>> +     * was called by the kernel before this check
>> +     */
>> +    if (*mas_ptr != RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN)
>> +        return false;
>> +
>> +    kernel_vector_begin();
>> +    __asm__ __volatile__ (
>> +        ".option push\n\t"
>> +        ".option arch, +v\n\t"
>> +        "    li t1, 0x1\n"                //size
>> +        "       vsetvli t0, t1, e16, m2, ta, ma\n\t"    // Vectors of
>> 16b
>> +        "       addi t0, %[ptr], 1\n\t"            // Misalign address
>> +        "    vle16.v v0, (t0)\n\t"            // Load bytes
>> +        ".option pop\n\t"
>> +        : : [ptr] "r" (&tmp_var) : "v0", "t0", "t1", "memory");
>> +    kernel_vector_end();
>> +
>> +    misaligned_vec_suported = (*mas_ptr ==
>> RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN);
>> +
>> +    return misaligned_vec_suported;
>> +}
>> +#else
>> +static bool check_vector_unaligned_access(int cpu)
>> +{
>> +    return false;
>> +}
>> +#endif
>> +
>> +bool check_vector_unaligned_access_all_cpus(void)
>> +{
>> +    int cpu;
>> +
>> +    for_each_online_cpu(cpu)
>> +        if (!check_vector_unaligned_access(cpu))
>> +            return false;
>> +
>> +    return true;
>> +}
>> +
>>   bool check_unaligned_access_emulated_all_cpus(void)
>>   {
>>       int cpu;
>> diff --git a/arch/riscv/kernel/unaligned_access_speed.c
>> b/arch/riscv/kernel/unaligned_access_speed.c
>> index a9a6bcb02acf..92a84239beaa 100644
>> --- a/arch/riscv/kernel/unaligned_access_speed.c
>> +++ b/arch/riscv/kernel/unaligned_access_speed.c
>> @@ -20,6 +20,7 @@
>>   #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
>>     DEFINE_PER_CPU(long, misaligned_access_speed);
>> +DEFINE_PER_CPU(long, vector_misaligned_access) =
>> RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>>     #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>>   static cpumask_t fast_misaligned_access;
>> @@ -264,6 +265,8 @@ static int check_unaligned_access_all_cpus(void)
>>   {
>>       bool all_cpus_emulated =
>> check_unaligned_access_emulated_all_cpus();
>>  
>
> There was talks about Zicclsm, but spike doesnt have support for Zicclsm
> afaik, but I was wondering if i should add Zicclsm to cpufeature and
> aswell.
>
> If anyone wants to run this i tested with
> spike --misaligned --isa=RV64IMAFDCV_zicntr_zihpm
> --kernel=arch/riscv/boot/Image
> opensbi/build/platform/generic/firmware/fw_jump.elf

AFAIK, --misaligned tells spike to actually handle misaligned load/store
in "hardware" and not generate a misaligned trap so I guess you would
need to remove that flag and use a specific openSBI version that always
delegate the misaligned load/store traps if you want to be sure that the
kernel actually handles the vector misaligned load/store traps.

I previously made a small test utility to verify that the kernel
correctly gets the misaligned traps [1]. If it fails then, your kernel
probably do not get any trap :)

Link: https://github.com/clementleger/unaligned_test [1]
>
>
> Thanks,
> Jesse Taube
>
>
>> +    check_vector_unaligned_access_all_cpus();
>> +
>>       if (!all_cpus_emulated)
>>           return check_unaligned_access_speed_all_cpus();
>>   @@ -273,6 +276,7 @@ static int check_unaligned_access_all_cpus(void)
>>   static int check_unaligned_access_all_cpus(void)
>>   {
>>       check_unaligned_access_emulated_all_cpus();
>> +    check_vector_unaligned_access_all_cpus();
>>         return 0;
>>   }