Re: [PATCH] riscv: module: Optimize PLT/GOT entry counting
From: Samuel Holland
Date: Wed Apr 09 2025 - 09:54:18 EST
Hi Drew,
On 2025-04-09 5:48 AM, Andrew Jones wrote:
> On Tue, Apr 08, 2025 at 07:45:16PM -0700, Samuel Holland wrote:
>> perf reports that 99.63% of the cycles from `modprobe amdgpu` are spent
>> inside module_frob_arch_sections(). This is because amdgpu.ko contains
>> about 300000 relocations in its .rela.text section, and the algorithm in
>> count_max_entries() takes quadratic time.
>>
>> Apply two optimizations from the arm64 code, which together reduce the
>> total execution time by 99.57%. First, sort the relocations so duplicate
>> entries are adjacent. Second, reduce the number of relocations that must
>> be sorted by filtering to only relocations that need PLT/GOT entries, as
>> done in commit d4e0340919fb ("arm64/module: Optimize module load time by
>> optimizing PLT counting").
>>
>> Unlike the arm64 code, here the filtering and sorting is done in a
>> scratch buffer, because the HI20 relocation search optimization in
>> apply_relocate_add() depends on the original order of the relocations.
>>
>> Signed-off-by: Samuel Holland <samuel.holland@xxxxxxxxxx>
>> ---
>>
>> arch/riscv/kernel/module-sections.c | 66 +++++++++++++++++++++++++----
>> 1 file changed, 57 insertions(+), 9 deletions(-)
>>
>> diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c
>> index e264e59e596e..91d4f0fbd0af 100644
>> --- a/arch/riscv/kernel/module-sections.c
>> +++ b/arch/riscv/kernel/module-sections.c
>> @@ -9,6 +9,7 @@
>> #include <linux/kernel.h>
>> #include <linux/module.h>
>> #include <linux/moduleloader.h>
>> +#include <linux/sort.h>
>>
>> unsigned long module_emit_got_entry(struct module *mod, unsigned long val)
>> {
>> @@ -55,19 +56,27 @@ unsigned long module_emit_plt_entry(struct module *mod, unsigned long val)
>> return (unsigned long)&plt[i];
>> }
>>
>> -static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y)
>> +#define cmp_3way(a, b) ((a) < (b) ? -1 : (a) > (b))
>> +
>> +static int cmp_rela(const void *a, const void *b)
>> {
>> - return x->r_info == y->r_info && x->r_addend == y->r_addend;
>> + const Elf_Rela *x = a, *y = b;
>> + int i;
>> +
>> + /* sort by type, symbol index and addend */
>> + i = cmp_3way(x->r_info, y->r_info);
>> + if (i == 0)
>> + i = cmp_3way(x->r_addend, y->r_addend);
>> + return i;
>> }
>>
>> static bool duplicate_rela(const Elf_Rela *rela, int idx)
>> {
>> - int i;
>> - for (i = 0; i < idx; i++) {
>> - if (is_rela_equal(&rela[i], &rela[idx]))
>> - return true;
>> - }
>> - return false;
>> + /*
>> + * Entries are sorted by type, symbol index and addend. That means
>> + * that, if a duplicate entry exists, it must be in the preceding slot.
>> + */
>> + return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0;
>> }
>>
>> static void count_max_entries(Elf_Rela *relas, int num,
>> @@ -87,11 +96,33 @@ static void count_max_entries(Elf_Rela *relas, int num,
>> }
>> }
>>
>> +static bool rela_needs_plt_got(const Elf_Rela *rela)
>> +{
>> + unsigned int type = ELF_R_TYPE(rela->r_info);
>> +
>> + return type == R_RISCV_CALL_PLT || type == R_RISCV_GOT_HI20;
>
> I see these two are sufficient to support count_max_entries(), but Charlie
> also added support for R_RISCV_PLT32. I don't know enough about relocation
> types to know if we need to consider that one here and in
> count_max_entries(), so I'm just pointing it out in case it was
> overlooked.
Yes, R_RISCV_PLT32 should be included. Today if a module were to include a
R_RISCV_PLT32 relocation against a unique symbol, it would hit a BUG_ON in
module_emit_plt_entry() because there would be no space in the PLT. If there was
also a R_RISCV_CALL_PLT relocation against the same symbol, the code would
silently succeed.
Regards,
Samuel
>> +}
>> +
>> +/* Copy PLT and GOT relas to the scratch array. */
>> +static unsigned int partition_plt_got_relas(const Elf_Rela *relas, Elf_Rela *scratch,
>> + unsigned int num_rela)
>> +{
>> + int j = 0;
>> +
>> + for (int i = 0; i < num_rela; i++)
>> + if (rela_needs_plt_got(&relas[i]))
>> + scratch[j++] = relas[i];
>> +
>> + return j;
>> +}
>> +
>> int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
>> char *secstrings, struct module *mod)
>> {
>> unsigned int num_plts = 0;
>> unsigned int num_gots = 0;
>> + Elf_Rela *scratch = NULL;
>> + size_t scratch_size = 0;
>> int i;
>>
>> /*
>> @@ -132,9 +163,26 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
>> if (!(dst_sec->sh_flags & SHF_EXECINSTR))
>> continue;
>>
>> - count_max_entries(relas, num_rela, &num_plts, &num_gots);
>> + /*
>> + * apply_relocate_add() relies on HI20 and LO12 relocation pairs being
>> + * close together, so sort a copy of the section to avoid interfering.
>> + */
>> + if (sechdrs[i].sh_size > scratch_size) {
>> + scratch_size = sechdrs[i].sh_size;
>> + scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL);
>> + if (!scratch)
>> + return -ENOMEM;
>> + }
>> +
>> + /* sort relocations requiring a PLT or GOT entry so duplicates are adjacent */
>> + num_rela = partition_plt_got_relas(relas, scratch, num_rela);
>> + sort(scratch, num_rela, sizeof(Elf_Rela), cmp_rela, NULL);
>> + count_max_entries(scratch, num_rela, &num_plts, &num_gots);
>> }
>>
>> + if (scratch)
>> + kvfree(scratch);
>> +
>> mod->arch.plt.shdr->sh_type = SHT_NOBITS;
>> mod->arch.plt.shdr->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
>> mod->arch.plt.shdr->sh_addralign = L1_CACHE_BYTES;
>> --
>> 2.47.0
>>
>
> Besides my question above,
>
> Reviewed-by: Andrew Jones <ajones@xxxxxxxxxxxxxxxx>
>
> Thanks,
> drew