Re: [PATCH v2 3/3] riscv: module: Optimize PLT/GOT entry counting
From: Andrew Jones
Date: Wed Apr 09 2025 - 15:24:29 EST
On Wed, Apr 09, 2025 at 02:18:29PM -0500, Samuel Holland wrote:
> Hi Drew,
>
> Thanks for the review again!
>
> On 2025-04-09 2:07 PM, Andrew Jones wrote:
> > On Wed, Apr 09, 2025 at 10:14:51AM -0700, Samuel Holland wrote:
> >> perf reports that 99.63% of the cycles from `modprobe amdgpu` are spent
> >> inside module_frob_arch_sections(). This is because amdgpu.ko contains
> >> about 300000 relocations in its .rela.text section, and the algorithm in
> >> count_max_entries() takes quadratic time.
> >>
> >> Apply two optimizations from the arm64 code, which together reduce the
> >> total execution time by 99.58%. First, sort the relocations so duplicate
> >> entries are adjacent. Second, reduce the number of relocations that must
> >> be sorted by filtering to only relocations that need PLT/GOT entries, as
> >> done in commit d4e0340919fb ("arm64/module: Optimize module load time by
> >> optimizing PLT counting").
> >>
> >> Unlike the arm64 code, here the filtering and sorting is done in a
> >> scratch buffer, because the HI20 relocation search optimization in
> >> apply_relocate_add() depends on the original order of the relocations.
> >> This allows accumulating PLT/GOT relocations across sections so sorting
> >> and counting is only done once per module.
> >>
> >> Signed-off-by: Samuel Holland <samuel.holland@xxxxxxxxxx>
> >> ---
> >>
> >> Changes in v2:
> >> - Include R_RISCV_PLT32 relocations when computing the PLT size
> >> - Accumulate PLT/GOT relocations across relocation sections
> >>
> >> arch/riscv/kernel/module-sections.c | 81 +++++++++++++++++++++++------
> >> 1 file changed, 65 insertions(+), 16 deletions(-)
> >>
> >> diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c
> >> index 91d0b355ceef..75551ac6504c 100644
> >> --- a/arch/riscv/kernel/module-sections.c
> >> +++ b/arch/riscv/kernel/module-sections.c
> >> @@ -9,6 +9,7 @@
> >> #include <linux/kernel.h>
> >> #include <linux/module.h>
> >> #include <linux/moduleloader.h>
> >> +#include <linux/sort.h>
> >>
> >> unsigned long module_emit_got_entry(struct module *mod, unsigned long val)
> >> {
> >> @@ -55,44 +56,70 @@ unsigned long module_emit_plt_entry(struct module *mod, unsigned long val)
> >> return (unsigned long)&plt[i];
> >> }
> >>
> >> -static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y)
> >> +#define cmp_3way(a, b) ((a) < (b) ? -1 : (a) > (b))
> >> +
> >> +static int cmp_rela(const void *a, const void *b)
> >> {
> >> - return x->r_info == y->r_info && x->r_addend == y->r_addend;
> >> + const Elf_Rela *x = a, *y = b;
> >> + int i;
> >> +
> >> + /* sort by type, symbol index and addend */
> >> + i = cmp_3way(x->r_info, y->r_info);
> >> + if (i == 0)
> >> + i = cmp_3way(x->r_addend, y->r_addend);
> >> + return i;
> >> }
> >>
> >> static bool duplicate_rela(const Elf_Rela *rela, int idx)
> >> {
> >> - int i;
> >> - for (i = 0; i < idx; i++) {
> >> - if (is_rela_equal(&rela[i], &rela[idx]))
> >> - return true;
> >> - }
> >> - return false;
> >> + /*
> >> + * Entries are sorted by type, symbol index and addend. That means
> >> + * that, if a duplicate entry exists, it must be in the preceding slot.
> >> + */
> >> + return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0;
> >> }
> >>
> >> -static void count_max_entries(Elf_Rela *relas, int num,
> >> +static void count_max_entries(const Elf_Rela *relas, size_t num,
> >> unsigned int *plts, unsigned int *gots)
> >> {
> >> - for (int i = 0; i < num; i++) {
> >> + for (size_t i = 0; i < num; i++) {
> >> + if (duplicate_rela(relas, i))
> >> + continue;
> >> +
> >> switch (ELF_R_TYPE(relas[i].r_info)) {
> >> case R_RISCV_CALL_PLT:
> >> case R_RISCV_PLT32:
> >> - if (!duplicate_rela(relas, i))
> >> - (*plts)++;
> >> + (*plts)++;
> >> break;
> >> case R_RISCV_GOT_HI20:
> >> - if (!duplicate_rela(relas, i))
> >> - (*gots)++;
> >> + (*gots)++;
> >> break;
> >> + default:
> >> + unreachable();
> >> }
> >> }
> >> }
> >>
> >> +static bool rela_needs_plt_got_entry(const Elf_Rela *rela)
> >> +{
> >> + switch (ELF_R_TYPE(rela->r_info)) {
> >> + case R_RISCV_CALL_PLT:
> >> + case R_RISCV_GOT_HI20:
> >> + case R_RISCV_PLT32:
> >> + return true;
> >> + default:
> >> + return false;
> >> + }
> >> +}
> >> +
> >> int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
> >> char *secstrings, struct module *mod)
> >> {
> >> + size_t num_scratch_relas = 0;
> >> unsigned int num_plts = 0;
> >> unsigned int num_gots = 0;
> >> + Elf_Rela *scratch = NULL;
> >> + size_t scratch_size = 0;
> >> int i;
> >>
> >> /*
> >> @@ -122,9 +149,10 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
> >>
> >> /* Calculate the maxinum number of entries */
> >> for (i = 0; i < ehdr->e_shnum; i++) {
> >> + size_t num_relas = sechdrs[i].sh_size / sizeof(Elf_Rela);
> >> Elf_Rela *relas = (void *)ehdr + sechdrs[i].sh_offset;
> >> - int num_rela = sechdrs[i].sh_size / sizeof(Elf_Rela);
> >> Elf_Shdr *dst_sec = sechdrs + sechdrs[i].sh_info;
> >> + size_t scratch_size_needed;
> >>
> >> if (sechdrs[i].sh_type != SHT_RELA)
> >> continue;
> >> @@ -133,7 +161,28 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
> >> if (!(dst_sec->sh_flags & SHF_EXECINSTR))
> >> continue;
> >>
> >> - count_max_entries(relas, num_rela, &num_plts, &num_gots);
> >> + /*
> >> + * apply_relocate_add() relies on HI20 and LO12 relocation pairs being
> >> + * close together, so sort a copy of the section to avoid interfering.
> >> + */
> >> + scratch_size_needed = (num_scratch_relas + num_relas) * sizeof(*scratch);
> >> + if (scratch_size_needed > scratch_size) {
> >> + scratch_size = scratch_size_needed;
> >
> > Maybe not worth it, but when i is less than ehdr->e_shnum - 1 (we still
> > have more sections to look at) we could use scratch_size_needed * 2, or
> > something, in order to hopefully reduce the number of times kvrealloc()
> > is called.
>
> I tried to keep the code as simple as possible, so I suppose it's not obvious
> what's going on here. The current code takes advantage of the fact that one of
> the relocation sections (.rela.text) is generally an order of magnitude or more
> larger than the others, is usually the first section processed, and much fewer
> than half of the relocations require a PLT/GOT entry. So in the common case:
>
> num_relas (.rela.text) > num_scratch_relas (all sections) + num_relas (any
> other section)
>
> and there will only be one allocation, for .rela.text.
Sounds good.
Thanks,
drew
>
> Regards,
> Samuel
>
> >> + scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL);
> >> + if (!scratch)
> >> + return -ENOMEM;
> >> + }
> >> +
> >> + for (size_t j = 0; j < num_relas; j++)
> >> + if (rela_needs_plt_got_entry(&relas[j]))
> >> + scratch[num_scratch_relas++] = relas[j];
> >> + }
> >> +
> >> + if (scratch) {
> >> + /* sort the accumulated PLT/GOT relocations so duplicates are adjacent */
> >> + sort(scratch, num_scratch_relas, sizeof(*scratch), cmp_rela, NULL);
> >> + count_max_entries(scratch, num_scratch_relas, &num_plts, &num_gots);
> >> + kvfree(scratch);
> >> }
> >>
> >> mod->arch.plt.shdr->sh_type = SHT_NOBITS;
> >> --
> >> 2.47.0
> >>
> >
> > Reviewed-by: Andrew Jones <ajones@xxxxxxxxxxxxxxxx>
> >
> > Thanks,
> > drew
>