Simplify load_unaligned_zeropad() (was Re: [GIT PULL] Ceph updates for 5.20-rc1)

From: Linus Torvalds
Date: Sun Aug 14 2022 - 17:14:33 EST


On Sun, Aug 14, 2022 at 1:29 PM Linus Torvalds
<torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:
>
> I dug into it some more, and it is really "load_unaligned_zeropad()"
> that makes clang really uncomfortable.
>
> The problem ends up being that clang sees that it's inside that inner
> loop, and tries very hard to optimize the shift-and-mask that happens
> if the exception happens.
>
> The fact that the exception *never* happens unless DEBUG_PAGEALLOC is
> enabled - and very very seldom even then - is not something we can
> really explain to clang.

Hmm.

The solution to that is actually to move the 'zeropad' part into the
exception handler.

That makes both gcc and clang generate quite nice code for this all.
But perhaps equally importantly, it actually simplifies the code
noticeably:

arch/x86/include/asm/extable_fixup_types.h | 2 ++
arch/x86/include/asm/word-at-a-time.h | 50 +++---------------------------
arch/x86/mm/extable.c | 30 ++++++++++++++++++
3 files changed, 37 insertions(+), 45 deletions(-)

and that's with 11 of those 37 new lines being a new block comment.

It's mainly because now there's no need to worry about
CONFIG_CC_HAS_ASM_GOTO_OUTPUT in load_unaligned_zeropad(), because the
asm becomes a simple "address in, data out" thing.

And to make the fixup code simple and straightforward, I just made
"load_unaligned_zeropad()" use fixed registers for address and output,
which doesn't bother the compilers at all, they'll happily adjust
their register allocation. The code generation ends up much better
than with the goto and the subtle address games that never trigger
anyway.

PeterZ - you've touched both the load_unaligned_zeropad() and the
exception code last, so let's run this past you, but this really does
seem to not only fix the code generation issue in fs/dcache.s, it just
looks simpler too. Comments?

Linus
arch/x86/include/asm/extable_fixup_types.h | 2 ++
arch/x86/include/asm/word-at-a-time.h | 50 +++---------------------------
arch/x86/mm/extable.c | 30 ++++++++++++++++++
3 files changed, 37 insertions(+), 45 deletions(-)

diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h
index 503622627400..b53f1919710b 100644
--- a/arch/x86/include/asm/extable_fixup_types.h
+++ b/arch/x86/include/asm/extable_fixup_types.h
@@ -64,4 +64,6 @@
#define EX_TYPE_UCOPY_LEN4 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(4))
#define EX_TYPE_UCOPY_LEN8 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(8))

+#define EX_TYPE_ZEROPAD 20 /* load ax from dx zero-padded */
+
#endif
diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h
index 8338b0432b50..4893f1b30dd6 100644
--- a/arch/x86/include/asm/word-at-a-time.h
+++ b/arch/x86/include/asm/word-at-a-time.h
@@ -77,58 +77,18 @@ static inline unsigned long find_zero(unsigned long mask)
* and the next page not being mapped, take the exception and
* return zeroes in the non-existing part.
*/
-#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
-
static inline unsigned long load_unaligned_zeropad(const void *addr)
{
- unsigned long offset, data;
unsigned long ret;

- asm_volatile_goto(
- "1: mov %[mem], %[ret]\n"
-
- _ASM_EXTABLE(1b, %l[do_exception])
-
- : [ret] "=r" (ret)
- : [mem] "m" (*(unsigned long *)addr)
- : : do_exception);
-
- return ret;
-
-do_exception:
- offset = (unsigned long)addr & (sizeof(long) - 1);
- addr = (void *)((unsigned long)addr & ~(sizeof(long) - 1));
- data = *(unsigned long *)addr;
- ret = data >> offset * 8;
-
- return ret;
-}
-
-#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
-
-static inline unsigned long load_unaligned_zeropad(const void *addr)
-{
- unsigned long offset, data;
- unsigned long ret, err = 0;
-
- asm( "1: mov %[mem], %[ret]\n"
+ asm volatile(
+ "1: mov (%[addr]), %[ret]\n"
"2:\n"
-
- _ASM_EXTABLE_FAULT(1b, 2b)
-
- : [ret] "=&r" (ret), "+a" (err)
- : [mem] "m" (*(unsigned long *)addr));
-
- if (unlikely(err)) {
- offset = (unsigned long)addr & (sizeof(long) - 1);
- addr = (void *)((unsigned long)addr & ~(sizeof(long) - 1));
- data = *(unsigned long *)addr;
- ret = data >> offset * 8;
- }
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_ZEROPAD)
+ : [ret] "=a" (ret)
+ : [addr] "d" (addr));

return ret;
}

-#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
-
#endif /* _ASM_WORD_AT_A_TIME_H */
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 331310c29349..58c79077a496 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -41,6 +41,34 @@ static bool ex_handler_default(const struct exception_table_entry *e,
return true;
}

+/*
+ * This is the *very* rare case where we do a "load_unaligned_zeropad()"
+ * and it's a page crosser into a non-existent page.
+ *
+ * This happens when we optimistically load a pathname a word-at-a-time
+ * and the name is less than the full word and the next page is not
+ * mapped. Typically that only happens for CONFIG_DEBUG_PAGEALLOC.
+ *
+ * NOTE! The load is always of the form "mov (%edx),%eax" to make the
+ * fixup simple.
+ */
+static bool ex_handler_zeropad(const struct exception_table_entry *e,
+ struct pt_regs *regs,
+ unsigned long fault_addr)
+{
+ const unsigned long mask = sizeof(long) - 1;
+ unsigned long offset, addr;
+
+ offset = regs->dx & mask;
+ addr = regs->dx & ~mask;
+ if (fault_addr != addr + sizeof(long))
+ return false;
+
+ regs->ax = *(unsigned long *)addr >> (offset * 8);
+ regs->ip = ex_fixup_addr(e);
+ return true;
+}
+
static bool ex_handler_fault(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
@@ -217,6 +245,8 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
return ex_handler_sgx(e, regs, trapnr);
case EX_TYPE_UCOPY_LEN:
return ex_handler_ucopy_len(e, regs, trapnr, reg, imm);
+ case EX_TYPE_ZEROPAD:
+ return ex_handler_zeropad(e, regs, fault_addr);
}
BUG();
}