Re: Simplify load_unaligned_zeropad() (was Re: [GIT PULL] Ceph updates for 5.20-rc1)

From: Linus Torvalds
Date: Mon Aug 15 2022 - 22:33:58 EST


On Mon, Aug 15, 2022 at 1:09 PM Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>
> I'm not at all suggesting we do this; but it might be
> insn_get_addr_ref() does what is needed.

.. you didn't suggest it at all, but I started doing it anyway.

It's a bit more complicated, and the fixup certainly isn't that
trivial thing any more, but you were right, it's not *that*
complicated, and it does allow us to use arbitrary 'mov' instructions.

And while it now has more added lines than deletions, and the diffstat now says

3 files changed, 60 insertions(+), 43 deletions(-)

most of the added lines are still that block comment, and some *very*
anal but trivial sanity checks of the instruction decode.

So I could have made it smaller than it used to be by just not doing
any of those verifications, and maybe I went a bit overboard, but I
think this is such a rare case that it's better to be ridiculously
careful than to try to minimize the number of lines.

So it may be a few more lines, but I can argue that it is still at
least conceptually simpler than the conditional asm goto with outputs
code was.

And yeah, it makes some of the code generation places marginally better.

So since I was tricked into writing this patch, and it's even tested
(the second attachment has a truly stupid patch with my test-case), I
think it's worth doing.

Comments? I left your "Acked-by" from the previous version of this
thing, so holler now if you think this got too ugly in the meantime..

Linus
From 310edfc7b6b0c9a7921f11c051234c79385130ed Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Date: Sun, 14 Aug 2022 14:16:13 -0700
Subject: [PATCH] x86: simplify load_unaligned_zeropad() implementation

The exception for the "unaligned access at the end of the page, next
page not mapped" never happens, but the fixup code ends up causing
trouble for compilers to optimize well.

clang in particular ends up seeing it being in the middle of a loop, and
tries desperately to optimize the exception fixup code that is never
really reached.

The simple solution is to just move all the fixups into the exception
handler itself, which moves it all out of the hot case code, and means
that the compiler never sees it or needs to worry about it.

Acked-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Signed-off-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
---
arch/x86/include/asm/extable_fixup_types.h | 2 +
arch/x86/include/asm/word-at-a-time.h | 46 ++----------------
arch/x86/mm/extable.c | 55 ++++++++++++++++++++++
3 files changed, 60 insertions(+), 43 deletions(-)

diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h
index 503622627400..b53f1919710b 100644
--- a/arch/x86/include/asm/extable_fixup_types.h
+++ b/arch/x86/include/asm/extable_fixup_types.h
@@ -64,4 +64,6 @@
#define EX_TYPE_UCOPY_LEN4 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(4))
#define EX_TYPE_UCOPY_LEN8 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(8))

+#define EX_TYPE_ZEROPAD 20 /* load ax from dx zero-padded */
+
#endif
diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h
index 8338b0432b50..46b4f1f7f354 100644
--- a/arch/x86/include/asm/word-at-a-time.h
+++ b/arch/x86/include/asm/word-at-a-time.h
@@ -77,58 +77,18 @@ static inline unsigned long find_zero(unsigned long mask)
* and the next page not being mapped, take the exception and
* return zeroes in the non-existing part.
*/
-#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
-
static inline unsigned long load_unaligned_zeropad(const void *addr)
{
- unsigned long offset, data;
unsigned long ret;

- asm_volatile_goto(
+ asm volatile(
"1: mov %[mem], %[ret]\n"
-
- _ASM_EXTABLE(1b, %l[do_exception])
-
- : [ret] "=r" (ret)
- : [mem] "m" (*(unsigned long *)addr)
- : : do_exception);
-
- return ret;
-
-do_exception:
- offset = (unsigned long)addr & (sizeof(long) - 1);
- addr = (void *)((unsigned long)addr & ~(sizeof(long) - 1));
- data = *(unsigned long *)addr;
- ret = data >> offset * 8;
-
- return ret;
-}
-
-#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
-
-static inline unsigned long load_unaligned_zeropad(const void *addr)
-{
- unsigned long offset, data;
- unsigned long ret, err = 0;
-
- asm( "1: mov %[mem], %[ret]\n"
"2:\n"
-
- _ASM_EXTABLE_FAULT(1b, 2b)
-
- : [ret] "=&r" (ret), "+a" (err)
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_ZEROPAD)
+ : [ret] "=r" (ret)
: [mem] "m" (*(unsigned long *)addr));

- if (unlikely(err)) {
- offset = (unsigned long)addr & (sizeof(long) - 1);
- addr = (void *)((unsigned long)addr & ~(sizeof(long) - 1));
- data = *(unsigned long *)addr;
- ret = data >> offset * 8;
- }
-
return ret;
}

-#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
-
#endif /* _ASM_WORD_AT_A_TIME_H */
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 331310c29349..60814e110a54 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -41,6 +41,59 @@ static bool ex_handler_default(const struct exception_table_entry *e,
return true;
}

+/*
+ * This is the *very* rare case where we do a "load_unaligned_zeropad()"
+ * and it's a page crosser into a non-existent page.
+ *
+ * This happens when we optimistically load a pathname a word-at-a-time
+ * and the name is less than the full word and the next page is not
+ * mapped. Typically that only happens for CONFIG_DEBUG_PAGEALLOC.
+ *
+ * NOTE! The faulting address is always a 'mov mem,reg' type instruction
+ * of size 'long', and the exception fixup must always point to right
+ * after the instruction.
+ */
+static bool ex_handler_zeropad(const struct exception_table_entry *e,
+ struct pt_regs *regs,
+ unsigned long fault_addr)
+{
+ struct insn insn;
+ const unsigned long mask = sizeof(long) - 1;
+ unsigned long offset, addr, next_ip, len;
+ unsigned long *reg;
+
+ next_ip = ex_fixup_addr(e);
+ len = next_ip - regs->ip;
+ if (len > MAX_INSN_SIZE)
+ return false;
+
+ if (insn_decode(&insn, (void *) regs->ip, len, INSN_MODE_KERN))
+ return false;
+ if (insn.length != len)
+ return false;
+
+ if (insn.opcode.bytes[0] != 0x8b)
+ return false;
+ if (insn.opnd_bytes != sizeof(long))
+ return false;
+
+ addr = (unsigned long) insn_get_addr_ref(&insn, regs);
+ if (addr == ~0ul)
+ return false;
+
+ offset = addr & mask;
+ addr = addr & ~mask;
+ if (fault_addr != addr + sizeof(long))
+ return false;
+
+ reg = insn_get_modrm_reg_ptr(&insn, regs);
+ if (!reg)
+ return false;
+
+ *reg = *(unsigned long *)addr >> (offset * 8);
+ return ex_handler_default(e, regs);
+}
+
static bool ex_handler_fault(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
@@ -217,6 +270,8 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
return ex_handler_sgx(e, regs, trapnr);
case EX_TYPE_UCOPY_LEN:
return ex_handler_ucopy_len(e, regs, trapnr, reg, imm);
+ case EX_TYPE_ZEROPAD:
+ return ex_handler_zeropad(e, regs, fault_addr);
}
BUG();
}
--
2.37.1.289.g45aa1e5c72.dirty

init/main.c | 13 +++++++++++++
1 file changed, 13 insertions(+)

diff --git a/init/main.c b/init/main.c
index 91642a4e69be..c1e632582419 100644
--- a/init/main.c
+++ b/init/main.c
@@ -108,6 +108,8 @@
#include <asm/sections.h>
#include <asm/cacheflush.h>

+#include <asm/word-at-a-time.h>
+
#define CREATE_TRACE_POINTS
#include <trace/events/initcall.h>

@@ -1488,6 +1490,15 @@ void __weak free_initmem(void)
free_initmem_default(POISON_FREE_INITMEM);
}

+static int test_unaligned(void *unused)
+{
+ void *buf = vmalloc(PAGE_SIZE);
+ memset(buf, 0xfe, PAGE_SIZE);
+ printk("load_unaligned_zeropad = %016lx\n", load_unaligned_zeropad(buf + PAGE_SIZE - 5));
+ kthread_exit(0);
+}
+
+
static int __ref kernel_init(void *unused)
{
int ret;
@@ -1518,6 +1529,8 @@ static int __ref kernel_init(void *unused)
system_state = SYSTEM_RUNNING;
numa_default_policy();

+kernel_thread(test_unaligned, NULL, CLONE_FS | CLONE_FILES);
+
rcu_end_inkernel_boot();

do_sysctl_args();