Re: [PATCH RFC 0/2] KVM: s390: avoid having to enable vm.alloc_pgste

From: Martin Schwidefsky
Date: Fri Jun 02 2017 - 05:47:02 EST


On Fri, 2 Jun 2017 09:02:10 +0200
Heiko Carstens <heiko.carstens@xxxxxxxxxx> wrote:

> On Thu, Jun 01, 2017 at 12:46:51PM +0200, Martin Schwidefsky wrote:
> > > Unfortunately, converting all page tables to 4k pgste page tables is
> > > not possible without provoking various race conditions.
> >
> > That is one approach we tried and was found to be buggy. The point is that
> > you are not allowed to reallocate a page table while a VMA exists that is
> > in the address range of that page table.
> >
> > Another approach we tried is to use an ELF flag on the qemu executable.
> > That does not work either because fs/exec.c allocates and populates the
> > new mm struct for the argument pages before fs/binfmt_elf.c comes into
> > play.
>
> How about if you would fail the system call within arch_check_elf() if you
> detect that the binary requires pgstes (as indicated by elf flags) and then
> restart the system call?
>
> That is: arch_check_elf() e.g. would set a thread flag that future mm's
> should be allocated with pgstes. Then do_execve() would cleanup everything
> and return to entry.S. Upon return to userspace we detect this condition
> and simply restart the system call, similar to signals vs -ERESTARTSYS.
>
> That would make do_execve() cleanup everything and upon reentering it would
> allocate an mm with the pgste flag set.
>
> Maybe this is a bit over-simplified, but might work.

This is not over-simplified at all, that does work:
--
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 69a77eecaec1..7bd182676ddd 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -64,6 +64,7 @@ config ARCH_SUPPORTS_UPROBES

config S390
def_bool y
+ select ARCH_BINFMT_ELF_STATE
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_GCOV_PROFILE_ALL
diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index e8f623041769..79911231f9e6 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -151,6 +151,28 @@ extern unsigned int vdso_enabled;
&& (x)->e_ident[EI_CLASS] == ELF_CLASS)
#define compat_start_thread start_thread31

+struct arch_elf_state {
+};
+
+#define INIT_ARCH_ELF_STATE { }
+
+#define arch_elf_pt_proc(ehdr, phdr, elf, interp, state) (0)
+#define arch_check_elf(ehdr, interp, interp_ehdr, state) \
+({ \
+ struct elf64_hdr *hdr = (void*) ehdr; \
+ int _rc = 0; \
+ if (hdr->e_ident[EI_CLASS] == ELFCLASS64 && \
+ (hdr->e_flags & 0x00000002) && \
+ !page_table_allocate_pgste && \
+ !current->mm->context.alloc_pgste) { \
+ current->mm->context.alloc_pgste = 1; \
+ set_pt_regs_flag(task_pt_regs(current), \
+ PIF_SYSCALL_RESTART); \
+ _rc = -EAGAIN; \
+ } \
+ _rc; \
+})
+
/* For SVR4/S390 the function pointer to be registered with `atexit` is
passed in R14. */
#define ELF_PLAT_INIT(_r, load_addr) \
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index c119d564d8f2..268a5d22ce1b 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -25,7 +25,8 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.gmap_asce = 0;
mm->context.flush_mm = 0;
#ifdef CONFIG_PGSTE
- mm->context.alloc_pgste = page_table_allocate_pgste;
+ mm->context.alloc_pgste = page_table_allocate_pgste ||
+ current->mm->context.alloc_pgste;
mm->context.has_pgste = 0;
mm->context.use_skey = 0;
mm->context.use_cmma = 0;
diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h
index 99bc456cc26a..24baa80f7af6 100644
--- a/arch/s390/include/asm/ptrace.h
+++ b/arch/s390/include/asm/ptrace.h
@@ -11,9 +11,11 @@

#define PIF_SYSCALL 0 /* inside a system call */
#define PIF_PER_TRAP 1 /* deliver sigtrap on return to user */
+#define PIF_SYSCALL_RESTART 2 /* restart the current system call */

#define _PIF_SYSCALL _BITUL(PIF_SYSCALL)
#define _PIF_PER_TRAP _BITUL(PIF_PER_TRAP)
+#define _PIF_SYSCALL_RESTART _BITUL(PIF_SYSCALL_RESTART)

#ifndef __ASSEMBLY__

diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 0c2c3b8bfc9a..8c824b32527a 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -52,7 +52,7 @@ _TIF_TRACE = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \
_TIF_SYSCALL_TRACEPOINT)
_CIF_WORK = (_CIF_MCCK_PENDING | _CIF_ASCE_PRIMARY | \
_CIF_ASCE_SECONDARY | _CIF_FPU)
-_PIF_WORK = (_PIF_PER_TRAP)
+_PIF_WORK = (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART)

#define BASED(name) name-cleanup_critical(%r13)

@@ -342,6 +342,8 @@ ENTRY(system_call)
jo .Lsysc_guarded_storage
TSTMSK __PT_FLAGS(%r11),_PIF_PER_TRAP
jo .Lsysc_singlestep
+ TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL_RESTART
+ jo .Lsysc_syscall_restart
TSTMSK __TI_flags(%r12),_TIF_SIGPENDING
jo .Lsysc_sigpending
TSTMSK __TI_flags(%r12),_TIF_NOTIFY_RESUME
@@ -434,6 +436,15 @@ ENTRY(system_call)
jg do_per_trap

#
+# _PIF_SYSCALL_RESTART is set, repeat the current system call
+#
+.Lsysc_syscall_restart:
+ ni __PT_FLAGS+7(%r11),255-_PIF_SYSCALL_RESTART
+ lmg %r1,%r7,__PT_R1(%r11) # load svc arguments
+ lg %r2,__PT_ORIG_GPR2(%r11)
+ j .Lsysc_do_svc
+
+#
# call tracehook_report_syscall_entry/tracehook_report_syscall_exit before
# and after the system call
#
--
blue skies,
Martin.

"Reality continues to ruin my life." - Calvin.