[PATCH 4.9 21/39] kaiser: load_new_mm_cr3() let SWITCH_USER_CR3 flush user

From: Greg Kroah-Hartman
Date: Wed Jan 03 2018 - 15:21:21 EST


4.9-stable review patch. If anyone has any objections, please let me know.

------------------

From: Hugh Dickins <hughd@xxxxxxxxxx>


We have many machines (Westmere, Sandybridge, Ivybridge) supporting
PCID but not INVPCID: on these load_new_mm_cr3() simply crashed.

Flushing user context inside load_new_mm_cr3() without the use of
invpcid is difficult: momentarily switch from kernel to user context
and back to do so? I'm not sure whether that can be safely done at
all, and would risk polluting user context with kernel internals,
and kernel context with stale user externals.

Instead, follow the hint in the comment that was there: change
X86_CR3_PCID_USER_VAR to be a per-cpu variable, then load_new_mm_cr3()
can leave a note in it, for SWITCH_USER_CR3 on return to userspace to
flush user context TLB, instead of default X86_CR3_PCID_USER_NOFLUSH.

Which works well enough that there's no need to do it this way only
when invpcid is unsupported: it's a good alternative to invpcid here.
But there's a couple of inlines in asm/tlbflush.h that need to do the
same trick, so it's best to localize all this per-cpu business in
mm/kaiser.c: moving that part of the initialization from setup_pcid()
to kaiser_setup_pcid(); with kaiser_flush_tlb_on_return_to_user() the
function for noting an X86_CR3_PCID_USER_FLUSH. And let's keep a
KAISER_SHADOW_PGD_OFFSET in there, to avoid the extra OR on exit.

I did try to make the feature tests in asm/tlbflush.h more consistent
with each other: there seem to be far too many ways of performing such
tests, and I don't have a good grasp of their differences. At first
I converted them all to be static_cpu_has(): but that proved to be a
mistake, as the comment in __native_flush_tlb_single() hints; so then
I reversed and made them all this_cpu_has(). Probably all gratuitous
change, but that's the way it's working at present.

I am slightly bothered by the way non-per-cpu X86_CR3_PCID_KERN_VAR
gets re-initialized by each cpu (before and after these changes):
no problem when (as usual) all cpus on a machine have the same
features, but in principle incorrect. However, my experiment
to per-cpu-ify that one did not end well...

Signed-off-by: Hugh Dickins <hughd@xxxxxxxxxx>
Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>
---
arch/x86/include/asm/kaiser.h | 18 +++++++-----
arch/x86/include/asm/tlbflush.h | 56 +++++++++++++++++++++++++++-------------
arch/x86/kernel/cpu/common.c | 22 ---------------
arch/x86/mm/kaiser.c | 50 +++++++++++++++++++++++++++++++----
arch/x86/mm/tlb.c | 46 ++++++++++++--------------------
5 files changed, 113 insertions(+), 79 deletions(-)

--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -32,13 +32,12 @@ movq \reg, %cr3
.macro _SWITCH_TO_USER_CR3 reg
movq %cr3, \reg
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
-/*
- * This can obviously be one instruction by putting the
- * KAISER_SHADOW_PGD_OFFSET bit in the X86_CR3_PCID_USER_VAR.
- * But, just leave it now for simplicity.
- */
-orq X86_CR3_PCID_USER_VAR, \reg
-orq $(KAISER_SHADOW_PGD_OFFSET), \reg
+orq PER_CPU_VAR(X86_CR3_PCID_USER_VAR), \reg
+js 9f
+// FLUSH this time, reset to NOFLUSH for next time
+// But if nopcid? Consider using 0x80 for user pcid?
+movb $(0x80), PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7)
+9:
movq \reg, %cr3
.endm

@@ -90,6 +89,11 @@ movq PER_CPU_VAR(unsafe_stack_register_b
*/
DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);

+extern unsigned long X86_CR3_PCID_KERN_VAR;
+DECLARE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR);
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+
/**
* kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
* @addr: the start address of the range
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -13,6 +13,7 @@ static inline void __invpcid(unsigned lo
unsigned long type)
{
struct { u64 d[2]; } desc = { { pcid, addr } };
+
/*
* The memory clobber is because the whole point is to invalidate
* stale TLB entries and, especially if we're flushing global
@@ -131,27 +132,42 @@ static inline void cr4_set_bits_and_upda
cr4_set_bits(mask);
}

+/*
+ * Declare a couple of kaiser interfaces here for convenience,
+ * to avoid the need for asm/kaiser.h in unexpected places.
+ */
+#ifdef CONFIG_KAISER
+extern void kaiser_setup_pcid(void);
+extern void kaiser_flush_tlb_on_return_to_user(void);
+#else
+static inline void kaiser_setup_pcid(void)
+{
+}
+static inline void kaiser_flush_tlb_on_return_to_user(void)
+{
+}
+#endif
+
static inline void __native_flush_tlb(void)
{
- if (!cpu_feature_enabled(X86_FEATURE_INVPCID)) {
+ if (this_cpu_has(X86_FEATURE_INVPCID)) {
/*
- * If current->mm == NULL then we borrow a mm which may change during a
- * task switch and therefore we must not be preempted while we write CR3
- * back:
+ * Note, this works with CR4.PCIDE=0 or 1.
*/
- preempt_disable();
- native_write_cr3(native_read_cr3());
- preempt_enable();
+ invpcid_flush_all_nonglobals();
return;
}
+
/*
- * We are no longer using globals with KAISER, so a
- * "nonglobals" flush would work too. But, this is more
- * conservative.
- *
- * Note, this works with CR4.PCIDE=0 or 1.
+ * If current->mm == NULL then we borrow a mm which may change during a
+ * task switch and therefore we must not be preempted while we write CR3
+ * back:
*/
- invpcid_flush_all();
+ preempt_disable();
+ if (this_cpu_has(X86_FEATURE_PCID))
+ kaiser_flush_tlb_on_return_to_user();
+ native_write_cr3(native_read_cr3());
+ preempt_enable();
}

static inline void __native_flush_tlb_global_irq_disabled(void)
@@ -167,9 +183,13 @@ static inline void __native_flush_tlb_gl

static inline void __native_flush_tlb_global(void)
{
+#ifdef CONFIG_KAISER
+ /* Globals are not used at all */
+ __native_flush_tlb();
+#else
unsigned long flags;

- if (static_cpu_has(X86_FEATURE_INVPCID)) {
+ if (this_cpu_has(X86_FEATURE_INVPCID)) {
/*
* Using INVPCID is considerably faster than a pair of writes
* to CR4 sandwiched inside an IRQ flag save/restore.
@@ -186,10 +206,9 @@ static inline void __native_flush_tlb_gl
* be called from deep inside debugging code.)
*/
raw_local_irq_save(flags);
-
__native_flush_tlb_global_irq_disabled();
-
raw_local_irq_restore(flags);
+#endif
}

static inline void __native_flush_tlb_single(unsigned long addr)
@@ -200,9 +219,12 @@ static inline void __native_flush_tlb_si
*
* The ASIDs used below are hard-coded. But, we must not
* call invpcid(type=1/2) before CR4.PCIDE=1. Just call
- * invpcid in the case we are called early.
+ * invlpg in the case we are called early.
*/
+
if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
+ if (this_cpu_has(X86_FEATURE_PCID))
+ kaiser_flush_tlb_on_return_to_user();
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
return;
}
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -324,33 +324,12 @@ static __always_inline void setup_smap(s
}
}

-/*
- * These can have bit 63 set, so we can not just use a plain "or"
- * instruction to get their value or'd into CR3. It would take
- * another register. So, we use a memory reference to these
- * instead.
- *
- * This is also handy because systems that do not support
- * PCIDs just end up or'ing a 0 into their CR3, which does
- * no harm.
- */
-__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR = 0;
-__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_USER_VAR = 0;
-
static void setup_pcid(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_PCID)) {
if (cpu_has(c, X86_FEATURE_PGE)) {
cr4_set_bits(X86_CR4_PCIDE);
/*
- * These variables are used by the entry/exit
- * code to change PCIDs.
- */
-#ifdef CONFIG_KAISER
- X86_CR3_PCID_KERN_VAR = X86_CR3_PCID_KERN_NOFLUSH;
- X86_CR3_PCID_USER_VAR = X86_CR3_PCID_USER_NOFLUSH;
-#endif
- /*
* INVPCID has two "groups" of types:
* 1/2: Invalidate an individual address
* 3/4: Invalidate all contexts
@@ -375,6 +354,7 @@ static void setup_pcid(struct cpuinfo_x8
clear_cpu_cap(c, X86_FEATURE_PCID);
}
}
+ kaiser_setup_pcid();
}

/*
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -11,12 +11,26 @@
#include <linux/uaccess.h>

#include <asm/kaiser.h>
+#include <asm/tlbflush.h> /* to verify its kaiser declarations */
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/desc.h>
+
#ifdef CONFIG_KAISER
+__visible
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+/*
+ * These can have bit 63 set, so we can not just use a plain "or"
+ * instruction to get their value or'd into CR3. It would take
+ * another register. So, we use a memory reference to these instead.
+ *
+ * This is also handy because systems that do not support PCIDs
+ * just end up or'ing a 0 into their CR3, which does no harm.
+ */
+__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR;
+DEFINE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR);

-__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
/*
* At runtime, the only things we map are some things for CPU
* hotplug, and stacks for new processes. No two CPUs will ever
@@ -238,9 +252,6 @@ static void __init kaiser_init_all_pgds(
WARN_ON(__ret); \
} while (0)

-extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
-extern unsigned long X86_CR3_PCID_KERN_VAR;
-extern unsigned long X86_CR3_PCID_USER_VAR;
/*
* If anything in here fails, we will likely die on one of the
* first kernel->user transitions and init will die. But, we
@@ -294,8 +305,6 @@ void __init kaiser_init(void)

kaiser_add_user_map_early(&X86_CR3_PCID_KERN_VAR, PAGE_SIZE,
__PAGE_KERNEL);
- kaiser_add_user_map_early(&X86_CR3_PCID_USER_VAR, PAGE_SIZE,
- __PAGE_KERNEL);
}

/* Add a mapping to the shadow mapping, and synchronize the mappings */
@@ -358,4 +367,33 @@ pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp,
}
return pgd;
}
+
+void kaiser_setup_pcid(void)
+{
+ unsigned long kern_cr3 = 0;
+ unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
+
+ if (this_cpu_has(X86_FEATURE_PCID)) {
+ kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH;
+ user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
+ }
+ /*
+ * These variables are used by the entry/exit
+ * code to change PCID and pgd and TLB flushing.
+ */
+ X86_CR3_PCID_KERN_VAR = kern_cr3;
+ this_cpu_write(X86_CR3_PCID_USER_VAR, user_cr3);
+}
+
+/*
+ * Make a note that this cpu will need to flush USER tlb on return to user.
+ * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling:
+ * if cpu does not, then the NOFLUSH bit will never have been set.
+ */
+void kaiser_flush_tlb_on_return_to_user(void)
+{
+ this_cpu_write(X86_CR3_PCID_USER_VAR,
+ X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
+}
+EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
#endif /* CONFIG_KAISER */
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -6,13 +6,14 @@
#include <linux/interrupt.h>
#include <linux/export.h>
#include <linux/cpu.h>
+#include <linux/debugfs.h>

#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/cache.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>
-#include <linux/debugfs.h>
+#include <asm/kaiser.h>

/*
* TLB flushing, formerly SMP-only
@@ -38,34 +39,23 @@ static void load_new_mm_cr3(pgd_t *pgdir
{
unsigned long new_mm_cr3 = __pa(pgdir);

- /*
- * KAISER, plus PCIDs needs some extra work here. But,
- * if either of features is not present, we need no
- * PCIDs here and just do a normal, full TLB flush with
- * the write_cr3()
- */
- if (!IS_ENABLED(CONFIG_KAISER) ||
- !cpu_feature_enabled(X86_FEATURE_PCID))
- goto out_set_cr3;
- /*
- * We reuse the same PCID for different tasks, so we must
- * flush all the entires for the PCID out when we change
- * tasks.
- */
- new_mm_cr3 = X86_CR3_PCID_KERN_FLUSH | __pa(pgdir);
-
- /*
- * The flush from load_cr3() may leave old TLB entries
- * for userspace in place. We must flush that context
- * separately. We can theoretically delay doing this
- * until we actually load up the userspace CR3, but
- * that's a bit tricky. We have to have the "need to
- * flush userspace PCID" bit per-cpu and check it in the
- * exit-to-userspace paths.
- */
- invpcid_flush_single_context(X86_CR3_PCID_ASID_USER);
+#ifdef CONFIG_KAISER
+ if (this_cpu_has(X86_FEATURE_PCID)) {
+ /*
+ * We reuse the same PCID for different tasks, so we must
+ * flush all the entries for the PCID out when we change tasks.
+ * Flush KERN below, flush USER when returning to userspace in
+ * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
+ *
+ * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
+ * do it here, but can only be used if X86_FEATURE_INVPCID is
+ * available - and many machines support pcid without invpcid.
+ */
+ new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
+ kaiser_flush_tlb_on_return_to_user();
+ }
+#endif /* CONFIG_KAISER */

-out_set_cr3:
/*
* Caution: many callers of this function expect
* that load_cr3() is serializing and orders TLB