[GIT PULL] x86 fixes

From: Ingo Molnar
Date: Thu Jan 14 2016 - 05:16:36 EST


Linus,

Please pull the latest x86-urgent-for-linus git tree from:

git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86-urgent-for-linus

# HEAD: 7030a7e9321166eef44c811fe4af4d460360d424 x86/cpu/amd: Remove an unneeded condition in srat_detect_node()

Misc changes:

- fix lguest bug
- fix /proc/meminfo output on certain configs
- fix pvclock bug
- fix reboot on certain iMacs by adding new reboot quirk
- fix bootup crash
- fix FPU boot line option parsing
- add more x86 self-tests
- small cleanups, documentation improvements, etc.

out-of-topic modifications in x86-urgent-for-linus:
-----------------------------------------------------
drivers/lguest/core.c # e27d90e8be08: lguest: Map switcher text R/
tools/testing/selftests/x86/Makefile# 65cacec1ba90: selftests/x86: Test __kernel
# 0f672809f91a: selftests/x86: Disable the l
tools/testing/selftests/x86/vdso_restorer.c# 65cacec1ba90: selftests/x86: Test __kernel

Thanks,

Ingo

------------------>
Andy Lutomirski (5):
x86/mm: Add barriers and document switch_mm()-vs-flush synchronization
selftests/x86: Disable the ldt_gdt_64 test for now
selftests/x86: Test __kernel_sigreturn and __kernel_rt_sigreturn
x86/mm: Improve switch_mm() barrier comments
x86/vdso/pvclock: Protect STABLE check with the seqcount

Borislav Petkov (1):
x86/boot: Hide local labels in verify_cpu()

Dan Carpenter (1):
x86/cpu/amd: Remove an unneeded condition in srat_detect_node()

Dave Jones (1):
x86/mm/pat: Make split_page_count() check for empty levels to fix /proc/meminfo output

H.J. Lu (1):
x86/boot: Double BOOT_HEAP_SIZE to 64KB

Kefeng Wang (1):
x86/mm: Use PAGE_ALIGNED instead of IS_ALIGNED

Mario Kleiner (1):
x86/reboot/quirks: Add iMac10,1 to pci_reboot_dmi_table[]

Rusty Russell (1):
lguest: Map switcher text R/O

yu-cheng yu (4):
x86/fpu: Fix early FPU command-line parsing
x86/fpu: Disable XGETBV1 when no XSAVE
x86/fpu: Disable MPX when eagerfpu is off
x86/fpu: Disable AVX when eagerfpu is off


arch/x86/entry/vdso/vclock_gettime.c | 12 +--
arch/x86/include/asm/boot.h | 2 +-
arch/x86/include/asm/fpu/internal.h | 1 +
arch/x86/include/asm/fpu/xstate.h | 11 +-
arch/x86/include/asm/lguest.h | 4 +-
arch/x86/include/asm/mmu_context.h | 34 +++++-
arch/x86/kernel/cpu/amd.c | 3 +-
arch/x86/kernel/fpu/init.c | 161 ++++++++++++++--------------
arch/x86/kernel/fpu/xstate.c | 4 +-
arch/x86/kernel/reboot.c | 8 ++
arch/x86/kernel/verify_cpu.S | 50 ++++-----
arch/x86/mm/init_64.c | 3 +-
arch/x86/mm/pageattr.c | 3 +
arch/x86/mm/tlb.c | 29 ++++-
drivers/lguest/core.c | 74 +++++++++----
tools/testing/selftests/x86/Makefile | 6 +-
tools/testing/selftests/x86/vdso_restorer.c | 88 +++++++++++++++
17 files changed, 342 insertions(+), 151 deletions(-)
create mode 100644 tools/testing/selftests/x86/vdso_restorer.c

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 8602f06c759f..1a50e09c945b 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -126,23 +126,23 @@ static notrace cycle_t vread_pvclock(int *mode)
*
* On Xen, we don't appear to have that guarantee, but Xen still
* supplies a valid seqlock using the version field.
-
+ *
* We only do pvclock vdso timing at all if
* PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
* mean that all vCPUs have matching pvti and that the TSC is
* synced, so we can just look at vCPU 0's pvti.
*/

- if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
- *mode = VCLOCK_NONE;
- return 0;
- }
-
do {
version = pvti->version;

smp_rmb();

+ if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
+ *mode = VCLOCK_NONE;
+ return 0;
+ }
+
tsc = rdtsc_ordered();
pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
pvti_tsc_shift = pvti->tsc_shift;
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 4fa687a47a62..6b8d6e8cd449 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -27,7 +27,7 @@
#define BOOT_HEAP_SIZE 0x400000
#else /* !CONFIG_KERNEL_BZIP2 */

-#define BOOT_HEAP_SIZE 0x8000
+#define BOOT_HEAP_SIZE 0x10000

#endif /* !CONFIG_KERNEL_BZIP2 */

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index eadcdd5bb946..0fd440df63f1 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -42,6 +42,7 @@ extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system(struct cpuinfo_x86 *c);
extern void fpu__init_check_bugs(void);
extern void fpu__resume_cpu(void);
+extern u64 fpu__get_supported_xfeatures_mask(void);

/*
* Debugging facility:
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 3a6c89b70307..af30fdeb140d 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -20,15 +20,16 @@

/* Supported features which support lazy state saving */
#define XFEATURE_MASK_LAZY (XFEATURE_MASK_FP | \
- XFEATURE_MASK_SSE | \
+ XFEATURE_MASK_SSE)
+
+/* Supported features which require eager state saving */
+#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | \
+ XFEATURE_MASK_BNDCSR | \
XFEATURE_MASK_YMM | \
- XFEATURE_MASK_OPMASK | \
+ XFEATURE_MASK_OPMASK | \
XFEATURE_MASK_ZMM_Hi256 | \
XFEATURE_MASK_Hi16_ZMM)

-/* Supported features which require eager state saving */
-#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)
-
/* All currently supported features */
#define XCNTXT_MASK (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER)

diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 3bbc07a57a31..73d0c9b92087 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -12,7 +12,9 @@
#define GUEST_PL 1

/* Page for Switcher text itself, then two pages per cpu */
-#define TOTAL_SWITCHER_PAGES (1 + 2 * nr_cpu_ids)
+#define SWITCHER_TEXT_PAGES (1)
+#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids)
+#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES)

/* Where we map the Switcher, in both Host and Guest. */
extern unsigned long switcher_addr;
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 379cd3658799..bfd9b2a35a0b 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -116,8 +116,36 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
#endif
cpumask_set_cpu(cpu, mm_cpumask(next));

- /* Re-load page tables */
+ /*
+ * Re-load page tables.
+ *
+ * This logic has an ordering constraint:
+ *
+ * CPU 0: Write to a PTE for 'next'
+ * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
+ * CPU 1: set bit 1 in next's mm_cpumask
+ * CPU 1: load from the PTE that CPU 0 writes (implicit)
+ *
+ * We need to prevent an outcome in which CPU 1 observes
+ * the new PTE value and CPU 0 observes bit 1 clear in
+ * mm_cpumask. (If that occurs, then the IPI will never
+ * be sent, and CPU 0's TLB will contain a stale entry.)
+ *
+ * The bad outcome can occur if either CPU's load is
+ * reordered before that CPU's store, so both CPUs must
+ * execute full barriers to prevent this from happening.
+ *
+ * Thus, switch_mm needs a full barrier between the
+ * store to mm_cpumask and any operation that could load
+ * from next->pgd. TLB fills are special and can happen
+ * due to instruction fetches or for no reason at all,
+ * and neither LOCK nor MFENCE orders them.
+ * Fortunately, load_cr3() is serializing and gives the
+ * ordering guarantee we need.
+ *
+ */
load_cr3(next->pgd);
+
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);

/* Stop flush ipis for the previous mm */
@@ -156,10 +184,14 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
* schedule, protecting us from simultaneous changes.
*/
cpumask_set_cpu(cpu, mm_cpumask(next));
+
/*
* We were in lazy tlb mode and leave_mm disabled
* tlb flush IPI delivery. We must reload CR3
* to make sure to use no freed page tables.
+ *
+ * As above, load_cr3() is serializing and orders TLB
+ * fills with respect to the mm_cpumask write.
*/
load_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e678ddeed030..a07956a08936 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -434,8 +434,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
*/
int ht_nodeid = c->initial_apicid;

- if (ht_nodeid >= 0 &&
- __apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+ if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
node = __apicid_to_node[ht_nodeid];
/* Pick a nearby node */
if (!node_online(node))
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 7b2978ab30df..6d9f0a7ef4c8 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -3,8 +3,11 @@
*/
#include <asm/fpu/internal.h>
#include <asm/tlbflush.h>
+#include <asm/setup.h>
+#include <asm/cmdline.h>

#include <linux/sched.h>
+#include <linux/init.h>

/*
* Initialize the TS bit in CR0 according to the style of context-switches
@@ -270,20 +273,52 @@ static void __init fpu__init_system_xstate_size_legacy(void)
*/
static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;

-static int __init eager_fpu_setup(char *s)
+/*
+ * Find supported xfeatures based on cpu features and command-line input.
+ * This must be called after fpu__init_parse_early_param() is called and
+ * xfeatures_mask is enumerated.
+ */
+u64 __init fpu__get_supported_xfeatures_mask(void)
{
- if (!strcmp(s, "on"))
- eagerfpu = ENABLE;
- else if (!strcmp(s, "off"))
- eagerfpu = DISABLE;
- else if (!strcmp(s, "auto"))
- eagerfpu = AUTO;
- return 1;
+ /* Support all xfeatures known to us */
+ if (eagerfpu != DISABLE)
+ return XCNTXT_MASK;
+
+ /* Warning of xfeatures being disabled for no eagerfpu mode */
+ if (xfeatures_mask & XFEATURE_MASK_EAGER) {
+ pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
+ xfeatures_mask & XFEATURE_MASK_EAGER);
+ }
+
+ /* Return a mask that masks out all features requiring eagerfpu mode */
+ return ~XFEATURE_MASK_EAGER;
+}
+
+/*
+ * Disable features dependent on eagerfpu.
+ */
+static void __init fpu__clear_eager_fpu_features(void)
+{
+ setup_clear_cpu_cap(X86_FEATURE_MPX);
+ setup_clear_cpu_cap(X86_FEATURE_AVX);
+ setup_clear_cpu_cap(X86_FEATURE_AVX2);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512F);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
}
-__setup("eagerfpu=", eager_fpu_setup);

/*
* Pick the FPU context switching strategy:
+ *
+ * When eagerfpu is AUTO or ENABLE, we ensure it is ENABLE if either of
+ * the following is true:
+ *
+ * (1) the cpu has xsaveopt, as it has the optimization and doing eager
+ * FPU switching has a relatively low cost compared to a plain xsave;
+ * (2) the cpu has xsave features (e.g. MPX) that depend on eager FPU
+ * switching. Should the kernel boot with noxsaveopt, we support MPX
+ * with eager FPU switching at a higher cost.
*/
static void __init fpu__init_system_ctx_switch(void)
{
@@ -295,19 +330,11 @@ static void __init fpu__init_system_ctx_switch(void)
WARN_ON_FPU(current->thread.fpu.fpstate_active);
current_thread_info()->status = 0;

- /* Auto enable eagerfpu for xsaveopt */
if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE)
eagerfpu = ENABLE;

- if (xfeatures_mask & XFEATURE_MASK_EAGER) {
- if (eagerfpu == DISABLE) {
- pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
- xfeatures_mask & XFEATURE_MASK_EAGER);
- xfeatures_mask &= ~XFEATURE_MASK_EAGER;
- } else {
- eagerfpu = ENABLE;
- }
- }
+ if (xfeatures_mask & XFEATURE_MASK_EAGER)
+ eagerfpu = ENABLE;

if (eagerfpu == ENABLE)
setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
@@ -316,11 +343,48 @@ static void __init fpu__init_system_ctx_switch(void)
}

/*
+ * We parse fpu parameters early because fpu__init_system() is executed
+ * before parse_early_param().
+ */
+static void __init fpu__init_parse_early_param(void)
+{
+ /*
+ * No need to check "eagerfpu=auto" again, since it is the
+ * initial default.
+ */
+ if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
+ eagerfpu = DISABLE;
+ fpu__clear_eager_fpu_features();
+ } else if (cmdline_find_option_bool(boot_command_line, "eagerfpu=on")) {
+ eagerfpu = ENABLE;
+ }
+
+ if (cmdline_find_option_bool(boot_command_line, "no387"))
+ setup_clear_cpu_cap(X86_FEATURE_FPU);
+
+ if (cmdline_find_option_bool(boot_command_line, "nofxsr")) {
+ setup_clear_cpu_cap(X86_FEATURE_FXSR);
+ setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
+ setup_clear_cpu_cap(X86_FEATURE_XMM);
+ }
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsave"))
+ fpu__xstate_clear_all_cpu_caps();
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+}
+
+/*
* Called on the boot CPU once per system bootup, to set up the initial
* FPU state that is later cloned into all processes:
*/
void __init fpu__init_system(struct cpuinfo_x86 *c)
{
+ fpu__init_parse_early_param();
fpu__init_system_early_generic(c);

/*
@@ -344,62 +408,3 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)

fpu__init_system_ctx_switch();
}
-
-/*
- * Boot parameter to turn off FPU support and fall back to math-emu:
- */
-static int __init no_387(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_FPU);
- return 1;
-}
-__setup("no387", no_387);
-
-/*
- * Disable all xstate CPU features:
- */
-static int __init x86_noxsave_setup(char *s)
-{
- if (strlen(s))
- return 0;
-
- fpu__xstate_clear_all_cpu_caps();
-
- return 1;
-}
-__setup("noxsave", x86_noxsave_setup);
-
-/*
- * Disable the XSAVEOPT instruction specifically:
- */
-static int __init x86_noxsaveopt_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-
- return 1;
-}
-__setup("noxsaveopt", x86_noxsaveopt_setup);
-
-/*
- * Disable the XSAVES instruction:
- */
-static int __init x86_noxsaves_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-
- return 1;
-}
-__setup("noxsaves", x86_noxsaves_setup);
-
-/*
- * Disable FX save/restore and SSE support:
- */
-static int __init x86_nofxsr_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_FXSR);
- setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
- setup_clear_cpu_cap(X86_FEATURE_XMM);
-
- return 1;
-}
-__setup("nofxsr", x86_nofxsr_setup);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 40f100285984..d425cda5ae6d 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -52,6 +52,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
setup_clear_cpu_cap(X86_FEATURE_MPX);
+ setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
}

/*
@@ -632,8 +633,7 @@ void __init fpu__init_system_xstate(void)
BUG();
}

- /* Support only the state known to the OS: */
- xfeatures_mask = xfeatures_mask & XCNTXT_MASK;
+ xfeatures_mask &= fpu__get_supported_xfeatures_mask();

/* Enable xstate instructions to be able to continue with initialization: */
fpu__init_cpu_xstate();
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d64889aa2d46..ab0adc0fa5db 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -182,6 +182,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
},
},
+ { /* Handle problems with rebooting on the iMac10,1. */
+ .callback = set_pci_reboot,
+ .ident = "Apple iMac10,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "iMac10,1"),
+ },
+ },

/* ASRock */
{ /* Handle problems with rebooting on ASRock Q1900DC-ITX */
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 4cf401f581e7..07efb35ee4bc 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -48,31 +48,31 @@
pushfl
popl %eax
cmpl %eax,%ebx
- jz verify_cpu_no_longmode # cpu has no cpuid
+ jz .Lverify_cpu_no_longmode # cpu has no cpuid
#endif

movl $0x0,%eax # See if cpuid 1 is implemented
cpuid
cmpl $0x1,%eax
- jb verify_cpu_no_longmode # no cpuid 1
+ jb .Lverify_cpu_no_longmode # no cpuid 1

xor %di,%di
cmpl $0x68747541,%ebx # AuthenticAMD
- jnz verify_cpu_noamd
+ jnz .Lverify_cpu_noamd
cmpl $0x69746e65,%edx
- jnz verify_cpu_noamd
+ jnz .Lverify_cpu_noamd
cmpl $0x444d4163,%ecx
- jnz verify_cpu_noamd
+ jnz .Lverify_cpu_noamd
mov $1,%di # cpu is from AMD
- jmp verify_cpu_check
+ jmp .Lverify_cpu_check

-verify_cpu_noamd:
+.Lverify_cpu_noamd:
cmpl $0x756e6547,%ebx # GenuineIntel?
- jnz verify_cpu_check
+ jnz .Lverify_cpu_check
cmpl $0x49656e69,%edx
- jnz verify_cpu_check
+ jnz .Lverify_cpu_check
cmpl $0x6c65746e,%ecx
- jnz verify_cpu_check
+ jnz .Lverify_cpu_check

# only call IA32_MISC_ENABLE when:
# family > 6 || (family == 6 && model >= 0xd)
@@ -83,59 +83,59 @@
andl $0x0ff00f00, %eax # mask family and extended family
shrl $8, %eax
cmpl $6, %eax
- ja verify_cpu_clear_xd # family > 6, ok
- jb verify_cpu_check # family < 6, skip
+ ja .Lverify_cpu_clear_xd # family > 6, ok
+ jb .Lverify_cpu_check # family < 6, skip

andl $0x000f00f0, %ecx # mask model and extended model
shrl $4, %ecx
cmpl $0xd, %ecx
- jb verify_cpu_check # family == 6, model < 0xd, skip
+ jb .Lverify_cpu_check # family == 6, model < 0xd, skip

-verify_cpu_clear_xd:
+.Lverify_cpu_clear_xd:
movl $MSR_IA32_MISC_ENABLE, %ecx
rdmsr
btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE
- jnc verify_cpu_check # only write MSR if bit was changed
+ jnc .Lverify_cpu_check # only write MSR if bit was changed
wrmsr

-verify_cpu_check:
+.Lverify_cpu_check:
movl $0x1,%eax # Does the cpu have what it takes
cpuid
andl $REQUIRED_MASK0,%edx
xorl $REQUIRED_MASK0,%edx
- jnz verify_cpu_no_longmode
+ jnz .Lverify_cpu_no_longmode

movl $0x80000000,%eax # See if extended cpuid is implemented
cpuid
cmpl $0x80000001,%eax
- jb verify_cpu_no_longmode # no extended cpuid
+ jb .Lverify_cpu_no_longmode # no extended cpuid

movl $0x80000001,%eax # Does the cpu have what it takes
cpuid
andl $REQUIRED_MASK1,%edx
xorl $REQUIRED_MASK1,%edx
- jnz verify_cpu_no_longmode
+ jnz .Lverify_cpu_no_longmode

-verify_cpu_sse_test:
+.Lverify_cpu_sse_test:
movl $1,%eax
cpuid
andl $SSE_MASK,%edx
cmpl $SSE_MASK,%edx
- je verify_cpu_sse_ok
+ je .Lverify_cpu_sse_ok
test %di,%di
- jz verify_cpu_no_longmode # only try to force SSE on AMD
+ jz .Lverify_cpu_no_longmode # only try to force SSE on AMD
movl $MSR_K7_HWCR,%ecx
rdmsr
btr $15,%eax # enable SSE
wrmsr
xor %di,%di # don't loop
- jmp verify_cpu_sse_test # try again
+ jmp .Lverify_cpu_sse_test # try again

-verify_cpu_no_longmode:
+.Lverify_cpu_no_longmode:
popf # Restore caller passed flags
movl $1,%eax
ret
-verify_cpu_sse_ok:
+.Lverify_cpu_sse_ok:
popf # Restore caller passed flags
xorl %eax, %eax
ret
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ec081fe0ce2c..8829482d69ec 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -814,8 +814,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
if (phys_addr < (phys_addr_t)0x40000000)
return;

- if (IS_ALIGNED(addr, PAGE_SIZE) &&
- IS_ALIGNED(next, PAGE_SIZE)) {
+ if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
/*
* Do not free direct mapping pages since they were
* freed when offlining, or simplely not in use.
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 6000ad7f560c..fc6a4c8f6e2a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -66,6 +66,9 @@ void update_page_count(int level, unsigned long pages)

static void split_page_count(int level)
{
+ if (direct_pages_count[level] == 0)
+ return;
+
direct_pages_count[level]--;
direct_pages_count[level - 1] += PTRS_PER_PTE;
}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 8ddb5d0d66fb..8f4cc3dfac32 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -161,7 +161,10 @@ void flush_tlb_current_task(void)
preempt_disable();

count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+
+ /* This is an implicit full barrier that synchronizes with switch_mm. */
local_flush_tlb();
+
trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
@@ -188,17 +191,29 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long base_pages_to_flush = TLB_FLUSH_ALL;

preempt_disable();
- if (current->active_mm != mm)
+ if (current->active_mm != mm) {
+ /* Synchronize with switch_mm. */
+ smp_mb();
+
goto out;
+ }

if (!current->mm) {
leave_mm(smp_processor_id());
+
+ /* Synchronize with switch_mm. */
+ smp_mb();
+
goto out;
}

if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
base_pages_to_flush = (end - start) >> PAGE_SHIFT;

+ /*
+ * Both branches below are implicit full barriers (MOV to CR or
+ * INVLPG) that synchronize with switch_mm.
+ */
if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
base_pages_to_flush = TLB_FLUSH_ALL;
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
@@ -228,10 +243,18 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
preempt_disable();

if (current->active_mm == mm) {
- if (current->mm)
+ if (current->mm) {
+ /*
+ * Implicit full barrier (INVLPG) that synchronizes
+ * with switch_mm.
+ */
__flush_tlb_one(start);
- else
+ } else {
leave_mm(smp_processor_id());
+
+ /* Synchronize with switch_mm. */
+ smp_mb();
+ }
}

if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 312ffd3d0017..9e385b38debf 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -22,7 +22,8 @@

unsigned long switcher_addr;
struct page **lg_switcher_pages;
-static struct vm_struct *switcher_vma;
+static struct vm_struct *switcher_text_vma;
+static struct vm_struct *switcher_stacks_vma;

/* This One Big lock protects all inter-guest data structures. */
DEFINE_MUTEX(lguest_lock);
@@ -83,54 +84,80 @@ static __init int map_switcher(void)
}

/*
+ * Copy in the compiled-in Switcher code (from x86/switcher_32.S).
+ * It goes in the first page, which we map in momentarily.
+ */
+ memcpy(kmap(lg_switcher_pages[0]), start_switcher_text,
+ end_switcher_text - start_switcher_text);
+ kunmap(lg_switcher_pages[0]);
+
+ /*
* We place the Switcher underneath the fixmap area, which is the
* highest virtual address we can get. This is important, since we
* tell the Guest it can't access this memory, so we want its ceiling
* as high as possible.
*/
- switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE;
+ switcher_addr = FIXADDR_START - TOTAL_SWITCHER_PAGES*PAGE_SIZE;

/*
- * Now we reserve the "virtual memory area" we want. We might
- * not get it in theory, but in practice it's worked so far.
- * The end address needs +1 because __get_vm_area allocates an
- * extra guard page, so we need space for that.
+ * Now we reserve the "virtual memory area"s we want. We might
+ * not get them in theory, but in practice it's worked so far.
+ *
+ * We want the switcher text to be read-only and executable, and
+ * the stacks to be read-write and non-executable.
*/
- switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
- VM_ALLOC, switcher_addr, switcher_addr
- + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE);
- if (!switcher_vma) {
+ switcher_text_vma = __get_vm_area(PAGE_SIZE, VM_ALLOC|VM_NO_GUARD,
+ switcher_addr,
+ switcher_addr + PAGE_SIZE);
+
+ if (!switcher_text_vma) {
err = -ENOMEM;
printk("lguest: could not map switcher pages high\n");
goto free_pages;
}

+ switcher_stacks_vma = __get_vm_area(SWITCHER_STACK_PAGES * PAGE_SIZE,
+ VM_ALLOC|VM_NO_GUARD,
+ switcher_addr + PAGE_SIZE,
+ switcher_addr + TOTAL_SWITCHER_PAGES * PAGE_SIZE);
+ if (!switcher_stacks_vma) {
+ err = -ENOMEM;
+ printk("lguest: could not map switcher pages high\n");
+ goto free_text_vma;
+ }
+
/*
* This code actually sets up the pages we've allocated to appear at
* switcher_addr. map_vm_area() takes the vma we allocated above, the
- * kind of pages we're mapping (kernel pages), and a pointer to our
- * array of struct pages.
+ * kind of pages we're mapping (kernel text pages and kernel writable
+ * pages respectively), and a pointer to our array of struct pages.
*/
- err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, lg_switcher_pages);
+ err = map_vm_area(switcher_text_vma, PAGE_KERNEL_RX, lg_switcher_pages);
+ if (err) {
+ printk("lguest: text map_vm_area failed: %i\n", err);
+ goto free_vmas;
+ }
+
+ err = map_vm_area(switcher_stacks_vma, PAGE_KERNEL,
+ lg_switcher_pages + SWITCHER_TEXT_PAGES);
if (err) {
- printk("lguest: map_vm_area failed: %i\n", err);
- goto free_vma;
+ printk("lguest: stacks map_vm_area failed: %i\n", err);
+ goto free_vmas;
}

/*
* Now the Switcher is mapped at the right address, we can't fail!
- * Copy in the compiled-in Switcher code (from x86/switcher_32.S).
*/
- memcpy(switcher_vma->addr, start_switcher_text,
- end_switcher_text - start_switcher_text);
-
printk(KERN_INFO "lguest: mapped switcher at %p\n",
- switcher_vma->addr);
+ switcher_text_vma->addr);
/* And we succeeded... */
return 0;

-free_vma:
- vunmap(switcher_vma->addr);
+free_vmas:
+ /* Undoes map_vm_area and __get_vm_area */
+ vunmap(switcher_stacks_vma->addr);
+free_text_vma:
+ vunmap(switcher_text_vma->addr);
free_pages:
i = TOTAL_SWITCHER_PAGES;
free_some_pages:
@@ -148,7 +175,8 @@ static void unmap_switcher(void)
unsigned int i;

/* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */
- vunmap(switcher_vma->addr);
+ vunmap(switcher_text_vma->addr);
+ vunmap(switcher_stacks_vma->addr);
/* Now we just need to free the pages we copied the switcher into */
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
__free_pages(lg_switcher_pages[i], 0);
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index eabcff411984..d0c473f65850 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -4,9 +4,11 @@ include ../lib.mk

.PHONY: all all_32 all_64 warn_32bit_failure clean

-TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt ptrace_syscall
+TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall
TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso \
- test_FCMOV test_FCOMI test_FISTTP
+ test_FCMOV test_FCOMI test_FISTTP \
+ ldt_gdt \
+ vdso_restorer

TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
diff --git a/tools/testing/selftests/x86/vdso_restorer.c b/tools/testing/selftests/x86/vdso_restorer.c
new file mode 100644
index 000000000000..cb038424a403
--- /dev/null
+++ b/tools/testing/selftests/x86/vdso_restorer.c
@@ -0,0 +1,88 @@
+/*
+ * vdso_restorer.c - tests vDSO-based signal restore
+ * Copyright (c) 2015 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * This makes sure that sa_restorer == NULL keeps working on 32-bit
+ * configurations. Modern glibc doesn't use it under any circumstances,
+ * so it's easy to overlook breakage.
+ *
+ * 64-bit userspace has never supported sa_restorer == NULL, so this is
+ * 32-bit only.
+ */
+
+#define _GNU_SOURCE
+
+#include <err.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <sys/syscall.h>
+
+/* Open-code this -- the headers are too messy to easily use them. */
+struct real_sigaction {
+ void *handler;
+ unsigned long flags;
+ void *restorer;
+ unsigned int mask[2];
+};
+
+static volatile sig_atomic_t handler_called;
+
+static void handler_with_siginfo(int sig, siginfo_t *info, void *ctx_void)
+{
+ handler_called = 1;
+}
+
+static void handler_without_siginfo(int sig)
+{
+ handler_called = 1;
+}
+
+int main()
+{
+ int nerrs = 0;
+ struct real_sigaction sa;
+
+ memset(&sa, 0, sizeof(sa));
+ sa.handler = handler_with_siginfo;
+ sa.flags = SA_SIGINFO;
+ sa.restorer = NULL; /* request kernel-provided restorer */
+
+ if (syscall(SYS_rt_sigaction, SIGUSR1, &sa, NULL, 8) != 0)
+ err(1, "raw rt_sigaction syscall");
+
+ raise(SIGUSR1);
+
+ if (handler_called) {
+ printf("[OK]\tSA_SIGINFO handler returned successfully\n");
+ } else {
+ printf("[FAIL]\tSA_SIGINFO handler was not called\n");
+ nerrs++;
+ }
+
+ sa.flags = 0;
+ sa.handler = handler_without_siginfo;
+ if (syscall(SYS_sigaction, SIGUSR1, &sa, 0) != 0)
+ err(1, "raw sigaction syscall");
+ handler_called = 0;
+
+ raise(SIGUSR1);
+
+ if (handler_called) {
+ printf("[OK]\t!SA_SIGINFO handler returned successfully\n");
+ } else {
+ printf("[FAIL]\t!SA_SIGINFO handler was not called\n");
+ nerrs++;
+ }
+}