Re: [PATCH v2] Linux VM workaround for Knights Landing A/D leak

From: Borislav Petkov
Date: Tue Jun 14 2016 - 14:10:13 EST


On Tue, Jun 14, 2016 at 07:01:12PM +0200, Lukasz Anaczkowski wrote:
> From: Andi Kleen <ak@xxxxxxxxxxxxxxx>
>
> Knights Landing has a issue that a thread setting A or D bits
> may not do so atomically against checking the present bit.
> A thread which is going to page fault may still set those
> bits, even though the present bit was already atomically cleared.
>
> This implies that when the kernel clears present atomically,
> some time later the supposed to be zero entry could be corrupted
> with stray A or D bits.
>
> Since the PTE could be already used for storing a swap index,
> or a NUMA migration index, this cannot be tolerated. Most
> of the time the kernel detects the problem, but in some
> rare cases it may not.
>
> This patch enforces that the page unmap path in vmscan/direct reclaim
> always flushes other CPUs after clearing each page, and also
> clears the PTE again after the flush.
>
> For reclaim this brings the performance back to before Mel's
> flushing changes, but for unmap it disables batching.
>
> This makes sure any leaked A/D bits are immediately cleared before the entry
> is used for something else.
>
> Any parallel faults that check for entry is zero may loop,
> but they should eventually recover after the entry is written.
>
> Also other users may spin in the page table lock until we
> "fixed" the PTE. This is ensured by always taking the page table lock
> even for the swap cache case. Previously this was only done
> on architectures with non atomic PTE accesses (such as 32bit PTE),
> but now it is also done when this bug workaround is active.
>
> I audited apply_pte_range and other users of arch_enter_lazy...
> and they seem to all not clear the present bit.
>
> Right now the extra flush is done in the low level
> architecture code, while the higher level code still
> does batched TLB flush. This means there is always one extra
> unnecessary TLB flush now. As a followon optimization
> this could be avoided by telling the callers that
> the flush already happenend.
>
> v2 (Lukasz Anaczkowski):
> () added call to smp_mb__after_atomic() to synchornize with
> switch_mm, based on Nadav's comment
> () fixed compilation breakage
>
> Signed-off-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>
> Signed-off-by: Lukasz Anaczkowski <lukasz.anaczkowski@xxxxxxxxx>
> ---
> arch/x86/include/asm/cpufeatures.h | 1 +
> arch/x86/include/asm/hugetlb.h | 9 ++++++++-
> arch/x86/include/asm/pgtable.h | 5 +++++
> arch/x86/include/asm/pgtable_64.h | 6 ++++++
> arch/x86/kernel/cpu/intel.c | 10 ++++++++++
> arch/x86/mm/tlb.c | 22 ++++++++++++++++++++++
> include/linux/mm.h | 4 ++++
> mm/memory.c | 3 ++-
> 8 files changed, 58 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
> index 4a41348..2c48011 100644
> --- a/arch/x86/include/asm/cpufeatures.h
> +++ b/arch/x86/include/asm/cpufeatures.h
> @@ -303,6 +303,7 @@
> #define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
> #define X86_BUG_NULL_SEG X86_BUG(9) /* Nulling a selector preserves the base */
> #define X86_BUG_SWAPGS_FENCE X86_BUG(10) /* SWAPGS without input dep on GS */
> +#define X86_BUG_PTE_LEAK X86_BUG(11) /* PTE may leak A/D bits after clear */
>
>
> #ifdef CONFIG_X86_32
> diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
> index 3a10616..58e1ca9 100644
> --- a/arch/x86/include/asm/hugetlb.h
> +++ b/arch/x86/include/asm/hugetlb.h
> @@ -41,10 +41,17 @@ static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
> set_pte_at(mm, addr, ptep, pte);
> }
>
> +extern void fix_pte_leak(struct mm_struct *mm, unsigned long addr,
> + pte_t *ptep);
> +
> static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
> unsigned long addr, pte_t *ptep)
> {
> - return ptep_get_and_clear(mm, addr, ptep);
> + pte_t pte = ptep_get_and_clear(mm, addr, ptep);
> +
> + if (boot_cpu_has_bug(X86_BUG_PTE_LEAK))

static_cpu_has_bug()

> + fix_pte_leak(mm, addr, ptep);
> + return pte;
> }
>
> static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index 1a27396..9769355 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -794,11 +794,16 @@ extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
> extern int ptep_clear_flush_young(struct vm_area_struct *vma,
> unsigned long address, pte_t *ptep);
>
> +extern void fix_pte_leak(struct mm_struct *mm, unsigned long addr,
> + pte_t *ptep);
> +
> #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
> static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
> pte_t *ptep)
> {
> pte_t pte = native_ptep_get_and_clear(ptep);
> + if (boot_cpu_has_bug(X86_BUG_PTE_LEAK))

static_cpu_has_bug()

> + fix_pte_leak(mm, addr, ptep);
> pte_update(mm, addr, ptep);
> return pte;
> }
> diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
> index 2ee7811..6fa4079 100644
> --- a/arch/x86/include/asm/pgtable_64.h
> +++ b/arch/x86/include/asm/pgtable_64.h
> @@ -178,6 +178,12 @@ extern void cleanup_highmap(void);
> extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
> extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
>
> +#define ARCH_HAS_NEEDS_SWAP_PTL 1
> +static inline bool arch_needs_swap_ptl(void)
> +{
> + return boot_cpu_has_bug(X86_BUG_PTE_LEAK);

static_cpu_has_bug()

> +}
> +
> #endif /* !__ASSEMBLY__ */
>
> #endif /* _ASM_X86_PGTABLE_64_H */
> diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
> index 6e2ffbe..f499513 100644
> --- a/arch/x86/kernel/cpu/intel.c
> +++ b/arch/x86/kernel/cpu/intel.c
> @@ -181,6 +181,16 @@ static void early_init_intel(struct cpuinfo_x86 *c)
> }
> }
>
> + if (c->x86_model == 87) {

That should be INTEL_FAM6_XEON_PHI_KNL, AFAICT.

> + static bool printed;
> +
> + if (!printed) {
> + pr_info("Enabling PTE leaking workaround\n");
> + printed = true;
> + }

pr_info_once

> + set_cpu_bug(c, X86_BUG_PTE_LEAK);
> + }
> +
> /*
> * Intel Quark Core DevMan_001.pdf section 6.4.11
> * "The operating system also is required to invalidate (i.e., flush)
--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.