Re: [PATCH v2 08/26] userfaultfd: wp: add WP pagetable tracking to x86

From: Jerome Glisse
Date: Thu Feb 21 2019 - 12:21:02 EST


On Tue, Feb 12, 2019 at 10:56:14AM +0800, Peter Xu wrote:
> From: Andrea Arcangeli <aarcange@xxxxxxxxxx>
>
> Accurate userfaultfd WP tracking is possible by tracking exactly which
> virtual memory ranges were writeprotected by userland. We can't relay
> only on the RW bit of the mapped pagetable because that information is
> destroyed by fork() or KSM or swap. If we were to relay on that, we'd
> need to stay on the safe side and generate false positive wp faults
> for every swapped out page.
>
> Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx>
> Signed-off-by: Peter Xu <peterx@xxxxxxxxxx>

So i thought about this some more and the only alternative i see is
definining a new swap type to preserve the pte write bit when swapping,
and storing the original pte write within ksm stable_node. This would
solve false positive for swap and ksm.

But i do not see this as a better alternative to storing the wp status
as bit in the pte. So:

Reviewed-by: Jérôme Glisse <jglisse@xxxxxxxxxx>

> ---
> arch/x86/Kconfig | 1 +
> arch/x86/include/asm/pgtable.h | 52 ++++++++++++++++++++++++++++
> arch/x86/include/asm/pgtable_64.h | 8 ++++-
> arch/x86/include/asm/pgtable_types.h | 9 +++++
> include/asm-generic/pgtable.h | 1 +
> include/asm-generic/pgtable_uffd.h | 51 +++++++++++++++++++++++++++
> init/Kconfig | 5 +++
> 7 files changed, 126 insertions(+), 1 deletion(-)
> create mode 100644 include/asm-generic/pgtable_uffd.h
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 68261430fe6e..cb43bc008675 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -209,6 +209,7 @@ config X86
> select USER_STACKTRACE_SUPPORT
> select VIRT_TO_BUS
> select X86_FEATURE_NAMES if PROC_FS
> + select HAVE_ARCH_USERFAULTFD_WP if USERFAULTFD
>
> config INSTRUCTION_DECODER
> def_bool y
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index 2779ace16d23..6863236e8484 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -23,6 +23,7 @@
>
> #ifndef __ASSEMBLY__
> #include <asm/x86_init.h>
> +#include <asm-generic/pgtable_uffd.h>
>
> extern pgd_t early_top_pgt[PTRS_PER_PGD];
> int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
> @@ -293,6 +294,23 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
> return native_make_pte(v & ~clear);
> }
>
> +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
> +static inline int pte_uffd_wp(pte_t pte)
> +{
> + return pte_flags(pte) & _PAGE_UFFD_WP;
> +}
> +
> +static inline pte_t pte_mkuffd_wp(pte_t pte)
> +{
> + return pte_set_flags(pte, _PAGE_UFFD_WP);
> +}
> +
> +static inline pte_t pte_clear_uffd_wp(pte_t pte)
> +{
> + return pte_clear_flags(pte, _PAGE_UFFD_WP);
> +}
> +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
> +
> static inline pte_t pte_mkclean(pte_t pte)
> {
> return pte_clear_flags(pte, _PAGE_DIRTY);
> @@ -372,6 +390,23 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
> return native_make_pmd(v & ~clear);
> }
>
> +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
> +static inline int pmd_uffd_wp(pmd_t pmd)
> +{
> + return pmd_flags(pmd) & _PAGE_UFFD_WP;
> +}
> +
> +static inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
> +{
> + return pmd_set_flags(pmd, _PAGE_UFFD_WP);
> +}
> +
> +static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
> +{
> + return pmd_clear_flags(pmd, _PAGE_UFFD_WP);
> +}
> +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
> +
> static inline pmd_t pmd_mkold(pmd_t pmd)
> {
> return pmd_clear_flags(pmd, _PAGE_ACCESSED);
> @@ -1351,6 +1386,23 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
> #endif
> #endif
>
> +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
> +static inline pte_t pte_swp_mkuffd_wp(pte_t pte)
> +{
> + return pte_set_flags(pte, _PAGE_SWP_UFFD_WP);
> +}
> +
> +static inline int pte_swp_uffd_wp(pte_t pte)
> +{
> + return pte_flags(pte) & _PAGE_SWP_UFFD_WP;
> +}
> +
> +static inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
> +{
> + return pte_clear_flags(pte, _PAGE_SWP_UFFD_WP);
> +}
> +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
> +
> #define PKRU_AD_BIT 0x1
> #define PKRU_WD_BIT 0x2
> #define PKRU_BITS_PER_PKEY 2
> diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
> index 9c85b54bf03c..e0c5d29b8685 100644
> --- a/arch/x86/include/asm/pgtable_64.h
> +++ b/arch/x86/include/asm/pgtable_64.h
> @@ -189,7 +189,7 @@ extern void sync_global_pgds(unsigned long start, unsigned long end);
> *
> * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number
> * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
> - * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|X|SD|0| <- swp entry
> + * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|F|SD|0| <- swp entry
> *
> * G (8) is aliased and used as a PROT_NONE indicator for
> * !present ptes. We need to start storing swap entries above
> @@ -197,9 +197,15 @@ extern void sync_global_pgds(unsigned long start, unsigned long end);
> * erratum where they can be incorrectly set by hardware on
> * non-present PTEs.
> *
> + * SD Bits 1-4 are not used in non-present format and available for
> + * special use described below:
> + *
> * SD (1) in swp entry is used to store soft dirty bit, which helps us
> * remember soft dirty over page migration
> *
> + * F (2) in swp entry is used to record when a pagetable is
> + * writeprotected by userfaultfd WP support.
> + *
> * Bit 7 in swp entry should be 0 because pmd_present checks not only P,
> * but also L and G.
> *
> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
> index d6ff0bbdb394..8cebcff91e57 100644
> --- a/arch/x86/include/asm/pgtable_types.h
> +++ b/arch/x86/include/asm/pgtable_types.h
> @@ -32,6 +32,7 @@
>
> #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
> #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
> +#define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
> #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
> #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
>
> @@ -100,6 +101,14 @@
> #define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0))
> #endif
>
> +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
> +#define _PAGE_UFFD_WP (_AT(pteval_t, 1) << _PAGE_BIT_UFFD_WP)
> +#define _PAGE_SWP_UFFD_WP _PAGE_USER
> +#else
> +#define _PAGE_UFFD_WP (_AT(pteval_t, 0))
> +#define _PAGE_SWP_UFFD_WP (_AT(pteval_t, 0))
> +#endif
> +
> #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
> #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
> #define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP)
> diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
> index 05e61e6c843f..f49afe951711 100644
> --- a/include/asm-generic/pgtable.h
> +++ b/include/asm-generic/pgtable.h
> @@ -10,6 +10,7 @@
> #include <linux/mm_types.h>
> #include <linux/bug.h>
> #include <linux/errno.h>
> +#include <asm-generic/pgtable_uffd.h>
>
> #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \
> defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS
> diff --git a/include/asm-generic/pgtable_uffd.h b/include/asm-generic/pgtable_uffd.h
> new file mode 100644
> index 000000000000..643d1bf559c2
> --- /dev/null
> +++ b/include/asm-generic/pgtable_uffd.h
> @@ -0,0 +1,51 @@
> +#ifndef _ASM_GENERIC_PGTABLE_UFFD_H
> +#define _ASM_GENERIC_PGTABLE_UFFD_H
> +
> +#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
> +static __always_inline int pte_uffd_wp(pte_t pte)
> +{
> + return 0;
> +}
> +
> +static __always_inline int pmd_uffd_wp(pmd_t pmd)
> +{
> + return 0;
> +}
> +
> +static __always_inline pte_t pte_mkuffd_wp(pte_t pte)
> +{
> + return pte;
> +}
> +
> +static __always_inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
> +{
> + return pmd;
> +}
> +
> +static __always_inline pte_t pte_clear_uffd_wp(pte_t pte)
> +{
> + return pte;
> +}
> +
> +static __always_inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
> +{
> + return pmd;
> +}
> +
> +static __always_inline pte_t pte_swp_mkuffd_wp(pte_t pte)
> +{
> + return pte;
> +}
> +
> +static __always_inline int pte_swp_uffd_wp(pte_t pte)
> +{
> + return 0;
> +}
> +
> +static __always_inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
> +{
> + return pte;
> +}
> +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
> +
> +#endif /* _ASM_GENERIC_PGTABLE_UFFD_H */
> diff --git a/init/Kconfig b/init/Kconfig
> index c9386a365eea..892d61ddf2eb 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1424,6 +1424,11 @@ config ADVISE_SYSCALLS
> applications use these syscalls, you can disable this option to save
> space.
>
> +config HAVE_ARCH_USERFAULTFD_WP
> + bool
> + help
> + Arch has userfaultfd write protection support
> +
> config MEMBARRIER
> bool "Enable membarrier() system call" if EXPERT
> default y
> --
> 2.17.1
>