Re: [PATCH] um: Abandon the _PAGE_NEWPROT bit

From: Benjamin Berg
Date: Fri Oct 11 2024 - 03:39:44 EST


Hi,

On Fri, 2024-10-11 at 13:38 +0800, Tiwei Bie wrote:
> When a PTE is updated in the page table, the _PAGE_NEWPAGE bit will
> always be set. And the corresponding page will always be mapped or
> unmapped depending on whether the PTE is present or not. The check
> on the _PAGE_NEWPROT bit is not really reachable. Abandoning it will
> allow us to simplify the code and remove the unreachable code.

Oh, nice cleanup!

And I like that mprotect is gone as I don't want it in SECCOMP mode :-)

Maybe we should rename _PAGE_NEWPAGE to something like _PAGE_NEEDSYNC?
I think that might make it more clear how everything ties together.

Anyway, the change looks good to me.

Benjamin

Reviewed-by: Benjamin Berg <benjamin.berg@xxxxxxxxx>

> Signed-off-by: Tiwei Bie <tiwei.btw@xxxxxxxxxxxx>
> ---
>  arch/um/include/asm/pgtable.h           | 40 ++++-----------
>  arch/um/include/shared/os.h             |  2 -
>  arch/um/include/shared/skas/stub-data.h |  1 -
>  arch/um/kernel/skas/stub.c              | 10 ----
>  arch/um/kernel/tlb.c                    | 66 +++++++++++------------
> --
>  arch/um/os-Linux/skas/mem.c             | 21 --------
>  6 files changed, 37 insertions(+), 103 deletions(-)
>
> diff --git a/arch/um/include/asm/pgtable.h
> b/arch/um/include/asm/pgtable.h
> index bd7a9593705f..a32424cfe792 100644
> --- a/arch/um/include/asm/pgtable.h
> +++ b/arch/um/include/asm/pgtable.h
> @@ -12,7 +12,6 @@
>  
>  #define _PAGE_PRESENT 0x001
>  #define _PAGE_NEWPAGE 0x002
> -#define _PAGE_NEWPROT 0x004
>  #define _PAGE_RW 0x020
>  #define _PAGE_USER 0x040
>  #define _PAGE_ACCESSED 0x080
> @@ -151,23 +150,12 @@ static inline int pte_newpage(pte_t pte)
>   return pte_get_bits(pte, _PAGE_NEWPAGE);
>  }
>  
> -static inline int pte_newprot(pte_t pte)
> -{
> - return(pte_present(pte) && (pte_get_bits(pte,
> _PAGE_NEWPROT)));
> -}
> -
>  /*
>   * =================================
>   * Flags setting section.
>   * =================================
>   */
>  
> -static inline pte_t pte_mknewprot(pte_t pte)
> -{
> - pte_set_bits(pte, _PAGE_NEWPROT);
> - return(pte);
> -}
> -
>  static inline pte_t pte_mkclean(pte_t pte)
>  {
>   pte_clear_bits(pte, _PAGE_DIRTY);
> @@ -184,17 +172,14 @@ static inline pte_t pte_wrprotect(pte_t pte)
>  {
>   if (likely(pte_get_bits(pte, _PAGE_RW)))
>   pte_clear_bits(pte, _PAGE_RW);
> - else
> - return pte;
> - return(pte_mknewprot(pte));
> + return pte;
>  }
>  
>  static inline pte_t pte_mkread(pte_t pte)
>  {
> - if (unlikely(pte_get_bits(pte, _PAGE_USER)))
> - return pte;
> - pte_set_bits(pte, _PAGE_USER);
> - return(pte_mknewprot(pte));
> + if (likely(!pte_get_bits(pte, _PAGE_USER)))
> + pte_set_bits(pte, _PAGE_USER);
> + return pte;
>  }
>  
>  static inline pte_t pte_mkdirty(pte_t pte)
> @@ -211,18 +196,15 @@ static inline pte_t pte_mkyoung(pte_t pte)
>  
>  static inline pte_t pte_mkwrite_novma(pte_t pte)
>  {
> - if (unlikely(pte_get_bits(pte,  _PAGE_RW)))
> - return pte;
> - pte_set_bits(pte, _PAGE_RW);
> - return(pte_mknewprot(pte));
> + if (likely(!pte_get_bits(pte, _PAGE_RW)))
> + pte_set_bits(pte, _PAGE_RW);
> + return pte;
>  }
>  
>  static inline pte_t pte_mkuptodate(pte_t pte)
>  {
>   pte_clear_bits(pte, _PAGE_NEWPAGE);
> - if(pte_present(pte))
> - pte_clear_bits(pte, _PAGE_NEWPROT);
> - return(pte);
> + return pte;
>  }
>  
>  static inline pte_t pte_mknewpage(pte_t pte)
> @@ -236,12 +218,10 @@ static inline void set_pte(pte_t *pteptr, pte_t
> pteval)
>   pte_copy(*pteptr, pteval);
>  
>   /* If it's a swap entry, it needs to be marked _PAGE_NEWPAGE
> so
> - * fix_range knows to unmap it.  _PAGE_NEWPROT is specific
> to
> - * mapped pages.
> + * update_pte_range knows to unmap it.
>   */
>  
>   *pteptr = pte_mknewpage(*pteptr);
> - if(pte_present(*pteptr)) *pteptr = pte_mknewprot(*pteptr);
>  }
>  
>  #define PFN_PTE_SHIFT PAGE_SHIFT
> @@ -298,8 +278,6 @@ static inline int pte_same(pte_t pte_a, pte_t
> pte_b)
>   ({ pte_t pte; \
>   \
>   pte_set_val(pte, page_to_phys(page), (pgprot)); \
> - if (pte_present(pte)) \
> - pte_mknewprot(pte_mknewpage(pte)); \
>   pte;})
>  
>  static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
> diff --git a/arch/um/include/shared/os.h
> b/arch/um/include/shared/os.h
> index bf539fee7831..09f8201de5db 100644
> --- a/arch/um/include/shared/os.h
> +++ b/arch/um/include/shared/os.h
> @@ -279,8 +279,6 @@ int map(struct mm_id *mm_idp, unsigned long virt,
>   unsigned long len, int prot, int phys_fd,
>   unsigned long long offset);
>  int unmap(struct mm_id *mm_idp, unsigned long addr, unsigned long
> len);
> -int protect(struct mm_id *mm_idp, unsigned long addr,
> -     unsigned long len, unsigned int prot);
>  
>  /* skas/process.c */
>  extern int is_skas_winch(int pid, int fd, void *data);
> diff --git a/arch/um/include/shared/skas/stub-data.h
> b/arch/um/include/shared/skas/stub-data.h
> index 3fbdda727373..81a4cace032c 100644
> --- a/arch/um/include/shared/skas/stub-data.h
> +++ b/arch/um/include/shared/skas/stub-data.h
> @@ -30,7 +30,6 @@ enum stub_syscall_type {
>   STUB_SYSCALL_UNSET = 0,
>   STUB_SYSCALL_MMAP,
>   STUB_SYSCALL_MUNMAP,
> - STUB_SYSCALL_MPROTECT,
>  };
>  
>  struct stub_syscall {
> diff --git a/arch/um/kernel/skas/stub.c b/arch/um/kernel/skas/stub.c
> index 5d52ffa682dc..796fc266d3bb 100644
> --- a/arch/um/kernel/skas/stub.c
> +++ b/arch/um/kernel/skas/stub.c
> @@ -35,16 +35,6 @@ static __always_inline int syscall_handler(struct
> stub_data *d)
>   return -1;
>   }
>   break;
> - case STUB_SYSCALL_MPROTECT:
> - res = stub_syscall3(__NR_mprotect,
> -     sc->mem.addr, sc-
> >mem.length,
> -     sc->mem.prot);
> - if (res) {
> - d->err = res;
> - d->syscall_data_len = i;
> - return -1;
> - }
> - break;
>   default:
>   d->err = -95; /* EOPNOTSUPP */
>   d->syscall_data_len = i;
> diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
> index 548af31d4111..23c1f550cd7c 100644
> --- a/arch/um/kernel/tlb.c
> +++ b/arch/um/kernel/tlb.c
> @@ -23,9 +23,6 @@ struct vm_ops {
>       int phys_fd, unsigned long long offset);
>   int (*unmap)(struct mm_id *mm_idp,
>        unsigned long virt, unsigned long len);
> - int (*mprotect)(struct mm_id *mm_idp,
> - unsigned long virt, unsigned long len,
> - unsigned int prot);
>  };
>  
>  static int kern_map(struct mm_id *mm_idp,
> @@ -44,15 +41,6 @@ static int kern_unmap(struct mm_id *mm_idp,
>   return os_unmap_memory((void *)virt, len);
>  }
>  
> -static int kern_mprotect(struct mm_id *mm_idp,
> - unsigned long virt, unsigned long len,
> - unsigned int prot)
> -{
> - return os_protect_memory((void *)virt, len,
> - prot & UM_PROT_READ, prot &
> UM_PROT_WRITE,
> - 1);
> -}
> -
>  void report_enomem(void)
>  {
>   printk(KERN_ERR "UML ran out of memory on the host side! "
> @@ -65,33 +53,37 @@ static inline int update_pte_range(pmd_t *pmd,
> unsigned long addr,
>      struct vm_ops *ops)
>  {
>   pte_t *pte;
> - int r, w, x, prot, ret = 0;
> + int ret = 0;
>  
>   pte = pte_offset_kernel(pmd, addr);
>   do {
> - r = pte_read(*pte);
> - w = pte_write(*pte);
> - x = pte_exec(*pte);
> - if (!pte_young(*pte)) {
> - r = 0;
> - w = 0;
> - } else if (!pte_dirty(*pte))
> - w = 0;
> -
> - prot = ((r ? UM_PROT_READ : 0) | (w ? UM_PROT_WRITE
> : 0) |
> - (x ? UM_PROT_EXEC : 0));
> - if (pte_newpage(*pte)) {
> - if (pte_present(*pte)) {
> - __u64 offset;
> - unsigned long phys = pte_val(*pte) &
> PAGE_MASK;
> - int fd = phys_mapping(phys,
> &offset);
> -
> - ret = ops->mmap(ops->mm_idp, addr,
> PAGE_SIZE,
> - prot, fd, offset);
> - } else
> - ret = ops->unmap(ops->mm_idp, addr,
> PAGE_SIZE);
> - } else if (pte_newprot(*pte))
> - ret = ops->mprotect(ops->mm_idp, addr,
> PAGE_SIZE, prot);
> + if (!pte_newpage(*pte))
> + continue;
> +
> + if (pte_present(*pte)) {
> + __u64 offset;
> + unsigned long phys = pte_val(*pte) &
> PAGE_MASK;
> + int fd = phys_mapping(phys, &offset);
> + int r, w, x, prot;
> +
> + r = pte_read(*pte);
> + w = pte_write(*pte);
> + x = pte_exec(*pte);
> + if (!pte_young(*pte)) {
> + r = 0;
> + w = 0;
> + } else if (!pte_dirty(*pte))
> + w = 0;
> +
> + prot = (r ? UM_PROT_READ : 0) |
> +        (w ? UM_PROT_WRITE : 0) |
> +        (x ? UM_PROT_EXEC : 0);
> +
> + ret = ops->mmap(ops->mm_idp, addr,
> PAGE_SIZE,
> + prot, fd, offset);
> + } else
> + ret = ops->unmap(ops->mm_idp, addr,
> PAGE_SIZE);
> +
>   *pte = pte_mkuptodate(*pte);
>   } while (pte++, addr += PAGE_SIZE, ((addr < end) && !ret));
>   return ret;
> @@ -180,11 +172,9 @@ int um_tlb_sync(struct mm_struct *mm)
>   if (mm == &init_mm) {
>   ops.mmap = kern_map;
>   ops.unmap = kern_unmap;
> - ops.mprotect = kern_mprotect;
>   } else {
>   ops.mmap = map;
>   ops.unmap = unmap;
> - ops.mprotect = protect;
>   }
>  
>   pgd = pgd_offset(mm, addr);
> diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-
> Linux/skas/mem.c
> index 9a13ac23c606..d7f1814b0e5a 100644
> --- a/arch/um/os-Linux/skas/mem.c
> +++ b/arch/um/os-Linux/skas/mem.c
> @@ -217,24 +217,3 @@ int unmap(struct mm_id *mm_idp, unsigned long
> addr, unsigned long len)
>  
>   return 0;
>  }
> -
> -int protect(struct mm_id *mm_idp, unsigned long addr, unsigned long
> len,
> -     unsigned int prot)
> -{
> - struct stub_syscall *sc;
> -
> - /* Compress with previous syscall if that is possible */
> - sc = syscall_stub_get_previous(mm_idp,
> STUB_SYSCALL_MPROTECT, addr);
> - if (sc && sc->mem.prot == prot) {
> - sc->mem.length += len;
> - return 0;
> - }
> -
> - sc = syscall_stub_alloc(mm_idp);
> - sc->syscall = STUB_SYSCALL_MPROTECT;
> - sc->mem.addr = addr;
> - sc->mem.length = len;
> - sc->mem.prot = prot;
> -
> - return 0;
> -}