[PATCH v1 1/2] mm: Implement memory-deny-write-execute as a prctl

From: Joey Gouly
Date: Wed Oct 26 2022 - 11:08:55 EST


The aim of such policy is to prevent a user task from creating an
executable mapping that is also writeable.

An example of mmap() returning -EACCESS if the policy is enabled:

mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, flags, 0, 0);

Similarly, mprotect() would return -EACCESS below:

addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0);
mprotect(addr, size, PROT_READ | PROT_WRITE | PROT_EXEC);

The BPF filter that systemd MDWE uses is stateless, and disallows
mprotect() with PROT_EXEC completely. This new prctl allows PROT_EXEC to
be enabled if it was already PROT_EXEC, which allows the following case:

addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0);
mprotect(addr, size, PROT_READ | PROT_EXEC | PROT_BTI);

where PROT_BTI enables branch tracking identification on arm64.

Signed-off-by: Joey Gouly <joey.gouly@xxxxxxx>
Co-developed-by: Catalin Marinas <catalin.marinas@xxxxxxx>
Signed-off-by: Catalin Marinas <catalin.marinas@xxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
---
include/linux/mman.h | 15 +++++++++++++++
include/linux/sched/coredump.h | 6 +++++-
include/uapi/linux/prctl.h | 6 ++++++
kernel/sys.c | 18 ++++++++++++++++++
mm/mmap.c | 3 +++
mm/mprotect.c | 5 +++++
6 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/include/linux/mman.h b/include/linux/mman.h
index 58b3abd457a3..d84fdeab6b5e 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -156,4 +156,19 @@ calc_vm_flag_bits(unsigned long flags)
}

unsigned long vm_commit_limit(void);
+
+static inline bool map_deny_write_exec(struct vm_area_struct *vma, unsigned long vm_flags)
+{
+ if (!test_bit(MMF_HAS_MDWE, &current->mm->flags))
+ return false;
+
+ if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE))
+ return true;
+
+ if (vma && !(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC))
+ return true;
+
+ return false;
+}
+
#endif /* _LINUX_MMAN_H */
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 8270ad7ae14c..0e17ae7fbfd3 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -81,9 +81,13 @@ static inline int get_dumpable(struct mm_struct *mm)
* lifecycle of this mm, just for simplicity.
*/
#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */
+
+#define MMF_HAS_MDWE 28
+#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE)
+
#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP)

#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
- MMF_DISABLE_THP_MASK)
+ MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK)

#endif /* _LINUX_SCHED_COREDUMP_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index a5e06dcbba13..ab9db1e86230 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -281,6 +281,12 @@ struct prctl_mm_map {
# define PR_SME_VL_LEN_MASK 0xffff
# define PR_SME_VL_INHERIT (1 << 17) /* inherit across exec */

+/* Memory deny write / execute */
+#define PR_SET_MDWE 65
+# define PR_MDWE_FLAG_MMAP 1
+
+#define PR_GET_MDWE 66
+
#define PR_SET_VMA 0x53564d41
# define PR_SET_VMA_ANON_NAME 0

diff --git a/kernel/sys.c b/kernel/sys.c
index 5fd54bf0e886..08e1dd6d2533 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2348,6 +2348,18 @@ static int prctl_set_vma(unsigned long opt, unsigned long start,
}
#endif /* CONFIG_ANON_VMA_NAME */

+static inline int prctl_set_mdwe(void)
+{
+ set_bit(MMF_HAS_MDWE, &current->mm->flags);
+
+ return 0;
+}
+
+static inline int prctl_get_mdwe(void)
+{
+ return test_bit(MMF_HAS_MDWE, &current->mm->flags);
+}
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -2623,6 +2635,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = sched_core_share_pid(arg2, arg3, arg4, arg5);
break;
#endif
+ case PR_SET_MDWE:
+ error = prctl_set_mdwe();
+ break;
+ case PR_GET_MDWE:
+ error = prctl_get_mdwe();
+ break;
case PR_SET_VMA:
error = prctl_set_vma(arg2, arg3, arg4, arg5);
break;
diff --git a/mm/mmap.c b/mm/mmap.c
index 099468aee4d8..42eaf6683216 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1409,6 +1409,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags |= VM_NORESERVE;
}

+ if (map_deny_write_exec(NULL, vm_flags))
+ return -EACCES;
+
addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
if (!IS_ERR_VALUE(addr) &&
((vm_flags & VM_LOCKED) ||
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 8d770855b591..af71ef0788fd 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -766,6 +766,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
break;
}

+ if (map_deny_write_exec(vma, newflags)) {
+ error = -EACCES;
+ goto out;
+ }
+
/* Allow architectures to sanity-check the new flags */
if (!arch_validate_flags(newflags)) {
error = -EINVAL;
--
2.17.1