[RFC PATCH] fs/exec: Add the support for ELF program's NUMA replication

From: Huang Shijie
Date: Mon Sep 06 2021 - 04:18:33 EST


This patch adds AT_NUMA_REPLICATION for execveat().

If this flag is set, the kernel will trigger COW(copy on write)
on the mmapped ELF binary. So the program will have a copied-page
on its NUMA node, even if the original page in page cache is
on other NUMA nodes.

Signed-off-by: Huang Shijie <shijie@xxxxxxxxxxxxxxxxxxxxxx>
---
fs/binfmt_elf.c | 27 ++++++++++++++++++++++-----
fs/exec.c | 5 ++++-
include/linux/binfmts.h | 1 +
include/linux/mm.h | 2 ++
include/uapi/linux/fcntl.h | 2 ++
mm/mprotect.c | 2 +-
6 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 439ed81e755a..fac8f4a4555a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -362,13 +362,14 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,

static unsigned long elf_map(struct file *filep, unsigned long addr,
const struct elf_phdr *eppnt, int prot, int type,
- unsigned long total_size)
+ unsigned long total_size, int numa_replication)
{
unsigned long map_addr;
unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
addr = ELF_PAGESTART(addr);
size = ELF_PAGEALIGN(size);
+ int ret;

/* mmap() will return -EINVAL if given a zero size, but a
* segment with zero filesize is perfectly valid */
@@ -385,11 +386,26 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
*/
if (total_size) {
total_size = ELF_PAGEALIGN(total_size);
- map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
+
+ if (numa_replication) {
+ /* Trigger the COW for this ELF code section */
+ map_addr = vm_mmap(filep, addr, total_size, prot | PROT_WRITE,
+ type | MAP_POPULATE, off);
+ if (!IS_ERR_VALUE(map_addr) && !(prot & PROT_WRITE)) {
+ /* Change back */
+ ret = do_mprotect_pkey(map_addr, total_size, prot, -1);
+ if (ret)
+ return ret;
+ }
+ } else {
+ map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
+ }
+
if (!BAD_ADDR(map_addr))
vm_munmap(map_addr+size, total_size-size);
- } else
+ } else {
map_addr = vm_mmap(filep, addr, size, prot, type, off);
+ }

if ((type & MAP_FIXED_NOREPLACE) &&
PTR_ERR((void *)map_addr) == -EEXIST)
@@ -635,7 +651,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
load_addr = -vaddr;

map_addr = elf_map(interpreter, load_addr + vaddr,
- eppnt, elf_prot, elf_type, total_size);
+ eppnt, elf_prot, elf_type, total_size, 0);
total_size = 0;
error = map_addr;
if (BAD_ADDR(map_addr))
@@ -1139,7 +1155,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
}

error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
- elf_prot, elf_flags, total_size);
+ elf_prot, elf_flags, total_size,
+ bprm->support_numa_replication);
if (BAD_ADDR(error)) {
retval = IS_ERR((void *)error) ?
PTR_ERR((void*)error) : -EINVAL;
diff --git a/fs/exec.c b/fs/exec.c
index 38f63451b928..d27efa540641 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -900,7 +900,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
.lookup_flags = LOOKUP_FOLLOW,
};

- if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+ if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH | AT_NUMA_REPLICATION)) != 0)
return ERR_PTR(-EINVAL);
if (flags & AT_SYMLINK_NOFOLLOW)
open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
@@ -1828,6 +1828,9 @@ static int bprm_execve(struct linux_binprm *bprm,
if (retval)
goto out;

+ /* Do we support NUMA replication for this program? */
+ bprm->support_numa_replication = flags & AT_NUMA_REPLICATION;
+
retval = exec_binprm(bprm);
if (retval < 0)
goto out;
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 049cf9421d83..1874e1732f20 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -64,6 +64,7 @@ struct linux_binprm {
struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */

char buf[BINPRM_BUF_SIZE];
+ int support_numa_replication;
} __randomize_layout;

#define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7ca22e6e694a..76611381be2a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3244,6 +3244,8 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping,
#endif

extern int sysctl_nr_trim_pages;
+int do_mprotect_pkey(unsigned long start, size_t len,
+ unsigned long prot, int pkey);

#ifdef CONFIG_PRINTK
void mem_dump_obj(void *object);
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 2f86b2ad6d7e..de99c5ae8eca 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -111,4 +111,6 @@

#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */

+#define AT_NUMA_REPLICATION 0x10000 /* Support NUMA replication for the ELF program */
+
#endif /* _UAPI_LINUX_FCNTL_H */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 883e2cc85cad..d1f8cececfed 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -519,7 +519,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
/*
* pkey==-1 when doing a legacy mprotect()
*/
-static int do_mprotect_pkey(unsigned long start, size_t len,
+int do_mprotect_pkey(unsigned long start, size_t len,
unsigned long prot, int pkey)
{
unsigned long nstart, end, tmp, reqprot;
--
2.30.2