[RFC PATCH 17/19] mm: Convert mmap and mlock to use account_locked_vm
From: Alistair Popple
Date: Tue Jan 24 2023 - 00:50:34 EST
A future change introduces a cgroup to control the amount of
locked/pinned memory on the system. To ensure memory pinned via mlock
and mmap is accounted for use the common account_locked_vm()
function.
As cgroups can outlive individual processes also unaccount for the
locked memory during process teardown.
This patch should introduce no user visible change.
Signed-off-by: Alistair Popple <apopple@xxxxxxxxxx>
Cc: linux-mm@xxxxxxxxx
Cc: linux-kernel@xxxxxxxxxxxxxxx
---
mm/internal.h | 2 +-
mm/mlock.c | 76 ++++++++++-----------------------------------------
mm/mmap.c | 76 +++++++++++++++++++++++++--------------------------
mm/mremap.c | 54 ++++++++++++++++++++++++++----------
mm/secretmem.c | 6 +---
5 files changed, 95 insertions(+), 119 deletions(-)
diff --git a/mm/internal.h b/mm/internal.h
index bcf75a8..7c8c3f2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -489,8 +489,6 @@ extern long populate_vma_page_range(struct vm_area_struct *vma,
extern long faultin_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
bool write, int *locked);
-extern int mlock_future_check(struct mm_struct *mm, unsigned long flags,
- unsigned long len);
/*
* mlock_vma_page() and munlock_vma_page():
* should be called with vma's mmap_lock held for read or write,
diff --git a/mm/mlock.c b/mm/mlock.c
index 7032f6d..a97c8c5 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -416,6 +416,20 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
goto out;
+ /*
+ * Keep track of amount of locked VM.
+ */
+ nr_pages = (end - start) >> PAGE_SHIFT;
+ if (!(newflags & VM_LOCKED)) {
+ __unaccount_locked_vm(mm, nr_pages);
+ } else if (!(oldflags & VM_LOCKED)) {
+ if (__account_locked_vm(mm, nr_pages, current,
+ capable(CAP_IPC_LOCK))) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
@@ -439,16 +453,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
success:
/*
- * Keep track of amount of locked VM.
- */
- nr_pages = (end - start) >> PAGE_SHIFT;
- if (!(newflags & VM_LOCKED))
- nr_pages = -nr_pages;
- else if (oldflags & VM_LOCKED)
- nr_pages = 0;
- mm->locked_vm += nr_pages;
-
- /*
* vm_flags is protected by the mmap_lock held in write mode.
* It's okay if try_to_unmap_one unmaps a page just after we
* set VM_LOCKED, populate_vma_page_range will bring it back.
@@ -517,42 +521,6 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
}
/*
- * Go through vma areas and sum size of mlocked
- * vma pages, as return value.
- * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
- * is also counted.
- * Return value: previously mlocked page counts
- */
-static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
- unsigned long start, size_t len)
-{
- struct vm_area_struct *vma;
- unsigned long count = 0;
- unsigned long end;
- VMA_ITERATOR(vmi, mm, start);
-
- /* Don't overflow past ULONG_MAX */
- if (unlikely(ULONG_MAX - len < start))
- end = ULONG_MAX;
- else
- end = start + len;
-
- for_each_vma_range(vmi, vma, end) {
- if (vma->vm_flags & VM_LOCKED) {
- if (start > vma->vm_start)
- count -= (start - vma->vm_start);
- if (end < vma->vm_end) {
- count += end - vma->vm_start;
- break;
- }
- count += vma->vm_end - vma->vm_start;
- }
- }
-
- return count >> PAGE_SHIFT;
-}
-
-/*
* convert get_user_pages() return value to posix mlock() error
*/
static int __mlock_posix_error_return(long retval)
@@ -585,21 +553,7 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla
if (mmap_write_lock_killable(current->mm))
return -EINTR;
- locked += current->mm->locked_vm;
- if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
- /*
- * It is possible that the regions requested intersect with
- * previously mlocked areas, that part area in "mm->locked_vm"
- * should not be counted to new mlock increment count. So check
- * and adjust locked count if necessary.
- */
- locked -= count_mm_mlocked_page_nr(current->mm,
- start, len);
- }
-
- /* check against resource limits */
- if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
- error = apply_vma_lock_flags(start, len, flags);
+ error = apply_vma_lock_flags(start, len, flags);
mmap_write_unlock(current->mm);
if (error)
diff --git a/mm/mmap.c b/mm/mmap.c
index 425a934..2c05582 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -160,7 +160,7 @@ static int check_brk_limits(unsigned long addr, unsigned long len)
if (IS_ERR_VALUE(mapped_addr))
return mapped_addr;
- return mlock_future_check(current->mm, current->mm->def_flags, len);
+ return 0;
}
static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
unsigned long newbrk, unsigned long oldbrk,
@@ -1184,23 +1184,6 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
return hint;
}
-int mlock_future_check(struct mm_struct *mm, unsigned long flags,
- unsigned long len)
-{
- unsigned long locked, lock_limit;
-
- /* mlock MCL_FUTURE? */
- if (flags & VM_LOCKED) {
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
- return 0;
-}
-
static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
{
if (S_ISREG(inode->i_mode))
@@ -1310,9 +1293,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (!can_do_mlock())
return -EPERM;
- if (mlock_future_check(mm, vm_flags, len))
- return -EAGAIN;
-
if (file) {
struct inode *inode = file_inode(file);
unsigned long flags_mask;
@@ -1882,22 +1862,27 @@ static int acct_stack_growth(struct vm_area_struct *vma,
if (size > rlimit(RLIMIT_STACK))
return -ENOMEM;
- /* mlock limit tests */
- if (mlock_future_check(mm, vma->vm_flags, grow << PAGE_SHIFT))
- return -ENOMEM;
-
/* Check to ensure the stack will not grow into a hugetlb-only region */
new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
vma->vm_end - size;
if (is_hugepage_only_range(vma->vm_mm, new_start, size))
return -EFAULT;
+ /* mlock limit tests */
+ if (vma->vm_flags & VM_LOCKED)
+ if (__account_locked_vm(mm, grow << PAGE_SHIFT, current,
+ capable(CAP_IPC_LOCK)))
+ return -ENOMEM;
+
/*
* Overcommit.. This must be the final test, as it will
* update security statistics.
*/
- if (security_vm_enough_memory_mm(mm, grow))
+ if (security_vm_enough_memory_mm(mm, grow)) {
+ if (vma->vm_flags & VM_LOCKED)
+ __unaccount_locked_vm(mm, grow << PAGE_SHIFT);
return -ENOMEM;
+ }
return 0;
}
@@ -1975,8 +1960,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
* to guard against concurrent vma expansions.
*/
spin_lock(&mm->page_table_lock);
- if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
@@ -2056,8 +2039,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
* to guard against concurrent vma expansions.
*/
spin_lock(&mm->page_table_lock);
- if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
@@ -2281,7 +2262,7 @@ static inline int munmap_sidetree(struct vm_area_struct *vma,
return -ENOMEM;
if (vma->vm_flags & VM_LOCKED)
- vma->vm_mm->locked_vm -= vma_pages(vma);
+ __unaccount_locked_vm(vma->vm_mm, vma_pages(vma));
return 0;
}
@@ -2525,6 +2506,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
struct vm_area_struct *next, *prev, *merge;
pgoff_t pglen = len >> PAGE_SHIFT;
unsigned long charged = 0;
+ unsigned long locked = 0;
unsigned long end = addr + len;
unsigned long merge_start = addr, merge_end = end;
pgoff_t vm_pgoff;
@@ -2560,6 +2542,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vm_flags |= VM_ACCOUNT;
}
+ if (vm_flags & VM_LOCKED) {
+ locked = len >> PAGE_SHIFT;
+ if (__account_locked_vm(mm, locked, current,
+ capable(CAP_IPC_LOCK))) {
+ error = -ENOMEM;
+ goto unacct_error;
+ }
+ }
+
next = mas_next(&mas, ULONG_MAX);
prev = mas_prev(&mas, 0);
if (vm_flags & VM_SPECIAL)
@@ -2605,7 +2596,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vma = vm_area_alloc(mm);
if (!vma) {
error = -ENOMEM;
- goto unacct_error;
+ goto unlock_error;
}
vma->vm_start = addr;
@@ -2725,8 +2716,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm))
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
- else
- mm->locked_vm += (len >> PAGE_SHIFT);
}
if (file)
@@ -2759,6 +2748,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
mapping_unmap_writable(file->f_mapping);
free_vma:
vm_area_free(vma);
+unlock_error:
+ if (locked)
+ __unaccount_locked_vm(mm, locked);
unacct_error:
if (charged)
vm_unacct_memory(charged);
@@ -2942,8 +2934,13 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
+ if (flags & VM_LOCKED)
+ if (__account_locked_vm(mm, len >> PAGE_SHIFT, current,
+ capable(CAP_IPC_LOCK)))
+ return -ENOMEM;
+
if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
- return -ENOMEM;
+ goto unacct_locked;
/*
* Expand the existing vma if possible; Note that singular lists do not
@@ -2993,8 +2990,6 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
mm->data_vm += len >> PAGE_SHIFT;
- if (flags & VM_LOCKED)
- mm->locked_vm += (len >> PAGE_SHIFT);
vma->vm_flags |= VM_SOFTDIRTY;
validate_mm(mm);
return 0;
@@ -3003,6 +2998,8 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
vm_area_free(vma);
unacct_fail:
vm_unacct_memory(len >> PAGE_SHIFT);
+unacct_locked:
+ __unaccount_locked_vm(mm, len >> PAGE_SHIFT);
return -ENOMEM;
}
@@ -3064,7 +3061,7 @@ void exit_mmap(struct mm_struct *mm)
{
struct mmu_gather tlb;
struct vm_area_struct *vma;
- unsigned long nr_accounted = 0;
+ unsigned long nr_accounted = 0, nr_locked = 0;
MA_STATE(mas, &mm->mm_mt, 0, 0);
int count = 0;
@@ -3107,6 +3104,8 @@ void exit_mmap(struct mm_struct *mm)
do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
+ if (vma->vm_flags & VM_LOCKED)
+ nr_locked += vma_pages(vma);
remove_vma(vma);
count++;
cond_resched();
@@ -3116,6 +3115,7 @@ void exit_mmap(struct mm_struct *mm)
trace_exit_mmap(mm);
__mt_destroy(&mm->mm_mt);
+ __unaccount_locked_vm(mm, nr_locked);
mmap_write_unlock(mm);
vm_unacct_memory(nr_accounted);
}
diff --git a/mm/mremap.c b/mm/mremap.c
index fe587c5..67cc4f3 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -574,7 +574,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
bool *locked, unsigned long flags,
struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
{
- long to_account = new_len - old_len;
+ long to_account = (new_len - old_len) >> PAGE_SHIFT;
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma;
unsigned long vm_flags = vma->vm_flags;
@@ -594,7 +594,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
return -ENOMEM;
if (unlikely(flags & MREMAP_DONTUNMAP))
- to_account = new_len;
+ to_account = new_len >> PAGE_SHIFT;
if (vma->vm_ops && vma->vm_ops->may_split) {
if (vma->vm_start != old_addr)
@@ -618,16 +618,36 @@ static unsigned long move_vma(struct vm_area_struct *vma,
return err;
if (vm_flags & VM_ACCOUNT) {
- if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
+ if (security_vm_enough_memory_mm(mm, to_account))
return -ENOMEM;
}
+ /*
+ * MREMAP_DONTUNMAP clears VM_LOCKED on the old vma and
+ * implies new_len == old_len so no need to account locked
+ * pages.
+ */
+ if ((vm_flags & VM_LOCKED) && likely(!(flags & MREMAP_DONTUNMAP))) {
+ if (__account_locked_vm(mm, to_account, current,
+ capable(CAP_IPC_LOCK))) {
+ if (vm_flags & VM_ACCOUNT)
+ vm_unacct_memory(to_account);
+ return -ENOMEM;
+ }
+ *locked = true;
+ }
+
new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
&need_rmap_locks);
if (!new_vma) {
if (vm_flags & VM_ACCOUNT)
- vm_unacct_memory(to_account >> PAGE_SHIFT);
+ vm_unacct_memory(to_account);
+ if ((vm_flags & VM_LOCKED) &&
+ likely(!(flags & MREMAP_DONTUNMAP))) {
+ __unaccount_locked_vm(mm, to_account);
+ *locked = false;
+ }
return -ENOMEM;
}
@@ -696,10 +716,11 @@ static unsigned long move_vma(struct vm_area_struct *vma,
vma->vm_end == (old_addr + old_len))
unlink_anon_vmas(vma);
- /* Because we won't unmap we don't need to touch locked_vm */
return new_addr;
}
+ /* Make sure do_munmap() doesn't unaccount locked pages */
+ vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
/* OOM: unable to split vma, just get accounts right */
if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
@@ -707,15 +728,11 @@ static unsigned long move_vma(struct vm_area_struct *vma,
excess = 0;
}
- if (vm_flags & VM_LOCKED) {
- mm->locked_vm += new_len >> PAGE_SHIFT;
- *locked = true;
- }
-
mm->hiwater_vm = hiwater_vm;
/* Restore VM_ACCOUNT if one or two pieces of vma left */
if (excess) {
+ vma->vm_flags = vm_flags;
vma->vm_flags |= VM_ACCOUNT;
if (split)
find_vma(mm, vma->vm_end)->vm_flags |= VM_ACCOUNT;
@@ -768,9 +785,6 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
return ERR_PTR(-EFAULT);
- if (mlock_future_check(mm, vma->vm_flags, new_len - old_len))
- return ERR_PTR(-EAGAIN);
-
if (!may_expand_vm(mm, vma->vm_flags,
(new_len - old_len) >> PAGE_SHIFT))
return ERR_PTR(-ENOMEM);
@@ -1026,6 +1040,16 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
}
}
+ if (vma->vm_flags & VM_LOCKED) {
+ if (__account_locked_vm(mm, pages, current,
+ capable(CAP_IPC_LOCK))) {
+ if (vma->vm_flags & VM_ACCOUNT)
+ vm_unacct_memory(pages);
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
/*
* Function vma_merge() is called on the extension we are adding to
* the already existing vma, vma_merge() will merge this extension with
@@ -1038,14 +1062,16 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
extension_pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (!vma) {
+ // TODO: We always unacct memory
+ // regardless of VM_ACCOUNT flags?
vm_unacct_memory(pages);
+ __unaccount_locked_vm(mm, pages);
ret = -ENOMEM;
goto out;
}
vm_stat_account(mm, vma->vm_flags, pages);
if (vma->vm_flags & VM_LOCKED) {
- mm->locked_vm += pages;
locked = true;
new_addr = addr;
}
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 04c3ac9..4515eb4 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -120,13 +120,11 @@ static int secretmem_release(struct inode *inode, struct file *file)
static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
{
- unsigned long len = vma->vm_end - vma->vm_start;
-
if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
return -EINVAL;
- if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
- return -EAGAIN;
+ if (account_locked_vm(vma->vm_mm, vma->vm_end - vma->vm_start))
+ return -ENOMEM;
vma->vm_flags |= VM_LOCKED | VM_DONTDUMP;
vma->vm_ops = &secretmem_vm_ops;
--
git-series 0.9.1