[PATCH v2] mm/madvise: fix madvise_pageout for private file mappings

From: Pavankumar Kondeti
Date: Fri Dec 02 2022 - 01:30:22 EST


When MADV_PAGEOUT is called on a private file mapping VMA region,
we bail out early if the process is neither owner nor write capable
of the file. However, this VMA may have both private/shared clean
pages and private dirty pages. The opportunity of paging out the
private dirty pages (Anon pages) is missed. Fix this behavior by
allowing private file mappings pageout further and perform the file
access check along with PageAnon() during page walk.

We observe ~10% improvement in zram usage, thus leaving more available
memory on a 4GB RAM system running Android.

Signed-off-by: Pavankumar Kondeti <quic_pkondeti@xxxxxxxxxxx>
---

V2:

- As per David's suggestion, removed new argument introduced in
madvise_walk_private struct and directly call can_do_file_pageout()

- As per Mark's suggestions, optimized PageAnon() checks. Also bailed out
early if the file mapping is not private.

mm/madvise.c | 53 +++++++++++++++++++++++++++++++++++------------------
1 file changed, 35 insertions(+), 18 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index c7105ec..02f12f4 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -321,6 +321,21 @@ static long madvise_willneed(struct vm_area_struct *vma,
return 0;
}

+static inline bool can_do_file_pageout(struct vm_area_struct *vma)
+{
+ if (!vma->vm_file)
+ return false;
+ /*
+ * paging out pagecache only for non-anonymous mappings that correspond
+ * to the files the calling process could (if tried) open for writing;
+ * otherwise we'd be including shared non-exclusive mappings, which
+ * opens a side channel.
+ */
+ return inode_owner_or_capable(&init_user_ns,
+ file_inode(vma->vm_file)) ||
+ file_permission(vma->vm_file, MAY_WRITE) == 0;
+}
+
static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
@@ -334,10 +349,14 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
spinlock_t *ptl;
struct page *page = NULL;
LIST_HEAD(page_list);
+ bool pageout_anon_only_filter;

if (fatal_signal_pending(current))
return -EINTR;

+ pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) &&
+ !can_do_file_pageout(vma);
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (pmd_trans_huge(*pmd)) {
pmd_t orig_pmd;
@@ -364,6 +383,9 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
if (page_mapcount(page) != 1)
goto huge_unlock;

+ if (pageout_anon_only_filter && !PageAnon(page))
+ goto huge_unlock;
+
if (next - addr != HPAGE_PMD_SIZE) {
int err;

@@ -432,6 +454,8 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
if (PageTransCompound(page)) {
if (page_mapcount(page) != 1)
break;
+ if (pageout_anon_only_filter && !PageAnon(page))
+ break;
get_page(page);
if (!trylock_page(page)) {
put_page(page);
@@ -459,6 +483,9 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
if (!PageLRU(page) || page_mapcount(page) != 1)
continue;

+ if (pageout_anon_only_filter && !PageAnon(page))
+ continue;
+
VM_BUG_ON_PAGE(PageTransCompound(page), page);

if (pte_young(ptent)) {
@@ -553,23 +580,6 @@ static void madvise_pageout_page_range(struct mmu_gather *tlb,
tlb_end_vma(tlb, vma);
}

-static inline bool can_do_pageout(struct vm_area_struct *vma)
-{
- if (vma_is_anonymous(vma))
- return true;
- if (!vma->vm_file)
- return false;
- /*
- * paging out pagecache only for non-anonymous mappings that correspond
- * to the files the calling process could (if tried) open for writing;
- * otherwise we'd be including shared non-exclusive mappings, which
- * opens a side channel.
- */
- return inode_owner_or_capable(&init_user_ns,
- file_inode(vma->vm_file)) ||
- file_permission(vma->vm_file, MAY_WRITE) == 0;
-}
-
static long madvise_pageout(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start_addr, unsigned long end_addr)
@@ -581,7 +591,14 @@ static long madvise_pageout(struct vm_area_struct *vma,
if (!can_madv_lru_vma(vma))
return -EINVAL;

- if (!can_do_pageout(vma))
+ /*
+ * If the VMA belongs to a private file mapping, there can be private
+ * dirty pages which can be paged out if even this process is neither
+ * owner nor write capable of the file. We allow private file mappings
+ * further to pageout dirty anon pages.
+ */
+ if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) &&
+ (vma->vm_flags & VM_MAYSHARE)))
return 0;

lru_add_drain();
--
2.7.4