Re: [RFC 3/6] mm: support madvise(MADV_FREE)
From: Minchan Kim
Date: Fri Mar 14 2014 - 03:48:45 EST
On Fri, Mar 14, 2014 at 03:37:47PM +0900, Minchan Kim wrote:
> Linux doesn't have an ability to free pages lazy while other OS
> already have been supported that named by madvise(MADV_FREE).
>
> The gain is clear that kernel can evict freed pages rather than
> swapping out or OOM if memory pressure happens.
>
> Without memory pressure, freed pages would be reused by userspace
> without another additional overhead(ex, page fault + + page allocation
> + page zeroing).
>
> Firstly, heavy users would be general allocators(ex, jemalloc,
> I hope ptmalloc support it) and jemalloc already have supported
> the feature for other OS(ex, FreeBSD)
>
> At the moment, this patch would break build other ARCHs which have
> own TLB flush scheme other than that x86 but if there is no objection
> in this direction, I will add patches for handling other ARCHs
> in next iteration.
>
> Signed-off-by: Minchan Kim <minchan@xxxxxxxxxx>
> ---
> include/asm-generic/tlb.h | 9 ++++++++
> include/linux/mm.h | 35 ++++++++++++++++++++++++++++++-
> include/linux/rmap.h | 1 +
> include/linux/swap.h | 15 ++++++++++++++
> include/uapi/asm-generic/mman-common.h | 1 +
> mm/madvise.c | 17 +++++++++++++--
> mm/memory.c | 12 ++++++++++-
> mm/rmap.c | 21 +++++++++++++++++--
> mm/swap_state.c | 38 +++++++++++++++++++++++++++++++++-
> mm/vmscan.c | 22 +++++++++++++++++++-
> 10 files changed, 163 insertions(+), 8 deletions(-)
>
> diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
> index 5672d7ea1fa0..b82ee729a065 100644
> --- a/include/asm-generic/tlb.h
> +++ b/include/asm-generic/tlb.h
> @@ -116,8 +116,17 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long
> void tlb_flush_mmu(struct mmu_gather *tlb);
> void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start,
> unsigned long end);
> +int __tlb_madvfree_page(struct mmu_gather *tlb, struct page *page);
> int __tlb_remove_page(struct mmu_gather *tlb, struct page *page);
>
> +static inline void tlb_madvfree_page(struct mmu_gather *tlb, struct page *page)
> +{
> + /* Prevent page free */
> + get_page(page);
> + if (!__tlb_remove_page(tlb, MarkLazyFree(page)))
> + tlb_flush_mmu(tlb);
> +}
> +
> /* tlb_remove_page
> * Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when
> * required.
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index c1b7414c7bef..9b048cabce27 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -933,10 +933,16 @@ void page_address_init(void);
> * Please note that, confusingly, "page_mapping" refers to the inode
> * address_space which maps the page from disk; whereas "page_mapped"
> * refers to user virtual address space into which the page is mapped.
> + *
> + * PAGE_MAPPING_LZFREE bit is set along with PAGE_MAPPING_ANON bit
> + * and then page->mapping points to an anon_vma. This flag is used
> + * for lazy freeing the page instead of swap.
> */
> #define PAGE_MAPPING_ANON 1
> #define PAGE_MAPPING_KSM 2
> -#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
> +#define PAGE_MAPPING_LZFREE 4
> +#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM | \
> + PAGE_MAPPING_LZFREE)
>
> extern struct address_space *page_mapping(struct page *page);
>
> @@ -962,6 +968,32 @@ static inline int PageAnon(struct page *page)
> return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
> }
>
> +static inline void SetPageLazyFree(struct page *page)
> +{
> + BUG_ON(!PageAnon(page));
> + BUG_ON(!PageLocked(page));
> +
> + page->mapping = (void *)((unsigned long)page->mapping |
> + PAGE_MAPPING_LZFREE);
> +}
> +
> +static inline void ClearPageLazyFree(struct page *page)
> +{
> + BUG_ON(!PageAnon(page));
> + BUG_ON(!PageLocked(page));
> +
> + page->mapping = (void *)((unsigned long)page->mapping &
> + ~PAGE_MAPPING_LZFREE);
> +}
> +
> +static inline int PageLazyFree(struct page *page)
> +{
> + if (((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
> + (PAGE_MAPPING_ANON|PAGE_MAPPING_LZFREE))
> + return 1;
> + return 0;
> +}
> +
> /*
> * Return the pagecache index of the passed page. Regular pagecache pages
> * use ->index whereas swapcache pages use ->private
> @@ -1054,6 +1086,7 @@ struct zap_details {
> struct address_space *check_mapping; /* Check page->mapping if set */
> pgoff_t first_index; /* Lowest page->index to unmap */
> pgoff_t last_index; /* Highest page->index to unmap */
> + int lazy_free; /* do lazy free */
> };
>
> struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index 1da693d51255..19e74aebb3d5 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -75,6 +75,7 @@ enum ttu_flags {
> TTU_UNMAP = 0, /* unmap mode */
> TTU_MIGRATION = 1, /* migration mode */
> TTU_MUNLOCK = 2, /* munlock mode */
> + TTU_LAZYFREE = 3, /* free lazyfree page */
> TTU_ACTION_MASK = 0xff,
>
> TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 46ba0c6c219f..223909c14703 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -13,6 +13,21 @@
> #include <linux/page-flags.h>
> #include <asm/page.h>
>
> +static inline struct page *MarkLazyFree(struct page *p)
> +{
> + return (struct page *)((unsigned long)p | 0x1UL);
> +}
> +
> +static inline struct page *ClearLazyFree(struct page *p)
> +{
> + return (struct page *)((unsigned long)p & ~0x1UL);
> +}
> +
> +static inline bool LazyFree(struct page *p)
> +{
> + return ((unsigned long)p & 0x1UL) ? true : false;
> +}
> +
> struct notifier_block;
>
> struct bio;
> diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
> index 4164529a94f9..7e257e49be2e 100644
> --- a/include/uapi/asm-generic/mman-common.h
> +++ b/include/uapi/asm-generic/mman-common.h
> @@ -34,6 +34,7 @@
> #define MADV_SEQUENTIAL 2 /* expect sequential page references */
> #define MADV_WILLNEED 3 /* will need these pages */
> #define MADV_DONTNEED 4 /* don't need these pages */
> +#define MADV_FREE 5 /* do lazy free */
>
> /* common parameters: try to keep these consistent across architectures */
> #define MADV_REMOVE 9 /* remove these pages & resources */
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 539eeb96b323..2e904289a2bb 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -31,6 +31,7 @@ static int madvise_need_mmap_write(int behavior)
> case MADV_REMOVE:
> case MADV_WILLNEED:
> case MADV_DONTNEED:
> + case MADV_FREE:
> return 0;
> default:
> /* be safe, default to 1. list exceptions explicitly */
> @@ -272,7 +273,8 @@ static long madvise_willneed(struct vm_area_struct *vma,
> */
> static long madvise_dontneed(struct vm_area_struct *vma,
> struct vm_area_struct **prev,
> - unsigned long start, unsigned long end)
> + unsigned long start, unsigned long end,
> + int behavior)
> {
> *prev = vma;
> if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
> @@ -284,8 +286,17 @@ static long madvise_dontneed(struct vm_area_struct *vma,
> .last_index = ULONG_MAX,
> };
> zap_page_range(vma, start, end - start, &details);
> + } else if (behavior == MADV_FREE) {
> + struct zap_details details = {
> + .lazy_free = 1,
> + };
> +
> + if (vma->vm_file)
> + return -EINVAL;
> + zap_page_range(vma, start, end - start, &details);
> } else
> zap_page_range(vma, start, end - start, NULL);
> +
> return 0;
> }
>
> @@ -384,8 +395,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
> return madvise_remove(vma, prev, start, end);
> case MADV_WILLNEED:
> return madvise_willneed(vma, prev, start, end);
> + case MADV_FREE:
> case MADV_DONTNEED:
> - return madvise_dontneed(vma, prev, start, end);
> + return madvise_dontneed(vma, prev, start, end, behavior);
> default:
> return madvise_behavior(vma, prev, start, end, behavior);
> }
> @@ -403,6 +415,7 @@ madvise_behavior_valid(int behavior)
> case MADV_REMOVE:
> case MADV_WILLNEED:
> case MADV_DONTNEED:
> + case MADV_FREE:
> #ifdef CONFIG_KSM
> case MADV_MERGEABLE:
> case MADV_UNMERGEABLE:
> diff --git a/mm/memory.c b/mm/memory.c
> index 22dfa617bddb..f1f0dc13e8d1 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1093,6 +1093,15 @@ again:
>
> page = vm_normal_page(vma, addr, ptent);
> if (unlikely(details) && page) {
> + if (details->lazy_free && PageAnon(page)) {
> + ptent = pte_mkold(ptent);
> + ptent = pte_mkclean(ptent);
> + set_pte_at(mm, addr, pte, ptent);
> + tlb_remove_tlb_entry(tlb, pte, addr);
> + tlb_madvfree_page(tlb, page);
> + continue;
> + }
> +
> /*
> * unmap_shared_mapping_pages() wants to
> * invalidate cache without truncating:
> @@ -1276,7 +1285,8 @@ static void unmap_page_range(struct mmu_gather *tlb,
> pgd_t *pgd;
> unsigned long next;
>
> - if (details && !details->check_mapping && !details->nonlinear_vma)
> + if (details && !details->check_mapping && !details->nonlinear_vma &&
> + !details->lazy_free)
> details = NULL;
>
> BUG_ON(addr >= end);
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 76069afa6b81..7712f39acfee 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -377,6 +377,15 @@ void __init anon_vma_init(void)
> anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
> }
>
> +static inline bool is_anon_vma(unsigned long mapping)
> +{
> + unsigned long anon_mapping = mapping & PAGE_MAPPING_FLAGS;
> + if ((anon_mapping != PAGE_MAPPING_ANON) &&
> + (anon_mapping != (PAGE_MAPPING_ANON|PAGE_MAPPING_LZFREE)))
> + return false;
> + return true;
> +}
> +
> /*
> * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
> *
> @@ -407,7 +416,7 @@ struct anon_vma *page_get_anon_vma(struct page *page)
>
> rcu_read_lock();
> anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
> - if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
> + if (!is_anon_vma(anon_mapping))
> goto out;
> if (!page_mapped(page))
> goto out;
> @@ -450,7 +459,7 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
>
> rcu_read_lock();
> anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
> - if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
> + if (!is_anon_vma(anon_mapping))
> goto out;
> if (!page_mapped(page))
> goto out;
> @@ -1165,6 +1174,14 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
> }
> set_pte_at(mm, address, pte,
> swp_entry_to_pte(make_hwpoison_entry(page)));
> + } else if ((flags & TTU_LAZYFREE) && PageLazyFree(page)) {
> + BUG_ON(!PageAnon(page));
> + if (unlikely(pte_dirty(pteval))) {
> + set_pte_at(mm, address, pte, pteval);
> + ret = SWAP_FAIL;
> + goto out_unmap;
> + }
> + dec_mm_counter(mm, MM_ANONPAGES);
> } else if (PageAnon(page)) {
> swp_entry_t entry = { .val = page_private(page) };
> pte_t swp_pte;
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index e76ace30d436..0718ecd166dc 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -18,6 +18,7 @@
> #include <linux/pagevec.h>
> #include <linux/migrate.h>
> #include <linux/page_cgroup.h>
> +#include <linux/ksm.h>
>
> #include <asm/pgtable.h>
>
> @@ -256,8 +257,36 @@ void free_page_and_swap_cache(struct page *page)
> }
>
> /*
> + * move @page to inactive LRU's tail so that VM can discard it
> + * rather than swapping hot pages out when memory pressure happens.
> + */
> +static bool move_lazyfree(struct page *page)
> +{
> + if (!trylock_page(page))
> + return false;
> +
> + if (PageKsm(page)) {
> + unlock_page(page);
> + return false;
> + }
> +
> + if (PageSwapCache(page) &&
> + try_to_free_swap(page))
> + ClearPageDirty(page);
> +
> + if (!PageLazyFree(page)) {
> + SetPageLazyFree(page);
> + deactivate_page(page);
> + }
> +
> + unlock_page(page);
> + return true;
> +}
> +
> +/*
> * Passed an array of pages, drop them all from swapcache and then release
> * them. They are removed from the LRU and freed if this is their last use.
> + * If page passed are lazyfree, deactivate them intead of freeing.
> */
> void free_pages_and_swap_cache(struct page **pages, int nr)
> {
> @@ -269,7 +298,14 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
> int i;
>
> for (i = 0; i < todo; i++)
> - free_swap_cache(pagep[i]);
> + if (LazyFree(pagep[i])) {
> + pagep[i] = ClearLazyFree(pagep[i]);
> + /* If we failed, just free */
> + if (!move_lazyfree(pagep[i]))
> + free_swap_cache(pagep[i]);
Oops, patchset was confused by older version in my git tree.
Fix goes.
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0718ecd166dc..882f1c8e5bd2 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -300,9 +300,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
for (i = 0; i < todo; i++)
if (LazyFree(pagep[i])) {
pagep[i] = ClearLazyFree(pagep[i]);
- /* If we failed, just free */
- if (!move_lazyfree(pagep[i]))
- free_swap_cache(pagep[i]);
+ move_lazyfree(pagep[i]);
} else {
free_swap_cache(pagep[i]);
}
--
Kind regards,
Minchan Kim
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/