Re: [PATCH V6 3/4] powerpc/mm/iommu: Allow migration of cma allocated pages during mm_iommu_get

From: Aneesh Kumar K.V
Date: Wed Jan 09 2019 - 03:40:38 EST


Andrea Arcangeli <aarcange@xxxxxxxxxx> writes:

> Hello,
>
> On Tue, Jan 08, 2019 at 10:21:09AM +0530, Aneesh Kumar K.V wrote:
>> @@ -187,41 +149,25 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
>> goto unlock_exit;
>> }
>>
>> + ret = get_user_pages_cma_migrate(ua, entries, 1, mem->hpages);
>
> In terms of gup APIs, I've been wondering if this shall become
> get_user_pages_longerm(FOLL_CMA_MIGRATE). So basically moving this
> CMA migrate logic inside get_user_pages_longerm.

Do we need the FOLL_CMA_MIGRATE flag? Wondering whether a long term pin
won't imply a CMA migrate? What is the benefit of that FOLL_CMA_MIGRATE
flags. We can do better by taking a list of pages for migration and I
guess it is much simpler if we limit that migration logic to
get_user_pages_longterm()?

I ended up with something like below. Do you suggest we should add those
isolate_lru and other details via FOLL_CMA_MIGRATE flag and do that when
we take the page reference instead of doing this by iterating the page array in
get_user_pages_longterm as in the below diff?

diff --git a/mm/gup.c b/mm/gup.c
index 05acd7e2eb22..6e8152594e83 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -13,6 +13,9 @@
#include <linux/sched/signal.h>
#include <linux/rwsem.h>
#include <linux/hugetlb.h>
+#include <linux/migrate.h>
+#include <linux/mm_inline.h>
+#include <linux/sched/mm.h>

#include <asm/mmu_context.h>
#include <asm/pgtable.h>
@@ -1126,7 +1129,167 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
}
EXPORT_SYMBOL(get_user_pages);

+#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
+
#ifdef CONFIG_FS_DAX
+static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
+{
+ long i;
+ struct vm_area_struct *vma_prev = NULL;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct vm_area_struct *vma = vmas[i];
+
+ if (vma == vma_prev)
+ continue;
+
+ vma_prev = vma;
+
+ if (vma_is_fsdax(vma))
+ return true;
+ }
+ return false;
+}
+#else
+static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
+{
+ return false;
+}
+#endif
+
+#ifdef CONFIG_CMA
+static struct page *new_non_cma_page(struct page *page, unsigned long private)
+{
+ /*
+ * We want to make sure we allocate the new page from the same node
+ * as the source page.
+ */
+ int nid = page_to_nid(page);
+ /*
+ * Trying to allocate a page for migration. Ignore allocation
+ * failure warnings. We don't force __GFP_THISNODE here because
+ * this node here is the node where we have CMA reservation and
+ * in some case these nodes will have really less non movable
+ * allocation memory.
+ */
+ gfp_t gfp_mask = GFP_USER | __GFP_NOWARN;
+
+ if (PageHighMem(page))
+ gfp_mask |= __GFP_HIGHMEM;
+
+#ifdef CONFIG_HUGETLB_PAGE
+ if (PageHuge(page)) {
+ struct hstate *h = page_hstate(page);
+ /*
+ * We don't want to dequeue from the pool because pool pages will
+ * mostly be from the CMA region.
+ */
+ return alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
+ }
+#endif
+ if (PageTransHuge(page)) {
+ struct page *thp;
+ /*
+ * ignore allocation failure warnings
+ */
+ gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_NOWARN;
+
+ /*
+ * Remove the movable mask so that we don't allocate from
+ * CMA area again.
+ */
+ thp_gfpmask &= ~__GFP_MOVABLE;
+ thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER);
+ if (!thp)
+ return NULL;
+ prep_transhuge_page(thp);
+ return thp;
+ }
+
+ return __alloc_pages_node(nid, gfp_mask, 0);
+}
+
+static long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
+ unsigned int gup_flags,
+ struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ long i;
+ bool drain_allow = true;
+ bool migrate_allow = true;
+ LIST_HEAD(cma_page_list);
+
+check_again:
+ for (i = 0; i < nr_pages; i++) {
+ /*
+ * If we get a page from the CMA zone, since we are going to
+ * be pinning these entries, we might as well move them out
+ * of the CMA zone if possible.
+ */
+ if (is_migrate_cma_page(pages[i])) {
+
+ struct page *head = compound_head(pages[i]);
+
+ if (PageHuge(head)) {
+ isolate_huge_page(head, &cma_page_list);
+ } else {
+ if (!PageLRU(head) && drain_allow) {
+ lru_add_drain_all();
+ drain_allow = false;
+ }
+
+ if (!isolate_lru_page(head)) {
+ list_add_tail(&head->lru, &cma_page_list);
+ mod_node_page_state(page_pgdat(head),
+ NR_ISOLATED_ANON +
+ page_is_file_cache(head),
+ hpage_nr_pages(head));
+ }
+ }
+ }
+ }
+
+ if (!list_empty(&cma_page_list)) {
+ /*
+ * drop the above get_user_pages reference.
+ */
+ for (i = 0; i < nr_pages; i++)
+ put_page(pages[i]);
+
+ if (migrate_pages(&cma_page_list, new_non_cma_page,
+ NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
+ /*
+ * some of the pages failed migration. Do get_user_pages
+ * without migration.
+ */
+ migrate_allow = false;
+
+ if (!list_empty(&cma_page_list))
+ putback_movable_pages(&cma_page_list);
+ }
+ /*
+ * We did migrate all the pages, Try to get the page references again
+ * migrating any new CMA pages which we failed to isolate earlier.
+ */
+ nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+ if ((nr_pages > 0) && migrate_allow) {
+ drain_allow = true;
+ goto check_again;
+ }
+ }
+
+ return nr_pages;
+}
+#else
+static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
+ unsigned int gup_flags,
+ struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ return nr_pages;
+}
+#endif
+
/*
* This is the same as get_user_pages() in that it assumes we are
* operating on the current task's mm, but it goes further to validate
@@ -1140,11 +1303,11 @@ EXPORT_SYMBOL(get_user_pages);
* Contrast this to iov_iter_get_pages() usages which are transient.
*/
long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas_arg)
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas_arg)
{
struct vm_area_struct **vmas = vmas_arg;
- struct vm_area_struct *vma_prev = NULL;
+ unsigned long flags;
long rc, i;

if (!pages)
@@ -1157,31 +1320,20 @@ long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
return -ENOMEM;
}

+ flags = memalloc_nocma_save();
rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+ memalloc_nocma_restore(flags);
+ if (rc < 0)
+ goto out;

- for (i = 0; i < rc; i++) {
- struct vm_area_struct *vma = vmas[i];
-
- if (vma == vma_prev)
- continue;
-
- vma_prev = vma;
-
- if (vma_is_fsdax(vma))
- break;
- }
-
- /*
- * Either get_user_pages() failed, or the vma validation
- * succeeded, in either case we don't need to put_page() before
- * returning.
- */
- if (i >= rc)
+ if (check_dax_vmas(vmas, rc)) {
+ for (i = 0; i < rc; i++)
+ put_page(pages[i]);
+ rc = -EOPNOTSUPP;
goto out;
+ }

- for (i = 0; i < rc; i++)
- put_page(pages[i]);
- rc = -EOPNOTSUPP;
+ rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas);
out:
if (vmas != vmas_arg)
kfree(vmas);