Re: [PATCH 2/2] mm/damon: Add a new scheme to support demotion on tiered memory system
From: SeongJae Park
Date: Tue Dec 21 2021 - 08:24:47 EST
Hi Baolin,
Thank you for this great patch! I left some comments below.
On Tue, 21 Dec 2021 17:18:04 +0800 Baolin Wang <baolin.wang@xxxxxxxxxxxxxxxxx> wrote:
> On tiered memory system, the reclaim path in shrink_page_list() already
> support demoting pages to slow memory node instead of discarding the
> pages. However, at that time the fast memory node memory wartermark is
> already tense, which will increase the memory allocation latency during
> page demotion.
>
> We can rely on the DAMON in user space to help to monitor the cold
> memory on fast memory node, and demote the cold pages to slow memory
> node proactively to keep the fast memory node in a healthy state.
> Thus this patch introduces a new scheme named DAMOS_DEMOTE to support
> this feature.
>
> Signed-off-by: Baolin Wang <baolin.wang@xxxxxxxxxxxxxxxxx>
> ---
> include/linux/damon.h | 3 +
> mm/damon/dbgfs.c | 1 +
> mm/damon/vaddr.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 160 insertions(+)
>
> diff --git a/include/linux/damon.h b/include/linux/damon.h
> index af64838..da9957c 100644
> --- a/include/linux/damon.h
> +++ b/include/linux/damon.h
> @@ -87,6 +87,8 @@ struct damon_target {
> * @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT.
> * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE.
> * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
> + * @DAMOS_DEMOTE: Migate cold pages from fast memory type (DRAM) to slow
> + * memory type (persistent memory).
s/Migate/Migrate/
Also, could you make the second line of the description aligned with the first
line? e.g.,
+ * @DAMOS_DEMOTE: Migate cold pages from fast memory type (DRAM) to slow
+ * memory type (persistent memory).
> * @DAMOS_STAT: Do nothing but count the stat.
> */
> enum damos_action {
> @@ -95,6 +97,7 @@ enum damos_action {
> DAMOS_PAGEOUT,
> DAMOS_HUGEPAGE,
> DAMOS_NOHUGEPAGE,
> + DAMOS_DEMOTE,
> DAMOS_STAT, /* Do nothing but only record the stat */
This would make user space people who were using 'DAMOS_STAT' be confused.
Could you put 'DAMOS_DEMOTE' after 'DAMOS_STAT'?
> };
>
> diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
> index 58dbb96..43355ab 100644
> --- a/mm/damon/dbgfs.c
> +++ b/mm/damon/dbgfs.c
> @@ -168,6 +168,7 @@ static bool damos_action_valid(int action)
> case DAMOS_PAGEOUT:
> case DAMOS_HUGEPAGE:
> case DAMOS_NOHUGEPAGE:
> + case DAMOS_DEMOTE:
> case DAMOS_STAT:
> return true;
> default:
> diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
> index 9e213a1..b354d3e 100644
> --- a/mm/damon/vaddr.c
> +++ b/mm/damon/vaddr.c
> @@ -14,6 +14,10 @@
> #include <linux/page_idle.h>
> #include <linux/pagewalk.h>
> #include <linux/sched/mm.h>
> +#include <linux/migrate.h>
> +#include <linux/mm_inline.h>
> +#include <linux/swapops.h>
> +#include "../internal.h"
>
> #include "prmtv-common.h"
>
> @@ -693,6 +697,156 @@ static unsigned long damos_madvise(struct damon_target *target,
> }
> #endif /* CONFIG_ADVISE_SYSCALLS */
>
> +static bool damos_isolate_page(struct page *page, struct list_head *demote_list)
> +{
> + struct page *head = compound_head(page);
> +
> + /* Do not interfere with other mappings of this page */
> + if (page_mapcount(head) != 1)
> + return false;
> +
> + /* No need migration if the target demotion node is empty. */
> + if (next_demotion_node(page_to_nid(head)) == NUMA_NO_NODE)
> + return false;
> +
> + if (isolate_lru_page(head))
> + return false;
> +
> + list_add_tail(&head->lru, demote_list);
> + mod_node_page_state(page_pgdat(head),
> + NR_ISOLATED_ANON + page_is_file_lru(head),
> + thp_nr_pages(head));
> + return true;
The return value is not used by callers. If not needed, let's remove it.
> +}
> +
> +static int damos_migrate_pmd_entry(pmd_t *pmd, unsigned long addr,
> + unsigned long end, struct mm_walk *walk)
> +{
> + struct vm_area_struct *vma = walk->vma;
> + struct list_head *demote_list = walk->private;
> + spinlock_t *ptl;
> + struct page *page;
> + pte_t *pte, *mapped_pte;
> +
> + if (!vma_migratable(vma))
> + return -EFAULT;
> +
> + ptl = pmd_trans_huge_lock(pmd, vma);
> + if (ptl) {
> + /* Bail out if THP migration is not supported. */
> + if (!thp_migration_supported())
> + goto thp_out;
> +
> + /* If the THP pte is under migration, do not bother it. */
> + if (unlikely(is_pmd_migration_entry(*pmd)))
> + goto thp_out;
> +
> + page = damon_get_page(pmd_pfn(*pmd));
> + if (!page)
> + goto thp_out;
> +
> + damos_isolate_page(page, demote_list);
> +
> + put_page(page);
> +thp_out:
> + spin_unlock(ptl);
> + return 0;
> + }
Could we wrap above THP handling code with '#ifdef CONFIG_TRANSPARENT_HUGEPAGE'?
> +
> + /* regular page handling */
> + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
> + return -EINVAL;
> +
> + mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
> + for (; addr != end; pte++, addr += PAGE_SIZE) {
> + if (pte_none(*pte) || !pte_present(*pte))
> + continue;
> +
> + page = damon_get_page(pte_pfn(*pte));
> + if (!page)
> + continue;
> +
> + damos_isolate_page(page, demote_list);
> + put_page(page);
> + }
> + pte_unmap_unlock(mapped_pte, ptl);
> + cond_resched();
> +
> + return 0;
> +}
> +
> +static const struct mm_walk_ops damos_migrate_pages_walk_ops = {
> + .pmd_entry = damos_migrate_pmd_entry,
The names are little bit confusing to me. How about 's/migrate/isolate/'?
> +};
> +
> +/*
> + * damos_demote() - demote cold pages from fast memory to slow memory
> + * @target: the given target
> + * @r: region of the target
> + *
> + * On tiered memory system, if DAMON monitored cold pages on fast memory
> + * node (DRAM), we can demote them to slow memory node proactively in case
> + * accumulating much more cold memory on fast memory node (DRAM) to reclaim.
> + *
> + * Return:
> + * = 0 means no pages were demoted.
> + * > 0 means how many cold pages were demoted successfully.
> + * < 0 means errors happened.
damon_va_apply_scheme(), which returns what this function returns when
DAMOS_DEMOTE action is given, should return bytes of the region that the action
is successfully applied.
> + */
> +static int damos_demote(struct damon_target *target, struct damon_region *r)
> +{
> + struct mm_struct *mm;
> + LIST_HEAD(demote_pages);
> + LIST_HEAD(pagelist);
> + int target_nid, nr_pages, ret = 0;
> + unsigned int nr_succeeded, demoted_pages = 0;
> + struct page *page, *page2;
> +
> + /* Validate if allowing to do page demotion */
> + if (!numa_demotion_enabled)
> + return -EINVAL;
> +
> + mm = damon_get_mm(target);
> + if (!mm)
> + return -ENOMEM;
> +
> + mmap_read_lock(mm);
> + walk_page_range(mm, PAGE_ALIGN(r->ar.start), PAGE_ALIGN(r->ar.end),
I guess PAGE_ALIGN()s are not necessary here?
> + &damos_migrate_pages_walk_ops, &demote_pages);
> + mmap_read_unlock(mm);
> +
> + mmput(mm);
> + if (list_empty(&demote_pages))
> + return 0;
> +
> + list_for_each_entry_safe(page, page2, &demote_pages, lru) {
I'd prefer 'next' or 'next_page' instead of 'page2'.
> + list_add(&page->lru, &pagelist);
> + target_nid = next_demotion_node(page_to_nid(page));
> + nr_pages = thp_nr_pages(page);
> +
> + ret = migrate_pages(&pagelist, alloc_demote_page, NULL,
> + target_nid, MIGRATE_SYNC, MR_DEMOTION,
> + &nr_succeeded);
> + if (ret) {
> + if (!list_empty(&pagelist)) {
> + list_del(&page->lru);
> + mod_node_page_state(page_pgdat(page),
> + NR_ISOLATED_ANON + page_is_file_lru(page),
> + -nr_pages);
> + putback_lru_page(page);
> + }
> + } else {
> + __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
> + demoted_pages += nr_succeeded;
> + }
Why should we do above work for each page on our own instead of exporting and
calling 'demote_page_list()'?
Thanks,
SJ
> +
> + INIT_LIST_HEAD(&pagelist);
> + cond_resched();
> + }
> +
> + return demoted_pages;
> +}
> +
> static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
> struct damon_target *t, struct damon_region *r,
> struct damos *scheme)
> @@ -715,6 +869,8 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
> case DAMOS_NOHUGEPAGE:
> madv_action = MADV_NOHUGEPAGE;
> break;
> + case DAMOS_DEMOTE:
> + return damos_demote(t, r);
> case DAMOS_STAT:
> return 0;
> default:
> --
> 1.8.3.1