Re: [PATCH] mm: Free per cpu pages async to shorten program exit time

From: Claudio Imbrenda
Date: Fri Oct 08 2021 - 04:52:58 EST


On Fri, 8 Oct 2021 10:17:50 +0200
David Hildenbrand <david@xxxxxxxxxx> wrote:

> On 08.10.21 08:39, ultrachin@xxxxxxx wrote:
> > From: chen xiaoguang <xiaoggchen@xxxxxxxxxxx>
> >
> > The exit time is long when program allocated big memory and
> > the most time consuming part is free memory which takes 99.9%
> > of the total exit time. By using async free we can save 25% of
> > exit time.
> >
> > Signed-off-by: chen xiaoguang <xiaoggchen@xxxxxxxxxxx>
> > Signed-off-by: zeng jingxiang <linuszeng@xxxxxxxxxxx>
> > Signed-off-by: lu yihui <yihuilu@xxxxxxxxxxx>
>
> I recently discussed with Claudio if it would be possible to tear down
> the process MM deferred, because for some use cases (secure/encrypted
> virtualization, very large mmaps) tearing down the page tables is
> already the much more expensive operation.
>
> There is mmdrop_async(), and I wondered if one could reuse that concept
> when tearing down a process -- I didn't look into feasibility, however,
> so it's just some very rough idea.

I have done some experiments by unconditionally replacing mmdrop with
mmdrop_async in exit.c and nothing broke, and exit time of large
processes was almost instant (with the actual cleanup being performed in
background)

my approach is probably simpler/cleaner:

diff --git a/include/asm-generic/mmu_context.h b/include/asm-generic/mmu_context.h
index 91727065bacb..900931a6a105 100644
--- a/include/asm-generic/mmu_context.h
+++ b/include/asm-generic/mmu_context.h
@@ -73,4 +73,8 @@ static inline void deactivate_mm(struct task_struct *tsk,
}
#endif

+#ifndef arch_exit_mm_mmput
+#define arch_exit_mm_mmput mmput
+#endif
+
#endif /* __ASM_GENERIC_MMU_CONTEXT_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index 9a89e7f36acb..604cb9c740fa 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -498,7 +498,7 @@ static void exit_mm(void)
task_unlock(current);
mmap_read_unlock(mm);
mm_update_next_owner(mm);
- mmput(mm);
+ arch_exit_mm_mmput(mm);
if (test_thread_flag(TIF_MEMDIE))
exit_oom_victim();
}

these are the minimal changes to common code, then each architecture can
define their own arch_exit_mm_mmput as they see fit (for example, to free
asynchronously only for certain classes of mm, like big ones, VMs, or so).

Another option is to simply always replace mmput with mmput_async, which I
expect will raise more eyebrows.

> > ---
> > include/linux/mm.h | 1 +
> > kernel/exit.c | 2 ++
> > mm/page_alloc.c | 89 +++++++++++++++++++++++++++++++++++++++++++---
> > 3 files changed, 87 insertions(+), 5 deletions(-)
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 73a52aba448f..2add3b635eee 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -908,6 +908,7 @@ void put_pages_list(struct list_head *pages);
> >
> > void split_page(struct page *page, unsigned int order);
> > void copy_huge_page(struct page *dst, struct page *src);
> > +void kfreepcp_set_run(unsigned int cpu);
> >
> > /*
> > * Compound pages have a destructor function. Provide a
> > diff --git a/kernel/exit.c b/kernel/exit.c
> > index 91a43e57a32e..269eb81acbe9 100644
> > --- a/kernel/exit.c
> > +++ b/kernel/exit.c
> > @@ -167,10 +167,12 @@ static void __exit_signal(struct task_struct *tsk)
> > static void delayed_put_task_struct(struct rcu_head *rhp)
> > {
> > struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
> > + unsigned int cpu = tsk->cpu;
> >
> > perf_event_delayed_put(tsk);
> > trace_sched_process_free(tsk);
> > put_task_struct(tsk);
> > + kfreepcp_set_run(cpu);
> > }
> >
> > void put_task_struct_rcu_user(struct task_struct *task)
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index b37435c274cf..8a748ea9156b 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -72,6 +72,7 @@
> > #include <linux/padata.h>
> > #include <linux/khugepaged.h>
> > #include <linux/buffer_head.h>
> > +#include <linux/smpboot.h>
> > #include <asm/sections.h>
> > #include <asm/tlbflush.h>
> > #include <asm/div64.h>
> > @@ -147,6 +148,12 @@ DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
> > EXPORT_PER_CPU_SYMBOL(_numa_mem_);
> > #endif
> >
> > +struct freepcp_stat {
> > + struct task_struct *thread;
> > + bool should_run;
> > +};
> > +DEFINE_PER_CPU(struct freepcp_stat, kfreepcp);
> > +
> > /* work_structs for global per-cpu drains */
> > struct pcpu_drain {
> > struct zone *zone;
> > @@ -3361,6 +3368,81 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone)
> > return min(READ_ONCE(pcp->batch) << 2, high);
> > }
> >
> > +void kfreepcp_set_run(unsigned int cpu)
> > +{
> > + struct task_struct *tsk;
> > + struct freepcp_stat *stat = this_cpu_ptr(&kfreepcp);
> > +
> > + tsk = stat->thread;
> > + per_cpu(kfreepcp.should_run, cpu) = true;
> > +
> > + if (tsk && !task_is_running(tsk))
> > + wake_up_process(tsk);
> > +}
> > +EXPORT_SYMBOL_GPL(kfreepcp_set_run);
> > +
> > +static int kfreepcp_should_run(unsigned int cpu)
> > +{
> > + struct freepcp_stat *stat = this_cpu_ptr(&kfreepcp);
> > +
> > + return stat->should_run;
> > +}
> > +
> > +static void run_kfreepcp(unsigned int cpu)
> > +{
> > + struct zone *zone;
> > + struct per_cpu_pages *pcp;
> > + unsigned long flags;
> > + struct freepcp_stat *stat = this_cpu_ptr(&kfreepcp);
> > + bool need_free_more = false;
> > +
> > +
> > +
> > +again:
> > + need_free_more = false;
> > + for_each_populated_zone(zone) {
> > + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
> > + if (pcp->count && pcp->high && pcp->count > pcp->high) {
> > + unsigned long batch = READ_ONCE(pcp->batch);
> > + int high;
> > +
> > + high = nr_pcp_high(pcp, zone);
> > + local_irq_save(flags);
> > + free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch),
> > + pcp);
> > + local_irq_restore(flags);
> > + if (pcp->count > pcp->high)
> > + need_free_more = true;
> > + }
> > +
> > + cond_resched();
> > + }
> > + if (need_free_more)
> > + goto again;
> > +
> > + stat->should_run = false;
> > +}
> > +
> > +static struct smp_hotplug_thread freepcp_threads = {
> > + .store = &kfreepcp.thread,
> > + .thread_should_run = kfreepcp_should_run,
> > + .thread_fn = run_kfreepcp,
> > + .thread_comm = "kfreepcp/%u",
> > +};
> > +
> > +static int __init freepcp_init(void)
> > +{
> > + int cpu;
> > +
> > + for_each_possible_cpu(cpu)
> > + per_cpu(kfreepcp.should_run, cpu) = false;
> > +
> > + BUG_ON(smpboot_register_percpu_thread(&freepcp_threads));
> > +
> > + return 0;
> > +}
> > +late_initcall(freepcp_init);
> > +
> > static void free_unref_page_commit(struct page *page, unsigned long pfn,
> > int migratetype, unsigned int order)
> > {
> > @@ -3375,11 +3457,8 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn,
> > list_add(&page->lru, &pcp->lists[pindex]);
> > pcp->count += 1 << order;
> > high = nr_pcp_high(pcp, zone);
> > - if (pcp->count >= high) {
> > - int batch = READ_ONCE(pcp->batch);
> > -
> > - free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch), pcp);
> > - }
> > + if (pcp->count >= high)
> > + this_cpu_ptr(&kfreepcp)->should_run = false;
> > }
> >
> > /*
> >
>
>