Re: [PATCH v5 04/11] thp: do_huge_pmd_wp_page(): handle huge zeropage

From: Kirill A. Shutemov
Date: Thu Nov 15 2012 - 03:28:21 EST


On Wed, Nov 14, 2012 at 03:08:45PM -0800, David Rientjes wrote:
> On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:
>
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index fa06804..fe329da 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -516,6 +516,14 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
> > }
> > #endif
> >
> > +#ifndef my_zero_pfn
> > +static inline unsigned long my_zero_pfn(unsigned long addr)
> > +{
> > + extern unsigned long zero_pfn;
>
> I don't think you should be declaring this inside an inlined function, you
> probably should be protecting the declarations of the variable and the
> function instead. Perhaps by CONFIG_MMU?

mips and s390 use declaration inside inline function to implement
is_zero_pfn(). I wanted to be consistent with that.

I have patch to cleanup zero page helpers a bit. It's on top of
this patchset.

http://article.gmane.org/gmane.linux.kernel.mm/87387

> > + return zero_pfn;
> > +}
> > +#endif
> > +
> > /*
> > * Multiple processes may "see" the same page. E.g. for untouched
> > * mappings of /dev/null, all processes see the same page full of
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 0d903bf..d767a7c 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -824,6 +824,88 @@ out:
> > return ret;
> > }
> >
> > +/* no "address" argument so destroys page coloring of some arch */
> > +pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
> > +{
>
> Umm, this is a copy and paste of pgtable_trans_huge_withdraw() from the
> generic page table handling. Why can't you reuse that and support (and/or
> modify) the s390 and sparc code?

My bad. It's mistake on conflict solving. I'll fix that.

> > + pgtable_t pgtable;
> > +
> > + assert_spin_locked(&mm->page_table_lock);
> > +
> > + /* FIFO */
> > + pgtable = mm->pmd_huge_pte;
> > + if (list_empty(&pgtable->lru))
> > + mm->pmd_huge_pte = NULL;
> > + else {
> > + mm->pmd_huge_pte = list_entry(pgtable->lru.next,
> > + struct page, lru);
> > + list_del(&pgtable->lru);
> > + }
> > + return pgtable;
> > +}
> > +
> > +static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
> > + struct vm_area_struct *vma, unsigned long address,
> > + pmd_t *pmd, unsigned long haddr)
>
> This whole function is extremely similar to the implementation of
> do_huge_pmd_wp_page_fallback(), there really is no way to fold the two?

It's similar by structure (I used do_huge_pmd_wp_page_fallback() as a
template) but details are different in many places and I fail to see how
to combine them without making result ugly.

> Typically in cases like this it's helpful to split out different logical
> segments of a function into smaller functions that would handle both
> page and !page accordingly.
>
> > +{
> > + pgtable_t pgtable;
> > + pmd_t _pmd;
> > + struct page *page;
> > + int i, ret = 0;
> > + unsigned long mmun_start; /* For mmu_notifiers */
> > + unsigned long mmun_end; /* For mmu_notifiers */
> > +
> > + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
> > + if (!page) {
> > + ret |= VM_FAULT_OOM;
> > + goto out;
> > + }
> > +
> > + if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
> > + put_page(page);
> > + ret |= VM_FAULT_OOM;
> > + goto out;
> > + }
> > +
> > + clear_user_highpage(page, address);
> > + __SetPageUptodate(page);
> > +
> > + mmun_start = haddr;
> > + mmun_end = haddr + HPAGE_PMD_SIZE;
> > + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
> > +
> > + spin_lock(&mm->page_table_lock);
> > + pmdp_clear_flush(vma, haddr, pmd);
> > + /* leave pmd empty until pte is filled */
> > +
> > + pgtable = get_pmd_huge_pte(mm);
> > + pmd_populate(mm, &_pmd, pgtable);
> > +
> > + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
> > + pte_t *pte, entry;
> > + if (haddr == (address & PAGE_MASK)) {
> > + entry = mk_pte(page, vma->vm_page_prot);
> > + entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> > + page_add_new_anon_rmap(page, vma, haddr);
> > + } else {
> > + entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
> > + entry = pte_mkspecial(entry);
> > + }
> > + pte = pte_offset_map(&_pmd, haddr);
> > + VM_BUG_ON(!pte_none(*pte));
> > + set_pte_at(mm, haddr, pte, entry);
> > + pte_unmap(pte);
> > + }
> > + smp_wmb(); /* make pte visible before pmd */
> > + pmd_populate(mm, pmd, pgtable);
> > + spin_unlock(&mm->page_table_lock);
> > +
> > + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
> > +
> > + ret |= VM_FAULT_WRITE;
> > +out:
> > + return ret;
> > +}
> > +
> > static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
> > struct vm_area_struct *vma,
> > unsigned long address,
> > @@ -930,19 +1012,21 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
> > unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
> > {
> > int ret = 0;
> > - struct page *page, *new_page;
> > + struct page *page = NULL, *new_page;
> > unsigned long haddr;
> > unsigned long mmun_start; /* For mmu_notifiers */
> > unsigned long mmun_end; /* For mmu_notifiers */
> >
> > VM_BUG_ON(!vma->anon_vma);
> > + haddr = address & HPAGE_PMD_MASK;
> > + if (is_huge_zero_pmd(orig_pmd))
> > + goto alloc;
> > spin_lock(&mm->page_table_lock);
> > if (unlikely(!pmd_same(*pmd, orig_pmd)))
> > goto out_unlock;
> >
> > page = pmd_page(orig_pmd);
> > VM_BUG_ON(!PageCompound(page) || !PageHead(page));
> > - haddr = address & HPAGE_PMD_MASK;
> > if (page_mapcount(page) == 1) {
> > pmd_t entry;
> > entry = pmd_mkyoung(orig_pmd);
> > @@ -954,7 +1038,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
> > }
> > get_page(page);
> > spin_unlock(&mm->page_table_lock);
> > -
> > +alloc:
>
> This could all use a minor restructuring to make it much more cleaner,
> perhaps by extracting the page_mapcount(page) == 1 case to be a separate
> function that deals with non-copying writes?

Makes sense. I'll do it as a separate patch on top of the series.

>
> > if (transparent_hugepage_enabled(vma) &&
> > !transparent_hugepage_debug_cow())
> > new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
> > @@ -964,24 +1048,34 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
> >
> > if (unlikely(!new_page)) {
> > count_vm_event(THP_FAULT_FALLBACK);
> > - ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
> > - pmd, orig_pmd, page, haddr);
> > - if (ret & VM_FAULT_OOM)
> > - split_huge_page(page);
> > - put_page(page);
> > + if (is_huge_zero_pmd(orig_pmd)) {
> > + ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
> > + address, pmd, haddr);
> > + } else {
> > + ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
> > + pmd, orig_pmd, page, haddr);
> > + if (ret & VM_FAULT_OOM)
> > + split_huge_page(page);
> > + put_page(page);
> > + }
> > goto out;
> > }
> > count_vm_event(THP_FAULT_ALLOC);
> >
> > if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
> > put_page(new_page);
> > - split_huge_page(page);
> > - put_page(page);
> > + if (page) {
> > + split_huge_page(page);
> > + put_page(page);
> > + }
> > ret |= VM_FAULT_OOM;
> > goto out;
> > }
> >
> > - copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
> > + if (is_huge_zero_pmd(orig_pmd))
> > + clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
> > + else
> > + copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
> > __SetPageUptodate(new_page);
> >
> > mmun_start = haddr;
> > @@ -989,7 +1083,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
> > mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
> >
> > spin_lock(&mm->page_table_lock);
> > - put_page(page);
> > + if (page)
> > + put_page(page);
> > if (unlikely(!pmd_same(*pmd, orig_pmd))) {
> > spin_unlock(&mm->page_table_lock);
> > mem_cgroup_uncharge_page(new_page);
> > @@ -997,7 +1092,6 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
> > goto out_mn;
> > } else {
> > pmd_t entry;
> > - VM_BUG_ON(!PageHead(page));
> > entry = mk_pmd(new_page, vma->vm_page_prot);
> > entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> > entry = pmd_mkhuge(entry);
> > @@ -1005,8 +1099,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
> > page_add_new_anon_rmap(new_page, vma, haddr);
> > set_pmd_at(mm, haddr, pmd, entry);
> > update_mmu_cache_pmd(vma, address, pmd);
> > - page_remove_rmap(page);
> > - put_page(page);
> > + if (is_huge_zero_pmd(orig_pmd))
> > + add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
> > + if (page) {
>
> Couldn't this be an "else" instead?

Yes. I'll update.

> > + VM_BUG_ON(!PageHead(page));
> > + page_remove_rmap(page);
> > + put_page(page);
> > + }
> > ret |= VM_FAULT_WRITE;
> > }
> > spin_unlock(&mm->page_table_lock);

--
Kirill A. Shutemov

Attachment: signature.asc
Description: Digital signature