Re: [PATCH v4 06/11] mm: thp: check pmd migration entry in common path
From: Zi Yan
Date: Fri Mar 24 2017 - 12:10:12 EST
Kirill A. Shutemov wrote:
> On Mon, Mar 13, 2017 at 11:45:02AM -0400, Zi Yan wrote:
>> From: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
>>
>> If one of callers of page migration starts to handle thp,
>> memory management code start to see pmd migration entry, so we need
>> to prepare for it before enabling. This patch changes various code
>> point which checks the status of given pmds in order to prevent race
>> between thp migration and the pmd-related works.
>>
>> ChangeLog v1 -> v2:
>> - introduce pmd_related() (I know the naming is not good, but can't
>> think up no better name. Any suggesntion is welcomed.)
>>
>> Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
>>
>> ChangeLog v2 -> v3:
>> - add is_swap_pmd()
>> - a pmd entry should be pmd pointing to pte pages, is_swap_pmd(),
>> pmd_trans_huge(), pmd_devmap(), or pmd_none()
>> - use pmdp_huge_clear_flush() instead of pmdp_huge_get_and_clear()
>> - flush_cache_range() while set_pmd_migration_entry()
>> - pmd_none_or_trans_huge_or_clear_bad() and pmd_trans_unstable() return
>> true on pmd_migration_entry, so that migration entries are not
>> treated as pmd page table entries.
>>
>> Signed-off-by: Zi Yan <zi.yan@xxxxxxxxxxxxxx>
>> ---
>> arch/x86/mm/gup.c | 4 +--
>> fs/proc/task_mmu.c | 22 +++++++++------
>> include/asm-generic/pgtable.h | 3 +-
>> include/linux/huge_mm.h | 14 +++++++--
>> mm/gup.c | 22 +++++++++++++--
>> mm/huge_memory.c | 66 ++++++++++++++++++++++++++++++++++++++-----
>> mm/madvise.c | 2 ++
>> mm/memcontrol.c | 2 ++
>> mm/memory.c | 9 ++++--
>> mm/mprotect.c | 6 ++--
>> mm/mremap.c | 2 +-
>> 11 files changed, 124 insertions(+), 28 deletions(-)
>>
<snip>
>> diff --git a/mm/gup.c b/mm/gup.c
>> index 94fab8fa432b..2b1effb16242 100644
>> --- a/mm/gup.c
>> +++ b/mm/gup.c
>> @@ -272,6 +272,15 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
>> return page;
>> return no_page_table(vma, flags);
>> }
>> + if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
>> + return no_page_table(vma, flags);
>> + if (!pmd_present(*pmd)) {
>> +retry:
>> + if (likely(!(flags & FOLL_MIGRATION)))
>> + return no_page_table(vma, flags);
>> + pmd_migration_entry_wait(mm, pmd);
>> + goto retry;
>
> This looks a lot like endless loop if flags contain FOLL_MIGRATION. Hm?
>
> I guess retry label should be on previous line.
You are right. It should be:
+ if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
+ return no_page_table(vma, flags);
+retry:
+ if (!pmd_present(*pmd)) {
+ if (likely(!(flags & FOLL_MIGRATION)))
+ return no_page_table(vma, flags);
+ pmd_migration_entry_wait(mm, pmd);
+ goto retry;
>
>> + }
>> if (pmd_devmap(*pmd)) {
>> ptl = pmd_lock(mm, pmd);
>> page = follow_devmap_pmd(vma, address, pmd, flags);
>> @@ -286,6 +295,15 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
>> return no_page_table(vma, flags);
>>
>> ptl = pmd_lock(mm, pmd);
>> + if (unlikely(!pmd_present(*pmd))) {
>> +retry_locked:
>> + if (likely(!(flags & FOLL_MIGRATION))) {
>> + spin_unlock(ptl);
>> + return no_page_table(vma, flags);
>> + }
>> + pmd_migration_entry_wait(mm, pmd);
>> + goto retry_locked;
>
> Again. That's doesn't look right..
It will be changed:
ptl = pmd_lock(mm, pmd);
+retry_locked:
+ if (unlikely(!pmd_present(*pmd))) {
+ if (likely(!(flags & FOLL_MIGRATION))) {
+ spin_unlock(ptl);
+ return no_page_table(vma, flags);
+ }
+ pmd_migration_entry_wait(mm, pmd);
+ goto retry_locked;
>
>> + }
>> if (unlikely(!pmd_trans_huge(*pmd))) {
>> spin_unlock(ptl);
>> return follow_page_pte(vma, address, pmd, flags);
>> @@ -341,7 +359,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
>> pud = pud_offset(pgd, address);
>> BUG_ON(pud_none(*pud));
>> pmd = pmd_offset(pud, address);
>> - if (pmd_none(*pmd))
>> + if (!pmd_present(*pmd))
>> return -EFAULT;
>> VM_BUG_ON(pmd_trans_huge(*pmd));
>> pte = pte_offset_map(pmd, address);
>> @@ -1369,7 +1387,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
>> pmd_t pmd = READ_ONCE(*pmdp);
>>
>> next = pmd_addr_end(addr, end);
>> - if (pmd_none(pmd))
>> + if (!pmd_present(pmd))
>> return 0;
>>
>> if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index a9c2a0ef5b9b..3f18452f3eb1 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -898,6 +898,21 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>>
>> ret = -EAGAIN;
>> pmd = *src_pmd;
>> +
>> + if (unlikely(is_pmd_migration_entry(pmd))) {
>
> Shouldn't you first check that the pmd is not present?
is_pmd_migration_entry() checks !pmd_present().
in linux/swapops.h, is_pmd_migration_entry is defined as:
static inline int is_pmd_migration_entry(pmd_t pmd)
{
return !pmd_present(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
}
>
>> + swp_entry_t entry = pmd_to_swp_entry(pmd);
>> +
>> + if (is_write_migration_entry(entry)) {
>> + make_migration_entry_read(&entry);
>> + pmd = swp_entry_to_pmd(entry);
>> + set_pmd_at(src_mm, addr, src_pmd, pmd);
>> + }
>> + set_pmd_at(dst_mm, addr, dst_pmd, pmd);
>> + ret = 0;
>> + goto out_unlock;
>> + }
>> + WARN_ONCE(!pmd_present(pmd), "Uknown non-present format on pmd.\n");
>
> Typo.
Got it.
>
>> +
>> if (unlikely(!pmd_trans_huge(pmd))) {
>> pte_free(dst_mm, pgtable);
>> goto out_unlock;
>> @@ -1204,6 +1219,9 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
>> if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
>> goto out_unlock;j
>>
>> + if (unlikely(!pmd_present(orig_pmd)))
>> + goto out_unlock;
>> +
>> page = pmd_page(orig_pmd);
>> VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
>> /*
>> @@ -1338,7 +1356,15 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
>> if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
>> goto out;
>>
>> - page = pmd_page(*pmd);
>> + if (is_pmd_migration_entry(*pmd)) {
>
> Again, I don't think it's it's safe to check if pmd is migration entry
> before checking if it's present.
>
>> + swp_entry_t entry;
>> +
>> + entry = pmd_to_swp_entry(*pmd);
>> + page = pfn_to_page(swp_offset(entry));
>> + if (!is_migration_entry(entry))
>> + goto out;
>
> I don't understand how it suppose to work.
> You take swp_offset() of entry before checking if it's migration entry.
> What's going on?
This chunk of change inside follow_trans_huge_pmd() is not needed.
Because two callers, smaps_pmd_entry() and follow_page_mask(), guarantee
that the pmd points to a present entry.
I will drop this chunk in the next version.
>
>> + } else
>> + page = pmd_page(*pmd);
>> VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
>> if (flags & FOLL_TOUCH)
>> touch_pmd(vma, addr, pmd);
>> @@ -1534,6 +1560,9 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>> if (is_huge_zero_pmd(orig_pmd))
>> goto out;
>>
>> + if (unlikely(!pmd_present(orig_pmd)))
>> + goto out;
>> +
>> page = pmd_page(orig_pmd);
>> /*
>> * If other processes are mapping this page, we couldn't discard
>> @@ -1766,6 +1795,20 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>> if (prot_numa && pmd_protnone(*pmd))
>> goto unlock;
>>
>> + if (is_pmd_migration_entry(*pmd)) {
>> + swp_entry_t entry = pmd_to_swp_entry(*pmd);
>> +
>> + if (is_write_migration_entry(entry)) {
>> + pmd_t newpmd;
>> +
>> + make_migration_entry_read(&entry);
>> + newpmd = swp_entry_to_pmd(entry);
>> + set_pmd_at(mm, addr, pmd, newpmd);
>> + }
>> + goto unlock;
>> + } else if (!pmd_present(*pmd))
>> + WARN_ONCE(1, "Uknown non-present format on pmd.\n");
>
> Another typo.
Got it.
Thanks for all your comments.
--
Best Regards,
Yan Zi
Attachment:
signature.asc
Description: OpenPGP digital signature