Re: [PATCH 1/2] mm/numa: no task_numa_fault() call if page table is changed
From: Zi Yan
Date: Thu Aug 08 2024 - 10:23:47 EST
On 8 Aug 2024, at 10:14, David Hildenbrand wrote:
> On 08.08.24 16:13, Zi Yan wrote:
>> On 8 Aug 2024, at 4:22, David Hildenbrand wrote:
>>
>>> On 08.08.24 05:19, Baolin Wang wrote:
>>>>
>>>>
>>>> On 2024/8/8 02:47, Zi Yan wrote:
>>>>> When handling a numa page fault, task_numa_fault() should be called by a
>>>>> process that restores the page table of the faulted folio to avoid
>>>>> duplicated stats counting. Commit b99a342d4f11 ("NUMA balancing: reduce
>>>>> TLB flush via delaying mapping on hint page fault") restructured
>>>>> do_numa_page() and do_huge_pmd_numa_page() and did not avoid
>>>>> task_numa_fault() call in the second page table check after a numa
>>>>> migration failure. Fix it by making all !pte_same()/!pmd_same() return
>>>>> immediately.
>>>>>
>>>>> This issue can cause task_numa_fault() being called more than necessary
>>>>> and lead to unexpected numa balancing results (It is hard to tell whether
>>>>> the issue will cause positive or negative performance impact due to
>>>>> duplicated numa fault counting).
>>>>>
>>>>> Reported-by: "Huang, Ying" <ying.huang@xxxxxxxxx>
>>>>> Closes: https://lore.kernel.org/linux-mm/87zfqfw0yw.fsf@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx/
>>>>> Fixes: b99a342d4f11 ("NUMA balancing: reduce TLB flush via delaying mapping on hint page fault")
>>>>> Cc: <stable@xxxxxxxxxxxxxxx>
>>>>> Signed-off-by: Zi Yan <ziy@xxxxxxxxxx>
>>>>
>>>> The fix looks reasonable to me. Feel free to add:
>>>> Reviewed-by: Baolin Wang <baolin.wang@xxxxxxxxxxxxxxxxx>
>>>>
>>>> (Nit: These goto labels are a bit confusing and might need some cleanup
>>>> in the future.)
>>>
>>> Agreed, maybe we should simply handle that right away and replace the "goto out;" users by "return 0;".
>>>
>>> Then, just copy the 3 LOC.
>>>
>>> For mm/memory.c that would be:
>>>
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index 67496dc5064f..410ba50ca746 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -5461,7 +5461,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>>> if (unlikely(!pte_same(old_pte, vmf->orig_pte))) {
>>> pte_unmap_unlock(vmf->pte, vmf->ptl);
>>> - goto out;
>>> + return 0;
>>> }
>>> pte = pte_modify(old_pte, vma->vm_page_prot);
>>> @@ -5528,15 +5528,14 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>>> vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
>>> vmf->address, &vmf->ptl);
>>> if (unlikely(!vmf->pte))
>>> - goto out;
>>> + return 0;
>>> if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
>>> pte_unmap_unlock(vmf->pte, vmf->ptl);
>>> - goto out;
>>> + return 0;
>>> }
>>> goto out_map;
>>> }
>>> -out:
>>> if (nid != NUMA_NO_NODE)
>>> task_numa_fault(last_cpupid, nid, nr_pages, flags);
>>> return 0;
>>> @@ -5552,7 +5551,9 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>>> numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte,
>>> writable);
>>> pte_unmap_unlock(vmf->pte, vmf->ptl);
>>> - goto out;
>>> + if (nid != NUMA_NO_NODE)
>>> + task_numa_fault(last_cpupid, nid, nr_pages, flags);
>>> + return 0;
>>> }
>>
>> Looks good to me. Thanks.
>>
>> Hi Andrew,
>>
>> Should I resend this for an easy back porting? Or you want to fold David’s
>> changes in directly?
>
> Note that I didn't touch huge_memory.c. So maybe just send a fixup on top?
Got it. The fixup is attached.
Best Regards,
Yan, Zi
From c0494d569e77291f7f51abb16c2ceff0976371f4 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@xxxxxxxxxx>
Date: Thu, 8 Aug 2024 10:18:42 -0400
Subject: [PATCH] fixup! mm/numa: no task_numa_fault() call if page table is
changed
---
mm/huge_memory.c | 11 +++++------
mm/memory.c | 12 ++++++------
2 files changed, 11 insertions(+), 12 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a3c018f2b554..4e8746769a97 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1681,7 +1681,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
spin_unlock(vmf->ptl);
- goto out;
+ return 0;
}
pmd = pmd_modify(oldpmd, vma->vm_page_prot);
@@ -1729,16 +1729,13 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
spin_unlock(vmf->ptl);
- goto out;
+ return 0;
}
goto out_map;
}
-count_fault:
if (nid != NUMA_NO_NODE)
task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
-
-out:
return 0;
out_map:
@@ -1750,7 +1747,9 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
spin_unlock(vmf->ptl);
- goto count_fault;
+ if (nid != NUMA_NO_NODE)
+ task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
+ return 0;
}
/*
diff --git a/mm/memory.c b/mm/memory.c
index 503d493263df..410ba50ca746 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5461,7 +5461,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
if (unlikely(!pte_same(old_pte, vmf->orig_pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
- goto out;
+ return 0;
}
pte = pte_modify(old_pte, vma->vm_page_prot);
@@ -5528,18 +5528,16 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
if (unlikely(!vmf->pte))
- goto out;
+ return 0;
if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
- goto out;
+ return 0;
}
goto out_map;
}
-count_fault:
if (nid != NUMA_NO_NODE)
task_numa_fault(last_cpupid, nid, nr_pages, flags);
-out:
return 0;
out_map:
/*
@@ -5553,7 +5551,9 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte,
writable);
pte_unmap_unlock(vmf->pte, vmf->ptl);
- goto count_fault;
+ if (nid != NUMA_NO_NODE)
+ task_numa_fault(last_cpupid, nid, nr_pages, flags);
+ return 0;
}
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
--
2.43.0
Attachment:
signature.asc
Description: OpenPGP digital signature