Re: [PATCH] mm: Remove double faults once write a device pfn

From: Christian König
Date: Mon Jan 22 2024 - 04:18:52 EST


Am 22.01.24 um 04:32 schrieb Xianrong Zhou:
The vmf_insert_pfn_prot could cause unnecessary double faults
on a device pfn. Because currently the vmf_insert_pfn_prot does
not make the pfn writable so the pte entry is normally read-only
or dirty catching.

What? How do you got to this conclusion?

The first fault only sets up the pte entry which actually is
dirty catching. And the second immediate fault to the pfn due
to first dirty catching when the cpu re-execute the store
instruction.

It could be that this is done to work around some hw behavior, but not because of dirty catching.

Normally if the drivers call vmf_insert_pfn_prot and also supply
'pfn_mkwrite' callback within vm_operations_struct which requires
the pte to be dirty catching then the vmf_insert_pfn_prot and the
double fault are reasonable. It is not a problem.

Well, as far as I can see that behavior absolutely doesn't make sense.

When pfn_mkwrite is requested then the driver should use PAGE_COPY, which is exactly what VMWGFX (the only driver using dirty tracking) is doing.

Everybody else uses PAGE_SHARED which should make the pte writeable immediately.

Regards,
Christian.


However the most of drivers calling vmf_insert_pfn_prot do not
supply the 'pfn_mkwrite' callback so that the second fault is
unnecessary.

So just like vmf_insert_mixed and vmf_insert_mixed_mkwrite pair,
we should also supply vmf_insert_pfn_mkwrite for drivers as well.

Signed-off-by: Xianrong Zhou <Xianrong.Zhou@xxxxxxx>
---
arch/x86/entry/vdso/vma.c | 3 ++-
drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 +-
drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 2 +-
drivers/gpu/drm/nouveau/nouveau_gem.c | 2 +-
drivers/gpu/drm/radeon/radeon_gem.c | 2 +-
drivers/gpu/drm/ttm/ttm_bo_vm.c | 8 +++++---
drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 8 +++++---
include/drm/ttm/ttm_bo.h | 3 ++-
include/linux/mm.h | 2 +-
mm/memory.c | 14 +++++++++++---
10 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 7645730dc228..dd2431c2975f 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -185,7 +185,8 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK)) {
return vmf_insert_pfn_prot(vma, vmf->address,
__pa(pvti) >> PAGE_SHIFT,
- pgprot_decrypted(vma->vm_page_prot));
+ pgprot_decrypted(vma->vm_page_prot),
+ true);
}
} else if (sym_offset == image->sym_hvclock_page) {
pfn = hv_get_tsc_pfn();
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 49a5f1c73b3e..adcb20d9e624 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -64,7 +64,7 @@ static vm_fault_t amdgpu_gem_fault(struct vm_fault *vmf)
}
ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
- TTM_BO_VM_NUM_PREFAULT);
+ TTM_BO_VM_NUM_PREFAULT, true);
drm_dev_exit(idx);
} else {
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
index 9227f8146a58..c6f13ae6c308 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
@@ -1114,7 +1114,7 @@ static vm_fault_t vm_fault_ttm(struct vm_fault *vmf)
if (drm_dev_enter(dev, &idx)) {
ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
- TTM_BO_VM_NUM_PREFAULT);
+ TTM_BO_VM_NUM_PREFAULT, true);
drm_dev_exit(idx);
} else {
ret = ttm_bo_vm_dummy_page(vmf, vmf->vma->vm_page_prot);
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
index 49c2bcbef129..7e1453762ec9 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -56,7 +56,7 @@ static vm_fault_t nouveau_ttm_fault(struct vm_fault *vmf)
nouveau_bo_del_io_reserve_lru(bo);
prot = vm_get_page_prot(vma->vm_flags);
- ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT);
+ ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT, true);
nouveau_bo_add_io_reserve_lru(bo);
if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
return ret;
diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c
index 3fec3acdaf28..b21cf00ae162 100644
--- a/drivers/gpu/drm/radeon/radeon_gem.c
+++ b/drivers/gpu/drm/radeon/radeon_gem.c
@@ -62,7 +62,7 @@ static vm_fault_t radeon_gem_fault(struct vm_fault *vmf)
goto unlock_resv;
ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
- TTM_BO_VM_NUM_PREFAULT);
+ TTM_BO_VM_NUM_PREFAULT, true);
if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
goto unlock_mclk;
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 4212b8c91dd4..7d14a7d267aa 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -167,6 +167,7 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
* @num_prefault: Maximum number of prefault pages. The caller may want to
* specify this based on madvice settings and the size of the GPU object
* backed by the memory.
+ * @mkwrite: make the pfn or page writable
*
* This function inserts one or more page table entries pointing to the
* memory backing the buffer object, and then returns a return code
@@ -180,7 +181,8 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
*/
vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
pgprot_t prot,
- pgoff_t num_prefault)
+ pgoff_t num_prefault,
+ bool mkwrite)
{
struct vm_area_struct *vma = vmf->vma;
struct ttm_buffer_object *bo = vma->vm_private_data;
@@ -263,7 +265,7 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
* at arbitrary times while the data is mmap'ed.
* See vmf_insert_pfn_prot() for a discussion.
*/
- ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
+ ret = vmf_insert_pfn_prot(vma, address, pfn, prot, mkwrite);
/* Never error on prefaulted PTEs */
if (unlikely((ret & VM_FAULT_ERROR))) {
@@ -312,7 +314,7 @@ vm_fault_t ttm_bo_vm_dummy_page(struct vm_fault *vmf, pgprot_t prot)
/* Prefault the entire VMA range right away to avoid further faults */
for (address = vma->vm_start; address < vma->vm_end;
address += PAGE_SIZE)
- ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
+ ret = vmf_insert_pfn_prot(vma, address, pfn, prot, true);
return ret;
}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
index 74ff2812d66a..bb8e4b641681 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
@@ -452,12 +452,14 @@ vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf)
* sure the page protection is write-enabled so we don't get
* a lot of unnecessary write faults.
*/
- if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE)
+ if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE) {
prot = vm_get_page_prot(vma->vm_flags & ~VM_SHARED);
- else
+ ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault, false);
+ } else {
prot = vm_get_page_prot(vma->vm_flags);
+ ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault, true);
+ }
- ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault);
if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
return ret;
diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h
index 0223a41a64b2..66e293db69ee 100644
--- a/include/drm/ttm/ttm_bo.h
+++ b/include/drm/ttm/ttm_bo.h
@@ -386,7 +386,8 @@ vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
struct vm_fault *vmf);
vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
pgprot_t prot,
- pgoff_t num_prefault);
+ pgoff_t num_prefault,
+ bool mkwrite);
vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf);
void ttm_bo_vm_open(struct vm_area_struct *vma);
void ttm_bo_vm_close(struct vm_area_struct *vma);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f5a97dec5169..f8868e28ea04 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3553,7 +3553,7 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn);
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn, pgprot_t pgprot);
+ unsigned long pfn, pgprot_t pgprot, bool mkwrite);
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn);
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
diff --git a/mm/memory.c b/mm/memory.c
index 7e1f4849463a..2c28f1a349ff 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2195,6 +2195,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
* @addr: target user address of this page
* @pfn: source kernel pfn
* @pgprot: pgprot flags for the inserted page
+ * @mkwrite: make the pfn writable
*
* This is exactly like vmf_insert_pfn(), except that it allows drivers
* to override pgprot on a per-page basis.
@@ -2223,7 +2224,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
* Return: vm_fault_t value.
*/
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn, pgprot_t pgprot)
+ unsigned long pfn, pgprot_t pgprot, bool mkwrite)
{
/*
* Technically, architectures with pte_special can avoid all these
@@ -2246,7 +2247,7 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
- false);
+ mkwrite);
}
EXPORT_SYMBOL(vmf_insert_pfn_prot);
@@ -2273,10 +2274,17 @@ EXPORT_SYMBOL(vmf_insert_pfn_prot);
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn)
{
- return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
+ return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot, false);
}
EXPORT_SYMBOL(vmf_insert_pfn);
+vm_fault_t vmf_insert_pfn_mkwrite(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn)
+{
+ return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot, true);
+}
+EXPORT_SYMBOL(vmf_insert_pfn_mkwrite);
+
static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
{
/* these checks mirror the abort conditions in vm_normal_page */