Re: [RFC PATCH 4/7] drm/ttm: Support huge pagefaults

From: Christian KÃnig
Date: Wed Nov 27 2019 - 04:13:06 EST


Am 27.11.19 um 09:31 schrieb Thomas HellstrÃm (VMware):
From: Thomas Hellstrom <thellstrom@xxxxxxxxxx>

Support huge (PMD-size and PUD-size) page-table entries by providing a
huge_fault() callback.
We still support private mappings and write-notify by splitting the huge
page-table entries on write-access.

Note that for huge page-faults to occur, either the kernel needs to be
compiled with trans-huge-pages always enabled, or the kernel needs to be
compiled with trans-huge-pages enabled using madvise, and the user-space
app needs to call madvise() to enable trans-huge pages on a per-mapping
basis.

Furthermore huge page-faults will not occur unless buffer objects and
user-space addresses are aligned on huge page size boundaries.

Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxxx>
Cc: "Matthew Wilcox (Oracle)" <willy@xxxxxxxxxxxxx>
Cc: "Kirill A. Shutemov" <kirill.shutemov@xxxxxxxxxxxxxxx>
Cc: Ralph Campbell <rcampbell@xxxxxxxxxx>
Cc: "JÃrÃme Glisse" <jglisse@xxxxxxxxxx>
Cc: "Christian KÃnig" <christian.koenig@xxxxxxx>
Signed-off-by: Thomas Hellstrom <thellstrom@xxxxxxxxxx>
---
drivers/gpu/drm/ttm/ttm_bo_vm.c | 139 +++++++++++++++++++++++++++++++-
include/drm/ttm/ttm_bo_api.h | 3 +-
2 files changed, 138 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 2098f8d4dfc5..8d6089880e39 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -150,6 +150,84 @@ vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
}
EXPORT_SYMBOL(ttm_bo_vm_reserve);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/**
+ * ttm_bo_vm_insert_huge - Insert a pfn for PUD or PMD faults
+ * @vmf: Fault data
+ * @bo: The buffer object
+ * @page_offset: Page offset from bo start
+ * @fault_page_size: The size of the fault in pages.
+ * @pgprot: The page protections.
+ * Does additional checking whether it's possible to insert a PUD or PMD
+ * pfn and performs the insertion.
+ *
+ * Return: VM_FAULT_NOPAGE on successful insertion, VM_FAULT_FALLBACK if
+ * a huge fault was not possible, and a VM_FAULT_ERROR code otherwise.
+ */
+static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf,
+ struct ttm_buffer_object *bo,
+ pgoff_t page_offset,
+ pgoff_t fault_page_size,
+ pgprot_t pgprot)
+{
+ pgoff_t i;
+ vm_fault_t ret;
+ unsigned long pfn;
+ pfn_t pfnt;
+ struct ttm_tt *ttm = bo->ttm;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+
+ /* Fault should not cross bo boundary */
+ page_offset &= ~(fault_page_size - 1);
+ if (page_offset + fault_page_size > bo->num_pages)
+ goto out_fallback;
+
+ if (bo->mem.bus.is_iomem)
+ pfn = ttm_bo_io_mem_pfn(bo, page_offset);
+ else
+ pfn = page_to_pfn(ttm->pages[page_offset]);
+
+ /* pfn must be fault_page_size aligned. */
+ if ((pfn & (fault_page_size - 1)) != 0)
+ goto out_fallback;
+
+ /* IO memory is OK now, TT memory must be contigous. */

That won't work correctly, IO mem might not be contiguous either.

We either need to call ttm_bo_io_mem_pfn() multiple times and check that the addresses are linear or return the length additional to the pfn.

Regards,
Christian.

+ if (!bo->mem.bus.is_iomem)
+ for (i = 1; i < fault_page_size; ++i) {
+ if (page_to_pfn(ttm->pages[page_offset + i]) != pfn + i)
+ goto out_fallback;
+ }
+
+ pfnt = __pfn_to_pfn_t(pfn, PFN_DEV);
+ if (fault_page_size == (HPAGE_PMD_SIZE >> PAGE_SHIFT))
+ ret = vmf_insert_pfn_pmd_prot(vmf, pfnt, pgprot, write);
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+ else if (fault_page_size == (HPAGE_PUD_SIZE >> PAGE_SHIFT))
+ ret = vmf_insert_pfn_pud_prot(vmf, pfnt, pgprot, write);
+#endif
+ else
+ WARN_ON_ONCE(ret = VM_FAULT_FALLBACK);
+
+ if (ret != VM_FAULT_NOPAGE)
+ goto out_fallback;
+
+ return VM_FAULT_NOPAGE;
+out_fallback:
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+}
+#else
+static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf,
+ struct ttm_buffer_object *bo,
+ pgoff_t page_offset,
+ pgoff_t fault_page_size,
+ pgprot_t pgprot)
+{
+ return VM_FAULT_NOPAGE;
+}
+#endif
+
/**
* ttm_bo_vm_fault_reserved - TTM fault helper
* @vmf: The struct vm_fault given as argument to the fault callback
@@ -170,7 +248,8 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
*/
vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
pgprot_t prot,
- pgoff_t num_prefault)
+ pgoff_t num_prefault,
+ pgoff_t fault_page_size)
{
struct vm_area_struct *vma = vmf->vma;
struct ttm_buffer_object *bo = vma->vm_private_data;
@@ -262,6 +341,13 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
prot = pgprot_decrypted(prot);
}
+ /* We don't prefault on huge faults. Yet. */
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && fault_page_size != 1) {
+ ret = ttm_bo_vm_insert_huge(vmf, bo, page_offset,
+ fault_page_size, prot);
+ goto out_io_unlock;
+ }
+
/*
* Speculatively prefault a number of pages. Only error on
* first page.
@@ -320,7 +406,7 @@ vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
return ret;
prot = vma->vm_page_prot;
- ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT);
+ ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT, 1);
if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
return ret;
@@ -330,6 +416,50 @@ vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
}
EXPORT_SYMBOL(ttm_bo_vm_fault);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static vm_fault_t ttm_bo_vm_huge_fault(struct vm_fault *vmf,
+ enum page_entry_size pe_size)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ pgprot_t prot;
+ struct ttm_buffer_object *bo = vma->vm_private_data;
+ vm_fault_t ret;
+ pgoff_t fault_page_size = 0;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+ switch (pe_size) {
+ case PE_SIZE_PMD:
+ fault_page_size = HPAGE_PMD_SIZE >> PAGE_SHIFT;
+ break;
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+ case PE_SIZE_PUD:
+ fault_page_size = HPAGE_PUD_SIZE >> PAGE_SHIFT;
+ break;
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ return VM_FAULT_FALLBACK;
+ }
+
+ /* Fallback on write dirty-tracking or COW */
+ if (write && !(pgprot_val(vmf->vma->vm_page_prot) & _PAGE_RW))
+ return VM_FAULT_FALLBACK;
+
+ ret = ttm_bo_vm_reserve(bo, vmf);
+ if (ret)
+ return ret;
+
+ prot = vm_get_page_prot(vma->vm_flags);
+ ret = ttm_bo_vm_fault_reserved(vmf, prot, 1, fault_page_size);
+ if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
+ return ret;
+
+ dma_resv_unlock(bo->base.resv);
+
+ return ret;
+}
+#endif
+
void ttm_bo_vm_open(struct vm_area_struct *vma)
{
struct ttm_buffer_object *bo = vma->vm_private_data;
@@ -431,7 +561,10 @@ static const struct vm_operations_struct ttm_bo_vm_ops = {
.fault = ttm_bo_vm_fault,
.open = ttm_bo_vm_open,
.close = ttm_bo_vm_close,
- .access = ttm_bo_vm_access
+ .access = ttm_bo_vm_access,
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ .huge_fault = ttm_bo_vm_huge_fault,
+#endif
};
static struct ttm_buffer_object *ttm_bo_vm_lookup(struct ttm_bo_device *bdev,
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
index 66ca49db9633..4fc90d53aa15 100644
--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@@ -732,7 +732,8 @@ vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
pgprot_t prot,
- pgoff_t num_prefault);
+ pgoff_t num_prefault,
+ pgoff_t fault_page_size);
vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf);