[RFC PATCH v3 6/8] hugetlb: add vma based lock for pmd sharing synchronization

From: Mike Kravetz
Date: Sun May 08 2022 - 15:06:07 EST


Allocate a rw semaphore and hang off vm_private_data for
synchronization use by vmas that could be involved in pmd sharing. Only
add infrastructure for the new lock here. Actual use will be added in
subsequent patch.

Signed-off-by: Mike Kravetz <mike.kravetz@xxxxxxxxxx>
---
include/linux/hugetlb.h | 34 +++++++++-
kernel/fork.c | 6 +-
mm/hugetlb.c | 138 ++++++++++++++++++++++++++++++++++++----
mm/rmap.c | 8 ++-
4 files changed, 169 insertions(+), 17 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e8e837c15eb9..a02bce4a1575 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -123,7 +123,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
long min_hpages);
void hugepage_put_subpool(struct hugepage_subpool *spool);

-void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
+void hugetlb_dup_vma_private(struct vm_area_struct *vma);
void clear_vma_resv_huge_pages(struct vm_area_struct *vma);
int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *);
int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *,
@@ -210,6 +210,13 @@ struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
pgd_t *pgd, int flags);

+void hugetlb_vma_lock_read(struct vm_area_struct *vma);
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
+void hugetlb_vma_lock_write(struct vm_area_struct *vma);
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma);
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma);
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma);
+
int pmd_huge(pmd_t pmd);
int pud_huge(pud_t pud);
unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
@@ -332,6 +339,31 @@ static inline int prepare_hugepage_range(struct file *file,
return -EINVAL;
}

+static void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+}
+
+static void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+}
+
+static void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+}
+
+static void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+}
+
+static int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+ return 1;
+}
+
+static void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+}
+
static inline int pmd_huge(pmd_t pmd)
{
return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 254ab63c1106..359ed6a3e3e8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -677,12 +677,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
}

/*
- * Clear hugetlb-related page reserves for children. This only
- * affects MAP_PRIVATE mappings. Faults generated by the child
- * are not guaranteed to succeed, even if read-only
+ * Copy/update hugetlb private vma information.
*/
if (is_vm_hugetlb_page(tmp))
- reset_vma_resv_huge_pages(tmp);
+ hugetlb_dup_vma_private(tmp);

/*
* Link in the new vma and copy the page table entries.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d278c95e9135..e8861603b069 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -95,6 +95,7 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;

/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
+static bool vma_pmd_shareable(struct vm_area_struct *vma);

static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
@@ -909,6 +910,89 @@ resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
#endif
}

+static bool __vma_shareable_flags_pmd(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
+ vma->vm_private_data;
+}
+
+void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma))
+ down_read((struct rw_semaphore *)&vma->vm_private_data);
+}
+
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma))
+ up_read((struct rw_semaphore *)&vma->vm_private_data);
+}
+
+void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma))
+ down_write((struct rw_semaphore *)&vma->vm_private_data);
+}
+
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma))
+ up_write((struct rw_semaphore *)&vma->vm_private_data);
+}
+
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+ if (!__vma_shareable_flags_pmd(vma))
+ return 1;
+
+ return down_write_trylock((struct rw_semaphore *)&vma->vm_private_data);
+}
+
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma))
+ lockdep_assert_held((struct rw_semaphore *)
+ &vma->vm_private_data);
+}
+
+static void hugetlb_free_vma_lock(struct vm_area_struct *vma)
+{
+ /* Only present in sharable vmas */
+ if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED)))
+ return;
+
+ if (vma->vm_private_data) {
+ kfree(vma->vm_private_data);
+ vma->vm_private_data = NULL;
+ }
+}
+
+static void hugetlb_alloc_vma_lock(struct vm_area_struct *vma)
+{
+ struct rw_semaphore *vma_sema;
+
+ /* Only establish in (flags) sharable vmas */
+ if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED)))
+ return;
+
+ if (!vma_pmd_shareable(vma)) {
+ vma->vm_private_data = NULL;
+ return;
+ }
+
+ vma_sema = kmalloc(sizeof(*vma_sema), GFP_KERNEL);
+ if (!vma_sema) {
+ /*
+ * If we can not allocate semaphore, then vma can not
+ * participate in pmd sharing.
+ */
+ vma->vm_private_data = NULL;
+ } else {
+ init_rwsem(vma_sema);
+ vma->vm_private_data = vma_sema;
+ }
+}
+
struct resv_map *resv_map_alloc(void)
{
struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
@@ -1012,12 +1096,22 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
return (get_vma_private_data(vma) & flag) != 0;
}

-/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
-void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+void hugetlb_dup_vma_private(struct vm_area_struct *vma)
{
+ /*
+ * Clear hugetlb-related page reserves for children. This only
+ * affects MAP_PRIVATE mappings. Faults generated by the child
+ * are not guaranteed to succeed, even if read-only
+ */
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
if (!(vma->vm_flags & VM_MAYSHARE))
vma->vm_private_data = (void *)0;
+
+ /*
+ * Allocate semaphore if pmd sharing is possible. Private mappings
+ * are ignored.
+ */
+ hugetlb_alloc_vma_lock(vma);
}

/*
@@ -1048,7 +1142,7 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
kref_put(&reservations->refs, resv_map_release);
}

- reset_vma_resv_huge_pages(vma);
+ hugetlb_dup_vma_private(vma);
}

/* Returns true if the VMA has associated reserve pages */
@@ -4581,16 +4675,21 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
kref_get(&resv->refs);
}
+
+ hugetlb_alloc_vma_lock(vma);
}

static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
struct hstate *h = hstate_vma(vma);
- struct resv_map *resv = vma_resv_map(vma);
+ struct resv_map *resv;
struct hugepage_subpool *spool = subpool_vma(vma);
unsigned long reserve, start, end;
long gbl_reserve;

+ hugetlb_free_vma_lock(vma);
+
+ resv = vma_resv_map(vma);
if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
return;

@@ -6399,6 +6498,11 @@ bool hugetlb_reserve_pages(struct inode *inode,
return false;
}

+ /*
+ * vma specific semaphore used for pmd sharing synchronization
+ */
+ hugetlb_alloc_vma_lock(vma);
+
/*
* Only apply hugepage reservation if asked. At fault time, an
* attempt will be made for VM_NORESERVE to allocate a page
@@ -6422,12 +6526,11 @@ bool hugetlb_reserve_pages(struct inode *inode,
resv_map = inode_resv_map(inode);

chg = region_chg(resv_map, from, to, &regions_needed);
-
} else {
/* Private mapping. */
resv_map = resv_map_alloc();
if (!resv_map)
- return false;
+ goto out_err;

chg = to - from;

@@ -6522,6 +6625,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
chg * pages_per_huge_page(h), h_cg);
out_err:
+ hugetlb_free_vma_lock(vma);
if (!vma || vma->vm_flags & VM_MAYSHARE)
/* Only call region_abort if the region_chg succeeded but the
* region_add failed or didn't run.
@@ -6603,14 +6707,30 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
static bool __vma_aligned_range_pmd_shareable(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
+#ifdef CONFIG_USERFAULTFD
+ if (uffd_disable_huge_pmd_share(vma))
+ return false;
+#endif
/*
* check on proper vm_flags and page table alignment
*/
- if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, start, end))
+ if (vma->vm_flags & VM_MAYSHARE && vma->vm_private_data &&
+ range_in_vma(vma, start, end))
return true;
return false;
}

+static bool vma_pmd_shareable(struct vm_area_struct *vma)
+{
+ unsigned long start = ALIGN(vma->vm_start, PUD_SIZE),
+ end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
+
+ if (start >= end)
+ return false;
+
+ return __vma_aligned_range_pmd_shareable(vma, start, end);
+}
+
static bool vma_addr_pmd_shareable(struct vm_area_struct *vma,
unsigned long addr)
{
@@ -6622,10 +6742,6 @@ static bool vma_addr_pmd_shareable(struct vm_area_struct *vma,

bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
{
-#ifdef CONFIG_USERFAULTFD
- if (uffd_disable_huge_pmd_share(vma))
- return false;
-#endif
return vma_addr_pmd_shareable(vma, addr);
}

diff --git a/mm/rmap.c b/mm/rmap.c
index e0ede92830a9..d0d87b9cf58b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,7 +24,7 @@
* mm->mmap_lock
* mapping->invalidate_lock (in filemap_fault)
* page->flags PG_locked (lock_page)
- * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
+ * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
* mapping->i_mmap_rwsem
* anon_vma->rwsem
* mm->page_table_lock or pte_lock
@@ -44,6 +44,12 @@
* anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon)
* ->tasklist_lock
* pte map lock
+ *
+ * hugetlbfs PageHuge() take locks in this order:
+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
+ * vma_lock (hugetlb specific lock for pmd_sharing)
+ * mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
+ * page->flags PG_locked (lock_page)
*/

#include <linux/mm.h>
--
2.35.3