[PATCH 6/8] hugetlb: add vma based lock for pmd sharing
From: Mike Kravetz
Date: Wed Aug 24 2022 - 13:59:27 EST
Allocate a rw semaphore and hang off vm_private_data for
synchronization use by vmas that could be involved in pmd sharing. Only
add infrastructure for the new lock here. Actual use will be added in
subsequent patch.
Signed-off-by: Mike Kravetz <mike.kravetz@xxxxxxxxxx>
---
include/linux/hugetlb.h | 36 ++++++++-
kernel/fork.c | 6 +-
mm/hugetlb.c | 170 ++++++++++++++++++++++++++++++++++++----
mm/rmap.c | 8 +-
4 files changed, 197 insertions(+), 23 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index acace1a25226..852f911d676e 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -126,7 +126,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
long min_hpages);
void hugepage_put_subpool(struct hugepage_subpool *spool);
-void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
+void hugetlb_dup_vma_private(struct vm_area_struct *vma);
void clear_vma_resv_huge_pages(struct vm_area_struct *vma);
int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *);
int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *,
@@ -214,6 +214,13 @@ struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
pgd_t *pgd, int flags);
+void hugetlb_vma_lock_read(struct vm_area_struct *vma);
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
+void hugetlb_vma_lock_write(struct vm_area_struct *vma);
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma);
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma);
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma);
+
int pmd_huge(pmd_t pmd);
int pud_huge(pud_t pud);
unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
@@ -225,7 +232,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
#else /* !CONFIG_HUGETLB_PAGE */
-static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma)
{
}
@@ -336,6 +343,31 @@ static inline int prepare_hugepage_range(struct file *file,
return -EINVAL;
}
+static inline void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+}
+
+static inline void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+}
+
+static inline void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+}
+
+static inline void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+}
+
+static inline int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+ return 1;
+}
+
+static inline void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+}
+
static inline int pmd_huge(pmd_t pmd)
{
return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 9470220e8f43..421c143286d2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -675,12 +675,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
}
/*
- * Clear hugetlb-related page reserves for children. This only
- * affects MAP_PRIVATE mappings. Faults generated by the child
- * are not guaranteed to succeed, even if read-only
+ * Copy/update hugetlb private vma information.
*/
if (is_vm_hugetlb_page(tmp))
- reset_vma_resv_huge_pages(tmp);
+ hugetlb_dup_vma_private(tmp);
/*
* Link in the new vma and copy the page table entries.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 758b6844d566..6fb0bff2c7ee 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -91,6 +91,8 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
@@ -1008,12 +1010,25 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
return (get_vma_private_data(vma) & flag) != 0;
}
-/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
-void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+void hugetlb_dup_vma_private(struct vm_area_struct *vma)
{
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
+ /*
+ * Clear vm_private_data
+ * - For MAP_PRIVATE mappings, this is the reserve map which does
+ * not apply to children. Faults generated by the children are
+ * not guaranteed to succeed, even if read-only.
+ * - For shared mappings this is a per-vma semaphore that may be
+ * allocated below.
+ */
+ vma->vm_private_data = (void *)0;
if (!(vma->vm_flags & VM_MAYSHARE))
- vma->vm_private_data = (void *)0;
+ return;
+
+ /*
+ * Allocate semaphore if pmd sharing is possible.
+ */
+ hugetlb_vma_lock_alloc(vma);
}
/*
@@ -1044,7 +1059,7 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
kref_put(&reservations->refs, resv_map_release);
}
- reset_vma_resv_huge_pages(vma);
+ hugetlb_dup_vma_private(vma);
}
/* Returns true if the VMA has associated reserve pages */
@@ -4623,16 +4638,21 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
kref_get(&resv->refs);
}
+
+ hugetlb_vma_lock_alloc(vma);
}
static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
struct hstate *h = hstate_vma(vma);
- struct resv_map *resv = vma_resv_map(vma);
+ struct resv_map *resv;
struct hugepage_subpool *spool = subpool_vma(vma);
unsigned long reserve, start, end;
long gbl_reserve;
+ hugetlb_vma_lock_free(vma);
+
+ resv = vma_resv_map(vma);
if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
return;
@@ -6447,6 +6467,11 @@ bool hugetlb_reserve_pages(struct inode *inode,
return false;
}
+ /*
+ * vma specific semaphore used for pmd sharing synchronization
+ */
+ hugetlb_vma_lock_alloc(vma);
+
/*
* Only apply hugepage reservation if asked. At fault time, an
* attempt will be made for VM_NORESERVE to allocate a page
@@ -6470,12 +6495,11 @@ bool hugetlb_reserve_pages(struct inode *inode,
resv_map = inode_resv_map(inode);
chg = region_chg(resv_map, from, to, ®ions_needed);
-
} else {
/* Private mapping. */
resv_map = resv_map_alloc();
if (!resv_map)
- return false;
+ goto out_err;
chg = to - from;
@@ -6570,6 +6594,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
chg * pages_per_huge_page(h), h_cg);
out_err:
+ hugetlb_vma_lock_free(vma);
if (!vma || vma->vm_flags & VM_MAYSHARE)
/* Only call region_abort if the region_chg succeeded but the
* region_add failed or didn't run.
@@ -6649,14 +6674,34 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
}
static bool __vma_aligned_range_pmd_shareable(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+ unsigned long start, unsigned long end,
+ bool check_vma_lock)
{
+#ifdef CONFIG_USERFAULTFD
+ if (uffd_disable_huge_pmd_share(vma))
+ return false;
+#endif
/*
* check on proper vm_flags and page table alignment
*/
- if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, start, end))
- return true;
- return false;
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ return false;
+ if (check_vma_lock && !vma->vm_private_data)
+ return false;
+ if (!range_in_vma(vma, start, end))
+ return false;
+ return true;
+}
+
+static bool vma_pmd_shareable(struct vm_area_struct *vma)
+{
+ unsigned long start = ALIGN(vma->vm_start, PUD_SIZE),
+ end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
+
+ if (start >= end)
+ return false;
+
+ return __vma_aligned_range_pmd_shareable(vma, start, end, false);
}
static bool vma_addr_pmd_shareable(struct vm_area_struct *vma,
@@ -6665,15 +6710,11 @@ static bool vma_addr_pmd_shareable(struct vm_area_struct *vma,
unsigned long start = addr & PUD_MASK;
unsigned long end = start + PUD_SIZE;
- return __vma_aligned_range_pmd_shareable(vma, start, end);
+ return __vma_aligned_range_pmd_shareable(vma, start, end, true);
}
bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
{
-#ifdef CONFIG_USERFAULTFD
- if (uffd_disable_huge_pmd_share(vma))
- return false;
-#endif
return vma_addr_pmd_shareable(vma, addr);
}
@@ -6704,6 +6745,95 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
*end = ALIGN(*end, PUD_SIZE);
}
+static bool __vma_shareable_flags_pmd(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
+ vma->vm_private_data;
+}
+
+void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma))
+ down_read((struct rw_semaphore *)vma->vm_private_data);
+}
+
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma))
+ up_read((struct rw_semaphore *)vma->vm_private_data);
+}
+
+void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma))
+ down_write((struct rw_semaphore *)vma->vm_private_data);
+}
+
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma))
+ up_write((struct rw_semaphore *)vma->vm_private_data);
+}
+
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+ if (!__vma_shareable_flags_pmd(vma))
+ return 1;
+
+ return down_write_trylock((struct rw_semaphore *)vma->vm_private_data);
+}
+
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma))
+ lockdep_assert_held((struct rw_semaphore *)
+ vma->vm_private_data);
+}
+
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
+{
+ /*
+ * Only present in sharable vmas. See comment in
+ * __unmap_hugepage_range_final about the neeed to check both
+ * VM_SHARED and VM_MAYSHARE in free path
+ */
+ if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED)))
+ return;
+
+ if (vma->vm_private_data) {
+ kfree(vma->vm_private_data);
+ vma->vm_private_data = NULL;
+ }
+}
+
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+{
+ struct rw_semaphore *vma_sema;
+
+ /* Only establish in (flags) sharable vmas */
+ if (!vma || !(vma->vm_flags & VM_MAYSHARE))
+ return;
+
+ /* Should never get here with non-NULL vm_private_data */
+ if (vma->vm_private_data)
+ return;
+
+ /* Check size/alignment for pmd sharing possible */
+ if (!vma_pmd_shareable(vma))
+ return;
+
+ vma_sema = kmalloc(sizeof(*vma_sema), GFP_KERNEL);
+ if (!vma_sema)
+ /*
+ * If we can not allocate semaphore, then vma can not
+ * participate in pmd sharing.
+ */
+ return;
+
+ init_rwsem(vma_sema);
+ vma->vm_private_data = vma_sema;
+}
+
/*
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
@@ -6790,6 +6920,14 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
}
#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
+{
+}
+
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+{
+}
+
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pud_t *pud)
{
diff --git a/mm/rmap.c b/mm/rmap.c
index ad9c97c6445c..55209e029847 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,7 +24,7 @@
* mm->mmap_lock
* mapping->invalidate_lock (in filemap_fault)
* page->flags PG_locked (lock_page)
- * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
+ * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
* mapping->i_mmap_rwsem
* anon_vma->rwsem
* mm->page_table_lock or pte_lock
@@ -44,6 +44,12 @@
* anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon)
* ->tasklist_lock
* pte map lock
+ *
+ * hugetlbfs PageHuge() take locks in this order:
+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
+ * vma_lock (hugetlb specific lock for pmd_sharing)
+ * mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
+ * page->flags PG_locked (lock_page)
*/
#include <linux/mm.h>
--
2.37.1