Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

From: Guillaume Morin
Date: Tue Apr 30 2024 - 11:22:39 EST


On 26 Apr 21:55, Guillaume Morin wrote:
>
> On 26 Apr 9:19, David Hildenbrand wrote:
> > A couple of points:
> >
> > a) Don't use page_mapcount(). Either folio_mapcount(), but likely you want
> > to check PageAnonExclusive.
> >
> > b) If you're not following the can_follow_write_pte/_pmd model, you are
> > doing something wrong :)
> >
> > c) The code was heavily changed in mm/mm-unstable. It was merged with t
> > the common code.
> >
> > Likely, in mm/mm-unstable, the existing can_follow_write_pte and
> > can_follow_write_pmd checks will already cover what you want in most cases.
> >
> > We'd need a can_follow_write_pud() to cover follow_huge_pud() and
> > (unfortunately) something to handle follow_hugepd() as well similarly.
> >
> > Copy-pasting what we do in can_follow_write_pte() and adjusting for
> > different PTE types is the right thing to do. Maybe now it's time to factor
> > out the common checks into a separate helper.
>
> I tried to get the hugepd stuff right but this was the first I heard
> about it :-) Afaict follow_huge_pmd and friends were already DTRT

I got it working on top of your uprobes-cow branch with the foll force
patch sent friday. Still pretty lightly tested

I went with using one write uprobe function with some additional
branches. I went back and forth between that and making them 2 different
functions.

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 2f4e88552d3f..8a33e380f7ea 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -83,6 +83,10 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
{}
};

+bool hugetlbfs_mapping(struct address_space *mapping) {
+ return mapping->a_ops == &hugetlbfs_aops;
+}
+
/*
* Mask used when checking the page offset value passed in via system
* calls. This value will be converted to a loff_t which is signed.
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 68244bb3637a..66fdcc0b5db2 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -511,6 +511,8 @@ struct hugetlbfs_sb_info {
umode_t mode;
};

+bool hugetlbfs_mapping(struct address_space *mapping);
+
static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
{
return sb->s_fs_info;
@@ -557,6 +559,8 @@ static inline struct hstate *hstate_inode(struct inode *i)
{
return NULL;
}
+
+static inline bool hugetlbfs_mapping(struct address_space *mapping) { return false; }
#endif /* !CONFIG_HUGETLBFS */

#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 4237a7477cca..e6e93a574c39 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -11,6 +11,7 @@

#include <linux/kernel.h>
#include <linux/highmem.h>
+#include <linux/hugetlb.h>
#include <linux/pagemap.h> /* read_mapping_page */
#include <linux/slab.h>
#include <linux/sched.h>
@@ -120,7 +121,7 @@ struct xol_area {
*/
static bool valid_vma(struct vm_area_struct *vma, bool is_register)
{
- vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
+ vm_flags_t flags = VM_MAYEXEC | VM_MAYSHARE;

if (is_register)
flags |= VM_WRITE;
@@ -163,21 +164,21 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn)
return is_swbp_insn(insn);
}

-static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
+static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len, unsigned long page_mask)
{
void *kaddr = kmap_atomic(page);
- memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
+ memcpy(dst, kaddr + (vaddr & ~page_mask), len);
kunmap_atomic(kaddr);
}

-static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
+static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len, unsigned long page_mask)
{
void *kaddr = kmap_atomic(page);
- memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
+ memcpy(kaddr + (vaddr & ~page_mask), src, len);
kunmap_atomic(kaddr);
}

-static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
+static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode, unsigned long page_mask)
{
uprobe_opcode_t old_opcode;
bool is_swbp;
@@ -191,7 +192,8 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* is a trap variant; uprobes always wins over any other (gdb)
* breakpoint.
*/
- copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
+ copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE,
+ page_mask);
is_swbp = is_swbp_insn(&old_opcode);

if (is_swbp_insn(new_opcode)) {
@@ -376,8 +378,8 @@ struct uwo_data {
uprobe_opcode_t opcode;
};

-static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
- unsigned long next, struct mm_walk *walk)
+static int __write_opcode(pte_t *ptep, unsigned long vaddr,
+ unsigned long page_mask, struct mm_walk *walk)
{
struct uwo_data *data = walk->private;;
const bool is_register = !!is_swbp_insn(&data->opcode);
@@ -415,9 +417,12 @@ static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,

/* Unmap + flush the TLB, such that we can write atomically .*/
flush_cache_page(vma, vaddr, pte_pfn(pte));
- pte = ptep_clear_flush(vma, vaddr, ptep);
+ if (folio_test_hugetlb(folio))
+ pte = huge_ptep_clear_flush(vma, vaddr, ptep);
+ else
+ pte = ptep_clear_flush(vma, vaddr, ptep);
copy_to_page(page, data->opcode_vaddr, &data->opcode,
- UPROBE_SWBP_INSN_SIZE);
+ UPROBE_SWBP_INSN_SIZE, page_mask);

/* When unregistering, we may only zap a PTE if uffd is disabled ... */
if (is_register || userfaultfd_missing(vma))
@@ -443,13 +448,18 @@ static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
if (!identical || folio_maybe_dma_pinned(folio))
goto remap;

- /* Zap it and try to reclaim swap space. */
- dec_mm_counter(mm, MM_ANONPAGES);
- folio_remove_rmap_pte(folio, page, vma);
- if (!folio_mapped(folio) && folio_test_swapcache(folio) &&
- folio_trylock(folio)) {
- folio_free_swap(folio);
- folio_unlock(folio);
+ if (folio_test_hugetlb(folio)) {
+ hugetlb_remove_rmap(folio);
+ large = false;
+ } else {
+ /* Zap it and try to reclaim swap space. */
+ dec_mm_counter(mm, MM_ANONPAGES);
+ folio_remove_rmap_pte(folio, page, vma);
+ if (!folio_mapped(folio) && folio_test_swapcache(folio) &&
+ folio_trylock(folio)) {
+ folio_free_swap(folio);
+ folio_unlock(folio);
+ }
}
folio_put(folio);

@@ -461,11 +471,29 @@ static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
*/
smp_wmb();
/* We modified the page. Make sure to mark the PTE dirty. */
- set_pte_at(mm, vaddr, ptep, pte_mkdirty(pte));
+ if (folio_test_hugetlb(folio))
+ set_huge_pte_at(mm , vaddr, ptep, huge_pte_mkdirty(pte),
+ (~page_mask) + 1);
+ else
+ set_pte_at(mm, vaddr, ptep, pte_mkdirty(pte));
return UWO_DONE;
}

+static int __write_opcode_hugetlb(pte_t *ptep, unsigned long hmask,
+ unsigned long vaddr,
+ unsigned long next, struct mm_walk *walk)
+{
+ return __write_opcode(ptep, vaddr, hmask, walk);
+}
+
+static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
+ unsigned long next, struct mm_walk *walk)
+{
+ return __write_opcode(ptep, vaddr, PAGE_MASK, walk);
+}
+
static const struct mm_walk_ops write_opcode_ops = {
+ .hugetlb_entry = __write_opcode_hugetlb,
.pte_entry = __write_opcode_pte,
.walk_lock = PGWALK_WRLOCK,
};
@@ -492,7 +520,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
unsigned long opcode_vaddr, uprobe_opcode_t opcode)
{
struct uprobe *uprobe = container_of(auprobe, struct uprobe, arch);
- const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
+ unsigned long vaddr = opcode_vaddr & PAGE_MASK;
const bool is_register = !!is_swbp_insn(&opcode);
struct uwo_data data = {
.opcode = opcode,
@@ -503,6 +531,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
struct mmu_notifier_range range;
int ret, ref_ctr_updated = 0;
struct page *page;
+ unsigned long page_size = PAGE_SIZE;

if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
return -EINVAL;
@@ -521,7 +550,14 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
if (ret != 1)
goto out;

- ret = verify_opcode(page, opcode_vaddr, &opcode);
+
+ if (is_vm_hugetlb_page(vma)) {
+ struct hstate *h = hstate_vma(vma);
+ page_size = huge_page_size(h);
+ vaddr &= huge_page_mask(h);
+ page = compound_head(page);
+ }
+ ret = verify_opcode(page, opcode_vaddr, &opcode, ~(page_size - 1));
put_page(page);
if (ret <= 0)
goto out;
@@ -541,12 +577,12 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
* be able to do it under PTL.
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
- vaddr, vaddr + PAGE_SIZE);
+ vaddr, vaddr + page_size);
mmu_notifier_invalidate_range_start(&range);
}

/* Walk the page tables and perform the actual magic. */
- ret = walk_page_range_vma(vma, vaddr, vaddr + PAGE_SIZE,
+ ret = walk_page_range_vma(vma, vaddr, vaddr + page_size,
&write_opcode_ops, &data);

if (!is_register)
@@ -816,6 +852,7 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
void *insn, int nbytes, loff_t offset)
{
struct page *page;
+ unsigned long mask = PAGE_MASK;
/*
* Ensure that the page that has the original instruction is populated
* and in page-cache. If ->read_folio == NULL it must be shmem_mapping(),
@@ -823,12 +860,17 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
*/
if (mapping->a_ops->read_folio)
page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
- else
+ else if (!is_file_hugepages(filp))
page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
+ else {
+ struct hstate *h = hstate_file(filp);
+ mask = huge_page_mask(h);
+ page = find_get_page(mapping, (offset & mask) >> PAGE_SHIFT);
+ }
if (IS_ERR(page))
return PTR_ERR(page);

- copy_from_page(page, offset, insn, nbytes);
+ copy_from_page(page, offset, insn, nbytes, mask);
put_page(page);

return 0;
@@ -1175,9 +1217,12 @@ static int __uprobe_register(struct inode *inode, loff_t offset,
if (!uc->handler && !uc->ret_handler)
return -EINVAL;

- /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
+ /* copy_insn() uses read_mapping_page() or shmem/hugetlbfs specific
+ * logic
+ */
if (!inode->i_mapping->a_ops->read_folio &&
- !shmem_mapping(inode->i_mapping))
+ !shmem_mapping(inode->i_mapping) &&
+ !hugetlbfs_mapping(inode->i_mapping))
return -EIO;
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
@@ -1698,7 +1743,7 @@ void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
void *src, unsigned long len)
{
/* Initialize the slot */
- copy_to_page(page, vaddr, src, len);
+ copy_to_page(page, vaddr, src, len, PAGE_MASK);

/*
* We probably need flush_icache_user_page() but it needs vma.
@@ -2062,7 +2107,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
if (result < 0)
return result;

- copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, PAGE_MASK);
put_page(page);
out:
/* This needs to return true for any variant of the trap insn */

--
Guillaume Morin <guillaume@xxxxxxxxxxx>