Re: [PATCH 01/13] mm: Update ptep_get_lockless()s comment

From: Linus Torvalds
Date: Sat Oct 29 2022 - 14:37:44 EST


On Sat, Oct 29, 2022 at 11:05 AM Nadav Amit <nadav.amit@xxxxxxxxx> wrote:
>
> Anyhow, I am not sure whether the solution that you propose would work.
> Please let me know if my understanding makes sense.

Let me include my (UNTESTED! PROBABLY GARBAGE!) patch here just as a
"I meant something like this".

Note that it's untested, but I tried to make it intentionally use
different names and an opaque type in the 'mmu_gather' data structure
so that at least these bit games end up being type-safe and more
likely to be correct if it compiles.

And I will say that it does compile for me - but only in my normal
x86-64 config. I haven't tested anything else, and I guarantee it
won't build on s390, for example, because s390 has its own mmu_gather
functions.

> Let’s assume that we do not call set_page_dirty() before we remove the rmap
> but only after we invalidate the page [*]. Let’s assume that
> shrink_page_list() is called after the page’s rmap is removed and the page
> is no longer mapped, but before set_page_dirty() was actually called.
>
> In such a case, shrink_page_list() would consider the page clean, and would
> indeed keep the page (since __remove_mapping() would find elevated page
> refcount), which appears to give us a chance to mark the page as dirty
> later.

Right. That is not different to any other function (like "write()"
having looked up the page.

> However, IIUC, in this case shrink_page_list() might still call
> filemap_release_folio() and release the buffers, so calling set_page_dirty()
> afterwards - after the actual TLB invalidation took place - would fail.

I'm not seeing why.

That would imply that any "look up page, do set_page_dirty()" is
broken. They don't have rmap either. And we have a number of them all
over (eg think "GUP users" etc).

But hey, you were right about the stale TLB case, so I may just be
missing something.

I *think* the important thing is just that we need to mark the page
dirtty from the page tables _after_ we've flushed the TLB, just to
make sure that there can be no subsequent dirtiers that then get lost.

Anyway, I think the best documentation for "this is what I meant" is
simply the patch. Does this affect your PoC on your setup?

I tried to run your program on my machine (WITHOUT this patch - I have
compiled this patch, but I haven't even booted it - it really could be
horrible broken).

But it doesn't fail for me even without the patch and I just get
"Finished without an error" over and over again - but you said it had
to be run on a RAM block device, which I didn't do, so my testing is
very suspect,.

Again: THIS PATCH IS UNTESTED. I feel like it might actually work, if
only because I tried to be so careful with the type system.

But that "might actually work" is probably me being wildly optimistic,
and also depends on the whole concept being ok in the first place.

So realistically, think of this patch more as a "document in code what
Linus meant with his incoherent ramblings" rather than anything else.

Hmm?

Linus
include/asm-generic/tlb.h | 28 +++++++++++++++++++++++-----
mm/memory.c | 17 ++++++++---------
mm/mmu_gather.c | 36 ++++++++++++++++++++++++++++++++----
3 files changed, 63 insertions(+), 18 deletions(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 492dce43236e..a95085f6dd47 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -238,11 +238,29 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
*/
#define MMU_GATHER_BUNDLE 8

+/* Fake type for an encoded page pointer with the dirty bit in the low bit */
+struct encoded_page;
+
+static inline struct encoded_page *encode_page(struct page *page, bool dirty)
+{
+ return (struct encoded_page *)(dirty | (unsigned long)page);
+}
+
+static inline bool encoded_page_dirty(struct encoded_page *page)
+{
+ return 1 & (unsigned long)page;
+}
+
+static inline struct page *encoded_page_ptr(struct encoded_page *page)
+{
+ return (struct page *)(~1ul & (unsigned long)page);
+}
+
struct mmu_gather_batch {
struct mmu_gather_batch *next;
unsigned int nr;
unsigned int max;
- struct page *pages[];
+ struct encoded_page *encoded_pages[];
};

#define MAX_GATHER_BATCH \
@@ -257,7 +275,7 @@ struct mmu_gather_batch {
#define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH)

extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
- int page_size);
+ int page_size, bool dirty);
#endif

/*
@@ -431,13 +449,13 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
static inline void tlb_remove_page_size(struct mmu_gather *tlb,
struct page *page, int page_size)
{
- if (__tlb_remove_page_size(tlb, page, page_size))
+ if (__tlb_remove_page_size(tlb, page, page_size, false))
tlb_flush_mmu(tlb);
}

-static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page, bool dirty)
{
- return __tlb_remove_page_size(tlb, page, PAGE_SIZE);
+ return __tlb_remove_page_size(tlb, page, PAGE_SIZE, dirty);
}

/* tlb_remove_page
diff --git a/mm/memory.c b/mm/memory.c
index f88c351aecd4..8ab4c0d7e99e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1423,7 +1423,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
arch_enter_lazy_mmu_mode();
do {
pte_t ptent = *pte;
- struct page *page;

if (pte_none(ptent))
continue;
@@ -1432,7 +1431,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
break;

if (pte_present(ptent)) {
- page = vm_normal_page(vma, addr, ptent);
+ struct page *page = vm_normal_page(vma, addr, ptent);
+ int dirty;
+
if (unlikely(!should_zap_page(details, page)))
continue;
ptent = ptep_get_and_clear_full(mm, addr, pte,
@@ -1443,11 +1444,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
if (unlikely(!page))
continue;

+ dirty = 0;
if (!PageAnon(page)) {
- if (pte_dirty(ptent)) {
- force_flush = 1;
- set_page_dirty(page);
- }
+ dirty = pte_dirty(ptent);
if (pte_young(ptent) &&
likely(!(vma->vm_flags & VM_SEQ_READ)))
mark_page_accessed(page);
@@ -1456,7 +1455,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
page_remove_rmap(page, vma, false);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
- if (unlikely(__tlb_remove_page(tlb, page))) {
+ if (unlikely(__tlb_remove_page(tlb, page, dirty))) {
force_flush = 1;
addr += PAGE_SIZE;
break;
@@ -1467,7 +1466,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
entry = pte_to_swp_entry(ptent);
if (is_device_private_entry(entry) ||
is_device_exclusive_entry(entry)) {
- page = pfn_swap_entry_to_page(entry);
+ struct page *page = pfn_swap_entry_to_page(entry);
if (unlikely(!should_zap_page(details, page)))
continue;
/*
@@ -1489,7 +1488,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
if (unlikely(!free_swap_and_cache(entry)))
print_bad_pte(vma, addr, ptent, NULL);
} else if (is_migration_entry(entry)) {
- page = pfn_swap_entry_to_page(entry);
+ struct page *page = pfn_swap_entry_to_page(entry);
if (!should_zap_page(details, page))
continue;
rss[mm_counter(page)]--;
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index add4244e5790..fa79e054413a 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -43,12 +43,40 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
return true;
}

+/*
+ * We get an 'encoded page' array, which has page pointers with
+ * the dirty bit in the low bit of the array.
+ *
+ * The TLB has been flushed, now we need to move the dirty bit into
+ * the 'struct page', clean the array in-place, and then free the
+ * pages and their swap cache.
+ */
+static void clean_and_free_pages_and_swap_cache(struct encoded_page **pages, unsigned int nr)
+{
+ for (unsigned int i = 0; i < nr; i++) {
+ struct encoded_page *encoded = pages[i];
+ if (encoded_page_dirty(encoded)) {
+ struct page *page = encoded_page_ptr(encoded);
+ /* Clean the dirty pointer in-place */
+ pages[i] = encode_page(page, 0);
+ set_page_dirty(page);
+ }
+ }
+
+ /*
+ * Now all entries have been un-encoded, and changed to plain
+ * page pointers, so we can cast the 'encoded_page' array to
+ * a plain page array and free them
+ */
+ free_pages_and_swap_cache((struct page **)pages, nr);
+}
+
static void tlb_batch_pages_flush(struct mmu_gather *tlb)
{
struct mmu_gather_batch *batch;

for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
- struct page **pages = batch->pages;
+ struct encoded_page **pages = batch->encoded_pages;

do {
/*
@@ -56,7 +84,7 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb)
*/
unsigned int nr = min(512U, batch->nr);

- free_pages_and_swap_cache(pages, nr);
+ clean_and_free_pages_and_swap_cache(pages, nr);
pages += nr;
batch->nr -= nr;

@@ -77,7 +105,7 @@ static void tlb_batch_list_free(struct mmu_gather *tlb)
tlb->local.next = NULL;
}

-bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
+bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size, bool dirty)
{
struct mmu_gather_batch *batch;

@@ -92,7 +120,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
* Add the page and check if we are full. If so
* force a flush.
*/
- batch->pages[batch->nr++] = page;
+ batch->encoded_pages[batch->nr++] = encode_page(page, dirty);
if (batch->nr == batch->max) {
if (!tlb_next_batch(tlb))
return true;