Re: scheduling while atomic in z3fold

From: Mike Galbraith
Date: Wed Dec 09 2020 - 01:15:46 EST


On Wed, 2020-12-09 at 00:26 +0100, Vitaly Wool wrote:
> Hi Mike,
>
> On 2020-12-07 16:41, Mike Galbraith wrote:
> > On Mon, 2020-12-07 at 16:21 +0100, Vitaly Wool wrote:
> >> On Mon, Dec 7, 2020 at 1:34 PM Mike Galbraith <efault@xxxxxx> wrote:
> >>>
> >>
> >>> Unfortunately, that made zero difference.
> >>
> >> Okay, I suggest that you submit the patch that changes read_lock() to
> >> write_lock() in __release_z3fold_page() and I'll ack it then.
> >> I would like to rewrite the code so that write_lock is not necessary
> >> there but I don't want to hold you back and it isn't likely that I'll
> >> complete this today.
> >
> > Nah, I'm in no rush... especially not to sign off on "Because the
> > little voices in my head said this bit should look like that bit over
> > yonder, and testing _seems_ to indicate they're right about that" :)
> >
> > -Mike
> >
>
> okay, thanks. Would this make things better:

Yup, z3fold became RT tolerant with this (un-munged and) applied.

BTW, I don't think my write_lock() hacklet actually plugged the hole
that leads to the below. I think it just reduced the odds of the two
meeting enough to make it look ~solid in fairly limited test time.

[ 3369.373023] kworker/-7413 4.....12 3368809247us : do_compact_page: zhdr: ffff95d93abd8000 zhdr->slots: ffff95d951f5df80 zhdr->slots->slot[0]: 0
[ 3369.373025] kworker/-7413 4.....12 3368809248us : do_compact_page: old_handle ffff95d951f5df98 *old_handle was ffff95d93abd804f now is ffff95da3ab8104c
[ 3369.373027] kworker/-7413 4.....11 3368809249us : __release_z3fold_page.constprop.25: freed ffff95d951f5df80
[ 3369.373028] ---------------------------------
[ 3369.373029] CR2: 0000000000000018
crash> p debug_handle | grep '\[2'
[2]: ffff95dc1ecac0d0
crash> rd ffff95dc1ecac0d0
ffff95dc1ecac0d0: ffff95d951f5df98 ...Q....
crash> p debug_zhdr | grep '\[2'
[2]: ffff95dc1ecac0c8
crash> rd ffff95dc1ecac0c8
ffff95dc1ecac0c8: ffff95da3ab81000 ...:.... <== kworker is not working on same zhdr as free_handle()..
crash> p debug_slots | grep '\[2'
[2]: ffff95dc1ecac0c0
crash> rd ffff95dc1ecac0c0
ffff95dc1ecac0c0: ffff95d951f5df80 ...Q.... <== ..but is the same slots, and frees it under free_handle().
crash> bt -sx leading to use after free+corruption+explosion 1us later.
PID: 9334 TASK: ffff95d95a1eb3c0 CPU: 2 COMMAND: "swapoff"
#0 [ffffb4248847f8f0] machine_kexec+0x16e at ffffffffa605f8ce
#1 [ffffb4248847f938] __crash_kexec+0xd2 at ffffffffa614c162
#2 [ffffb4248847f9f8] crash_kexec+0x30 at ffffffffa614d350
#3 [ffffb4248847fa08] oops_end+0xca at ffffffffa602680a
#4 [ffffb4248847fa28] no_context+0x14d at ffffffffa606d7cd
#5 [ffffb4248847fa88] exc_page_fault+0x2b8 at ffffffffa68bdb88
#6 [ffffb4248847fae0] asm_exc_page_fault+0x1e at ffffffffa6a00ace
#7 [ffffb4248847fb68] mark_wakeup_next_waiter+0x51 at ffffffffa60ea121
#8 [ffffb4248847fbd0] rt_mutex_futex_unlock+0x4f at ffffffffa68c93ef
#9 [ffffb4248847fc10] z3fold_zpool_free+0x593 at ffffffffc0ecb663 [z3fold]
#10 [ffffb4248847fc78] zswap_free_entry+0x43 at ffffffffa627c823
#11 [ffffb4248847fc88] zswap_frontswap_invalidate_page+0x8a at ffffffffa627c92a
#12 [ffffb4248847fcb0] __frontswap_invalidate_page+0x48 at ffffffffa627c018
#13 [ffffb4248847fcd8] swapcache_free_entries+0x1ee at ffffffffa6276f5e
#14 [ffffb4248847fd20] free_swap_slot+0x9f at ffffffffa627b8ff
#15 [ffffb4248847fd40] delete_from_swap_cache+0x61 at ffffffffa6274621
#16 [ffffb4248847fd60] try_to_free_swap+0x70 at ffffffffa6277520
#17 [ffffb4248847fd70] unuse_vma+0x55c at ffffffffa627869c
#18 [ffffb4248847fe90] try_to_unuse+0x139 at ffffffffa6278e89
#19 [ffffb4248847fee8] __x64_sys_swapoff+0x1eb at ffffffffa62798cb
#20 [ffffb4248847ff40] do_syscall_64+0x33 at ffffffffa68b9ab3
#21 [ffffb4248847ff50] entry_SYSCALL_64_after_hwframe+0x44 at ffffffffa6a0007c
RIP: 00007fbd835a5d17 RSP: 00007ffd60634458 RFLAGS: 00000202
RAX: ffffffffffffffda RBX: 0000559540e34b60 RCX: 00007fbd835a5d17
RDX: 0000000000000001 RSI: 0000000000000001 RDI: 0000559540e34b60
RBP: 0000000000000001 R8: 00007ffd606344c0 R9: 0000000000000003
R10: 0000559540e34721 R11: 0000000000000202 R12: 0000000000000001
R13: 0000000000000000 R14: 00007ffd606344c0 R15: 0000000000000000
ORIG_RAX: 00000000000000a8 CS: 0033 SS: 002b
crash>

>
> diff --git a/mm/z3fold.c b/mm/z3fold.c
> index 18feaa0bc537..340c38a5ffac 100644
> --- a/mm/z3fold.c
> +++ b/mm/z3fold.c
> @@ -303,10 +303,9 @@ static inline void put_z3fold_header(struct
> z3fold_header *zhdr)
> z3fold_page_unlock(zhdr);
> }
>
> -static inline void free_handle(unsigned long handle)
> +static inline void free_handle(unsigned long handle, struct
> z3fold_header *zhdr)
> {
> struct z3fold_buddy_slots *slots;
> - struct z3fold_header *zhdr;
> int i;
> bool is_free;
>
> @@ -316,22 +315,13 @@ static inline void free_handle(unsigned long handle)
> if (WARN_ON(*(unsigned long *)handle == 0))
> return;
>
> - zhdr = handle_to_z3fold_header(handle);
> slots = handle_to_slots(handle);
> write_lock(&slots->lock);
> *(unsigned long *)handle = 0;
> - if (zhdr->slots == slots) {
> - write_unlock(&slots->lock);
> - return; /* simple case, nothing else to do */
> - }
> + if (zhdr->slots != slots)
> + zhdr->foreign_handles--;
>
> - /* we are freeing a foreign handle if we are here */
> - zhdr->foreign_handles--;
> is_free = true;
> - if (!test_bit(HANDLES_ORPHANED, &slots->pool)) {
> - write_unlock(&slots->lock);
> - return;
> - }
> for (i = 0; i <= BUDDY_MASK; i++) {
> if (slots->slot[i]) {
> is_free = false;
> @@ -343,6 +333,8 @@ static inline void free_handle(unsigned long handle)
> if (is_free) {
> struct z3fold_pool *pool = slots_to_pool(slots);
>
> + if (zhdr->slots == slots)
> + zhdr->slots = NULL;
> kmem_cache_free(pool->c_handle, slots);
> }
> }
> @@ -525,8 +517,6 @@ static void __release_z3fold_page(struct
> z3fold_header *zhdr, bool locked)
> {
> struct page *page = virt_to_page(zhdr);
> struct z3fold_pool *pool = zhdr_to_pool(zhdr);
> - bool is_free = true;
> - int i;
>
> WARN_ON(!list_empty(&zhdr->buddy));
> set_bit(PAGE_STALE, &page->private);
> @@ -536,21 +526,6 @@ static void __release_z3fold_page(struct
> z3fold_header *zhdr, bool locked)
> list_del_init(&page->lru);
> spin_unlock(&pool->lock);
>
> - /* If there are no foreign handles, free the handles array */
> - read_lock(&zhdr->slots->lock);
> - for (i = 0; i <= BUDDY_MASK; i++) {
> - if (zhdr->slots->slot[i]) {
> - is_free = false;
> - break;
> - }
> - }
> - if (!is_free)
> - set_bit(HANDLES_ORPHANED, &zhdr->slots->pool);
> - read_unlock(&zhdr->slots->lock);
> -
> - if (is_free)
> - kmem_cache_free(pool->c_handle, zhdr->slots);
> -
> if (locked)
> z3fold_page_unlock(zhdr);
>
> @@ -973,6 +948,9 @@ static inline struct z3fold_header
> *__z3fold_alloc(struct z3fold_pool *pool,
> }
> }
>
> + if (zhdr && !zhdr->slots)
> + zhdr->slots = alloc_slots(pool,
> + can_sleep ? GFP_NOIO : GFP_ATOMIC);
> return zhdr;
> }
>
> @@ -1270,7 +1248,7 @@ static void z3fold_free(struct z3fold_pool *pool,
> unsigned long handle)
> }
>
> if (!page_claimed)
> - free_handle(handle);
> + free_handle(handle, zhdr);
> if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
> atomic64_dec(&pool->pages_nr);
> return;
> @@ -1429,19 +1407,19 @@ static int z3fold_reclaim_page(struct
> z3fold_pool *pool, unsigned int retries)
> ret = pool->ops->evict(pool, middle_handle);
> if (ret)
> goto next;
> - free_handle(middle_handle);
> + free_handle(middle_handle, zhdr);
> }
> if (first_handle) {
> ret = pool->ops->evict(pool, first_handle);
> if (ret)
> goto next;
> - free_handle(first_handle);
> + free_handle(first_handle, zhdr);
> }
> if (last_handle) {
> ret = pool->ops->evict(pool, last_handle);
> if (ret)
> goto next;
> - free_handle(last_handle);
> + free_handle(last_handle, zhdr);
> }
> next:
> if (test_bit(PAGE_HEADLESS, &page->private)) {
>
> --
>
> Best regards,
> Vitaly