Re: Hard and soft lockups with FIO and LTP runs on a large system

From: zhaoyang.huang
Date: Thu Jul 25 2024 - 06:29:54 EST


>However during the weekend mglru-enabled run (with above fix to
>isolate_lru_folios() and also the previous two patches: truncate.patch
>and mglru.patch and the inode fix provided by Mateusz), another hard
>lockup related to lruvec spinlock was observed.
>
>Here is the hardlock up:
>
>watchdog: Watchdog detected hard LOCKUP on cpu 466
>CPU: 466 PID: 3103929 Comm: fio Not tainted
>6.10.0-rc3-trnct_nvme_lruvecresched_sirq_inode_mglru #32
>RIP: 0010:native_queued_spin_lock_slowpath+0x2b4/0x300
>Call Trace:
> <NMI>
> ? show_regs+0x69/0x80
> ? watchdog_hardlockup_check+0x1b4/0x3a0
><SNIP>
> ? native_queued_spin_lock_slowpath+0x2b4/0x300
> </NMI>
> <IRQ>
> _raw_spin_lock_irqsave+0x5b/0x70
> folio_lruvec_lock_irqsave+0x62/0x90
> folio_batch_move_lru+0x9d/0x160
> folio_rotate_reclaimable+0xab/0xf0
> folio_end_writeback+0x60/0x90
> end_buffer_async_write+0xaa/0xe0
> end_bio_bh_io_sync+0x2c/0x50
> bio_endio+0x108/0x180
> blk_mq_end_request_batch+0x11f/0x5e0
> nvme_pci_complete_batch+0xb5/0xd0 [nvme]
> nvme_irq+0x92/0xe0 [nvme]
> __handle_irq_event_percpu+0x6e/0x1e0
> handle_irq_event+0x39/0x80
> handle_edge_irq+0x8c/0x240
> __common_interrupt+0x4e/0xf0
> common_interrupt+0x49/0xc0
> asm_common_interrupt+0x27/0x40
>
>Here is the lock holder details captured by all-cpu-backtrace:
>
>NMI backtrace for cpu 75
>CPU: 75 PID: 3095650 Comm: fio Not tainted
>6.10.0-rc3-trnct_nvme_lruvecresched_sirq_inode_mglru #32
>RIP: 0010:folio_inc_gen+0x142/0x430
>Call Trace:
> <NMI>
> ? show_regs+0x69/0x80
> ? nmi_cpu_backtrace+0xc5/0x130
> ? nmi_cpu_backtrace_handler+0x11/0x20
> ? nmi_handle+0x64/0x180
> ? default_do_nmi+0x45/0x130
> ? exc_nmi+0x128/0x1a0
> ? end_repeat_nmi+0xf/0x53
> ? folio_inc_gen+0x142/0x430
> ? folio_inc_gen+0x142/0x430
> ? folio_inc_gen+0x142/0x430
> </NMI>
> <TASK>
> isolate_folios+0x954/0x1630
> evict_folios+0xa5/0x8c0
> try_to_shrink_lruvec+0x1be/0x320
> shrink_one+0x10f/0x1d0
> shrink_node+0xa4c/0xc90
> do_try_to_free_pages+0xc0/0x590
> try_to_free_pages+0xde/0x210
> __alloc_pages_noprof+0x6ae/0x12c0
> alloc_pages_mpol_noprof+0xd9/0x220
> folio_alloc_noprof+0x63/0xe0
> filemap_alloc_folio_noprof+0xf4/0x100
> page_cache_ra_unbounded+0xb9/0x1a0
> page_cache_ra_order+0x26e/0x310
> ondemand_readahead+0x1a3/0x360
> page_cache_sync_ra+0x83/0x90
> filemap_get_pages+0xf0/0x6a0
> filemap_read+0xe7/0x3d0
> blkdev_read_iter+0x6f/0x140
> vfs_read+0x25b/0x340
> ksys_read+0x67/0xf0
> __x64_sys_read+0x19/0x20
> x64_sys_call+0x1771/0x20d0
> do_syscall_64+0x7e/0x130

>From the callstack of lock holder, it is looks like a scability issue rather than a deadlock. Unlike legacy LRU management, there is no throttling mechanism for global reclaim under mglru so far.Could we apply the similar method to throttle the reclaim when it is too aggresive. I am wondering if this patch which is a rough version could help on this?

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2e34de9cd0d4..827036e21f24 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4520,6 +4520,50 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
return scanned;
}

+static void lru_gen_throttle(pg_data_t *pgdat, struct scan_control *sc)
+{
+ struct lruvec *target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
+
+ if (current_is_kswapd()) {
+ if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
+ set_bit(PGDAT_WRITEBACK, &pgdat->flags);
+
+ /* Allow kswapd to start writing pages during reclaim.*/
+ if (sc->nr.unqueued_dirty == sc->nr.file_taken)
+ set_bit(PGDAT_DIRTY, &pgdat->flags);
+
+ if (sc->nr.immediate)
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
+ }
+
+ /*
+ * Tag a node/memcg as congested if all the dirty pages were marked
+ * for writeback and immediate reclaim (counted in nr.congested).
+ *
+ * Legacy memcg will stall in page writeback so avoid forcibly
+ * stalling in reclaim_throttle().
+ */
+ if (sc->nr.dirty && (sc->nr.dirty / 2 < sc->nr.congested)) {
+ if (cgroup_reclaim(sc) && writeback_throttling_sane(sc))
+ set_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags);
+
+ if (current_is_kswapd())
+ set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags);
+ }
+
+ /*
+ * Stall direct reclaim for IO completions if the lruvec is
+ * node is congested. Allow kswapd to continue until it
+ * starts encountering unqueued dirty pages or cycling through
+ * the LRU too quickly.
+ */
+ if (!current_is_kswapd() && current_may_throttle() &&
+ !sc->hibernation_mode &&
+ (test_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags) ||
+ test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags)))
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED);
+}
+
static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
{
int type;
@@ -4552,6 +4596,16 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
retry:
reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
sc->nr_reclaimed += reclaimed;
+ sc->nr.dirty += stat.nr_dirty;
+ sc->nr.congested += stat.nr_congested;
+ sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
+ sc->nr.writeback += stat.nr_writeback;
+ sc->nr.immediate += stat.nr_immediate;
+ sc->nr.taken += scanned;
+
+ if (type)
+ sc->nr.file_taken += scanned;
+
trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
scanned, reclaimed, &stat, sc->priority,
type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
@@ -5908,6 +5962,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)

if (lru_gen_enabled() && root_reclaim(sc)) {
lru_gen_shrink_node(pgdat, sc);
+ lru_gen_throttle(pgdat, sc);
return;
}

--
2.25.1