[PATCH v3] blk-cgroup: Replace u64 sync with spinlock for iostat update

From: boy . wu
Date: Tue Jul 16 2024 - 03:53:04 EST


From: Boy Wu <boy.wu@xxxxxxxxxxxx>

In 32bit SMP systems, if multiple CPUs call blkcg_print_stat,
it may cause blkcg_fill_root_iostats to have a concurrent problem
on the seqlock in u64_stats_update, which will cause a deadlock
on u64_stats_fetch_begin in blkcg_print_one_stat.

Thus, replace u64 sync with spinlock to protect iostat update.

Fixes: ef45fe470e1e ("blk-cgroup: show global disk stats in root cgroup io.stat")
Signed-off-by: Boy Wu <boy.wu@xxxxxxxxxxxx>
---
Change in v2:
- update commit message
- Remove u64_sync
- Replace spin_lock_irq with guard statement
- Replace blkg->q->queue_lock with blkg_stat_lock
Change in v3:
- update commit message
- Add spinlock in blkg_iostat_set structure
- Replace all u64_sync with spinlock for iostat
- Replace blkg_stat_lock with iostat.spinlock
---
block/blk-cgroup.c | 62 +++++++++++++++++++---------------------------
block/blk-cgroup.h | 1 +
2 files changed, 26 insertions(+), 37 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 37e6cc91d576..4b66f37c45a0 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -329,7 +329,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
#endif

- u64_stats_init(&blkg->iostat.sync);
+ spin_lock_init(&blkg->iostat.spinlock);
for_each_possible_cpu(cpu) {
u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
@@ -995,15 +995,13 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
struct blkg_iostat *last)
{
struct blkg_iostat delta;
- unsigned long flags;

/* propagate percpu delta to global */
- flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
+ guard(spinlock_irqsave)(&blkg->iostat.spinlock);
blkg_iostat_set(&delta, cur);
blkg_iostat_sub(&delta, last);
blkg_iostat_add(&blkg->iostat.cur, &delta);
blkg_iostat_add(last, &delta);
- u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}

static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
@@ -1034,7 +1032,6 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
struct blkcg_gq *blkg = bisc->blkg;
struct blkcg_gq *parent = blkg->parent;
struct blkg_iostat cur;
- unsigned int seq;

/*
* Order assignment of `next_bisc` from `bisc->lnode.next` in
@@ -1051,10 +1048,8 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
goto propagate_up; /* propagate up to parent only */

/* fetch the current per-cpu values */
- do {
- seq = u64_stats_fetch_begin(&bisc->sync);
+ scoped_guard(spinlock_irqsave, &bisc->spinlock)
blkg_iostat_set(&cur, &bisc->cur);
- } while (u64_stats_fetch_retry(&bisc->sync, seq));

blkcg_iostat_update(blkg, &cur, &bisc->last);

@@ -1112,7 +1107,6 @@ static void blkcg_fill_root_iostats(void)
struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg;
struct blkg_iostat tmp;
int cpu;
- unsigned long flags;

memset(&tmp, 0, sizeof(tmp));
for_each_possible_cpu(cpu) {
@@ -1134,9 +1128,8 @@ static void blkcg_fill_root_iostats(void)
cpu_dkstats->sectors[STAT_DISCARD] << 9;
}

- flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
+ guard(spinlock_irqsave)(&blkg->iostat.spinlock);
blkg_iostat_set(&blkg->iostat.cur, &tmp);
- u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}
}

@@ -1145,7 +1138,6 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)
struct blkg_iostat_set *bis = &blkg->iostat;
u64 rbytes, wbytes, rios, wios, dbytes, dios;
const char *dname;
- unsigned seq;
int i;

if (!blkg->online)
@@ -1157,16 +1149,14 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)

seq_printf(s, "%s ", dname);

- do {
- seq = u64_stats_fetch_begin(&bis->sync);
-
+ scoped_guard(spinlock_irqsave, &bis->spinlock) {
rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
rios = bis->cur.ios[BLKG_IOSTAT_READ];
wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
- } while (u64_stats_fetch_retry(&bis->sync, seq));
+ }

if (rbytes || wbytes || rios || wios) {
seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
@@ -2141,7 +2131,6 @@ void blk_cgroup_bio_start(struct bio *bio)
struct blkcg *blkcg = bio->bi_blkg->blkcg;
int rwd = blk_cgroup_io_type(bio), cpu;
struct blkg_iostat_set *bis;
- unsigned long flags;

if (!cgroup_subsys_on_dfl(io_cgrp_subsys))
return;
@@ -2152,30 +2141,29 @@ void blk_cgroup_bio_start(struct bio *bio)

cpu = get_cpu();
bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
- flags = u64_stats_update_begin_irqsave(&bis->sync);
-
- /*
- * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split
- * bio and we would have already accounted for the size of the bio.
- */
- if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
- bio_set_flag(bio, BIO_CGROUP_ACCT);
- bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
- }
- bis->cur.ios[rwd]++;
+ scoped_guard(spinlock_irqsave, &bis->spinlock) {
+ /*
+ * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split
+ * bio and we would have already accounted for the size of the bio.
+ */
+ if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
+ bio_set_flag(bio, BIO_CGROUP_ACCT);
+ bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
+ }
+ bis->cur.ios[rwd]++;

- /*
- * If the iostat_cpu isn't in a lockless list, put it into the
- * list to indicate that a stat update is pending.
- */
- if (!READ_ONCE(bis->lqueued)) {
- struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
+ /*
+ * If the iostat_cpu isn't in a lockless list, put it into the
+ * list to indicate that a stat update is pending.
+ */
+ if (!READ_ONCE(bis->lqueued)) {
+ struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);

- llist_add(&bis->lnode, lhead);
- WRITE_ONCE(bis->lqueued, true);
+ llist_add(&bis->lnode, lhead);
+ WRITE_ONCE(bis->lqueued, true);
+ }
}

- u64_stats_update_end_irqrestore(&bis->sync, flags);
cgroup_rstat_updated(blkcg->css.cgroup, cpu);
put_cpu();
}
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index bd472a30bc61..b9544969a131 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -44,6 +44,7 @@ struct blkg_iostat {
};

struct blkg_iostat_set {
+ spinlock_t spinlock;
struct u64_stats_sync sync;
struct blkcg_gq *blkg;
struct llist_node lnode;
--
2.18.0