Re: [PATCH v4] blk-cgroup: Replace u64 sync with spinlock for iostat

From: Boy Wu (吳勃誼)
Date: Thu Jul 25 2024 - 23:43:49 EST


On Fri, 2024-07-19 at 07:31 -1000, tj@xxxxxxxxxx wrote:
>
> External email : Please do not click links or open attachments until
> you have verified the sender or the content.
> Hello, Boy.
>
> On Fri, Jul 19, 2024 at 01:47:35AM +0000, Boy Wu (吳勃誼) wrote:
> ...
> > If it is for readability, I think keeping using u64 sync instead of
> > replacing it with spinlock is better, because what u64 sync
> protects is
> > 64-bit data for 32-bit systems, while spinlock can be used for many
> > other reasons. The root cause of this issue is just the incorrect
> use
>
> Yeah, but it can't be used when there are multiple updaters.
>
> > of u64 sync. Adding back the missing spinlock for the correct usage
> of
> > u64 sync is simpler. Is there any benefit to replacing u64 sync
> with
> > spinlock?
>
> It doesn't make sense to protect u64_sync with a spinlock. Both are
> only
> needed on 32bit. What's the point of having both? Also, note that
> iostat_cpu
> is also updated from two paths - bio issue and stat reset. If you
> want to
> keep that u64_sync, the only way to avoid possible deadlocks is
> adding
> spinlock in the bio issue path too, which will be pretty expensive.
>

The use of a spinlock with u64 sync is suggested in
include/linux/u64_stats_sync.h:33.

* Usage :
*
* Stats producer (writer) should use following template granted it
already got
* an exclusive access to counters (a lock is already taken, or per cpu
* data is used [in a non preemptable context])
*
* spin_lock_bh(...) or other synchronization to get exclusive access
* ...
* u64_stats_update_begin(&stats->syncp);
* u64_stats_add(&stats->bytes64, len); // non atomic operation
* u64_stats_inc(&stats->packets64); // non atomic operation
* u64_stats_update_end(&stats->syncp);

Is this a incorrect statment?

> > > Also, for blkg_clear_stat(), we're running a slight chance of
> > > clearing of
> > > iostat_cpu racing against state updates from the hot path. Given
> the
> > > circumstances - stat reset is an cgroup1-only feature which isn't
> > > used
> > > widely and a problematic interface to begin with, I believe this
> is a
> > > valid
> > > choice but it needs to be noted.
> >
> > I don't get this part, but if this is another issue, maybe another
> > patch would be better?
>
> It's the same issue. Reset is another writer and whenever you have
> more than
> one writers w/ u64_sync, there's a chance of deadlocks. So,
> iostat_cpu also
> has two writers - bio issue and stat reset. If you want to keep using
> u64_sync in both places, the only way to do it is adding spinlock
> protection
> to both paths, which is an expensive thing to do for the bio issue
> path. So,
> here, we'd rather just give up and let stat resetting be racy on
> 32bit.
>
> Thanks.
>
> --
> tejun

There are three places update iostat and two places update iostat_cpu
in blk-cgroup.c in version 6.10.1.

I assume the suggestion in u64_stats_sync.h is correct. For
readability, how about the following change?

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 85b3b9051455..7472cfa9a506 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -62,6 +62,10 @@ static struct workqueue_struct *blkcg_punt_bio_wq;

static DEFINE_RAW_SPINLOCK(blkg_stat_lock);

+#if BITS_PER_LONG == 32
+static DEFINE_RAW_SPINLOCK(blkg_stat_cpu_lock);
+#endif
+
#define BLKG_DESTROY_BATCH_SIZE 64

/*
@@ -535,5 +539,55 @@ static void blkg_destroy_all(struct gendisk *disk)
spin_unlock_irq(&q->queue_lock);
}

+static inline unsigned long blkcg_iostats_update_begin_irqsave(struct
u64_stats_sync *syncp)
+{
+ unsigned long flags;
+
+#if BITS_PER_LONG == 64
+ flags = u64_stats_update_begin_irqsave(syncp);
+#else /* 64 bit */
+ raw_spin_lock_irqsave(&blkg_stat_lock, flags);
+ u64_stats_update_begin(syncp);
+#endif
+
+ return flags;
+}
+
+static inline void blkcg_iostats_update_end_irqrestore(struct
u64_stats_sync *syncp,
+ unsigned long
flags)
+{
+#if BITS_PER_LONG == 64
+ u64_stats_update_end_irqrestore(syncp, flags);
+#else /* 64 bit */
+ u64_stats_update_end(syncp);
+ raw_spin_unlock_irqrestore(&blkg_stat_lock, flags);
+#endif
+}
+
+static unsigned long blkcg_iostats_cpu_update_begin_irqsave(struct
u64_stats_sync *syncp)
+{
+ unsigned long flags;
+
+#if BITS_PER_LONG == 64
+ flags = u64_stats_update_begin_irqsave(syncp);
+#else /* 64 bit */
+ raw_spin_lock_irqsave(&blkg_stat_cpu_lock, flags);
+ u64_stats_update_begin(syncp);
+#endif
+
+ return flags;
+}
+
+static inline void blkcg_iostats_cpu_update_end_irqrestore(struct
u64_stats_sync *syncp,
+ unsigned
long flags)
+{
+#if BITS_PER_LONG == 64
+ u64_stats_update_end_irqrestore(syncp, flags);
+#else /* 64 bit */
+ u64_stats_update_end(syncp);
+ raw_spin_unlock_irqrestore(&blkg_stat_cpu_lock, flags);
+#endif
+}
+
static void blkg_iostat_set(struct blkg_iostat *dst, struct
blkg_iostat *src)
{
int i;
@@ -632,24 +686,26 @@ static void blkg_iostat_set(struct blkg_iostat
*dst, struct blkg_iostat *src)
static void __blkg_clear_stat(struct blkg_iostat_set *bis)
{
struct blkg_iostat cur = {0};
- unsigned long flags;

- flags = u64_stats_update_begin_irqsave(&bis->sync);
blkg_iostat_set(&bis->cur, &cur);
blkg_iostat_set(&bis->last, &cur);
- u64_stats_update_end_irqrestore(&bis->sync, flags);
}

static void blkg_clear_stat(struct blkcg_gq *blkg)
{
int cpu;
+ unsigned long flags;

for_each_possible_cpu(cpu) {
struct blkg_iostat_set *s = per_cpu_ptr(blkg-
>iostat_cpu, cpu);

+ flags = blkcg_iostats_cpu_update_begin_irqsave(&s-
>sync);
__blkg_clear_stat(s);
+ blkcg_iostats_cpu_update_end_irqrestore(&s->sync,
flags);
}
+ flags = blkcg_iostats_update_begin_irqsave(&blkg->iostat.sync);
__blkg_clear_stat(&blkg->iostat);
+ blkcg_iostats_update_end_irqrestore(&blkg->iostat.sync, flags);
}

static int blkcg_reset_stats(struct cgroup_subsys_state *css,
@@ -1134,9 +1190,9 @@ static void blkcg_fill_root_iostats(void)
cpu_dkstats->sectors[STAT_DISCARD] <<
9;
}

- flags = u64_stats_update_begin_irqsave(&blkg-
>iostat.sync);
+ flags = blkcg_iostats_update_begin_irqsave(&blkg-
>iostat.sync);
blkg_iostat_set(&blkg->iostat.cur, &tmp);
- u64_stats_update_end_irqrestore(&blkg->iostat.sync,
flags);
+ blkcg_iostats_update_end_irqrestore(&blkg->iostat.sync,
flags);
}
}

@@ -2152,7 +2208,7 @@ void blk_cgroup_bio_start(struct bio *bio)

cpu = get_cpu();
bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
- flags = u64_stats_update_begin_irqsave(&bis->sync);
+ flags = blkcg_iostats_cpu_update_begin_irqsave(&bis->sync);

/*
* If the bio is flagged with BIO_CGROUP_ACCT it means this is
a split
@@ -2175,7 +2231,7 @@ void blk_cgroup_bio_start(struct bio *bio)
WRITE_ONCE(bis->lqueued, true);
}

- u64_stats_update_end_irqrestore(&bis->sync, flags);
+ blkcg_iostats_cpu_update_end_irqrestore(&bis->sync, flags);
cgroup_rstat_updated(blkcg->css.cgroup, cpu);
put_cpu();
}

--
boy.wu