[PATCH v2] erofs: kill hooked chains to avoid loops on deduplicated compressed images

From: Gao Xiang
Date: Mon May 22 2023 - 03:22:54 EST


After heavily stressing EROFS with several images which include a
hand-crafted image of repeated patterns for more than 46 days, I found
two chains could be linked with each other almost simultaneously and
form a loop so that the entire loop won't be submitted. As a
consequence, the corresponding file pages will remain locked forever.

It can be _only_ observed on data-deduplicated compressed images.
For example, consider two chains with five pclusters in total:
Chain 1: 2->3->4->5 -- The tail pcluster is 5;
Chain 2: 5->1->2 -- The tail pcluster is 2.

Chain 2 could link to Chain 1 with pcluster 5; and Chain 1 could link
to Chain 2 at the same time with pcluster 2.

Since hooked chains are all linked locklessly now, I have no idea how
to simply avoid the race. Instead, let's avoid hooked chains entirely
until a proper way could be worked out to fix this and end users finally
tell us that it's needed to add it back.

Actually, this optimization can be found with multi-threaded workloads
(especially even more often on deduplicated compressed images), yet I'm
not sure about the overall system impacts of not having this compared
with implementation complexity.

Fixes: 267f2492c8f7 ("erofs: introduce multi-reference pclusters (fully-referenced)")
Signed-off-by: Gao Xiang <hsiangkao@xxxxxxxxxxxxxxxxx>
---
changes since v1:
- previous fix is still racy, so get rid of hooked chains entirely.

fs/erofs/zdata.c | 69 +++++++-----------------------------------------
1 file changed, 10 insertions(+), 59 deletions(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 45f21db2303a..92f3a01262cf 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -96,8 +96,6 @@ struct z_erofs_pcluster {

/* the chained workgroup has't submitted io (still open) */
#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE)
-/* the chained workgroup has already submitted io */
-#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD)

#define Z_EROFS_PCLUSTER_NIL (NULL)

@@ -501,20 +499,6 @@ int __init z_erofs_init_zip_subsystem(void)

enum z_erofs_pclustermode {
Z_EROFS_PCLUSTER_INFLIGHT,
- /*
- * The current pclusters was the tail of an exist chain, in addition
- * that the previous processed chained pclusters are all decided to
- * be hooked up to it.
- * A new chain will be created for the remaining pclusters which are
- * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED,
- * the next pcluster cannot reuse the whole page safely for inplace I/O
- * in the following scenario:
- * ________________________________________________________________
- * | tail (partial) page | head (partial) page |
- * | (belongs to the next pcl) | (belongs to the current pcl) |
- * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________|
- */
- Z_EROFS_PCLUSTER_HOOKED,
/*
* a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
* could be dispatched into bypass queue later due to uptodated managed
@@ -532,8 +516,8 @@ enum z_erofs_pclustermode {
* ________________________________________________________________
* | tail (partial) page | head (partial) page |
* | (of the current cl) | (of the previous collection) |
- * | PCLUSTER_FOLLOWED or | |
- * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________|
+ * | | |
+ * |__PCLUSTER_FOLLOWED___|___________PCLUSTER_FOLLOWED____________|
*
* [ (*) the above page can be used as inplace I/O. ]
*/
@@ -546,7 +530,7 @@ struct z_erofs_decompress_frontend {
struct z_erofs_bvec_iter biter;

struct page *candidate_bvpage;
- struct z_erofs_pcluster *pcl, *tailpcl;
+ struct z_erofs_pcluster *pcl;
z_erofs_next_pcluster_t owned_head;
enum z_erofs_pclustermode mode;

@@ -752,19 +736,7 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
return;
}

- /*
- * type 2, link to the end of an existing open chain, be careful
- * that its submission is controlled by the original attached chain.
- */
- if (*owned_head != &pcl->next && pcl != f->tailpcl &&
- cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
- *owned_head) == Z_EROFS_PCLUSTER_TAIL) {
- *owned_head = Z_EROFS_PCLUSTER_TAIL;
- f->mode = Z_EROFS_PCLUSTER_HOOKED;
- f->tailpcl = NULL;
- return;
- }
- /* type 3, it belongs to a chain, but it isn't the end of the chain */
+ /* type 2, it belongs to an ongoing chain */
f->mode = Z_EROFS_PCLUSTER_INFLIGHT;
}

@@ -825,9 +797,6 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
goto err_out;
}
}
- /* used to check tail merging loop due to corrupted images */
- if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
- fe->tailpcl = pcl;
fe->owned_head = &pcl->next;
fe->pcl = pcl;
return 0;
@@ -848,7 +817,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)

/* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
- DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);

if (!(map->m_flags & EROFS_MAP_META)) {
grp = erofs_find_workgroup(fe->inode->i_sb,
@@ -867,10 +835,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)

if (ret == -EEXIST) {
mutex_lock(&fe->pcl->lock);
- /* used to check tail merging loop due to corrupted images */
- if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
- fe->tailpcl = fe->pcl;
-
z_erofs_try_to_claim_pcluster(fe);
} else if (ret) {
return ret;
@@ -1027,8 +991,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
* those chains are handled asynchronously thus the page cannot be used
* for inplace I/O or bvpage (should be processed in a strict order.)
*/
- tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED &&
- fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
+ tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);

cur = end - min_t(unsigned int, offset + end - map->m_la, end);
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
@@ -1406,10 +1369,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
};
z_erofs_next_pcluster_t owned = io->head;

- while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) {
- /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
- DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL);
- /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */
+ while (owned != Z_EROFS_PCLUSTER_TAIL) {
DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);

be.pcl = container_of(owned, struct z_erofs_pcluster, next);
@@ -1426,7 +1386,7 @@ static void z_erofs_decompressqueue_work(struct work_struct *work)
container_of(work, struct z_erofs_decompressqueue, u.work);
struct page *pagepool = NULL;

- DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+ DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL);
z_erofs_decompress_queue(bgq, &pagepool);
erofs_release_pages(&pagepool);
kvfree(bgq);
@@ -1614,7 +1574,7 @@ static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb,
q->sync = true;
}
q->sb = sb;
- q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
+ q->head = Z_EROFS_PCLUSTER_TAIL;
return q;
}

@@ -1632,11 +1592,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];

- DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
- if (owned_head == Z_EROFS_PCLUSTER_TAIL)
- owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
-
- WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED);
+ WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL);

WRITE_ONCE(*submit_qtail, owned_head);
WRITE_ONCE(*bypass_qtail, &pcl->next);
@@ -1707,15 +1663,10 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
unsigned int i = 0;
bool bypass = true;

- /* no possible 'owned_head' equals the following */
- DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
-
pcl = container_of(owned_head, struct z_erofs_pcluster, next);
+ owned_head = READ_ONCE(pcl->next);

- /* close the main owned chain at first */
- owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
- Z_EROFS_PCLUSTER_TAIL_CLOSED);
if (z_erofs_is_inline_pcluster(pcl)) {
move_to_bypass_jobqueue(pcl, qtail, owned_head);
continue;
--
2.24.4