[PATCH] erofs: handle overlapped pclusters out of crafted images properly
From: Gao Xiang
Date: Wed Sep 04 2024 - 07:17:22 EST
syzbot reported a task hang issue due to a deadlock case where it is
waiting for the folio lock of a cached folio that will be used for
cache I/Os.
After looking into the crafted fuzzed image, I found it's formed with
several overlapped big pclusters as below:
Ext: logical offset | length : physical offset | length
0: 0.. 16384 | 16384 : 151552.. 167936 | 16384
1: 16384.. 32768 | 16384 : 155648.. 172032 | 16384
2: 32768.. 49152 | 16384 : 537223168.. 537239552 | 16384
...
Here, extent 0/1 are physically overlapped although it's entirely
_impossible_ for normal filesystem images generated by mkfs.
First, managed folios containing compressed data will be marked as
up-to-date and then unlocked immediately (unlike in-place folios) when
compressed I/Os are complete. If physical blocks are not submitted in
the incremental order, there should be separate BIOs to avoid dependency
issues. However, the current code mis-arranges z_erofs_fill_bio_vec()
and BIO submission which causes unexpected BIO waits.
Second, managed folios will be connected to their own pclusters for
efficient inter-queries. However, this is somewhat hard to implement
easily if overlapped big pclusters exist. Again, these only appear in
fuzzed images so let's simply fall back to temporary short-lived pages
for correctness.
Additionally, it justifies that referenced managed folios cannot be
truncated for now and reverts part of commit 2080ca1ed3e4 ("erofs: tidy
up `struct z_erofs_bvec`") for simplicity although it shouldn't be any
difference.
Reported-by: syzbot+4fc98ed414ae63d1ada2@xxxxxxxxxxxxxxxxxxxxxxxxx
Closes: https://lore.kernel.org/r/0000000000002fda01061e334873@xxxxxxxxxx
Fixes: 8e6c8fa9f2e9 ("erofs: enable big pcluster feature")
Signed-off-by: Gao Xiang <hsiangkao@xxxxxxxxxxxxxxxxx>
---
fs/erofs/zdata.c | 56 +++++++++++++++++++++++++-----------------------
1 file changed, 29 insertions(+), 27 deletions(-)
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 424f656cd765..382e2e4e60e5 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1428,6 +1428,7 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
struct z_erofs_bvec zbv;
struct address_space *mapping;
struct folio *folio;
+ struct page *page;
int bs = i_blocksize(f->inode);
/* Except for inplace folios, the entire folio can be used for I/Os */
@@ -1450,7 +1451,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
* file-backed folios will be used instead.
*/
if (folio->private == (void *)Z_EROFS_PREALLOCATED_PAGE) {
- folio->private = 0;
tocache = true;
goto out_tocache;
}
@@ -1468,7 +1468,7 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
}
folio_lock(folio);
- if (folio->mapping == mc) {
+ if (likely(folio->mapping == mc)) {
/*
* The cached folio is still in managed cache but without
* a valid `->private` pcluster hint. Let's reconnect them.
@@ -1478,41 +1478,44 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
/* compressed_bvecs[] already takes a ref before */
folio_put(folio);
}
-
- /* no need to submit if it is already up-to-date */
- if (folio_test_uptodate(folio)) {
- folio_unlock(folio);
- bvec->bv_page = NULL;
+ if (likely(folio->private == pcl)) {
+ /* don't submit cache I/Os again if already uptodate */
+ if (folio_test_uptodate(folio)) {
+ folio_unlock(folio);
+ bvec->bv_page = NULL;
+ }
+ return;
}
- return;
+ /*
+ * Already linked with another pcluster, which only appears in
+ * crafted images by fuzzers for now. But handle this anyway.
+ */
+ tocache = false; /* use temporary short-lived pages */
+ } else {
+ DBG_BUGON(1); /* referenced managed folios can't be truncated */
+ tocache = true;
}
-
- /*
- * It has been truncated, so it's unsafe to reuse this one. Let's
- * allocate a new page for compressed data.
- */
- DBG_BUGON(folio->mapping);
- tocache = true;
folio_unlock(folio);
folio_put(folio);
out_allocfolio:
- zbv.page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
+ page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
spin_lock(&pcl->obj.lockref.lock);
- if (pcl->compressed_bvecs[nr].page) {
- erofs_pagepool_add(&f->pagepool, zbv.page);
+ if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) {
+ erofs_pagepool_add(&f->pagepool, page);
spin_unlock(&pcl->obj.lockref.lock);
cond_resched();
goto repeat;
}
- bvec->bv_page = pcl->compressed_bvecs[nr].page = zbv.page;
- folio = page_folio(zbv.page);
- /* first mark it as a temporary shortlived folio (now 1 ref) */
- folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
+ bvec->bv_page = pcl->compressed_bvecs[nr].page = page;
+ folio = page_folio(page);
spin_unlock(&pcl->obj.lockref.lock);
out_tocache:
if (!tocache || bs != PAGE_SIZE ||
- filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp))
+ filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp)) {
+ /* turn into a temporary shortlived folio (1 ref) */
+ folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
return;
+ }
folio_attach_private(folio, pcl);
/* drop a refcount added by allocpage (then 2 refs in total here) */
folio_put(folio);
@@ -1647,10 +1650,6 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
cur = mdev.m_pa;
end = cur + pcl->pclustersize;
do {
- z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc);
- if (!bvec.bv_page)
- continue;
-
if (bio && (cur != last_pa ||
bio->bi_bdev != mdev.m_bdev)) {
io_retry:
@@ -1665,6 +1664,9 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
}
bio = NULL;
}
+ z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc);
+ if (!bvec.bv_page)
+ continue;
if (unlikely(PageWorkingset(bvec.bv_page)) &&
!memstall) {
--
2.43.5