[PATCH 4/5] zram: batch IOs during writeback to improve performance

From: Pankaj Raghav
Date: Mon Sep 11 2023 - 17:38:51 EST


From: Pankaj Raghav <p.raghav@xxxxxxxxxxx>

This crosses off one of the TODO that was there as a part of
writeback_store() function:
A single page IO would be inefficient for write...

This reduces the time of writeback of 4G data to a nvme backing device from
68 secs to 15 secs (more than 4x improvement).

The idea is to batch the IOs until to a certain limit before the data is
flushed to the backing device. The batch limit is initially chosen based
on the bdi->io_pages value with an upper limit of 32 pages (128k on
x86). The limit is modified based writeback_limit, if set.

Signed-off-by: Pankaj Raghav <p.raghav@xxxxxxxxxxx>
---
drivers/block/zram/zram_drv.c | 186 +++++++++++++++++++++-------------
1 file changed, 113 insertions(+), 73 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 0b8f814e11dd..27313c2d781d 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -551,22 +551,6 @@ static ssize_t backing_dev_store(struct device *dev,
return err;
}

-static unsigned long alloc_block_bdev(struct zram *zram)
-{
- unsigned long blk_idx = 1;
-retry:
- /* skip 0 bit to confuse zram.handle = 0 */
- blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, blk_idx);
- if (blk_idx == zram->nr_pages)
- return 0;
-
- if (test_and_set_bit(blk_idx, zram->bitmap))
- goto retry;
-
- atomic64_inc(&zram->stats.bd_count);
- return blk_idx;
-}
-
static void free_block_bdev(struct zram *zram, unsigned long blk_idx)
{
int was_set;
@@ -628,6 +612,15 @@ static void read_from_bdev_async(struct zram *zram, struct page *page,
#define IDLE_WRITEBACK (1<<1)
#define INCOMPRESSIBLE_WRITEBACK (1<<2)

+#define MAX_INDEX_ENTRIES_ORDER 5
+#define MAX_INDEX_ENTRIES (1U << MAX_INDEX_ENTRIES_ORDER)
+struct index_mapping {
+ /* Cap the maximum indices to 32 before we flush */
+ unsigned long arr[MAX_INDEX_ENTRIES];
+ unsigned int nr_of_entries;
+};
+
+
/*
* Returns: true if the index was prepared for further processing
* false if the index can be skipped
@@ -668,39 +661,36 @@ static bool writeback_prep_or_skip_index(struct zram *zram, int mode,
return ret;
}

-static int writeback_flush_to_bdev(struct zram *zram, unsigned long index,
- struct page *page, unsigned long *blk_idx)
+static int writeback_flush_to_bdev(struct zram *zram, struct folio *folio,
+ struct index_mapping *map,
+ unsigned long blk_idx, unsigned int io_pages)
{
struct bio bio;
struct bio_vec bio_vec;
- int ret;
+ int ret = 0;
+
+ if (!map->nr_of_entries)
+ return ret;

bio_init(&bio, zram->bdev, &bio_vec, 1, REQ_OP_WRITE | REQ_SYNC);
- bio.bi_iter.bi_sector = *blk_idx * (PAGE_SIZE >> 9);
- __bio_add_page(&bio, page, PAGE_SIZE, 0);
+ bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
+
+ if (!bio_add_folio(&bio, folio, io_pages * PAGE_SIZE, 0))
+ goto cleanup;

- /*
- * XXX: A single page IO would be inefficient for write
- * but it would be not bad as starter.
- */
ret = submit_bio_wait(&bio);
- if (ret) {
- zram_slot_lock(zram, index);
- zram_clear_flag(zram, index, ZRAM_UNDER_WB);
- zram_clear_flag(zram, index, ZRAM_IDLE);
- zram_slot_unlock(zram, index);
- /*
- * BIO errors are not fatal, we continue and simply
- * attempt to writeback the remaining objects (pages).
- * At the same time we need to signal user-space that
- * some writes (at least one, but also could be all of
- * them) were not successful and we do so by returning
- * the most recent BIO error.
- */
- return ret;
- }
+ /*
+ * BIO errors are not fatal, we continue and simply
+ * attempt to writeback the remaining objects (pages).
+ * At the same time we need to signal user-space that
+ * some writes (at least one, but also could be all of
+ * them) were not successful and we do so by returning
+ * the most recent BIO error.
+ */
+ if (ret)
+ goto cleanup;

- atomic64_inc(&zram->stats.bd_writes);
+ atomic64_add(map->nr_of_entries, &zram->stats.bd_writes);
/*
* We released zram_slot_lock so need to check if the slot was
* changed. If there is freeing for the slot, we can catch it
@@ -710,28 +700,40 @@ static int writeback_flush_to_bdev(struct zram *zram, unsigned long index,
* mark ZRAM_IDLE once it found the slot was ZRAM_UNDER_WB.
* Thus, we could close the race by checking ZRAM_IDLE bit.
*/
- zram_slot_lock(zram, index);
- if (!zram_allocated(zram, index) ||
- !zram_test_flag(zram, index, ZRAM_IDLE)) {
- zram_clear_flag(zram, index, ZRAM_UNDER_WB);
- zram_clear_flag(zram, index, ZRAM_IDLE);
- goto skip;
+ for (int iter = 0; iter < map->nr_of_entries; iter++) {
+ zram_slot_lock(zram, map->arr[iter]);
+ if (!zram_allocated(zram, map->arr[iter]) ||
+ !zram_test_flag(zram, map->arr[iter], ZRAM_IDLE)) {
+ zram_clear_flag(zram, map->arr[iter], ZRAM_UNDER_WB);
+ zram_clear_flag(zram, map->arr[iter], ZRAM_IDLE);
+ zram_slot_unlock(zram, map->arr[iter]);
+ free_block_bdev(zram, blk_idx + iter);
+ continue;
+ }
+
+ zram_free_page(zram, map->arr[iter]);
+ zram_clear_flag(zram, map->arr[iter], ZRAM_UNDER_WB);
+ zram_set_flag(zram, map->arr[iter], ZRAM_WB);
+ zram_set_element(zram, map->arr[iter], blk_idx + iter);
+ zram_slot_unlock(zram, map->arr[iter]);
+ atomic64_inc(&zram->stats.pages_stored);
+
+ spin_lock(&zram->wb_limit_lock);
+ if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
+ zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12);
+ spin_unlock(&zram->wb_limit_lock);
}
+ return ret;

- zram_free_page(zram, index);
- zram_clear_flag(zram, index, ZRAM_UNDER_WB);
- zram_set_flag(zram, index, ZRAM_WB);
- zram_set_element(zram, index, *blk_idx);
- atomic64_inc(&zram->stats.pages_stored);
- *blk_idx = 0;
-
- spin_lock(&zram->wb_limit_lock);
- if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
- zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12);
- spin_unlock(&zram->wb_limit_lock);
-skip:
- zram_slot_unlock(zram, index);
- return 0;
+cleanup:
+ for (int iter = 0; iter < map->nr_of_entries; iter++) {
+ zram_slot_lock(zram, map->arr[iter]);
+ zram_clear_flag(zram, map->arr[iter], ZRAM_UNDER_WB);
+ zram_clear_flag(zram, map->arr[iter], ZRAM_IDLE);
+ zram_slot_unlock(zram, map->arr[iter]);
+ }
+ free_block_bdev_range(zram, blk_idx, map->nr_of_entries);
+ return ret;
}

static ssize_t writeback_store(struct device *dev,
@@ -741,9 +743,15 @@ static ssize_t writeback_store(struct device *dev,
unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
unsigned long index = 0;
struct page *page;
+ struct folio *folio;
ssize_t ret = len;
int mode, err;
unsigned long blk_idx = 0;
+ unsigned int io_pages;
+ u64 bd_wb_limit_pages = ULONG_MAX;
+ struct index_mapping map = {};
+ unsigned int order = min(MAX_INDEX_ENTRIES_ORDER,
+ ilog2(zram->bdev->bd_disk->bdi->io_pages));

if (sysfs_streq(buf, "idle"))
mode = IDLE_WRITEBACK;
@@ -776,32 +784,48 @@ static ssize_t writeback_store(struct device *dev,
goto release_init_lock;
}

- page = alloc_page(GFP_KERNEL);
- if (!page) {
+ folio = folio_alloc(GFP_KERNEL, order);
+ if (!folio) {
ret = -ENOMEM;
goto release_init_lock;
}

for (; nr_pages != 0; index++, nr_pages--) {
spin_lock(&zram->wb_limit_lock);
- if (zram->wb_limit_enable && !zram->bd_wb_limit) {
- spin_unlock(&zram->wb_limit_lock);
- ret = -EIO;
- break;
+ if (zram->wb_limit_enable) {
+ if (!zram->bd_wb_limit) {
+ spin_unlock(&zram->wb_limit_lock);
+ ret = -EIO;
+ break;
+ }
+ bd_wb_limit_pages = zram->bd_wb_limit >>
+ (PAGE_SHIFT - 12);
}
spin_unlock(&zram->wb_limit_lock);

if (!blk_idx) {
- blk_idx = alloc_block_bdev(zram);
+ io_pages = min(1UL << order, nr_pages);
+ io_pages = min_t(u64, bd_wb_limit_pages, io_pages);
+
+ blk_idx = alloc_block_bdev_range(zram, &io_pages);
if (!blk_idx) {
ret = -ENOSPC;
break;
}
}

- if (!writeback_prep_or_skip_index(zram, mode, index))
- continue;
+ if (!writeback_prep_or_skip_index(zram, mode, index)) {
+ if (nr_pages > 1 || map.nr_of_entries == 0)
+ continue;
+ /* There are still some unfinished IOs that
+ * needs to be flushed
+ */
+ err = writeback_flush_to_bdev(zram, folio, &map,
+ blk_idx, io_pages);
+ goto next;
+ }

+ page = folio_page(folio, map.nr_of_entries);
if (zram_read_page(zram, page, index, NULL)) {
zram_slot_lock(zram, index);
zram_clear_flag(zram, index, ZRAM_UNDER_WB);
@@ -810,15 +834,31 @@ static ssize_t writeback_store(struct device *dev,
continue;
}

- err = writeback_flush_to_bdev(zram, index, page, &blk_idx);
+ map.arr[map.nr_of_entries++] = index;
+ if (map.nr_of_entries < io_pages)
+ continue;

+ err = writeback_flush_to_bdev(zram, folio, &map, blk_idx,
+ io_pages);
+next:
if (err)
ret = err;
+
+ /*
+ * Check if all the blocks have been utilized before
+ * allocating the next batch. This is necessary to free
+ * the unused blocks after looping through all indices.
+ */
+ if (map.nr_of_entries == io_pages) {
+ blk_idx = 0;
+ map.nr_of_entries = 0;
+ }
}

if (blk_idx)
- free_block_bdev(zram, blk_idx);
- __free_page(page);
+ free_block_bdev_range(zram, blk_idx + map.nr_of_entries,
+ io_pages - map.nr_of_entries);
+ folio_put(folio);
release_init_lock:
up_read(&zram->init_lock);

--
2.40.1